dsl_dataset.c revision 226707
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2011 by Delphix. All rights reserved. 24 * Copyright (c) 2011 Pawel Jakub Dawidek <pawel@dawidek.net>. 25 * All rights reserved. 26 */ 27 28#include <sys/dmu_objset.h> 29#include <sys/dsl_dataset.h> 30#include <sys/dsl_dir.h> 31#include <sys/dsl_prop.h> 32#include <sys/dsl_synctask.h> 33#include <sys/dmu_traverse.h> 34#include <sys/dmu_tx.h> 35#include <sys/arc.h> 36#include <sys/zio.h> 37#include <sys/zap.h> 38#include <sys/unique.h> 39#include <sys/zfs_context.h> 40#include <sys/zfs_ioctl.h> 41#include <sys/spa.h> 42#include <sys/zfs_znode.h> 43#include <sys/zfs_onexit.h> 44#include <sys/zvol.h> 45#include <sys/dsl_scan.h> 46#include <sys/dsl_deadlist.h> 47 48static char *dsl_reaper = "the grim reaper"; 49 50static dsl_checkfunc_t dsl_dataset_destroy_begin_check; 51static dsl_syncfunc_t dsl_dataset_destroy_begin_sync; 52static dsl_syncfunc_t dsl_dataset_set_reservation_sync; 53 54#define SWITCH64(x, y) \ 55 { \ 56 uint64_t __tmp = (x); \ 57 (x) = (y); \ 58 (y) = __tmp; \ 59 } 60 61#define DS_REF_MAX (1ULL << 62) 62 63#define DSL_DEADLIST_BLOCKSIZE SPA_MAXBLOCKSIZE 64 65#define DSL_DATASET_IS_DESTROYED(ds) ((ds)->ds_owner == dsl_reaper) 66 67 68/* 69 * Figure out how much of this delta should be propogated to the dsl_dir 70 * layer. If there's a refreservation, that space has already been 71 * partially accounted for in our ancestors. 72 */ 73static int64_t 74parent_delta(dsl_dataset_t *ds, int64_t delta) 75{ 76 uint64_t old_bytes, new_bytes; 77 78 if (ds->ds_reserved == 0) 79 return (delta); 80 81 old_bytes = MAX(ds->ds_phys->ds_unique_bytes, ds->ds_reserved); 82 new_bytes = MAX(ds->ds_phys->ds_unique_bytes + delta, ds->ds_reserved); 83 84 ASSERT3U(ABS((int64_t)(new_bytes - old_bytes)), <=, ABS(delta)); 85 return (new_bytes - old_bytes); 86} 87 88void 89dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx) 90{ 91 int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp); 92 int compressed = BP_GET_PSIZE(bp); 93 int uncompressed = BP_GET_UCSIZE(bp); 94 int64_t delta; 95 96 dprintf_bp(bp, "ds=%p", ds); 97 98 ASSERT(dmu_tx_is_syncing(tx)); 99 /* It could have been compressed away to nothing */ 100 if (BP_IS_HOLE(bp)) 101 return; 102 ASSERT(BP_GET_TYPE(bp) != DMU_OT_NONE); 103 ASSERT3U(BP_GET_TYPE(bp), <, DMU_OT_NUMTYPES); 104 if (ds == NULL) { 105 /* 106 * Account for the meta-objset space in its placeholder 107 * dsl_dir. 108 */ 109 ASSERT3U(compressed, ==, uncompressed); /* it's all metadata */ 110 dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir, DD_USED_HEAD, 111 used, compressed, uncompressed, tx); 112 dsl_dir_dirty(tx->tx_pool->dp_mos_dir, tx); 113 return; 114 } 115 dmu_buf_will_dirty(ds->ds_dbuf, tx); 116 117 mutex_enter(&ds->ds_dir->dd_lock); 118 mutex_enter(&ds->ds_lock); 119 delta = parent_delta(ds, used); 120 ds->ds_phys->ds_used_bytes += used; 121 ds->ds_phys->ds_compressed_bytes += compressed; 122 ds->ds_phys->ds_uncompressed_bytes += uncompressed; 123 ds->ds_phys->ds_unique_bytes += used; 124 mutex_exit(&ds->ds_lock); 125 dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, delta, 126 compressed, uncompressed, tx); 127 dsl_dir_transfer_space(ds->ds_dir, used - delta, 128 DD_USED_REFRSRV, DD_USED_HEAD, tx); 129 mutex_exit(&ds->ds_dir->dd_lock); 130} 131 132int 133dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx, 134 boolean_t async) 135{ 136 if (BP_IS_HOLE(bp)) 137 return (0); 138 139 ASSERT(dmu_tx_is_syncing(tx)); 140 ASSERT(bp->blk_birth <= tx->tx_txg); 141 142 int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp); 143 int compressed = BP_GET_PSIZE(bp); 144 int uncompressed = BP_GET_UCSIZE(bp); 145 146 ASSERT(used > 0); 147 if (ds == NULL) { 148 /* 149 * Account for the meta-objset space in its placeholder 150 * dataset. 151 */ 152 dsl_free(tx->tx_pool, tx->tx_txg, bp); 153 154 dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir, DD_USED_HEAD, 155 -used, -compressed, -uncompressed, tx); 156 dsl_dir_dirty(tx->tx_pool->dp_mos_dir, tx); 157 return (used); 158 } 159 ASSERT3P(tx->tx_pool, ==, ds->ds_dir->dd_pool); 160 161 ASSERT(!dsl_dataset_is_snapshot(ds)); 162 dmu_buf_will_dirty(ds->ds_dbuf, tx); 163 164 if (bp->blk_birth > ds->ds_phys->ds_prev_snap_txg) { 165 int64_t delta; 166 167 dprintf_bp(bp, "freeing ds=%llu", ds->ds_object); 168 dsl_free(tx->tx_pool, tx->tx_txg, bp); 169 170 mutex_enter(&ds->ds_dir->dd_lock); 171 mutex_enter(&ds->ds_lock); 172 ASSERT(ds->ds_phys->ds_unique_bytes >= used || 173 !DS_UNIQUE_IS_ACCURATE(ds)); 174 delta = parent_delta(ds, -used); 175 ds->ds_phys->ds_unique_bytes -= used; 176 mutex_exit(&ds->ds_lock); 177 dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, 178 delta, -compressed, -uncompressed, tx); 179 dsl_dir_transfer_space(ds->ds_dir, -used - delta, 180 DD_USED_REFRSRV, DD_USED_HEAD, tx); 181 mutex_exit(&ds->ds_dir->dd_lock); 182 } else { 183 dprintf_bp(bp, "putting on dead list: %s", ""); 184 if (async) { 185 /* 186 * We are here as part of zio's write done callback, 187 * which means we're a zio interrupt thread. We can't 188 * call dsl_deadlist_insert() now because it may block 189 * waiting for I/O. Instead, put bp on the deferred 190 * queue and let dsl_pool_sync() finish the job. 191 */ 192 bplist_append(&ds->ds_pending_deadlist, bp); 193 } else { 194 dsl_deadlist_insert(&ds->ds_deadlist, bp, tx); 195 } 196 ASSERT3U(ds->ds_prev->ds_object, ==, 197 ds->ds_phys->ds_prev_snap_obj); 198 ASSERT(ds->ds_prev->ds_phys->ds_num_children > 0); 199 /* if (bp->blk_birth > prev prev snap txg) prev unique += bs */ 200 if (ds->ds_prev->ds_phys->ds_next_snap_obj == 201 ds->ds_object && bp->blk_birth > 202 ds->ds_prev->ds_phys->ds_prev_snap_txg) { 203 dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); 204 mutex_enter(&ds->ds_prev->ds_lock); 205 ds->ds_prev->ds_phys->ds_unique_bytes += used; 206 mutex_exit(&ds->ds_prev->ds_lock); 207 } 208 if (bp->blk_birth > ds->ds_dir->dd_origin_txg) { 209 dsl_dir_transfer_space(ds->ds_dir, used, 210 DD_USED_HEAD, DD_USED_SNAP, tx); 211 } 212 } 213 mutex_enter(&ds->ds_lock); 214 ASSERT3U(ds->ds_phys->ds_used_bytes, >=, used); 215 ds->ds_phys->ds_used_bytes -= used; 216 ASSERT3U(ds->ds_phys->ds_compressed_bytes, >=, compressed); 217 ds->ds_phys->ds_compressed_bytes -= compressed; 218 ASSERT3U(ds->ds_phys->ds_uncompressed_bytes, >=, uncompressed); 219 ds->ds_phys->ds_uncompressed_bytes -= uncompressed; 220 mutex_exit(&ds->ds_lock); 221 222 return (used); 223} 224 225uint64_t 226dsl_dataset_prev_snap_txg(dsl_dataset_t *ds) 227{ 228 uint64_t trysnap = 0; 229 230 if (ds == NULL) 231 return (0); 232 /* 233 * The snapshot creation could fail, but that would cause an 234 * incorrect FALSE return, which would only result in an 235 * overestimation of the amount of space that an operation would 236 * consume, which is OK. 237 * 238 * There's also a small window where we could miss a pending 239 * snapshot, because we could set the sync task in the quiescing 240 * phase. So this should only be used as a guess. 241 */ 242 if (ds->ds_trysnap_txg > 243 spa_last_synced_txg(ds->ds_dir->dd_pool->dp_spa)) 244 trysnap = ds->ds_trysnap_txg; 245 return (MAX(ds->ds_phys->ds_prev_snap_txg, trysnap)); 246} 247 248boolean_t 249dsl_dataset_block_freeable(dsl_dataset_t *ds, const blkptr_t *bp, 250 uint64_t blk_birth) 251{ 252 if (blk_birth <= dsl_dataset_prev_snap_txg(ds)) 253 return (B_FALSE); 254 255 ddt_prefetch(dsl_dataset_get_spa(ds), bp); 256 257 return (B_TRUE); 258} 259 260/* ARGSUSED */ 261static void 262dsl_dataset_evict(dmu_buf_t *db, void *dsv) 263{ 264 dsl_dataset_t *ds = dsv; 265 266 ASSERT(ds->ds_owner == NULL || DSL_DATASET_IS_DESTROYED(ds)); 267 268 unique_remove(ds->ds_fsid_guid); 269 270 if (ds->ds_objset != NULL) 271 dmu_objset_evict(ds->ds_objset); 272 273 if (ds->ds_prev) { 274 dsl_dataset_drop_ref(ds->ds_prev, ds); 275 ds->ds_prev = NULL; 276 } 277 278 bplist_destroy(&ds->ds_pending_deadlist); 279 if (db != NULL) { 280 dsl_deadlist_close(&ds->ds_deadlist); 281 } else { 282 ASSERT(ds->ds_deadlist.dl_dbuf == NULL); 283 ASSERT(!ds->ds_deadlist.dl_oldfmt); 284 } 285 if (ds->ds_dir) 286 dsl_dir_close(ds->ds_dir, ds); 287 288 ASSERT(!list_link_active(&ds->ds_synced_link)); 289 290 if (mutex_owned(&ds->ds_lock)) 291 mutex_exit(&ds->ds_lock); 292 mutex_destroy(&ds->ds_lock); 293 mutex_destroy(&ds->ds_recvlock); 294 if (mutex_owned(&ds->ds_opening_lock)) 295 mutex_exit(&ds->ds_opening_lock); 296 mutex_destroy(&ds->ds_opening_lock); 297 rw_destroy(&ds->ds_rwlock); 298 cv_destroy(&ds->ds_exclusive_cv); 299 300 kmem_free(ds, sizeof (dsl_dataset_t)); 301} 302 303static int 304dsl_dataset_get_snapname(dsl_dataset_t *ds) 305{ 306 dsl_dataset_phys_t *headphys; 307 int err; 308 dmu_buf_t *headdbuf; 309 dsl_pool_t *dp = ds->ds_dir->dd_pool; 310 objset_t *mos = dp->dp_meta_objset; 311 312 if (ds->ds_snapname[0]) 313 return (0); 314 if (ds->ds_phys->ds_next_snap_obj == 0) 315 return (0); 316 317 err = dmu_bonus_hold(mos, ds->ds_dir->dd_phys->dd_head_dataset_obj, 318 FTAG, &headdbuf); 319 if (err) 320 return (err); 321 headphys = headdbuf->db_data; 322 err = zap_value_search(dp->dp_meta_objset, 323 headphys->ds_snapnames_zapobj, ds->ds_object, 0, ds->ds_snapname); 324 dmu_buf_rele(headdbuf, FTAG); 325 return (err); 326} 327 328static int 329dsl_dataset_snap_lookup(dsl_dataset_t *ds, const char *name, uint64_t *value) 330{ 331 objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; 332 uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj; 333 matchtype_t mt; 334 int err; 335 336 if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET) 337 mt = MT_FIRST; 338 else 339 mt = MT_EXACT; 340 341 err = zap_lookup_norm(mos, snapobj, name, 8, 1, 342 value, mt, NULL, 0, NULL); 343 if (err == ENOTSUP && mt == MT_FIRST) 344 err = zap_lookup(mos, snapobj, name, 8, 1, value); 345 return (err); 346} 347 348static int 349dsl_dataset_snap_remove(dsl_dataset_t *ds, char *name, dmu_tx_t *tx) 350{ 351 objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; 352 uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj; 353 matchtype_t mt; 354 int err; 355 356 dsl_dir_snap_cmtime_update(ds->ds_dir); 357 358 if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET) 359 mt = MT_FIRST; 360 else 361 mt = MT_EXACT; 362 363 err = zap_remove_norm(mos, snapobj, name, mt, tx); 364 if (err == ENOTSUP && mt == MT_FIRST) 365 err = zap_remove(mos, snapobj, name, tx); 366 return (err); 367} 368 369static int 370dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag, 371 dsl_dataset_t **dsp) 372{ 373 objset_t *mos = dp->dp_meta_objset; 374 dmu_buf_t *dbuf; 375 dsl_dataset_t *ds; 376 int err; 377 dmu_object_info_t doi; 378 379 ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) || 380 dsl_pool_sync_context(dp)); 381 382 err = dmu_bonus_hold(mos, dsobj, tag, &dbuf); 383 if (err) 384 return (err); 385 386 /* Make sure dsobj has the correct object type. */ 387 dmu_object_info_from_db(dbuf, &doi); 388 if (doi.doi_type != DMU_OT_DSL_DATASET) 389 return (EINVAL); 390 391 ds = dmu_buf_get_user(dbuf); 392 if (ds == NULL) { 393 dsl_dataset_t *winner; 394 395 ds = kmem_zalloc(sizeof (dsl_dataset_t), KM_SLEEP); 396 ds->ds_dbuf = dbuf; 397 ds->ds_object = dsobj; 398 ds->ds_phys = dbuf->db_data; 399 400 mutex_init(&ds->ds_lock, NULL, MUTEX_DEFAULT, NULL); 401 mutex_init(&ds->ds_recvlock, NULL, MUTEX_DEFAULT, NULL); 402 mutex_init(&ds->ds_opening_lock, NULL, MUTEX_DEFAULT, NULL); 403 rw_init(&ds->ds_rwlock, 0, 0, 0); 404 cv_init(&ds->ds_exclusive_cv, NULL, CV_DEFAULT, NULL); 405 406 bplist_create(&ds->ds_pending_deadlist); 407 dsl_deadlist_open(&ds->ds_deadlist, 408 mos, ds->ds_phys->ds_deadlist_obj); 409 410 if (err == 0) { 411 err = dsl_dir_open_obj(dp, 412 ds->ds_phys->ds_dir_obj, NULL, ds, &ds->ds_dir); 413 } 414 if (err) { 415 mutex_destroy(&ds->ds_lock); 416 mutex_destroy(&ds->ds_recvlock); 417 mutex_destroy(&ds->ds_opening_lock); 418 rw_destroy(&ds->ds_rwlock); 419 cv_destroy(&ds->ds_exclusive_cv); 420 bplist_destroy(&ds->ds_pending_deadlist); 421 dsl_deadlist_close(&ds->ds_deadlist); 422 kmem_free(ds, sizeof (dsl_dataset_t)); 423 dmu_buf_rele(dbuf, tag); 424 return (err); 425 } 426 427 if (!dsl_dataset_is_snapshot(ds)) { 428 ds->ds_snapname[0] = '\0'; 429 if (ds->ds_phys->ds_prev_snap_obj) { 430 err = dsl_dataset_get_ref(dp, 431 ds->ds_phys->ds_prev_snap_obj, 432 ds, &ds->ds_prev); 433 } 434 } else { 435 if (zfs_flags & ZFS_DEBUG_SNAPNAMES) 436 err = dsl_dataset_get_snapname(ds); 437 if (err == 0 && ds->ds_phys->ds_userrefs_obj != 0) { 438 err = zap_count( 439 ds->ds_dir->dd_pool->dp_meta_objset, 440 ds->ds_phys->ds_userrefs_obj, 441 &ds->ds_userrefs); 442 } 443 } 444 445 if (err == 0 && !dsl_dataset_is_snapshot(ds)) { 446 /* 447 * In sync context, we're called with either no lock 448 * or with the write lock. If we're not syncing, 449 * we're always called with the read lock held. 450 */ 451 boolean_t need_lock = 452 !RW_WRITE_HELD(&dp->dp_config_rwlock) && 453 dsl_pool_sync_context(dp); 454 455 if (need_lock) 456 rw_enter(&dp->dp_config_rwlock, RW_READER); 457 458 err = dsl_prop_get_ds(ds, 459 "refreservation", sizeof (uint64_t), 1, 460 &ds->ds_reserved, NULL); 461 if (err == 0) { 462 err = dsl_prop_get_ds(ds, 463 "refquota", sizeof (uint64_t), 1, 464 &ds->ds_quota, NULL); 465 } 466 467 if (need_lock) 468 rw_exit(&dp->dp_config_rwlock); 469 } else { 470 ds->ds_reserved = ds->ds_quota = 0; 471 } 472 473 if (err == 0) { 474 winner = dmu_buf_set_user_ie(dbuf, ds, &ds->ds_phys, 475 dsl_dataset_evict); 476 } 477 if (err || winner) { 478 bplist_destroy(&ds->ds_pending_deadlist); 479 dsl_deadlist_close(&ds->ds_deadlist); 480 if (ds->ds_prev) 481 dsl_dataset_drop_ref(ds->ds_prev, ds); 482 dsl_dir_close(ds->ds_dir, ds); 483 mutex_destroy(&ds->ds_lock); 484 mutex_destroy(&ds->ds_recvlock); 485 mutex_destroy(&ds->ds_opening_lock); 486 rw_destroy(&ds->ds_rwlock); 487 cv_destroy(&ds->ds_exclusive_cv); 488 kmem_free(ds, sizeof (dsl_dataset_t)); 489 if (err) { 490 dmu_buf_rele(dbuf, tag); 491 return (err); 492 } 493 ds = winner; 494 } else { 495 ds->ds_fsid_guid = 496 unique_insert(ds->ds_phys->ds_fsid_guid); 497 } 498 } 499 ASSERT3P(ds->ds_dbuf, ==, dbuf); 500 ASSERT3P(ds->ds_phys, ==, dbuf->db_data); 501 ASSERT(ds->ds_phys->ds_prev_snap_obj != 0 || 502 spa_version(dp->dp_spa) < SPA_VERSION_ORIGIN || 503 dp->dp_origin_snap == NULL || ds == dp->dp_origin_snap); 504 mutex_enter(&ds->ds_lock); 505 if (!dsl_pool_sync_context(dp) && DSL_DATASET_IS_DESTROYED(ds)) { 506 mutex_exit(&ds->ds_lock); 507 dmu_buf_rele(ds->ds_dbuf, tag); 508 return (ENOENT); 509 } 510 mutex_exit(&ds->ds_lock); 511 *dsp = ds; 512 return (0); 513} 514 515static int 516dsl_dataset_hold_ref(dsl_dataset_t *ds, void *tag) 517{ 518 dsl_pool_t *dp = ds->ds_dir->dd_pool; 519 520 /* 521 * In syncing context we don't want the rwlock lock: there 522 * may be an existing writer waiting for sync phase to 523 * finish. We don't need to worry about such writers, since 524 * sync phase is single-threaded, so the writer can't be 525 * doing anything while we are active. 526 */ 527 if (dsl_pool_sync_context(dp)) { 528 ASSERT(!DSL_DATASET_IS_DESTROYED(ds)); 529 return (0); 530 } 531 532 /* 533 * Normal users will hold the ds_rwlock as a READER until they 534 * are finished (i.e., call dsl_dataset_rele()). "Owners" will 535 * drop their READER lock after they set the ds_owner field. 536 * 537 * If the dataset is being destroyed, the destroy thread will 538 * obtain a WRITER lock for exclusive access after it's done its 539 * open-context work and then change the ds_owner to 540 * dsl_reaper once destruction is assured. So threads 541 * may block here temporarily, until the "destructability" of 542 * the dataset is determined. 543 */ 544 ASSERT(!RW_WRITE_HELD(&dp->dp_config_rwlock)); 545 mutex_enter(&ds->ds_lock); 546 while (!rw_tryenter(&ds->ds_rwlock, RW_READER)) { 547 rw_exit(&dp->dp_config_rwlock); 548 cv_wait(&ds->ds_exclusive_cv, &ds->ds_lock); 549 if (DSL_DATASET_IS_DESTROYED(ds)) { 550 mutex_exit(&ds->ds_lock); 551 dsl_dataset_drop_ref(ds, tag); 552 rw_enter(&dp->dp_config_rwlock, RW_READER); 553 return (ENOENT); 554 } 555 /* 556 * The dp_config_rwlock lives above the ds_lock. And 557 * we need to check DSL_DATASET_IS_DESTROYED() while 558 * holding the ds_lock, so we have to drop and reacquire 559 * the ds_lock here. 560 */ 561 mutex_exit(&ds->ds_lock); 562 rw_enter(&dp->dp_config_rwlock, RW_READER); 563 mutex_enter(&ds->ds_lock); 564 } 565 mutex_exit(&ds->ds_lock); 566 return (0); 567} 568 569int 570dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag, 571 dsl_dataset_t **dsp) 572{ 573 int err = dsl_dataset_get_ref(dp, dsobj, tag, dsp); 574 575 if (err) 576 return (err); 577 return (dsl_dataset_hold_ref(*dsp, tag)); 578} 579 580int 581dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj, boolean_t inconsistentok, 582 void *tag, dsl_dataset_t **dsp) 583{ 584 int err = dsl_dataset_hold_obj(dp, dsobj, tag, dsp); 585 if (err) 586 return (err); 587 if (!dsl_dataset_tryown(*dsp, inconsistentok, tag)) { 588 dsl_dataset_rele(*dsp, tag); 589 *dsp = NULL; 590 return (EBUSY); 591 } 592 return (0); 593} 594 595int 596dsl_dataset_hold(const char *name, void *tag, dsl_dataset_t **dsp) 597{ 598 dsl_dir_t *dd; 599 dsl_pool_t *dp; 600 const char *snapname; 601 uint64_t obj; 602 int err = 0; 603 604 err = dsl_dir_open_spa(NULL, name, FTAG, &dd, &snapname); 605 if (err) 606 return (err); 607 608 dp = dd->dd_pool; 609 obj = dd->dd_phys->dd_head_dataset_obj; 610 rw_enter(&dp->dp_config_rwlock, RW_READER); 611 if (obj) 612 err = dsl_dataset_get_ref(dp, obj, tag, dsp); 613 else 614 err = ENOENT; 615 if (err) 616 goto out; 617 618 err = dsl_dataset_hold_ref(*dsp, tag); 619 620 /* we may be looking for a snapshot */ 621 if (err == 0 && snapname != NULL) { 622 dsl_dataset_t *ds = NULL; 623 624 if (*snapname++ != '@') { 625 dsl_dataset_rele(*dsp, tag); 626 err = ENOENT; 627 goto out; 628 } 629 630 dprintf("looking for snapshot '%s'\n", snapname); 631 err = dsl_dataset_snap_lookup(*dsp, snapname, &obj); 632 if (err == 0) 633 err = dsl_dataset_get_ref(dp, obj, tag, &ds); 634 dsl_dataset_rele(*dsp, tag); 635 636 ASSERT3U((err == 0), ==, (ds != NULL)); 637 638 if (ds) { 639 mutex_enter(&ds->ds_lock); 640 if (ds->ds_snapname[0] == 0) 641 (void) strlcpy(ds->ds_snapname, snapname, 642 sizeof (ds->ds_snapname)); 643 mutex_exit(&ds->ds_lock); 644 err = dsl_dataset_hold_ref(ds, tag); 645 *dsp = err ? NULL : ds; 646 } 647 } 648out: 649 rw_exit(&dp->dp_config_rwlock); 650 dsl_dir_close(dd, FTAG); 651 return (err); 652} 653 654int 655dsl_dataset_own(const char *name, boolean_t inconsistentok, 656 void *tag, dsl_dataset_t **dsp) 657{ 658 int err = dsl_dataset_hold(name, tag, dsp); 659 if (err) 660 return (err); 661 if (!dsl_dataset_tryown(*dsp, inconsistentok, tag)) { 662 dsl_dataset_rele(*dsp, tag); 663 return (EBUSY); 664 } 665 return (0); 666} 667 668void 669dsl_dataset_name(dsl_dataset_t *ds, char *name) 670{ 671 if (ds == NULL) { 672 (void) strcpy(name, "mos"); 673 } else { 674 dsl_dir_name(ds->ds_dir, name); 675 VERIFY(0 == dsl_dataset_get_snapname(ds)); 676 if (ds->ds_snapname[0]) { 677 (void) strcat(name, "@"); 678 /* 679 * We use a "recursive" mutex so that we 680 * can call dprintf_ds() with ds_lock held. 681 */ 682 if (!MUTEX_HELD(&ds->ds_lock)) { 683 mutex_enter(&ds->ds_lock); 684 (void) strcat(name, ds->ds_snapname); 685 mutex_exit(&ds->ds_lock); 686 } else { 687 (void) strcat(name, ds->ds_snapname); 688 } 689 } 690 } 691} 692 693static int 694dsl_dataset_namelen(dsl_dataset_t *ds) 695{ 696 int result; 697 698 if (ds == NULL) { 699 result = 3; /* "mos" */ 700 } else { 701 result = dsl_dir_namelen(ds->ds_dir); 702 VERIFY(0 == dsl_dataset_get_snapname(ds)); 703 if (ds->ds_snapname[0]) { 704 ++result; /* adding one for the @-sign */ 705 if (!MUTEX_HELD(&ds->ds_lock)) { 706 mutex_enter(&ds->ds_lock); 707 result += strlen(ds->ds_snapname); 708 mutex_exit(&ds->ds_lock); 709 } else { 710 result += strlen(ds->ds_snapname); 711 } 712 } 713 } 714 715 return (result); 716} 717 718void 719dsl_dataset_drop_ref(dsl_dataset_t *ds, void *tag) 720{ 721 dmu_buf_rele(ds->ds_dbuf, tag); 722} 723 724void 725dsl_dataset_rele(dsl_dataset_t *ds, void *tag) 726{ 727 if (!dsl_pool_sync_context(ds->ds_dir->dd_pool)) { 728 rw_exit(&ds->ds_rwlock); 729 } 730 dsl_dataset_drop_ref(ds, tag); 731} 732 733void 734dsl_dataset_disown(dsl_dataset_t *ds, void *tag) 735{ 736 ASSERT((ds->ds_owner == tag && ds->ds_dbuf) || 737 (DSL_DATASET_IS_DESTROYED(ds) && ds->ds_dbuf == NULL)); 738 739 mutex_enter(&ds->ds_lock); 740 ds->ds_owner = NULL; 741 if (RW_WRITE_HELD(&ds->ds_rwlock)) { 742 rw_exit(&ds->ds_rwlock); 743 cv_broadcast(&ds->ds_exclusive_cv); 744 } 745 mutex_exit(&ds->ds_lock); 746 if (ds->ds_dbuf) 747 dsl_dataset_drop_ref(ds, tag); 748 else 749 dsl_dataset_evict(NULL, ds); 750} 751 752boolean_t 753dsl_dataset_tryown(dsl_dataset_t *ds, boolean_t inconsistentok, void *tag) 754{ 755 boolean_t gotit = FALSE; 756 757 mutex_enter(&ds->ds_lock); 758 if (ds->ds_owner == NULL && 759 (!DS_IS_INCONSISTENT(ds) || inconsistentok)) { 760 ds->ds_owner = tag; 761 if (!dsl_pool_sync_context(ds->ds_dir->dd_pool)) 762 rw_exit(&ds->ds_rwlock); 763 gotit = TRUE; 764 } 765 mutex_exit(&ds->ds_lock); 766 return (gotit); 767} 768 769void 770dsl_dataset_make_exclusive(dsl_dataset_t *ds, void *owner) 771{ 772 ASSERT3P(owner, ==, ds->ds_owner); 773 if (!RW_WRITE_HELD(&ds->ds_rwlock)) 774 rw_enter(&ds->ds_rwlock, RW_WRITER); 775} 776 777uint64_t 778dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin, 779 uint64_t flags, dmu_tx_t *tx) 780{ 781 dsl_pool_t *dp = dd->dd_pool; 782 dmu_buf_t *dbuf; 783 dsl_dataset_phys_t *dsphys; 784 uint64_t dsobj; 785 objset_t *mos = dp->dp_meta_objset; 786 787 if (origin == NULL) 788 origin = dp->dp_origin_snap; 789 790 ASSERT(origin == NULL || origin->ds_dir->dd_pool == dp); 791 ASSERT(origin == NULL || origin->ds_phys->ds_num_children > 0); 792 ASSERT(dmu_tx_is_syncing(tx)); 793 ASSERT(dd->dd_phys->dd_head_dataset_obj == 0); 794 795 dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0, 796 DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx); 797 VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf)); 798 dmu_buf_will_dirty(dbuf, tx); 799 dsphys = dbuf->db_data; 800 bzero(dsphys, sizeof (dsl_dataset_phys_t)); 801 dsphys->ds_dir_obj = dd->dd_object; 802 dsphys->ds_flags = flags; 803 dsphys->ds_fsid_guid = unique_create(); 804 (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid, 805 sizeof (dsphys->ds_guid)); 806 dsphys->ds_snapnames_zapobj = 807 zap_create_norm(mos, U8_TEXTPREP_TOUPPER, DMU_OT_DSL_DS_SNAP_MAP, 808 DMU_OT_NONE, 0, tx); 809 dsphys->ds_creation_time = gethrestime_sec(); 810 dsphys->ds_creation_txg = tx->tx_txg == TXG_INITIAL ? 1 : tx->tx_txg; 811 812 if (origin == NULL) { 813 dsphys->ds_deadlist_obj = dsl_deadlist_alloc(mos, tx); 814 } else { 815 dsl_dataset_t *ohds; 816 817 dsphys->ds_prev_snap_obj = origin->ds_object; 818 dsphys->ds_prev_snap_txg = 819 origin->ds_phys->ds_creation_txg; 820 dsphys->ds_used_bytes = 821 origin->ds_phys->ds_used_bytes; 822 dsphys->ds_compressed_bytes = 823 origin->ds_phys->ds_compressed_bytes; 824 dsphys->ds_uncompressed_bytes = 825 origin->ds_phys->ds_uncompressed_bytes; 826 dsphys->ds_bp = origin->ds_phys->ds_bp; 827 dsphys->ds_flags |= origin->ds_phys->ds_flags; 828 829 dmu_buf_will_dirty(origin->ds_dbuf, tx); 830 origin->ds_phys->ds_num_children++; 831 832 VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, 833 origin->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &ohds)); 834 dsphys->ds_deadlist_obj = dsl_deadlist_clone(&ohds->ds_deadlist, 835 dsphys->ds_prev_snap_txg, dsphys->ds_prev_snap_obj, tx); 836 dsl_dataset_rele(ohds, FTAG); 837 838 if (spa_version(dp->dp_spa) >= SPA_VERSION_NEXT_CLONES) { 839 if (origin->ds_phys->ds_next_clones_obj == 0) { 840 origin->ds_phys->ds_next_clones_obj = 841 zap_create(mos, 842 DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx); 843 } 844 VERIFY(0 == zap_add_int(mos, 845 origin->ds_phys->ds_next_clones_obj, 846 dsobj, tx)); 847 } 848 849 dmu_buf_will_dirty(dd->dd_dbuf, tx); 850 dd->dd_phys->dd_origin_obj = origin->ds_object; 851 if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) { 852 if (origin->ds_dir->dd_phys->dd_clones == 0) { 853 dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx); 854 origin->ds_dir->dd_phys->dd_clones = 855 zap_create(mos, 856 DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx); 857 } 858 VERIFY3U(0, ==, zap_add_int(mos, 859 origin->ds_dir->dd_phys->dd_clones, dsobj, tx)); 860 } 861 } 862 863 if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE) 864 dsphys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE; 865 866 dmu_buf_rele(dbuf, FTAG); 867 868 dmu_buf_will_dirty(dd->dd_dbuf, tx); 869 dd->dd_phys->dd_head_dataset_obj = dsobj; 870 871 return (dsobj); 872} 873 874uint64_t 875dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname, 876 dsl_dataset_t *origin, uint64_t flags, cred_t *cr, dmu_tx_t *tx) 877{ 878 dsl_pool_t *dp = pdd->dd_pool; 879 uint64_t dsobj, ddobj; 880 dsl_dir_t *dd; 881 882 ASSERT(lastname[0] != '@'); 883 884 ddobj = dsl_dir_create_sync(dp, pdd, lastname, tx); 885 VERIFY(0 == dsl_dir_open_obj(dp, ddobj, lastname, FTAG, &dd)); 886 887 dsobj = dsl_dataset_create_sync_dd(dd, origin, flags, tx); 888 889 dsl_deleg_set_create_perms(dd, tx, cr); 890 891 dsl_dir_close(dd, FTAG); 892 893 /* 894 * If we are creating a clone, make sure we zero out any stale 895 * data from the origin snapshots zil header. 896 */ 897 if (origin != NULL) { 898 dsl_dataset_t *ds; 899 objset_t *os; 900 901 VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); 902 VERIFY3U(0, ==, dmu_objset_from_ds(ds, &os)); 903 bzero(&os->os_zil_header, sizeof (os->os_zil_header)); 904 dsl_dataset_dirty(ds, tx); 905 dsl_dataset_rele(ds, FTAG); 906 } 907 908 return (dsobj); 909} 910 911struct destroyarg { 912 dsl_sync_task_group_t *dstg; 913 char *snapname; 914 char *failed; 915 boolean_t defer; 916}; 917 918static int 919dsl_snapshot_destroy_one(const char *name, void *arg) 920{ 921 struct destroyarg *da = arg; 922 dsl_dataset_t *ds; 923 int err; 924 char *dsname; 925 926 dsname = kmem_asprintf("%s@%s", name, da->snapname); 927 err = dsl_dataset_own(dsname, B_TRUE, da->dstg, &ds); 928 strfree(dsname); 929 if (err == 0) { 930 struct dsl_ds_destroyarg *dsda; 931 932 dsl_dataset_make_exclusive(ds, da->dstg); 933 dsda = kmem_zalloc(sizeof (struct dsl_ds_destroyarg), KM_SLEEP); 934 dsda->ds = ds; 935 dsda->defer = da->defer; 936 dsl_sync_task_create(da->dstg, dsl_dataset_destroy_check, 937 dsl_dataset_destroy_sync, dsda, da->dstg, 0); 938 } else if (err == ENOENT) { 939 err = 0; 940 } else { 941 (void) strcpy(da->failed, name); 942 } 943 return (err); 944} 945 946/* 947 * Destroy 'snapname' in all descendants of 'fsname'. 948 */ 949#pragma weak dmu_snapshots_destroy = dsl_snapshots_destroy 950int 951dsl_snapshots_destroy(char *fsname, char *snapname, boolean_t defer) 952{ 953 int err; 954 struct destroyarg da; 955 dsl_sync_task_t *dst; 956 spa_t *spa; 957 958 err = spa_open(fsname, &spa, FTAG); 959 if (err) 960 return (err); 961 da.dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); 962 da.snapname = snapname; 963 da.failed = fsname; 964 da.defer = defer; 965 966 err = dmu_objset_find(fsname, 967 dsl_snapshot_destroy_one, &da, DS_FIND_CHILDREN); 968 969 if (err == 0) 970 err = dsl_sync_task_group_wait(da.dstg); 971 972 for (dst = list_head(&da.dstg->dstg_tasks); dst; 973 dst = list_next(&da.dstg->dstg_tasks, dst)) { 974 struct dsl_ds_destroyarg *dsda = dst->dst_arg1; 975 dsl_dataset_t *ds = dsda->ds; 976 977 /* 978 * Return the file system name that triggered the error 979 */ 980 if (dst->dst_err) { 981 dsl_dataset_name(ds, fsname); 982 *strchr(fsname, '@') = '\0'; 983 } 984 ASSERT3P(dsda->rm_origin, ==, NULL); 985 dsl_dataset_disown(ds, da.dstg); 986 kmem_free(dsda, sizeof (struct dsl_ds_destroyarg)); 987 } 988 989 dsl_sync_task_group_destroy(da.dstg); 990 spa_close(spa, FTAG); 991 return (err); 992} 993 994static boolean_t 995dsl_dataset_might_destroy_origin(dsl_dataset_t *ds) 996{ 997 boolean_t might_destroy = B_FALSE; 998 999 mutex_enter(&ds->ds_lock); 1000 if (ds->ds_phys->ds_num_children == 2 && ds->ds_userrefs == 0 && 1001 DS_IS_DEFER_DESTROY(ds)) 1002 might_destroy = B_TRUE; 1003 mutex_exit(&ds->ds_lock); 1004 1005 return (might_destroy); 1006} 1007 1008/* 1009 * If we're removing a clone, and these three conditions are true: 1010 * 1) the clone's origin has no other children 1011 * 2) the clone's origin has no user references 1012 * 3) the clone's origin has been marked for deferred destruction 1013 * Then, prepare to remove the origin as part of this sync task group. 1014 */ 1015static int 1016dsl_dataset_origin_rm_prep(struct dsl_ds_destroyarg *dsda, void *tag) 1017{ 1018 dsl_dataset_t *ds = dsda->ds; 1019 dsl_dataset_t *origin = ds->ds_prev; 1020 1021 if (dsl_dataset_might_destroy_origin(origin)) { 1022 char *name; 1023 int namelen; 1024 int error; 1025 1026 namelen = dsl_dataset_namelen(origin) + 1; 1027 name = kmem_alloc(namelen, KM_SLEEP); 1028 dsl_dataset_name(origin, name); 1029#ifdef _KERNEL 1030 error = zfs_unmount_snap(name, NULL); 1031 if (error) { 1032 kmem_free(name, namelen); 1033 return (error); 1034 } 1035#endif 1036 error = dsl_dataset_own(name, B_TRUE, tag, &origin); 1037 kmem_free(name, namelen); 1038 if (error) 1039 return (error); 1040 dsda->rm_origin = origin; 1041 dsl_dataset_make_exclusive(origin, tag); 1042 } 1043 1044 return (0); 1045} 1046 1047/* 1048 * ds must be opened as OWNER. On return (whether successful or not), 1049 * ds will be closed and caller can no longer dereference it. 1050 */ 1051int 1052dsl_dataset_destroy(dsl_dataset_t *ds, void *tag, boolean_t defer) 1053{ 1054 int err; 1055 dsl_sync_task_group_t *dstg; 1056 objset_t *os; 1057 dsl_dir_t *dd; 1058 uint64_t obj; 1059 struct dsl_ds_destroyarg dsda = { 0 }; 1060 dsl_dataset_t dummy_ds = { 0 }; 1061 1062 dsda.ds = ds; 1063 1064 if (dsl_dataset_is_snapshot(ds)) { 1065 /* Destroying a snapshot is simpler */ 1066 dsl_dataset_make_exclusive(ds, tag); 1067 1068 dsda.defer = defer; 1069 err = dsl_sync_task_do(ds->ds_dir->dd_pool, 1070 dsl_dataset_destroy_check, dsl_dataset_destroy_sync, 1071 &dsda, tag, 0); 1072 ASSERT3P(dsda.rm_origin, ==, NULL); 1073 goto out; 1074 } else if (defer) { 1075 err = EINVAL; 1076 goto out; 1077 } 1078 1079 dd = ds->ds_dir; 1080 dummy_ds.ds_dir = dd; 1081 dummy_ds.ds_object = ds->ds_object; 1082 1083 /* 1084 * Check for errors and mark this ds as inconsistent, in 1085 * case we crash while freeing the objects. 1086 */ 1087 err = dsl_sync_task_do(dd->dd_pool, dsl_dataset_destroy_begin_check, 1088 dsl_dataset_destroy_begin_sync, ds, NULL, 0); 1089 if (err) 1090 goto out; 1091 1092 err = dmu_objset_from_ds(ds, &os); 1093 if (err) 1094 goto out; 1095 1096 /* 1097 * remove the objects in open context, so that we won't 1098 * have too much to do in syncing context. 1099 */ 1100 for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, 1101 ds->ds_phys->ds_prev_snap_txg)) { 1102 /* 1103 * Ignore errors, if there is not enough disk space 1104 * we will deal with it in dsl_dataset_destroy_sync(). 1105 */ 1106 (void) dmu_free_object(os, obj); 1107 } 1108 if (err != ESRCH) 1109 goto out; 1110 1111 /* 1112 * Only the ZIL knows how to free log blocks. 1113 */ 1114 zil_destroy(dmu_objset_zil(os), B_FALSE); 1115 1116 /* 1117 * Sync out all in-flight IO. 1118 */ 1119 txg_wait_synced(dd->dd_pool, 0); 1120 1121 /* 1122 * If we managed to free all the objects in open 1123 * context, the user space accounting should be zero. 1124 */ 1125 if (ds->ds_phys->ds_bp.blk_fill == 0 && 1126 dmu_objset_userused_enabled(os)) { 1127 uint64_t count; 1128 1129 ASSERT(zap_count(os, DMU_USERUSED_OBJECT, &count) != 0 || 1130 count == 0); 1131 ASSERT(zap_count(os, DMU_GROUPUSED_OBJECT, &count) != 0 || 1132 count == 0); 1133 } 1134 1135 rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER); 1136 err = dsl_dir_open_obj(dd->dd_pool, dd->dd_object, NULL, FTAG, &dd); 1137 rw_exit(&dd->dd_pool->dp_config_rwlock); 1138 1139 if (err) 1140 goto out; 1141 1142 /* 1143 * Blow away the dsl_dir + head dataset. 1144 */ 1145 dsl_dataset_make_exclusive(ds, tag); 1146 /* 1147 * If we're removing a clone, we might also need to remove its 1148 * origin. 1149 */ 1150 do { 1151 dsda.need_prep = B_FALSE; 1152 if (dsl_dir_is_clone(dd)) { 1153 err = dsl_dataset_origin_rm_prep(&dsda, tag); 1154 if (err) { 1155 dsl_dir_close(dd, FTAG); 1156 goto out; 1157 } 1158 } 1159 1160 dstg = dsl_sync_task_group_create(ds->ds_dir->dd_pool); 1161 dsl_sync_task_create(dstg, dsl_dataset_destroy_check, 1162 dsl_dataset_destroy_sync, &dsda, tag, 0); 1163 dsl_sync_task_create(dstg, dsl_dir_destroy_check, 1164 dsl_dir_destroy_sync, &dummy_ds, FTAG, 0); 1165 err = dsl_sync_task_group_wait(dstg); 1166 dsl_sync_task_group_destroy(dstg); 1167 1168 /* 1169 * We could be racing against 'zfs release' or 'zfs destroy -d' 1170 * on the origin snap, in which case we can get EBUSY if we 1171 * needed to destroy the origin snap but were not ready to 1172 * do so. 1173 */ 1174 if (dsda.need_prep) { 1175 ASSERT(err == EBUSY); 1176 ASSERT(dsl_dir_is_clone(dd)); 1177 ASSERT(dsda.rm_origin == NULL); 1178 } 1179 } while (dsda.need_prep); 1180 1181 if (dsda.rm_origin != NULL) 1182 dsl_dataset_disown(dsda.rm_origin, tag); 1183 1184 /* if it is successful, dsl_dir_destroy_sync will close the dd */ 1185 if (err) 1186 dsl_dir_close(dd, FTAG); 1187out: 1188 dsl_dataset_disown(ds, tag); 1189 return (err); 1190} 1191 1192blkptr_t * 1193dsl_dataset_get_blkptr(dsl_dataset_t *ds) 1194{ 1195 return (&ds->ds_phys->ds_bp); 1196} 1197 1198void 1199dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx) 1200{ 1201 ASSERT(dmu_tx_is_syncing(tx)); 1202 /* If it's the meta-objset, set dp_meta_rootbp */ 1203 if (ds == NULL) { 1204 tx->tx_pool->dp_meta_rootbp = *bp; 1205 } else { 1206 dmu_buf_will_dirty(ds->ds_dbuf, tx); 1207 ds->ds_phys->ds_bp = *bp; 1208 } 1209} 1210 1211spa_t * 1212dsl_dataset_get_spa(dsl_dataset_t *ds) 1213{ 1214 return (ds->ds_dir->dd_pool->dp_spa); 1215} 1216 1217void 1218dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx) 1219{ 1220 dsl_pool_t *dp; 1221 1222 if (ds == NULL) /* this is the meta-objset */ 1223 return; 1224 1225 ASSERT(ds->ds_objset != NULL); 1226 1227 if (ds->ds_phys->ds_next_snap_obj != 0) 1228 panic("dirtying snapshot!"); 1229 1230 dp = ds->ds_dir->dd_pool; 1231 1232 if (txg_list_add(&dp->dp_dirty_datasets, ds, tx->tx_txg) == 0) { 1233 /* up the hold count until we can be written out */ 1234 dmu_buf_add_ref(ds->ds_dbuf, ds); 1235 } 1236} 1237 1238/* 1239 * The unique space in the head dataset can be calculated by subtracting 1240 * the space used in the most recent snapshot, that is still being used 1241 * in this file system, from the space currently in use. To figure out 1242 * the space in the most recent snapshot still in use, we need to take 1243 * the total space used in the snapshot and subtract out the space that 1244 * has been freed up since the snapshot was taken. 1245 */ 1246static void 1247dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds) 1248{ 1249 uint64_t mrs_used; 1250 uint64_t dlused, dlcomp, dluncomp; 1251 1252 ASSERT(!dsl_dataset_is_snapshot(ds)); 1253 1254 if (ds->ds_phys->ds_prev_snap_obj != 0) 1255 mrs_used = ds->ds_prev->ds_phys->ds_used_bytes; 1256 else 1257 mrs_used = 0; 1258 1259 dsl_deadlist_space(&ds->ds_deadlist, &dlused, &dlcomp, &dluncomp); 1260 1261 ASSERT3U(dlused, <=, mrs_used); 1262 ds->ds_phys->ds_unique_bytes = 1263 ds->ds_phys->ds_used_bytes - (mrs_used - dlused); 1264 1265 if (spa_version(ds->ds_dir->dd_pool->dp_spa) >= 1266 SPA_VERSION_UNIQUE_ACCURATE) 1267 ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE; 1268} 1269 1270struct killarg { 1271 dsl_dataset_t *ds; 1272 dmu_tx_t *tx; 1273}; 1274 1275/* ARGSUSED */ 1276static int 1277kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_buf_t *pbuf, 1278 const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) 1279{ 1280 struct killarg *ka = arg; 1281 dmu_tx_t *tx = ka->tx; 1282 1283 if (bp == NULL) 1284 return (0); 1285 1286 if (zb->zb_level == ZB_ZIL_LEVEL) { 1287 ASSERT(zilog != NULL); 1288 /* 1289 * It's a block in the intent log. It has no 1290 * accounting, so just free it. 1291 */ 1292 dsl_free(ka->tx->tx_pool, ka->tx->tx_txg, bp); 1293 } else { 1294 ASSERT(zilog == NULL); 1295 ASSERT3U(bp->blk_birth, >, ka->ds->ds_phys->ds_prev_snap_txg); 1296 (void) dsl_dataset_block_kill(ka->ds, bp, tx, B_FALSE); 1297 } 1298 1299 return (0); 1300} 1301 1302/* ARGSUSED */ 1303static int 1304dsl_dataset_destroy_begin_check(void *arg1, void *arg2, dmu_tx_t *tx) 1305{ 1306 dsl_dataset_t *ds = arg1; 1307 objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; 1308 uint64_t count; 1309 int err; 1310 1311 /* 1312 * Can't delete a head dataset if there are snapshots of it. 1313 * (Except if the only snapshots are from the branch we cloned 1314 * from.) 1315 */ 1316 if (ds->ds_prev != NULL && 1317 ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) 1318 return (EBUSY); 1319 1320 /* 1321 * This is really a dsl_dir thing, but check it here so that 1322 * we'll be less likely to leave this dataset inconsistent & 1323 * nearly destroyed. 1324 */ 1325 err = zap_count(mos, ds->ds_dir->dd_phys->dd_child_dir_zapobj, &count); 1326 if (err) 1327 return (err); 1328 if (count != 0) 1329 return (EEXIST); 1330 1331 return (0); 1332} 1333 1334/* ARGSUSED */ 1335static void 1336dsl_dataset_destroy_begin_sync(void *arg1, void *arg2, dmu_tx_t *tx) 1337{ 1338 dsl_dataset_t *ds = arg1; 1339 dsl_pool_t *dp = ds->ds_dir->dd_pool; 1340 1341 /* Mark it as inconsistent on-disk, in case we crash */ 1342 dmu_buf_will_dirty(ds->ds_dbuf, tx); 1343 ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT; 1344 1345 spa_history_log_internal(LOG_DS_DESTROY_BEGIN, dp->dp_spa, tx, 1346 "dataset = %llu", ds->ds_object); 1347} 1348 1349static int 1350dsl_dataset_origin_check(struct dsl_ds_destroyarg *dsda, void *tag, 1351 dmu_tx_t *tx) 1352{ 1353 dsl_dataset_t *ds = dsda->ds; 1354 dsl_dataset_t *ds_prev = ds->ds_prev; 1355 1356 if (dsl_dataset_might_destroy_origin(ds_prev)) { 1357 struct dsl_ds_destroyarg ndsda = {0}; 1358 1359 /* 1360 * If we're not prepared to remove the origin, don't remove 1361 * the clone either. 1362 */ 1363 if (dsda->rm_origin == NULL) { 1364 dsda->need_prep = B_TRUE; 1365 return (EBUSY); 1366 } 1367 1368 ndsda.ds = ds_prev; 1369 ndsda.is_origin_rm = B_TRUE; 1370 return (dsl_dataset_destroy_check(&ndsda, tag, tx)); 1371 } 1372 1373 /* 1374 * If we're not going to remove the origin after all, 1375 * undo the open context setup. 1376 */ 1377 if (dsda->rm_origin != NULL) { 1378 dsl_dataset_disown(dsda->rm_origin, tag); 1379 dsda->rm_origin = NULL; 1380 } 1381 1382 return (0); 1383} 1384 1385/* 1386 * If you add new checks here, you may need to add 1387 * additional checks to the "temporary" case in 1388 * snapshot_check() in dmu_objset.c. 1389 */ 1390/* ARGSUSED */ 1391int 1392dsl_dataset_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx) 1393{ 1394 struct dsl_ds_destroyarg *dsda = arg1; 1395 dsl_dataset_t *ds = dsda->ds; 1396 1397 /* we have an owner hold, so noone else can destroy us */ 1398 ASSERT(!DSL_DATASET_IS_DESTROYED(ds)); 1399 1400 /* 1401 * Only allow deferred destroy on pools that support it. 1402 * NOTE: deferred destroy is only supported on snapshots. 1403 */ 1404 if (dsda->defer) { 1405 if (spa_version(ds->ds_dir->dd_pool->dp_spa) < 1406 SPA_VERSION_USERREFS) 1407 return (ENOTSUP); 1408 ASSERT(dsl_dataset_is_snapshot(ds)); 1409 return (0); 1410 } 1411 1412 /* 1413 * Can't delete a head dataset if there are snapshots of it. 1414 * (Except if the only snapshots are from the branch we cloned 1415 * from.) 1416 */ 1417 if (ds->ds_prev != NULL && 1418 ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) 1419 return (EBUSY); 1420 1421 /* 1422 * If we made changes this txg, traverse_dsl_dataset won't find 1423 * them. Try again. 1424 */ 1425 if (ds->ds_phys->ds_bp.blk_birth >= tx->tx_txg) 1426 return (EAGAIN); 1427 1428 if (dsl_dataset_is_snapshot(ds)) { 1429 /* 1430 * If this snapshot has an elevated user reference count, 1431 * we can't destroy it yet. 1432 */ 1433 if (ds->ds_userrefs > 0 && !dsda->releasing) 1434 return (EBUSY); 1435 1436 mutex_enter(&ds->ds_lock); 1437 /* 1438 * Can't delete a branch point. However, if we're destroying 1439 * a clone and removing its origin due to it having a user 1440 * hold count of 0 and having been marked for deferred destroy, 1441 * it's OK for the origin to have a single clone. 1442 */ 1443 if (ds->ds_phys->ds_num_children > 1444 (dsda->is_origin_rm ? 2 : 1)) { 1445 mutex_exit(&ds->ds_lock); 1446 return (EEXIST); 1447 } 1448 mutex_exit(&ds->ds_lock); 1449 } else if (dsl_dir_is_clone(ds->ds_dir)) { 1450 return (dsl_dataset_origin_check(dsda, arg2, tx)); 1451 } 1452 1453 /* XXX we should do some i/o error checking... */ 1454 return (0); 1455} 1456 1457struct refsarg { 1458 kmutex_t lock; 1459 boolean_t gone; 1460 kcondvar_t cv; 1461}; 1462 1463/* ARGSUSED */ 1464static void 1465dsl_dataset_refs_gone(dmu_buf_t *db, void *argv) 1466{ 1467 struct refsarg *arg = argv; 1468 1469 mutex_enter(&arg->lock); 1470 arg->gone = TRUE; 1471 cv_signal(&arg->cv); 1472 mutex_exit(&arg->lock); 1473} 1474 1475static void 1476dsl_dataset_drain_refs(dsl_dataset_t *ds, void *tag) 1477{ 1478 struct refsarg arg; 1479 1480 bzero(&arg, sizeof(arg)); 1481 mutex_init(&arg.lock, NULL, MUTEX_DEFAULT, NULL); 1482 cv_init(&arg.cv, NULL, CV_DEFAULT, NULL); 1483 arg.gone = FALSE; 1484 (void) dmu_buf_update_user(ds->ds_dbuf, ds, &arg, &ds->ds_phys, 1485 dsl_dataset_refs_gone); 1486 dmu_buf_rele(ds->ds_dbuf, tag); 1487 mutex_enter(&arg.lock); 1488 while (!arg.gone) 1489 cv_wait(&arg.cv, &arg.lock); 1490 ASSERT(arg.gone); 1491 mutex_exit(&arg.lock); 1492 ds->ds_dbuf = NULL; 1493 ds->ds_phys = NULL; 1494 mutex_destroy(&arg.lock); 1495 cv_destroy(&arg.cv); 1496} 1497 1498static void 1499remove_from_next_clones(dsl_dataset_t *ds, uint64_t obj, dmu_tx_t *tx) 1500{ 1501 objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; 1502 uint64_t count; 1503 int err; 1504 1505 ASSERT(ds->ds_phys->ds_num_children >= 2); 1506 err = zap_remove_int(mos, ds->ds_phys->ds_next_clones_obj, obj, tx); 1507 /* 1508 * The err should not be ENOENT, but a bug in a previous version 1509 * of the code could cause upgrade_clones_cb() to not set 1510 * ds_next_snap_obj when it should, leading to a missing entry. 1511 * If we knew that the pool was created after 1512 * SPA_VERSION_NEXT_CLONES, we could assert that it isn't 1513 * ENOENT. However, at least we can check that we don't have 1514 * too many entries in the next_clones_obj even after failing to 1515 * remove this one. 1516 */ 1517 if (err != ENOENT) { 1518 VERIFY3U(err, ==, 0); 1519 } 1520 ASSERT3U(0, ==, zap_count(mos, ds->ds_phys->ds_next_clones_obj, 1521 &count)); 1522 ASSERT3U(count, <=, ds->ds_phys->ds_num_children - 2); 1523} 1524 1525static void 1526dsl_dataset_remove_clones_key(dsl_dataset_t *ds, uint64_t mintxg, dmu_tx_t *tx) 1527{ 1528 objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; 1529 zap_cursor_t zc; 1530 zap_attribute_t za; 1531 1532 /* 1533 * If it is the old version, dd_clones doesn't exist so we can't 1534 * find the clones, but deadlist_remove_key() is a no-op so it 1535 * doesn't matter. 1536 */ 1537 if (ds->ds_dir->dd_phys->dd_clones == 0) 1538 return; 1539 1540 for (zap_cursor_init(&zc, mos, ds->ds_dir->dd_phys->dd_clones); 1541 zap_cursor_retrieve(&zc, &za) == 0; 1542 zap_cursor_advance(&zc)) { 1543 dsl_dataset_t *clone; 1544 1545 VERIFY3U(0, ==, dsl_dataset_hold_obj(ds->ds_dir->dd_pool, 1546 za.za_first_integer, FTAG, &clone)); 1547 if (clone->ds_dir->dd_origin_txg > mintxg) { 1548 dsl_deadlist_remove_key(&clone->ds_deadlist, 1549 mintxg, tx); 1550 dsl_dataset_remove_clones_key(clone, mintxg, tx); 1551 } 1552 dsl_dataset_rele(clone, FTAG); 1553 } 1554 zap_cursor_fini(&zc); 1555} 1556 1557struct process_old_arg { 1558 dsl_dataset_t *ds; 1559 dsl_dataset_t *ds_prev; 1560 boolean_t after_branch_point; 1561 zio_t *pio; 1562 uint64_t used, comp, uncomp; 1563}; 1564 1565static int 1566process_old_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 1567{ 1568 struct process_old_arg *poa = arg; 1569 dsl_pool_t *dp = poa->ds->ds_dir->dd_pool; 1570 1571 if (bp->blk_birth <= poa->ds->ds_phys->ds_prev_snap_txg) { 1572 dsl_deadlist_insert(&poa->ds->ds_deadlist, bp, tx); 1573 if (poa->ds_prev && !poa->after_branch_point && 1574 bp->blk_birth > 1575 poa->ds_prev->ds_phys->ds_prev_snap_txg) { 1576 poa->ds_prev->ds_phys->ds_unique_bytes += 1577 bp_get_dsize_sync(dp->dp_spa, bp); 1578 } 1579 } else { 1580 poa->used += bp_get_dsize_sync(dp->dp_spa, bp); 1581 poa->comp += BP_GET_PSIZE(bp); 1582 poa->uncomp += BP_GET_UCSIZE(bp); 1583 dsl_free_sync(poa->pio, dp, tx->tx_txg, bp); 1584 } 1585 return (0); 1586} 1587 1588static void 1589process_old_deadlist(dsl_dataset_t *ds, dsl_dataset_t *ds_prev, 1590 dsl_dataset_t *ds_next, boolean_t after_branch_point, dmu_tx_t *tx) 1591{ 1592 struct process_old_arg poa = { 0 }; 1593 dsl_pool_t *dp = ds->ds_dir->dd_pool; 1594 objset_t *mos = dp->dp_meta_objset; 1595 1596 ASSERT(ds->ds_deadlist.dl_oldfmt); 1597 ASSERT(ds_next->ds_deadlist.dl_oldfmt); 1598 1599 poa.ds = ds; 1600 poa.ds_prev = ds_prev; 1601 poa.after_branch_point = after_branch_point; 1602 poa.pio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); 1603 VERIFY3U(0, ==, bpobj_iterate(&ds_next->ds_deadlist.dl_bpobj, 1604 process_old_cb, &poa, tx)); 1605 VERIFY3U(zio_wait(poa.pio), ==, 0); 1606 ASSERT3U(poa.used, ==, ds->ds_phys->ds_unique_bytes); 1607 1608 /* change snapused */ 1609 dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP, 1610 -poa.used, -poa.comp, -poa.uncomp, tx); 1611 1612 /* swap next's deadlist to our deadlist */ 1613 dsl_deadlist_close(&ds->ds_deadlist); 1614 dsl_deadlist_close(&ds_next->ds_deadlist); 1615 SWITCH64(ds_next->ds_phys->ds_deadlist_obj, 1616 ds->ds_phys->ds_deadlist_obj); 1617 dsl_deadlist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj); 1618 dsl_deadlist_open(&ds_next->ds_deadlist, mos, 1619 ds_next->ds_phys->ds_deadlist_obj); 1620} 1621 1622void 1623dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx) 1624{ 1625 struct dsl_ds_destroyarg *dsda = arg1; 1626 dsl_dataset_t *ds = dsda->ds; 1627 int err; 1628 int after_branch_point = FALSE; 1629 dsl_pool_t *dp = ds->ds_dir->dd_pool; 1630 objset_t *mos = dp->dp_meta_objset; 1631 dsl_dataset_t *ds_prev = NULL; 1632 boolean_t wont_destroy; 1633 uint64_t obj; 1634 1635 wont_destroy = (dsda->defer && 1636 (ds->ds_userrefs > 0 || ds->ds_phys->ds_num_children > 1)); 1637 1638 ASSERT(ds->ds_owner || wont_destroy); 1639 ASSERT(dsda->defer || ds->ds_phys->ds_num_children <= 1); 1640 ASSERT(ds->ds_prev == NULL || 1641 ds->ds_prev->ds_phys->ds_next_snap_obj != ds->ds_object); 1642 ASSERT3U(ds->ds_phys->ds_bp.blk_birth, <=, tx->tx_txg); 1643 1644 if (wont_destroy) { 1645 ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS); 1646 dmu_buf_will_dirty(ds->ds_dbuf, tx); 1647 ds->ds_phys->ds_flags |= DS_FLAG_DEFER_DESTROY; 1648 return; 1649 } 1650 1651 /* signal any waiters that this dataset is going away */ 1652 mutex_enter(&ds->ds_lock); 1653 ds->ds_owner = dsl_reaper; 1654 cv_broadcast(&ds->ds_exclusive_cv); 1655 mutex_exit(&ds->ds_lock); 1656 1657 /* Remove our reservation */ 1658 if (ds->ds_reserved != 0) { 1659 dsl_prop_setarg_t psa; 1660 uint64_t value = 0; 1661 1662 dsl_prop_setarg_init_uint64(&psa, "refreservation", 1663 (ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED), 1664 &value); 1665 psa.psa_effective_value = 0; /* predict default value */ 1666 1667 dsl_dataset_set_reservation_sync(ds, &psa, tx); 1668 ASSERT3U(ds->ds_reserved, ==, 0); 1669 } 1670 1671 ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock)); 1672 1673 dsl_scan_ds_destroyed(ds, tx); 1674 1675 obj = ds->ds_object; 1676 1677 if (ds->ds_phys->ds_prev_snap_obj != 0) { 1678 if (ds->ds_prev) { 1679 ds_prev = ds->ds_prev; 1680 } else { 1681 VERIFY(0 == dsl_dataset_hold_obj(dp, 1682 ds->ds_phys->ds_prev_snap_obj, FTAG, &ds_prev)); 1683 } 1684 after_branch_point = 1685 (ds_prev->ds_phys->ds_next_snap_obj != obj); 1686 1687 dmu_buf_will_dirty(ds_prev->ds_dbuf, tx); 1688 if (after_branch_point && 1689 ds_prev->ds_phys->ds_next_clones_obj != 0) { 1690 remove_from_next_clones(ds_prev, obj, tx); 1691 if (ds->ds_phys->ds_next_snap_obj != 0) { 1692 VERIFY(0 == zap_add_int(mos, 1693 ds_prev->ds_phys->ds_next_clones_obj, 1694 ds->ds_phys->ds_next_snap_obj, tx)); 1695 } 1696 } 1697 if (after_branch_point && 1698 ds->ds_phys->ds_next_snap_obj == 0) { 1699 /* This clone is toast. */ 1700 ASSERT(ds_prev->ds_phys->ds_num_children > 1); 1701 ds_prev->ds_phys->ds_num_children--; 1702 1703 /* 1704 * If the clone's origin has no other clones, no 1705 * user holds, and has been marked for deferred 1706 * deletion, then we should have done the necessary 1707 * destroy setup for it. 1708 */ 1709 if (ds_prev->ds_phys->ds_num_children == 1 && 1710 ds_prev->ds_userrefs == 0 && 1711 DS_IS_DEFER_DESTROY(ds_prev)) { 1712 ASSERT3P(dsda->rm_origin, !=, NULL); 1713 } else { 1714 ASSERT3P(dsda->rm_origin, ==, NULL); 1715 } 1716 } else if (!after_branch_point) { 1717 ds_prev->ds_phys->ds_next_snap_obj = 1718 ds->ds_phys->ds_next_snap_obj; 1719 } 1720 } 1721 1722 if (dsl_dataset_is_snapshot(ds)) { 1723 dsl_dataset_t *ds_next; 1724 uint64_t old_unique; 1725 uint64_t used = 0, comp = 0, uncomp = 0; 1726 1727 VERIFY(0 == dsl_dataset_hold_obj(dp, 1728 ds->ds_phys->ds_next_snap_obj, FTAG, &ds_next)); 1729 ASSERT3U(ds_next->ds_phys->ds_prev_snap_obj, ==, obj); 1730 1731 old_unique = ds_next->ds_phys->ds_unique_bytes; 1732 1733 dmu_buf_will_dirty(ds_next->ds_dbuf, tx); 1734 ds_next->ds_phys->ds_prev_snap_obj = 1735 ds->ds_phys->ds_prev_snap_obj; 1736 ds_next->ds_phys->ds_prev_snap_txg = 1737 ds->ds_phys->ds_prev_snap_txg; 1738 ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==, 1739 ds_prev ? ds_prev->ds_phys->ds_creation_txg : 0); 1740 1741 1742 if (ds_next->ds_deadlist.dl_oldfmt) { 1743 process_old_deadlist(ds, ds_prev, ds_next, 1744 after_branch_point, tx); 1745 } else { 1746 /* Adjust prev's unique space. */ 1747 if (ds_prev && !after_branch_point) { 1748 dsl_deadlist_space_range(&ds_next->ds_deadlist, 1749 ds_prev->ds_phys->ds_prev_snap_txg, 1750 ds->ds_phys->ds_prev_snap_txg, 1751 &used, &comp, &uncomp); 1752 ds_prev->ds_phys->ds_unique_bytes += used; 1753 } 1754 1755 /* Adjust snapused. */ 1756 dsl_deadlist_space_range(&ds_next->ds_deadlist, 1757 ds->ds_phys->ds_prev_snap_txg, UINT64_MAX, 1758 &used, &comp, &uncomp); 1759 dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP, 1760 -used, -comp, -uncomp, tx); 1761 1762 /* Move blocks to be freed to pool's free list. */ 1763 dsl_deadlist_move_bpobj(&ds_next->ds_deadlist, 1764 &dp->dp_free_bpobj, ds->ds_phys->ds_prev_snap_txg, 1765 tx); 1766 dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, 1767 DD_USED_HEAD, used, comp, uncomp, tx); 1768 dsl_dir_dirty(tx->tx_pool->dp_free_dir, tx); 1769 1770 /* Merge our deadlist into next's and free it. */ 1771 dsl_deadlist_merge(&ds_next->ds_deadlist, 1772 ds->ds_phys->ds_deadlist_obj, tx); 1773 } 1774 dsl_deadlist_close(&ds->ds_deadlist); 1775 dsl_deadlist_free(mos, ds->ds_phys->ds_deadlist_obj, tx); 1776 1777 /* Collapse range in clone heads */ 1778 dsl_dataset_remove_clones_key(ds, 1779 ds->ds_phys->ds_creation_txg, tx); 1780 1781 if (dsl_dataset_is_snapshot(ds_next)) { 1782 dsl_dataset_t *ds_nextnext; 1783 1784 /* 1785 * Update next's unique to include blocks which 1786 * were previously shared by only this snapshot 1787 * and it. Those blocks will be born after the 1788 * prev snap and before this snap, and will have 1789 * died after the next snap and before the one 1790 * after that (ie. be on the snap after next's 1791 * deadlist). 1792 */ 1793 VERIFY(0 == dsl_dataset_hold_obj(dp, 1794 ds_next->ds_phys->ds_next_snap_obj, 1795 FTAG, &ds_nextnext)); 1796 dsl_deadlist_space_range(&ds_nextnext->ds_deadlist, 1797 ds->ds_phys->ds_prev_snap_txg, 1798 ds->ds_phys->ds_creation_txg, 1799 &used, &comp, &uncomp); 1800 ds_next->ds_phys->ds_unique_bytes += used; 1801 dsl_dataset_rele(ds_nextnext, FTAG); 1802 ASSERT3P(ds_next->ds_prev, ==, NULL); 1803 1804 /* Collapse range in this head. */ 1805 dsl_dataset_t *hds; 1806 VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, 1807 ds->ds_dir->dd_phys->dd_head_dataset_obj, 1808 FTAG, &hds)); 1809 dsl_deadlist_remove_key(&hds->ds_deadlist, 1810 ds->ds_phys->ds_creation_txg, tx); 1811 dsl_dataset_rele(hds, FTAG); 1812 1813 } else { 1814 ASSERT3P(ds_next->ds_prev, ==, ds); 1815 dsl_dataset_drop_ref(ds_next->ds_prev, ds_next); 1816 ds_next->ds_prev = NULL; 1817 if (ds_prev) { 1818 VERIFY(0 == dsl_dataset_get_ref(dp, 1819 ds->ds_phys->ds_prev_snap_obj, 1820 ds_next, &ds_next->ds_prev)); 1821 } 1822 1823 dsl_dataset_recalc_head_uniq(ds_next); 1824 1825 /* 1826 * Reduce the amount of our unconsmed refreservation 1827 * being charged to our parent by the amount of 1828 * new unique data we have gained. 1829 */ 1830 if (old_unique < ds_next->ds_reserved) { 1831 int64_t mrsdelta; 1832 uint64_t new_unique = 1833 ds_next->ds_phys->ds_unique_bytes; 1834 1835 ASSERT(old_unique <= new_unique); 1836 mrsdelta = MIN(new_unique - old_unique, 1837 ds_next->ds_reserved - old_unique); 1838 dsl_dir_diduse_space(ds->ds_dir, 1839 DD_USED_REFRSRV, -mrsdelta, 0, 0, tx); 1840 } 1841 } 1842 dsl_dataset_rele(ds_next, FTAG); 1843 } else { 1844 /* 1845 * There's no next snapshot, so this is a head dataset. 1846 * Destroy the deadlist. Unless it's a clone, the 1847 * deadlist should be empty. (If it's a clone, it's 1848 * safe to ignore the deadlist contents.) 1849 */ 1850 struct killarg ka; 1851 1852 dsl_deadlist_close(&ds->ds_deadlist); 1853 dsl_deadlist_free(mos, ds->ds_phys->ds_deadlist_obj, tx); 1854 ds->ds_phys->ds_deadlist_obj = 0; 1855 1856 /* 1857 * Free everything that we point to (that's born after 1858 * the previous snapshot, if we are a clone) 1859 * 1860 * NB: this should be very quick, because we already 1861 * freed all the objects in open context. 1862 */ 1863 ka.ds = ds; 1864 ka.tx = tx; 1865 err = traverse_dataset(ds, ds->ds_phys->ds_prev_snap_txg, 1866 TRAVERSE_POST, kill_blkptr, &ka); 1867 ASSERT3U(err, ==, 0); 1868 ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || 1869 ds->ds_phys->ds_unique_bytes == 0); 1870 1871 if (ds->ds_prev != NULL) { 1872 if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) { 1873 VERIFY3U(0, ==, zap_remove_int(mos, 1874 ds->ds_prev->ds_dir->dd_phys->dd_clones, 1875 ds->ds_object, tx)); 1876 } 1877 dsl_dataset_rele(ds->ds_prev, ds); 1878 ds->ds_prev = ds_prev = NULL; 1879 } 1880 } 1881 1882 /* 1883 * This must be done after the dsl_traverse(), because it will 1884 * re-open the objset. 1885 */ 1886 if (ds->ds_objset) { 1887 dmu_objset_evict(ds->ds_objset); 1888 ds->ds_objset = NULL; 1889 } 1890 1891 if (ds->ds_dir->dd_phys->dd_head_dataset_obj == ds->ds_object) { 1892 /* Erase the link in the dir */ 1893 dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx); 1894 ds->ds_dir->dd_phys->dd_head_dataset_obj = 0; 1895 ASSERT(ds->ds_phys->ds_snapnames_zapobj != 0); 1896 err = zap_destroy(mos, ds->ds_phys->ds_snapnames_zapobj, tx); 1897 ASSERT(err == 0); 1898 } else { 1899 /* remove from snapshot namespace */ 1900 dsl_dataset_t *ds_head; 1901 ASSERT(ds->ds_phys->ds_snapnames_zapobj == 0); 1902 VERIFY(0 == dsl_dataset_hold_obj(dp, 1903 ds->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &ds_head)); 1904 VERIFY(0 == dsl_dataset_get_snapname(ds)); 1905#ifdef ZFS_DEBUG 1906 { 1907 uint64_t val; 1908 1909 err = dsl_dataset_snap_lookup(ds_head, 1910 ds->ds_snapname, &val); 1911 ASSERT3U(err, ==, 0); 1912 ASSERT3U(val, ==, obj); 1913 } 1914#endif 1915 err = dsl_dataset_snap_remove(ds_head, ds->ds_snapname, tx); 1916 ASSERT(err == 0); 1917 dsl_dataset_rele(ds_head, FTAG); 1918 } 1919 1920 if (ds_prev && ds->ds_prev != ds_prev) 1921 dsl_dataset_rele(ds_prev, FTAG); 1922 1923 spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx); 1924 spa_history_log_internal(LOG_DS_DESTROY, dp->dp_spa, tx, 1925 "dataset = %llu", ds->ds_object); 1926 1927 if (ds->ds_phys->ds_next_clones_obj != 0) { 1928 uint64_t count; 1929 ASSERT(0 == zap_count(mos, 1930 ds->ds_phys->ds_next_clones_obj, &count) && count == 0); 1931 VERIFY(0 == dmu_object_free(mos, 1932 ds->ds_phys->ds_next_clones_obj, tx)); 1933 } 1934 if (ds->ds_phys->ds_props_obj != 0) 1935 VERIFY(0 == zap_destroy(mos, ds->ds_phys->ds_props_obj, tx)); 1936 if (ds->ds_phys->ds_userrefs_obj != 0) 1937 VERIFY(0 == zap_destroy(mos, ds->ds_phys->ds_userrefs_obj, tx)); 1938 dsl_dir_close(ds->ds_dir, ds); 1939 ds->ds_dir = NULL; 1940 dsl_dataset_drain_refs(ds, tag); 1941 VERIFY(0 == dmu_object_free(mos, obj, tx)); 1942 1943 if (dsda->rm_origin) { 1944 /* 1945 * Remove the origin of the clone we just destroyed. 1946 */ 1947 struct dsl_ds_destroyarg ndsda = {0}; 1948 1949 ndsda.ds = dsda->rm_origin; 1950 dsl_dataset_destroy_sync(&ndsda, tag, tx); 1951 } 1952} 1953 1954static int 1955dsl_dataset_snapshot_reserve_space(dsl_dataset_t *ds, dmu_tx_t *tx) 1956{ 1957 uint64_t asize; 1958 1959 if (!dmu_tx_is_syncing(tx)) 1960 return (0); 1961 1962 /* 1963 * If there's an fs-only reservation, any blocks that might become 1964 * owned by the snapshot dataset must be accommodated by space 1965 * outside of the reservation. 1966 */ 1967 ASSERT(ds->ds_reserved == 0 || DS_UNIQUE_IS_ACCURATE(ds)); 1968 asize = MIN(ds->ds_phys->ds_unique_bytes, ds->ds_reserved); 1969 if (asize > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE)) 1970 return (ENOSPC); 1971 1972 /* 1973 * Propogate any reserved space for this snapshot to other 1974 * snapshot checks in this sync group. 1975 */ 1976 if (asize > 0) 1977 dsl_dir_willuse_space(ds->ds_dir, asize, tx); 1978 1979 return (0); 1980} 1981 1982int 1983dsl_dataset_snapshot_check(void *arg1, void *arg2, dmu_tx_t *tx) 1984{ 1985 dsl_dataset_t *ds = arg1; 1986 const char *snapname = arg2; 1987 int err; 1988 uint64_t value; 1989 1990 /* 1991 * We don't allow multiple snapshots of the same txg. If there 1992 * is already one, try again. 1993 */ 1994 if (ds->ds_phys->ds_prev_snap_txg >= tx->tx_txg) 1995 return (EAGAIN); 1996 1997 /* 1998 * Check for conflicting name snapshot name. 1999 */ 2000 err = dsl_dataset_snap_lookup(ds, snapname, &value); 2001 if (err == 0) 2002 return (EEXIST); 2003 if (err != ENOENT) 2004 return (err); 2005 2006 /* 2007 * Check that the dataset's name is not too long. Name consists 2008 * of the dataset's length + 1 for the @-sign + snapshot name's length 2009 */ 2010 if (dsl_dataset_namelen(ds) + 1 + strlen(snapname) >= MAXNAMELEN) 2011 return (ENAMETOOLONG); 2012 2013 err = dsl_dataset_snapshot_reserve_space(ds, tx); 2014 if (err) 2015 return (err); 2016 2017 ds->ds_trysnap_txg = tx->tx_txg; 2018 return (0); 2019} 2020 2021void 2022dsl_dataset_snapshot_sync(void *arg1, void *arg2, dmu_tx_t *tx) 2023{ 2024 dsl_dataset_t *ds = arg1; 2025 const char *snapname = arg2; 2026 dsl_pool_t *dp = ds->ds_dir->dd_pool; 2027 dmu_buf_t *dbuf; 2028 dsl_dataset_phys_t *dsphys; 2029 uint64_t dsobj, crtxg; 2030 objset_t *mos = dp->dp_meta_objset; 2031 int err; 2032 2033 ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock)); 2034 2035 /* 2036 * The origin's ds_creation_txg has to be < TXG_INITIAL 2037 */ 2038 if (strcmp(snapname, ORIGIN_DIR_NAME) == 0) 2039 crtxg = 1; 2040 else 2041 crtxg = tx->tx_txg; 2042 2043 dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0, 2044 DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx); 2045 VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf)); 2046 dmu_buf_will_dirty(dbuf, tx); 2047 dsphys = dbuf->db_data; 2048 bzero(dsphys, sizeof (dsl_dataset_phys_t)); 2049 dsphys->ds_dir_obj = ds->ds_dir->dd_object; 2050 dsphys->ds_fsid_guid = unique_create(); 2051 (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid, 2052 sizeof (dsphys->ds_guid)); 2053 dsphys->ds_prev_snap_obj = ds->ds_phys->ds_prev_snap_obj; 2054 dsphys->ds_prev_snap_txg = ds->ds_phys->ds_prev_snap_txg; 2055 dsphys->ds_next_snap_obj = ds->ds_object; 2056 dsphys->ds_num_children = 1; 2057 dsphys->ds_creation_time = gethrestime_sec(); 2058 dsphys->ds_creation_txg = crtxg; 2059 dsphys->ds_deadlist_obj = ds->ds_phys->ds_deadlist_obj; 2060 dsphys->ds_used_bytes = ds->ds_phys->ds_used_bytes; 2061 dsphys->ds_compressed_bytes = ds->ds_phys->ds_compressed_bytes; 2062 dsphys->ds_uncompressed_bytes = ds->ds_phys->ds_uncompressed_bytes; 2063 dsphys->ds_flags = ds->ds_phys->ds_flags; 2064 dsphys->ds_bp = ds->ds_phys->ds_bp; 2065 dmu_buf_rele(dbuf, FTAG); 2066 2067 ASSERT3U(ds->ds_prev != 0, ==, ds->ds_phys->ds_prev_snap_obj != 0); 2068 if (ds->ds_prev) { 2069 uint64_t next_clones_obj = 2070 ds->ds_prev->ds_phys->ds_next_clones_obj; 2071 ASSERT(ds->ds_prev->ds_phys->ds_next_snap_obj == 2072 ds->ds_object || 2073 ds->ds_prev->ds_phys->ds_num_children > 1); 2074 if (ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) { 2075 dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); 2076 ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==, 2077 ds->ds_prev->ds_phys->ds_creation_txg); 2078 ds->ds_prev->ds_phys->ds_next_snap_obj = dsobj; 2079 } else if (next_clones_obj != 0) { 2080 remove_from_next_clones(ds->ds_prev, 2081 dsphys->ds_next_snap_obj, tx); 2082 VERIFY3U(0, ==, zap_add_int(mos, 2083 next_clones_obj, dsobj, tx)); 2084 } 2085 } 2086 2087 /* 2088 * If we have a reference-reservation on this dataset, we will 2089 * need to increase the amount of refreservation being charged 2090 * since our unique space is going to zero. 2091 */ 2092 if (ds->ds_reserved) { 2093 int64_t delta; 2094 ASSERT(DS_UNIQUE_IS_ACCURATE(ds)); 2095 delta = MIN(ds->ds_phys->ds_unique_bytes, ds->ds_reserved); 2096 dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, 2097 delta, 0, 0, tx); 2098 } 2099 2100 dmu_buf_will_dirty(ds->ds_dbuf, tx); 2101 zfs_dbgmsg("taking snapshot %s@%s/%llu; newkey=%llu", 2102 ds->ds_dir->dd_myname, snapname, dsobj, 2103 ds->ds_phys->ds_prev_snap_txg); 2104 ds->ds_phys->ds_deadlist_obj = dsl_deadlist_clone(&ds->ds_deadlist, 2105 UINT64_MAX, ds->ds_phys->ds_prev_snap_obj, tx); 2106 dsl_deadlist_close(&ds->ds_deadlist); 2107 dsl_deadlist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj); 2108 dsl_deadlist_add_key(&ds->ds_deadlist, 2109 ds->ds_phys->ds_prev_snap_txg, tx); 2110 2111 ASSERT3U(ds->ds_phys->ds_prev_snap_txg, <, tx->tx_txg); 2112 ds->ds_phys->ds_prev_snap_obj = dsobj; 2113 ds->ds_phys->ds_prev_snap_txg = crtxg; 2114 ds->ds_phys->ds_unique_bytes = 0; 2115 if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE) 2116 ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE; 2117 2118 err = zap_add(mos, ds->ds_phys->ds_snapnames_zapobj, 2119 snapname, 8, 1, &dsobj, tx); 2120 ASSERT(err == 0); 2121 2122 if (ds->ds_prev) 2123 dsl_dataset_drop_ref(ds->ds_prev, ds); 2124 VERIFY(0 == dsl_dataset_get_ref(dp, 2125 ds->ds_phys->ds_prev_snap_obj, ds, &ds->ds_prev)); 2126 2127 dsl_scan_ds_snapshotted(ds, tx); 2128 2129 dsl_dir_snap_cmtime_update(ds->ds_dir); 2130 2131 spa_history_log_internal(LOG_DS_SNAPSHOT, dp->dp_spa, tx, 2132 "dataset = %llu", dsobj); 2133} 2134 2135void 2136dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx) 2137{ 2138 ASSERT(dmu_tx_is_syncing(tx)); 2139 ASSERT(ds->ds_objset != NULL); 2140 ASSERT(ds->ds_phys->ds_next_snap_obj == 0); 2141 2142 /* 2143 * in case we had to change ds_fsid_guid when we opened it, 2144 * sync it out now. 2145 */ 2146 dmu_buf_will_dirty(ds->ds_dbuf, tx); 2147 ds->ds_phys->ds_fsid_guid = ds->ds_fsid_guid; 2148 2149 dsl_dir_dirty(ds->ds_dir, tx); 2150 dmu_objset_sync(ds->ds_objset, zio, tx); 2151} 2152 2153void 2154dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv) 2155{ 2156 uint64_t refd, avail, uobjs, aobjs, ratio; 2157 2158 dsl_dir_stats(ds->ds_dir, nv); 2159 2160 dsl_dataset_space(ds, &refd, &avail, &uobjs, &aobjs); 2161 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_AVAILABLE, avail); 2162 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFERENCED, refd); 2163 2164 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATION, 2165 ds->ds_phys->ds_creation_time); 2166 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATETXG, 2167 ds->ds_phys->ds_creation_txg); 2168 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFQUOTA, 2169 ds->ds_quota); 2170 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRESERVATION, 2171 ds->ds_reserved); 2172 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_GUID, 2173 ds->ds_phys->ds_guid); 2174 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_UNIQUE, 2175 ds->ds_phys->ds_unique_bytes); 2176 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_OBJSETID, 2177 ds->ds_object); 2178 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERREFS, 2179 ds->ds_userrefs); 2180 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_DEFER_DESTROY, 2181 DS_IS_DEFER_DESTROY(ds) ? 1 : 0); 2182 2183 ratio = ds->ds_phys->ds_compressed_bytes == 0 ? 100 : 2184 (ds->ds_phys->ds_uncompressed_bytes * 100 / 2185 ds->ds_phys->ds_compressed_bytes); 2186 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRATIO, ratio); 2187 2188 if (ds->ds_phys->ds_next_snap_obj) { 2189 /* 2190 * This is a snapshot; override the dd's space used with 2191 * our unique space and compression ratio. 2192 */ 2193 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED, 2194 ds->ds_phys->ds_unique_bytes); 2195 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO, ratio); 2196 } 2197} 2198 2199void 2200dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat) 2201{ 2202 stat->dds_creation_txg = ds->ds_phys->ds_creation_txg; 2203 stat->dds_inconsistent = ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT; 2204 stat->dds_guid = ds->ds_phys->ds_guid; 2205 if (ds->ds_phys->ds_next_snap_obj) { 2206 stat->dds_is_snapshot = B_TRUE; 2207 stat->dds_num_clones = ds->ds_phys->ds_num_children - 1; 2208 } else { 2209 stat->dds_is_snapshot = B_FALSE; 2210 stat->dds_num_clones = 0; 2211 } 2212 2213 /* clone origin is really a dsl_dir thing... */ 2214 rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER); 2215 if (dsl_dir_is_clone(ds->ds_dir)) { 2216 dsl_dataset_t *ods; 2217 2218 VERIFY(0 == dsl_dataset_get_ref(ds->ds_dir->dd_pool, 2219 ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &ods)); 2220 dsl_dataset_name(ods, stat->dds_origin); 2221 dsl_dataset_drop_ref(ods, FTAG); 2222 } else { 2223 stat->dds_origin[0] = '\0'; 2224 } 2225 rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock); 2226} 2227 2228uint64_t 2229dsl_dataset_fsid_guid(dsl_dataset_t *ds) 2230{ 2231 return (ds->ds_fsid_guid); 2232} 2233 2234void 2235dsl_dataset_space(dsl_dataset_t *ds, 2236 uint64_t *refdbytesp, uint64_t *availbytesp, 2237 uint64_t *usedobjsp, uint64_t *availobjsp) 2238{ 2239 *refdbytesp = ds->ds_phys->ds_used_bytes; 2240 *availbytesp = dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE); 2241 if (ds->ds_reserved > ds->ds_phys->ds_unique_bytes) 2242 *availbytesp += ds->ds_reserved - ds->ds_phys->ds_unique_bytes; 2243 if (ds->ds_quota != 0) { 2244 /* 2245 * Adjust available bytes according to refquota 2246 */ 2247 if (*refdbytesp < ds->ds_quota) 2248 *availbytesp = MIN(*availbytesp, 2249 ds->ds_quota - *refdbytesp); 2250 else 2251 *availbytesp = 0; 2252 } 2253 *usedobjsp = ds->ds_phys->ds_bp.blk_fill; 2254 *availobjsp = DN_MAX_OBJECT - *usedobjsp; 2255} 2256 2257boolean_t 2258dsl_dataset_modified_since_lastsnap(dsl_dataset_t *ds) 2259{ 2260 dsl_pool_t *dp = ds->ds_dir->dd_pool; 2261 2262 ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) || 2263 dsl_pool_sync_context(dp)); 2264 if (ds->ds_prev == NULL) 2265 return (B_FALSE); 2266 if (ds->ds_phys->ds_bp.blk_birth > 2267 ds->ds_prev->ds_phys->ds_creation_txg) { 2268 objset_t *os, *os_prev; 2269 /* 2270 * It may be that only the ZIL differs, because it was 2271 * reset in the head. Don't count that as being 2272 * modified. 2273 */ 2274 if (dmu_objset_from_ds(ds, &os) != 0) 2275 return (B_TRUE); 2276 if (dmu_objset_from_ds(ds->ds_prev, &os_prev) != 0) 2277 return (B_TRUE); 2278 return (bcmp(&os->os_phys->os_meta_dnode, 2279 &os_prev->os_phys->os_meta_dnode, 2280 sizeof (os->os_phys->os_meta_dnode)) != 0); 2281 } 2282 return (B_FALSE); 2283} 2284 2285/* ARGSUSED */ 2286static int 2287dsl_dataset_snapshot_rename_check(void *arg1, void *arg2, dmu_tx_t *tx) 2288{ 2289 dsl_dataset_t *ds = arg1; 2290 char *newsnapname = arg2; 2291 dsl_dir_t *dd = ds->ds_dir; 2292 dsl_dataset_t *hds; 2293 uint64_t val; 2294 int err; 2295 2296 err = dsl_dataset_hold_obj(dd->dd_pool, 2297 dd->dd_phys->dd_head_dataset_obj, FTAG, &hds); 2298 if (err) 2299 return (err); 2300 2301 /* new name better not be in use */ 2302 err = dsl_dataset_snap_lookup(hds, newsnapname, &val); 2303 dsl_dataset_rele(hds, FTAG); 2304 2305 if (err == 0) 2306 err = EEXIST; 2307 else if (err == ENOENT) 2308 err = 0; 2309 2310 /* dataset name + 1 for the "@" + the new snapshot name must fit */ 2311 if (dsl_dir_namelen(ds->ds_dir) + 1 + strlen(newsnapname) >= MAXNAMELEN) 2312 err = ENAMETOOLONG; 2313 2314 return (err); 2315} 2316 2317static void 2318dsl_dataset_snapshot_rename_sync(void *arg1, void *arg2, dmu_tx_t *tx) 2319{ 2320 char oldname[MAXPATHLEN], newname[MAXPATHLEN]; 2321 dsl_dataset_t *ds = arg1; 2322 const char *newsnapname = arg2; 2323 dsl_dir_t *dd = ds->ds_dir; 2324 objset_t *mos = dd->dd_pool->dp_meta_objset; 2325 dsl_dataset_t *hds; 2326 int err; 2327 2328 ASSERT(ds->ds_phys->ds_next_snap_obj != 0); 2329 2330 VERIFY(0 == dsl_dataset_hold_obj(dd->dd_pool, 2331 dd->dd_phys->dd_head_dataset_obj, FTAG, &hds)); 2332 2333 VERIFY(0 == dsl_dataset_get_snapname(ds)); 2334 err = dsl_dataset_snap_remove(hds, ds->ds_snapname, tx); 2335 ASSERT3U(err, ==, 0); 2336 dsl_dataset_name(ds, oldname); 2337 mutex_enter(&ds->ds_lock); 2338 (void) strcpy(ds->ds_snapname, newsnapname); 2339 mutex_exit(&ds->ds_lock); 2340 err = zap_add(mos, hds->ds_phys->ds_snapnames_zapobj, 2341 ds->ds_snapname, 8, 1, &ds->ds_object, tx); 2342 ASSERT3U(err, ==, 0); 2343 dsl_dataset_name(ds, newname); 2344#ifdef _KERNEL 2345 zvol_rename_minors(oldname, newname); 2346#endif 2347 2348 spa_history_log_internal(LOG_DS_RENAME, dd->dd_pool->dp_spa, tx, 2349 "dataset = %llu", ds->ds_object); 2350 dsl_dataset_rele(hds, FTAG); 2351} 2352 2353struct renamesnaparg { 2354 dsl_sync_task_group_t *dstg; 2355 char failed[MAXPATHLEN]; 2356 char *oldsnap; 2357 char *newsnap; 2358}; 2359 2360static int 2361dsl_snapshot_rename_one(const char *name, void *arg) 2362{ 2363 struct renamesnaparg *ra = arg; 2364 dsl_dataset_t *ds = NULL; 2365 char *snapname; 2366 int err; 2367 2368 snapname = kmem_asprintf("%s@%s", name, ra->oldsnap); 2369 (void) strlcpy(ra->failed, snapname, sizeof (ra->failed)); 2370 2371 /* 2372 * For recursive snapshot renames the parent won't be changing 2373 * so we just pass name for both the to/from argument. 2374 */ 2375 err = zfs_secpolicy_rename_perms(snapname, snapname, CRED()); 2376 if (err != 0) { 2377 strfree(snapname); 2378 return (err == ENOENT ? 0 : err); 2379 } 2380 2381#ifdef _KERNEL 2382 /* 2383 * For all filesystems undergoing rename, we'll need to unmount it. 2384 */ 2385 (void) zfs_unmount_snap(snapname, NULL); 2386#endif 2387 err = dsl_dataset_hold(snapname, ra->dstg, &ds); 2388 strfree(snapname); 2389 if (err != 0) 2390 return (err == ENOENT ? 0 : err); 2391 2392 dsl_sync_task_create(ra->dstg, dsl_dataset_snapshot_rename_check, 2393 dsl_dataset_snapshot_rename_sync, ds, ra->newsnap, 0); 2394 2395 return (0); 2396} 2397 2398static int 2399dsl_recursive_rename(char *oldname, const char *newname) 2400{ 2401 int err; 2402 struct renamesnaparg *ra; 2403 dsl_sync_task_t *dst; 2404 spa_t *spa; 2405 char *cp, *fsname = spa_strdup(oldname); 2406 int len = strlen(oldname) + 1; 2407 2408 /* truncate the snapshot name to get the fsname */ 2409 cp = strchr(fsname, '@'); 2410 *cp = '\0'; 2411 2412 err = spa_open(fsname, &spa, FTAG); 2413 if (err) { 2414 kmem_free(fsname, len); 2415 return (err); 2416 } 2417 ra = kmem_alloc(sizeof (struct renamesnaparg), KM_SLEEP); 2418 ra->dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); 2419 2420 ra->oldsnap = strchr(oldname, '@') + 1; 2421 ra->newsnap = strchr(newname, '@') + 1; 2422 *ra->failed = '\0'; 2423 2424 err = dmu_objset_find(fsname, dsl_snapshot_rename_one, ra, 2425 DS_FIND_CHILDREN); 2426 kmem_free(fsname, len); 2427 2428 if (err == 0) { 2429 err = dsl_sync_task_group_wait(ra->dstg); 2430 } 2431 2432 for (dst = list_head(&ra->dstg->dstg_tasks); dst; 2433 dst = list_next(&ra->dstg->dstg_tasks, dst)) { 2434 dsl_dataset_t *ds = dst->dst_arg1; 2435 if (dst->dst_err) { 2436 dsl_dir_name(ds->ds_dir, ra->failed); 2437 (void) strlcat(ra->failed, "@", sizeof (ra->failed)); 2438 (void) strlcat(ra->failed, ra->newsnap, 2439 sizeof (ra->failed)); 2440 } 2441 dsl_dataset_rele(ds, ra->dstg); 2442 } 2443 2444 if (err) 2445 (void) strlcpy(oldname, ra->failed, sizeof (ra->failed)); 2446 2447 dsl_sync_task_group_destroy(ra->dstg); 2448 kmem_free(ra, sizeof (struct renamesnaparg)); 2449 spa_close(spa, FTAG); 2450 return (err); 2451} 2452 2453static int 2454dsl_valid_rename(const char *oldname, void *arg) 2455{ 2456 int delta = *(int *)arg; 2457 2458 if (strlen(oldname) + delta >= MAXNAMELEN) 2459 return (ENAMETOOLONG); 2460 2461 return (0); 2462} 2463 2464#pragma weak dmu_objset_rename = dsl_dataset_rename 2465int 2466dsl_dataset_rename(char *oldname, const char *newname, int flags) 2467{ 2468 dsl_dir_t *dd; 2469 dsl_dataset_t *ds; 2470 const char *tail; 2471 int err; 2472 2473 err = dsl_dir_open(oldname, FTAG, &dd, &tail); 2474 if (err) 2475 return (err); 2476 2477 if (tail == NULL) { 2478 int delta = strlen(newname) - strlen(oldname); 2479 2480 /* if we're growing, validate child name lengths */ 2481 if (delta > 0) 2482 err = dmu_objset_find(oldname, dsl_valid_rename, 2483 &delta, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS); 2484 2485 if (err == 0) 2486 err = dsl_dir_rename(dd, newname, flags); 2487 dsl_dir_close(dd, FTAG); 2488 return (err); 2489 } 2490 2491 if (tail[0] != '@') { 2492 /* the name ended in a nonexistent component */ 2493 dsl_dir_close(dd, FTAG); 2494 return (ENOENT); 2495 } 2496 2497 dsl_dir_close(dd, FTAG); 2498 2499 /* new name must be snapshot in same filesystem */ 2500 tail = strchr(newname, '@'); 2501 if (tail == NULL) 2502 return (EINVAL); 2503 tail++; 2504 if (strncmp(oldname, newname, tail - newname) != 0) 2505 return (EXDEV); 2506 2507 if (flags & ZFS_RENAME_RECURSIVE) { 2508 err = dsl_recursive_rename(oldname, newname); 2509 } else { 2510 err = dsl_dataset_hold(oldname, FTAG, &ds); 2511 if (err) 2512 return (err); 2513 2514 err = dsl_sync_task_do(ds->ds_dir->dd_pool, 2515 dsl_dataset_snapshot_rename_check, 2516 dsl_dataset_snapshot_rename_sync, ds, (char *)tail, 1); 2517 2518 dsl_dataset_rele(ds, FTAG); 2519 } 2520 2521 return (err); 2522} 2523 2524struct promotenode { 2525 list_node_t link; 2526 dsl_dataset_t *ds; 2527}; 2528 2529struct promotearg { 2530 list_t shared_snaps, origin_snaps, clone_snaps; 2531 dsl_dataset_t *origin_origin; 2532 uint64_t used, comp, uncomp, unique, cloneusedsnap, originusedsnap; 2533 char *err_ds; 2534}; 2535 2536static int snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep); 2537static boolean_t snaplist_unstable(list_t *l); 2538 2539static int 2540dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx) 2541{ 2542 dsl_dataset_t *hds = arg1; 2543 struct promotearg *pa = arg2; 2544 struct promotenode *snap = list_head(&pa->shared_snaps); 2545 dsl_dataset_t *origin_ds = snap->ds; 2546 int err; 2547 uint64_t unused; 2548 2549 /* Check that it is a real clone */ 2550 if (!dsl_dir_is_clone(hds->ds_dir)) 2551 return (EINVAL); 2552 2553 /* Since this is so expensive, don't do the preliminary check */ 2554 if (!dmu_tx_is_syncing(tx)) 2555 return (0); 2556 2557 if (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE) 2558 return (EXDEV); 2559 2560 /* compute origin's new unique space */ 2561 snap = list_tail(&pa->clone_snaps); 2562 ASSERT3U(snap->ds->ds_phys->ds_prev_snap_obj, ==, origin_ds->ds_object); 2563 dsl_deadlist_space_range(&snap->ds->ds_deadlist, 2564 origin_ds->ds_phys->ds_prev_snap_txg, UINT64_MAX, 2565 &pa->unique, &unused, &unused); 2566 2567 /* 2568 * Walk the snapshots that we are moving 2569 * 2570 * Compute space to transfer. Consider the incremental changes 2571 * to used for each snapshot: 2572 * (my used) = (prev's used) + (blocks born) - (blocks killed) 2573 * So each snapshot gave birth to: 2574 * (blocks born) = (my used) - (prev's used) + (blocks killed) 2575 * So a sequence would look like: 2576 * (uN - u(N-1) + kN) + ... + (u1 - u0 + k1) + (u0 - 0 + k0) 2577 * Which simplifies to: 2578 * uN + kN + kN-1 + ... + k1 + k0 2579 * Note however, if we stop before we reach the ORIGIN we get: 2580 * uN + kN + kN-1 + ... + kM - uM-1 2581 */ 2582 pa->used = origin_ds->ds_phys->ds_used_bytes; 2583 pa->comp = origin_ds->ds_phys->ds_compressed_bytes; 2584 pa->uncomp = origin_ds->ds_phys->ds_uncompressed_bytes; 2585 for (snap = list_head(&pa->shared_snaps); snap; 2586 snap = list_next(&pa->shared_snaps, snap)) { 2587 uint64_t val, dlused, dlcomp, dluncomp; 2588 dsl_dataset_t *ds = snap->ds; 2589 2590 /* Check that the snapshot name does not conflict */ 2591 VERIFY(0 == dsl_dataset_get_snapname(ds)); 2592 err = dsl_dataset_snap_lookup(hds, ds->ds_snapname, &val); 2593 if (err == 0) { 2594 err = EEXIST; 2595 goto out; 2596 } 2597 if (err != ENOENT) 2598 goto out; 2599 2600 /* The very first snapshot does not have a deadlist */ 2601 if (ds->ds_phys->ds_prev_snap_obj == 0) 2602 continue; 2603 2604 dsl_deadlist_space(&ds->ds_deadlist, 2605 &dlused, &dlcomp, &dluncomp); 2606 pa->used += dlused; 2607 pa->comp += dlcomp; 2608 pa->uncomp += dluncomp; 2609 } 2610 2611 /* 2612 * If we are a clone of a clone then we never reached ORIGIN, 2613 * so we need to subtract out the clone origin's used space. 2614 */ 2615 if (pa->origin_origin) { 2616 pa->used -= pa->origin_origin->ds_phys->ds_used_bytes; 2617 pa->comp -= pa->origin_origin->ds_phys->ds_compressed_bytes; 2618 pa->uncomp -= pa->origin_origin->ds_phys->ds_uncompressed_bytes; 2619 } 2620 2621 /* Check that there is enough space here */ 2622 err = dsl_dir_transfer_possible(origin_ds->ds_dir, hds->ds_dir, 2623 pa->used); 2624 if (err) 2625 return (err); 2626 2627 /* 2628 * Compute the amounts of space that will be used by snapshots 2629 * after the promotion (for both origin and clone). For each, 2630 * it is the amount of space that will be on all of their 2631 * deadlists (that was not born before their new origin). 2632 */ 2633 if (hds->ds_dir->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) { 2634 uint64_t space; 2635 2636 /* 2637 * Note, typically this will not be a clone of a clone, 2638 * so dd_origin_txg will be < TXG_INITIAL, so 2639 * these snaplist_space() -> dsl_deadlist_space_range() 2640 * calls will be fast because they do not have to 2641 * iterate over all bps. 2642 */ 2643 snap = list_head(&pa->origin_snaps); 2644 err = snaplist_space(&pa->shared_snaps, 2645 snap->ds->ds_dir->dd_origin_txg, &pa->cloneusedsnap); 2646 if (err) 2647 return (err); 2648 2649 err = snaplist_space(&pa->clone_snaps, 2650 snap->ds->ds_dir->dd_origin_txg, &space); 2651 if (err) 2652 return (err); 2653 pa->cloneusedsnap += space; 2654 } 2655 if (origin_ds->ds_dir->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) { 2656 err = snaplist_space(&pa->origin_snaps, 2657 origin_ds->ds_phys->ds_creation_txg, &pa->originusedsnap); 2658 if (err) 2659 return (err); 2660 } 2661 2662 return (0); 2663out: 2664 pa->err_ds = snap->ds->ds_snapname; 2665 return (err); 2666} 2667 2668static void 2669dsl_dataset_promote_sync(void *arg1, void *arg2, dmu_tx_t *tx) 2670{ 2671 dsl_dataset_t *hds = arg1; 2672 struct promotearg *pa = arg2; 2673 struct promotenode *snap = list_head(&pa->shared_snaps); 2674 dsl_dataset_t *origin_ds = snap->ds; 2675 dsl_dataset_t *origin_head; 2676 dsl_dir_t *dd = hds->ds_dir; 2677 dsl_pool_t *dp = hds->ds_dir->dd_pool; 2678 dsl_dir_t *odd = NULL; 2679 uint64_t oldnext_obj; 2680 int64_t delta; 2681 2682 ASSERT(0 == (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE)); 2683 2684 snap = list_head(&pa->origin_snaps); 2685 origin_head = snap->ds; 2686 2687 /* 2688 * We need to explicitly open odd, since origin_ds's dd will be 2689 * changing. 2690 */ 2691 VERIFY(0 == dsl_dir_open_obj(dp, origin_ds->ds_dir->dd_object, 2692 NULL, FTAG, &odd)); 2693 2694 /* change origin's next snap */ 2695 dmu_buf_will_dirty(origin_ds->ds_dbuf, tx); 2696 oldnext_obj = origin_ds->ds_phys->ds_next_snap_obj; 2697 snap = list_tail(&pa->clone_snaps); 2698 ASSERT3U(snap->ds->ds_phys->ds_prev_snap_obj, ==, origin_ds->ds_object); 2699 origin_ds->ds_phys->ds_next_snap_obj = snap->ds->ds_object; 2700 2701 /* change the origin's next clone */ 2702 if (origin_ds->ds_phys->ds_next_clones_obj) { 2703 remove_from_next_clones(origin_ds, snap->ds->ds_object, tx); 2704 VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset, 2705 origin_ds->ds_phys->ds_next_clones_obj, 2706 oldnext_obj, tx)); 2707 } 2708 2709 /* change origin */ 2710 dmu_buf_will_dirty(dd->dd_dbuf, tx); 2711 ASSERT3U(dd->dd_phys->dd_origin_obj, ==, origin_ds->ds_object); 2712 dd->dd_phys->dd_origin_obj = odd->dd_phys->dd_origin_obj; 2713 dd->dd_origin_txg = origin_head->ds_dir->dd_origin_txg; 2714 dmu_buf_will_dirty(odd->dd_dbuf, tx); 2715 odd->dd_phys->dd_origin_obj = origin_ds->ds_object; 2716 origin_head->ds_dir->dd_origin_txg = 2717 origin_ds->ds_phys->ds_creation_txg; 2718 2719 /* change dd_clone entries */ 2720 if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) { 2721 VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, 2722 odd->dd_phys->dd_clones, hds->ds_object, tx)); 2723 VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset, 2724 pa->origin_origin->ds_dir->dd_phys->dd_clones, 2725 hds->ds_object, tx)); 2726 2727 VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, 2728 pa->origin_origin->ds_dir->dd_phys->dd_clones, 2729 origin_head->ds_object, tx)); 2730 if (dd->dd_phys->dd_clones == 0) { 2731 dd->dd_phys->dd_clones = zap_create(dp->dp_meta_objset, 2732 DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx); 2733 } 2734 VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset, 2735 dd->dd_phys->dd_clones, origin_head->ds_object, tx)); 2736 2737 } 2738 2739 /* move snapshots to this dir */ 2740 for (snap = list_head(&pa->shared_snaps); snap; 2741 snap = list_next(&pa->shared_snaps, snap)) { 2742 dsl_dataset_t *ds = snap->ds; 2743 2744 /* unregister props as dsl_dir is changing */ 2745 if (ds->ds_objset) { 2746 dmu_objset_evict(ds->ds_objset); 2747 ds->ds_objset = NULL; 2748 } 2749 /* move snap name entry */ 2750 VERIFY(0 == dsl_dataset_get_snapname(ds)); 2751 VERIFY(0 == dsl_dataset_snap_remove(origin_head, 2752 ds->ds_snapname, tx)); 2753 VERIFY(0 == zap_add(dp->dp_meta_objset, 2754 hds->ds_phys->ds_snapnames_zapobj, ds->ds_snapname, 2755 8, 1, &ds->ds_object, tx)); 2756 2757 /* change containing dsl_dir */ 2758 dmu_buf_will_dirty(ds->ds_dbuf, tx); 2759 ASSERT3U(ds->ds_phys->ds_dir_obj, ==, odd->dd_object); 2760 ds->ds_phys->ds_dir_obj = dd->dd_object; 2761 ASSERT3P(ds->ds_dir, ==, odd); 2762 dsl_dir_close(ds->ds_dir, ds); 2763 VERIFY(0 == dsl_dir_open_obj(dp, dd->dd_object, 2764 NULL, ds, &ds->ds_dir)); 2765 2766 /* move any clone references */ 2767 if (ds->ds_phys->ds_next_clones_obj && 2768 spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) { 2769 zap_cursor_t zc; 2770 zap_attribute_t za; 2771 2772 for (zap_cursor_init(&zc, dp->dp_meta_objset, 2773 ds->ds_phys->ds_next_clones_obj); 2774 zap_cursor_retrieve(&zc, &za) == 0; 2775 zap_cursor_advance(&zc)) { 2776 dsl_dataset_t *cnds; 2777 uint64_t o; 2778 2779 if (za.za_first_integer == oldnext_obj) { 2780 /* 2781 * We've already moved the 2782 * origin's reference. 2783 */ 2784 continue; 2785 } 2786 2787 VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, 2788 za.za_first_integer, FTAG, &cnds)); 2789 o = cnds->ds_dir->dd_phys->dd_head_dataset_obj; 2790 2791 VERIFY3U(zap_remove_int(dp->dp_meta_objset, 2792 odd->dd_phys->dd_clones, o, tx), ==, 0); 2793 VERIFY3U(zap_add_int(dp->dp_meta_objset, 2794 dd->dd_phys->dd_clones, o, tx), ==, 0); 2795 dsl_dataset_rele(cnds, FTAG); 2796 } 2797 zap_cursor_fini(&zc); 2798 } 2799 2800 ASSERT3U(dsl_prop_numcb(ds), ==, 0); 2801 } 2802 2803 /* 2804 * Change space accounting. 2805 * Note, pa->*usedsnap and dd_used_breakdown[SNAP] will either 2806 * both be valid, or both be 0 (resulting in delta == 0). This 2807 * is true for each of {clone,origin} independently. 2808 */ 2809 2810 delta = pa->cloneusedsnap - 2811 dd->dd_phys->dd_used_breakdown[DD_USED_SNAP]; 2812 ASSERT3S(delta, >=, 0); 2813 ASSERT3U(pa->used, >=, delta); 2814 dsl_dir_diduse_space(dd, DD_USED_SNAP, delta, 0, 0, tx); 2815 dsl_dir_diduse_space(dd, DD_USED_HEAD, 2816 pa->used - delta, pa->comp, pa->uncomp, tx); 2817 2818 delta = pa->originusedsnap - 2819 odd->dd_phys->dd_used_breakdown[DD_USED_SNAP]; 2820 ASSERT3S(delta, <=, 0); 2821 ASSERT3U(pa->used, >=, -delta); 2822 dsl_dir_diduse_space(odd, DD_USED_SNAP, delta, 0, 0, tx); 2823 dsl_dir_diduse_space(odd, DD_USED_HEAD, 2824 -pa->used - delta, -pa->comp, -pa->uncomp, tx); 2825 2826 origin_ds->ds_phys->ds_unique_bytes = pa->unique; 2827 2828 /* log history record */ 2829 spa_history_log_internal(LOG_DS_PROMOTE, dd->dd_pool->dp_spa, tx, 2830 "dataset = %llu", hds->ds_object); 2831 2832 dsl_dir_close(odd, FTAG); 2833} 2834 2835static char *snaplist_tag = "snaplist"; 2836/* 2837 * Make a list of dsl_dataset_t's for the snapshots between first_obj 2838 * (exclusive) and last_obj (inclusive). The list will be in reverse 2839 * order (last_obj will be the list_head()). If first_obj == 0, do all 2840 * snapshots back to this dataset's origin. 2841 */ 2842static int 2843snaplist_make(dsl_pool_t *dp, boolean_t own, 2844 uint64_t first_obj, uint64_t last_obj, list_t *l) 2845{ 2846 uint64_t obj = last_obj; 2847 2848 ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock)); 2849 2850 list_create(l, sizeof (struct promotenode), 2851 offsetof(struct promotenode, link)); 2852 2853 while (obj != first_obj) { 2854 dsl_dataset_t *ds; 2855 struct promotenode *snap; 2856 int err; 2857 2858 if (own) { 2859 err = dsl_dataset_own_obj(dp, obj, 2860 0, snaplist_tag, &ds); 2861 if (err == 0) 2862 dsl_dataset_make_exclusive(ds, snaplist_tag); 2863 } else { 2864 err = dsl_dataset_hold_obj(dp, obj, snaplist_tag, &ds); 2865 } 2866 if (err == ENOENT) { 2867 /* lost race with snapshot destroy */ 2868 struct promotenode *last = list_tail(l); 2869 ASSERT(obj != last->ds->ds_phys->ds_prev_snap_obj); 2870 obj = last->ds->ds_phys->ds_prev_snap_obj; 2871 continue; 2872 } else if (err) { 2873 return (err); 2874 } 2875 2876 if (first_obj == 0) 2877 first_obj = ds->ds_dir->dd_phys->dd_origin_obj; 2878 2879 snap = kmem_alloc(sizeof (struct promotenode), KM_SLEEP); 2880 snap->ds = ds; 2881 list_insert_tail(l, snap); 2882 obj = ds->ds_phys->ds_prev_snap_obj; 2883 } 2884 2885 return (0); 2886} 2887 2888static int 2889snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep) 2890{ 2891 struct promotenode *snap; 2892 2893 *spacep = 0; 2894 for (snap = list_head(l); snap; snap = list_next(l, snap)) { 2895 uint64_t used, comp, uncomp; 2896 dsl_deadlist_space_range(&snap->ds->ds_deadlist, 2897 mintxg, UINT64_MAX, &used, &comp, &uncomp); 2898 *spacep += used; 2899 } 2900 return (0); 2901} 2902 2903static void 2904snaplist_destroy(list_t *l, boolean_t own) 2905{ 2906 struct promotenode *snap; 2907 2908 if (!l || !list_link_active(&l->list_head)) 2909 return; 2910 2911 while ((snap = list_tail(l)) != NULL) { 2912 list_remove(l, snap); 2913 if (own) 2914 dsl_dataset_disown(snap->ds, snaplist_tag); 2915 else 2916 dsl_dataset_rele(snap->ds, snaplist_tag); 2917 kmem_free(snap, sizeof (struct promotenode)); 2918 } 2919 list_destroy(l); 2920} 2921 2922/* 2923 * Promote a clone. Nomenclature note: 2924 * "clone" or "cds": the original clone which is being promoted 2925 * "origin" or "ods": the snapshot which is originally clone's origin 2926 * "origin head" or "ohds": the dataset which is the head 2927 * (filesystem/volume) for the origin 2928 * "origin origin": the origin of the origin's filesystem (typically 2929 * NULL, indicating that the clone is not a clone of a clone). 2930 */ 2931int 2932dsl_dataset_promote(const char *name, char *conflsnap) 2933{ 2934 dsl_dataset_t *ds; 2935 dsl_dir_t *dd; 2936 dsl_pool_t *dp; 2937 dmu_object_info_t doi; 2938 struct promotearg pa = { 0 }; 2939 struct promotenode *snap; 2940 int err; 2941 2942 err = dsl_dataset_hold(name, FTAG, &ds); 2943 if (err) 2944 return (err); 2945 dd = ds->ds_dir; 2946 dp = dd->dd_pool; 2947 2948 err = dmu_object_info(dp->dp_meta_objset, 2949 ds->ds_phys->ds_snapnames_zapobj, &doi); 2950 if (err) { 2951 dsl_dataset_rele(ds, FTAG); 2952 return (err); 2953 } 2954 2955 if (dsl_dataset_is_snapshot(ds) || dd->dd_phys->dd_origin_obj == 0) { 2956 dsl_dataset_rele(ds, FTAG); 2957 return (EINVAL); 2958 } 2959 2960 /* 2961 * We are going to inherit all the snapshots taken before our 2962 * origin (i.e., our new origin will be our parent's origin). 2963 * Take ownership of them so that we can rename them into our 2964 * namespace. 2965 */ 2966 rw_enter(&dp->dp_config_rwlock, RW_READER); 2967 2968 err = snaplist_make(dp, B_TRUE, 0, dd->dd_phys->dd_origin_obj, 2969 &pa.shared_snaps); 2970 if (err != 0) 2971 goto out; 2972 2973 err = snaplist_make(dp, B_FALSE, 0, ds->ds_object, &pa.clone_snaps); 2974 if (err != 0) 2975 goto out; 2976 2977 snap = list_head(&pa.shared_snaps); 2978 ASSERT3U(snap->ds->ds_object, ==, dd->dd_phys->dd_origin_obj); 2979 err = snaplist_make(dp, B_FALSE, dd->dd_phys->dd_origin_obj, 2980 snap->ds->ds_dir->dd_phys->dd_head_dataset_obj, &pa.origin_snaps); 2981 if (err != 0) 2982 goto out; 2983 2984 if (snap->ds->ds_dir->dd_phys->dd_origin_obj != 0) { 2985 err = dsl_dataset_hold_obj(dp, 2986 snap->ds->ds_dir->dd_phys->dd_origin_obj, 2987 FTAG, &pa.origin_origin); 2988 if (err != 0) 2989 goto out; 2990 } 2991 2992out: 2993 rw_exit(&dp->dp_config_rwlock); 2994 2995 /* 2996 * Add in 128x the snapnames zapobj size, since we will be moving 2997 * a bunch of snapnames to the promoted ds, and dirtying their 2998 * bonus buffers. 2999 */ 3000 if (err == 0) { 3001 err = dsl_sync_task_do(dp, dsl_dataset_promote_check, 3002 dsl_dataset_promote_sync, ds, &pa, 3003 2 + 2 * doi.doi_physical_blocks_512); 3004 if (err && pa.err_ds && conflsnap) 3005 (void) strncpy(conflsnap, pa.err_ds, MAXNAMELEN); 3006 } 3007 3008 snaplist_destroy(&pa.shared_snaps, B_TRUE); 3009 snaplist_destroy(&pa.clone_snaps, B_FALSE); 3010 snaplist_destroy(&pa.origin_snaps, B_FALSE); 3011 if (pa.origin_origin) 3012 dsl_dataset_rele(pa.origin_origin, FTAG); 3013 dsl_dataset_rele(ds, FTAG); 3014 return (err); 3015} 3016 3017struct cloneswaparg { 3018 dsl_dataset_t *cds; /* clone dataset */ 3019 dsl_dataset_t *ohds; /* origin's head dataset */ 3020 boolean_t force; 3021 int64_t unused_refres_delta; /* change in unconsumed refreservation */ 3022}; 3023 3024/* ARGSUSED */ 3025static int 3026dsl_dataset_clone_swap_check(void *arg1, void *arg2, dmu_tx_t *tx) 3027{ 3028 struct cloneswaparg *csa = arg1; 3029 3030 /* they should both be heads */ 3031 if (dsl_dataset_is_snapshot(csa->cds) || 3032 dsl_dataset_is_snapshot(csa->ohds)) 3033 return (EINVAL); 3034 3035 /* the branch point should be just before them */ 3036 if (csa->cds->ds_prev != csa->ohds->ds_prev) 3037 return (EINVAL); 3038 3039 /* cds should be the clone (unless they are unrelated) */ 3040 if (csa->cds->ds_prev != NULL && 3041 csa->cds->ds_prev != csa->cds->ds_dir->dd_pool->dp_origin_snap && 3042 csa->ohds->ds_object != 3043 csa->cds->ds_prev->ds_phys->ds_next_snap_obj) 3044 return (EINVAL); 3045 3046 /* the clone should be a child of the origin */ 3047 if (csa->cds->ds_dir->dd_parent != csa->ohds->ds_dir) 3048 return (EINVAL); 3049 3050 /* ohds shouldn't be modified unless 'force' */ 3051 if (!csa->force && dsl_dataset_modified_since_lastsnap(csa->ohds)) 3052 return (ETXTBSY); 3053 3054 /* adjust amount of any unconsumed refreservation */ 3055 csa->unused_refres_delta = 3056 (int64_t)MIN(csa->ohds->ds_reserved, 3057 csa->ohds->ds_phys->ds_unique_bytes) - 3058 (int64_t)MIN(csa->ohds->ds_reserved, 3059 csa->cds->ds_phys->ds_unique_bytes); 3060 3061 if (csa->unused_refres_delta > 0 && 3062 csa->unused_refres_delta > 3063 dsl_dir_space_available(csa->ohds->ds_dir, NULL, 0, TRUE)) 3064 return (ENOSPC); 3065 3066 if (csa->ohds->ds_quota != 0 && 3067 csa->cds->ds_phys->ds_unique_bytes > csa->ohds->ds_quota) 3068 return (EDQUOT); 3069 3070 return (0); 3071} 3072 3073/* ARGSUSED */ 3074static void 3075dsl_dataset_clone_swap_sync(void *arg1, void *arg2, dmu_tx_t *tx) 3076{ 3077 struct cloneswaparg *csa = arg1; 3078 dsl_pool_t *dp = csa->cds->ds_dir->dd_pool; 3079 3080 ASSERT(csa->cds->ds_reserved == 0); 3081 ASSERT(csa->ohds->ds_quota == 0 || 3082 csa->cds->ds_phys->ds_unique_bytes <= csa->ohds->ds_quota); 3083 3084 dmu_buf_will_dirty(csa->cds->ds_dbuf, tx); 3085 dmu_buf_will_dirty(csa->ohds->ds_dbuf, tx); 3086 3087 if (csa->cds->ds_objset != NULL) { 3088 dmu_objset_evict(csa->cds->ds_objset); 3089 csa->cds->ds_objset = NULL; 3090 } 3091 3092 if (csa->ohds->ds_objset != NULL) { 3093 dmu_objset_evict(csa->ohds->ds_objset); 3094 csa->ohds->ds_objset = NULL; 3095 } 3096 3097 /* 3098 * Reset origin's unique bytes, if it exists. 3099 */ 3100 if (csa->cds->ds_prev) { 3101 dsl_dataset_t *origin = csa->cds->ds_prev; 3102 uint64_t comp, uncomp; 3103 3104 dmu_buf_will_dirty(origin->ds_dbuf, tx); 3105 dsl_deadlist_space_range(&csa->cds->ds_deadlist, 3106 origin->ds_phys->ds_prev_snap_txg, UINT64_MAX, 3107 &origin->ds_phys->ds_unique_bytes, &comp, &uncomp); 3108 } 3109 3110 /* swap blkptrs */ 3111 { 3112 blkptr_t tmp; 3113 tmp = csa->ohds->ds_phys->ds_bp; 3114 csa->ohds->ds_phys->ds_bp = csa->cds->ds_phys->ds_bp; 3115 csa->cds->ds_phys->ds_bp = tmp; 3116 } 3117 3118 /* set dd_*_bytes */ 3119 { 3120 int64_t dused, dcomp, duncomp; 3121 uint64_t cdl_used, cdl_comp, cdl_uncomp; 3122 uint64_t odl_used, odl_comp, odl_uncomp; 3123 3124 ASSERT3U(csa->cds->ds_dir->dd_phys-> 3125 dd_used_breakdown[DD_USED_SNAP], ==, 0); 3126 3127 dsl_deadlist_space(&csa->cds->ds_deadlist, 3128 &cdl_used, &cdl_comp, &cdl_uncomp); 3129 dsl_deadlist_space(&csa->ohds->ds_deadlist, 3130 &odl_used, &odl_comp, &odl_uncomp); 3131 3132 dused = csa->cds->ds_phys->ds_used_bytes + cdl_used - 3133 (csa->ohds->ds_phys->ds_used_bytes + odl_used); 3134 dcomp = csa->cds->ds_phys->ds_compressed_bytes + cdl_comp - 3135 (csa->ohds->ds_phys->ds_compressed_bytes + odl_comp); 3136 duncomp = csa->cds->ds_phys->ds_uncompressed_bytes + 3137 cdl_uncomp - 3138 (csa->ohds->ds_phys->ds_uncompressed_bytes + odl_uncomp); 3139 3140 dsl_dir_diduse_space(csa->ohds->ds_dir, DD_USED_HEAD, 3141 dused, dcomp, duncomp, tx); 3142 dsl_dir_diduse_space(csa->cds->ds_dir, DD_USED_HEAD, 3143 -dused, -dcomp, -duncomp, tx); 3144 3145 /* 3146 * The difference in the space used by snapshots is the 3147 * difference in snapshot space due to the head's 3148 * deadlist (since that's the only thing that's 3149 * changing that affects the snapused). 3150 */ 3151 dsl_deadlist_space_range(&csa->cds->ds_deadlist, 3152 csa->ohds->ds_dir->dd_origin_txg, UINT64_MAX, 3153 &cdl_used, &cdl_comp, &cdl_uncomp); 3154 dsl_deadlist_space_range(&csa->ohds->ds_deadlist, 3155 csa->ohds->ds_dir->dd_origin_txg, UINT64_MAX, 3156 &odl_used, &odl_comp, &odl_uncomp); 3157 dsl_dir_transfer_space(csa->ohds->ds_dir, cdl_used - odl_used, 3158 DD_USED_HEAD, DD_USED_SNAP, tx); 3159 } 3160 3161 /* swap ds_*_bytes */ 3162 SWITCH64(csa->ohds->ds_phys->ds_used_bytes, 3163 csa->cds->ds_phys->ds_used_bytes); 3164 SWITCH64(csa->ohds->ds_phys->ds_compressed_bytes, 3165 csa->cds->ds_phys->ds_compressed_bytes); 3166 SWITCH64(csa->ohds->ds_phys->ds_uncompressed_bytes, 3167 csa->cds->ds_phys->ds_uncompressed_bytes); 3168 SWITCH64(csa->ohds->ds_phys->ds_unique_bytes, 3169 csa->cds->ds_phys->ds_unique_bytes); 3170 3171 /* apply any parent delta for change in unconsumed refreservation */ 3172 dsl_dir_diduse_space(csa->ohds->ds_dir, DD_USED_REFRSRV, 3173 csa->unused_refres_delta, 0, 0, tx); 3174 3175 /* 3176 * Swap deadlists. 3177 */ 3178 dsl_deadlist_close(&csa->cds->ds_deadlist); 3179 dsl_deadlist_close(&csa->ohds->ds_deadlist); 3180 SWITCH64(csa->ohds->ds_phys->ds_deadlist_obj, 3181 csa->cds->ds_phys->ds_deadlist_obj); 3182 dsl_deadlist_open(&csa->cds->ds_deadlist, dp->dp_meta_objset, 3183 csa->cds->ds_phys->ds_deadlist_obj); 3184 dsl_deadlist_open(&csa->ohds->ds_deadlist, dp->dp_meta_objset, 3185 csa->ohds->ds_phys->ds_deadlist_obj); 3186 3187 dsl_scan_ds_clone_swapped(csa->ohds, csa->cds, tx); 3188} 3189 3190/* 3191 * Swap 'clone' with its origin head datasets. Used at the end of "zfs 3192 * recv" into an existing fs to swizzle the file system to the new 3193 * version, and by "zfs rollback". Can also be used to swap two 3194 * independent head datasets if neither has any snapshots. 3195 */ 3196int 3197dsl_dataset_clone_swap(dsl_dataset_t *clone, dsl_dataset_t *origin_head, 3198 boolean_t force) 3199{ 3200 struct cloneswaparg csa; 3201 int error; 3202 3203 ASSERT(clone->ds_owner); 3204 ASSERT(origin_head->ds_owner); 3205retry: 3206 /* 3207 * Need exclusive access for the swap. If we're swapping these 3208 * datasets back after an error, we already hold the locks. 3209 */ 3210 if (!RW_WRITE_HELD(&clone->ds_rwlock)) 3211 rw_enter(&clone->ds_rwlock, RW_WRITER); 3212 if (!RW_WRITE_HELD(&origin_head->ds_rwlock) && 3213 !rw_tryenter(&origin_head->ds_rwlock, RW_WRITER)) { 3214 rw_exit(&clone->ds_rwlock); 3215 rw_enter(&origin_head->ds_rwlock, RW_WRITER); 3216 if (!rw_tryenter(&clone->ds_rwlock, RW_WRITER)) { 3217 rw_exit(&origin_head->ds_rwlock); 3218 goto retry; 3219 } 3220 } 3221 csa.cds = clone; 3222 csa.ohds = origin_head; 3223 csa.force = force; 3224 error = dsl_sync_task_do(clone->ds_dir->dd_pool, 3225 dsl_dataset_clone_swap_check, 3226 dsl_dataset_clone_swap_sync, &csa, NULL, 9); 3227 return (error); 3228} 3229 3230/* 3231 * Given a pool name and a dataset object number in that pool, 3232 * return the name of that dataset. 3233 */ 3234int 3235dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf) 3236{ 3237 spa_t *spa; 3238 dsl_pool_t *dp; 3239 dsl_dataset_t *ds; 3240 int error; 3241 3242 if ((error = spa_open(pname, &spa, FTAG)) != 0) 3243 return (error); 3244 dp = spa_get_dsl(spa); 3245 rw_enter(&dp->dp_config_rwlock, RW_READER); 3246 if ((error = dsl_dataset_hold_obj(dp, obj, FTAG, &ds)) == 0) { 3247 dsl_dataset_name(ds, buf); 3248 dsl_dataset_rele(ds, FTAG); 3249 } 3250 rw_exit(&dp->dp_config_rwlock); 3251 spa_close(spa, FTAG); 3252 3253 return (error); 3254} 3255 3256int 3257dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota, 3258 uint64_t asize, uint64_t inflight, uint64_t *used, uint64_t *ref_rsrv) 3259{ 3260 int error = 0; 3261 3262 ASSERT3S(asize, >, 0); 3263 3264 /* 3265 * *ref_rsrv is the portion of asize that will come from any 3266 * unconsumed refreservation space. 3267 */ 3268 *ref_rsrv = 0; 3269 3270 mutex_enter(&ds->ds_lock); 3271 /* 3272 * Make a space adjustment for reserved bytes. 3273 */ 3274 if (ds->ds_reserved > ds->ds_phys->ds_unique_bytes) { 3275 ASSERT3U(*used, >=, 3276 ds->ds_reserved - ds->ds_phys->ds_unique_bytes); 3277 *used -= (ds->ds_reserved - ds->ds_phys->ds_unique_bytes); 3278 *ref_rsrv = 3279 asize - MIN(asize, parent_delta(ds, asize + inflight)); 3280 } 3281 3282 if (!check_quota || ds->ds_quota == 0) { 3283 mutex_exit(&ds->ds_lock); 3284 return (0); 3285 } 3286 /* 3287 * If they are requesting more space, and our current estimate 3288 * is over quota, they get to try again unless the actual 3289 * on-disk is over quota and there are no pending changes (which 3290 * may free up space for us). 3291 */ 3292 if (ds->ds_phys->ds_used_bytes + inflight >= ds->ds_quota) { 3293 if (inflight > 0 || ds->ds_phys->ds_used_bytes < ds->ds_quota) 3294 error = ERESTART; 3295 else 3296 error = EDQUOT; 3297 } 3298 mutex_exit(&ds->ds_lock); 3299 3300 return (error); 3301} 3302 3303/* ARGSUSED */ 3304static int 3305dsl_dataset_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx) 3306{ 3307 dsl_dataset_t *ds = arg1; 3308 dsl_prop_setarg_t *psa = arg2; 3309 int err; 3310 3311 if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_REFQUOTA) 3312 return (ENOTSUP); 3313 3314 if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0) 3315 return (err); 3316 3317 if (psa->psa_effective_value == 0) 3318 return (0); 3319 3320 if (psa->psa_effective_value < ds->ds_phys->ds_used_bytes || 3321 psa->psa_effective_value < ds->ds_reserved) 3322 return (ENOSPC); 3323 3324 return (0); 3325} 3326 3327extern void dsl_prop_set_sync(void *, void *, dmu_tx_t *); 3328 3329void 3330dsl_dataset_set_quota_sync(void *arg1, void *arg2, dmu_tx_t *tx) 3331{ 3332 dsl_dataset_t *ds = arg1; 3333 dsl_prop_setarg_t *psa = arg2; 3334 uint64_t effective_value = psa->psa_effective_value; 3335 3336 dsl_prop_set_sync(ds, psa, tx); 3337 DSL_PROP_CHECK_PREDICTION(ds->ds_dir, psa); 3338 3339 if (ds->ds_quota != effective_value) { 3340 dmu_buf_will_dirty(ds->ds_dbuf, tx); 3341 ds->ds_quota = effective_value; 3342 3343 spa_history_log_internal(LOG_DS_REFQUOTA, 3344 ds->ds_dir->dd_pool->dp_spa, tx, "%lld dataset = %llu ", 3345 (longlong_t)ds->ds_quota, ds->ds_object); 3346 } 3347} 3348 3349int 3350dsl_dataset_set_quota(const char *dsname, zprop_source_t source, uint64_t quota) 3351{ 3352 dsl_dataset_t *ds; 3353 dsl_prop_setarg_t psa; 3354 int err; 3355 3356 dsl_prop_setarg_init_uint64(&psa, "refquota", source, "a); 3357 3358 err = dsl_dataset_hold(dsname, FTAG, &ds); 3359 if (err) 3360 return (err); 3361 3362 /* 3363 * If someone removes a file, then tries to set the quota, we 3364 * want to make sure the file freeing takes effect. 3365 */ 3366 txg_wait_open(ds->ds_dir->dd_pool, 0); 3367 3368 err = dsl_sync_task_do(ds->ds_dir->dd_pool, 3369 dsl_dataset_set_quota_check, dsl_dataset_set_quota_sync, 3370 ds, &psa, 0); 3371 3372 dsl_dataset_rele(ds, FTAG); 3373 return (err); 3374} 3375 3376static int 3377dsl_dataset_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx) 3378{ 3379 dsl_dataset_t *ds = arg1; 3380 dsl_prop_setarg_t *psa = arg2; 3381 uint64_t effective_value; 3382 uint64_t unique; 3383 int err; 3384 3385 if (spa_version(ds->ds_dir->dd_pool->dp_spa) < 3386 SPA_VERSION_REFRESERVATION) 3387 return (ENOTSUP); 3388 3389 if (dsl_dataset_is_snapshot(ds)) 3390 return (EINVAL); 3391 3392 if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0) 3393 return (err); 3394 3395 effective_value = psa->psa_effective_value; 3396 3397 /* 3398 * If we are doing the preliminary check in open context, the 3399 * space estimates may be inaccurate. 3400 */ 3401 if (!dmu_tx_is_syncing(tx)) 3402 return (0); 3403 3404 mutex_enter(&ds->ds_lock); 3405 if (!DS_UNIQUE_IS_ACCURATE(ds)) 3406 dsl_dataset_recalc_head_uniq(ds); 3407 unique = ds->ds_phys->ds_unique_bytes; 3408 mutex_exit(&ds->ds_lock); 3409 3410 if (MAX(unique, effective_value) > MAX(unique, ds->ds_reserved)) { 3411 uint64_t delta = MAX(unique, effective_value) - 3412 MAX(unique, ds->ds_reserved); 3413 3414 if (delta > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE)) 3415 return (ENOSPC); 3416 if (ds->ds_quota > 0 && 3417 effective_value > ds->ds_quota) 3418 return (ENOSPC); 3419 } 3420 3421 return (0); 3422} 3423 3424static void 3425dsl_dataset_set_reservation_sync(void *arg1, void *arg2, dmu_tx_t *tx) 3426{ 3427 dsl_dataset_t *ds = arg1; 3428 dsl_prop_setarg_t *psa = arg2; 3429 uint64_t effective_value = psa->psa_effective_value; 3430 uint64_t unique; 3431 int64_t delta; 3432 3433 dsl_prop_set_sync(ds, psa, tx); 3434 DSL_PROP_CHECK_PREDICTION(ds->ds_dir, psa); 3435 3436 dmu_buf_will_dirty(ds->ds_dbuf, tx); 3437 3438 mutex_enter(&ds->ds_dir->dd_lock); 3439 mutex_enter(&ds->ds_lock); 3440 ASSERT(DS_UNIQUE_IS_ACCURATE(ds)); 3441 unique = ds->ds_phys->ds_unique_bytes; 3442 delta = MAX(0, (int64_t)(effective_value - unique)) - 3443 MAX(0, (int64_t)(ds->ds_reserved - unique)); 3444 ds->ds_reserved = effective_value; 3445 mutex_exit(&ds->ds_lock); 3446 3447 dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, delta, 0, 0, tx); 3448 mutex_exit(&ds->ds_dir->dd_lock); 3449 3450 spa_history_log_internal(LOG_DS_REFRESERV, 3451 ds->ds_dir->dd_pool->dp_spa, tx, "%lld dataset = %llu", 3452 (longlong_t)effective_value, ds->ds_object); 3453} 3454 3455int 3456dsl_dataset_set_reservation(const char *dsname, zprop_source_t source, 3457 uint64_t reservation) 3458{ 3459 dsl_dataset_t *ds; 3460 dsl_prop_setarg_t psa; 3461 int err; 3462 3463 dsl_prop_setarg_init_uint64(&psa, "refreservation", source, 3464 &reservation); 3465 3466 err = dsl_dataset_hold(dsname, FTAG, &ds); 3467 if (err) 3468 return (err); 3469 3470 err = dsl_sync_task_do(ds->ds_dir->dd_pool, 3471 dsl_dataset_set_reservation_check, 3472 dsl_dataset_set_reservation_sync, ds, &psa, 0); 3473 3474 dsl_dataset_rele(ds, FTAG); 3475 return (err); 3476} 3477 3478typedef struct zfs_hold_cleanup_arg { 3479 dsl_pool_t *dp; 3480 uint64_t dsobj; 3481 char htag[MAXNAMELEN]; 3482} zfs_hold_cleanup_arg_t; 3483 3484static void 3485dsl_dataset_user_release_onexit(void *arg) 3486{ 3487 zfs_hold_cleanup_arg_t *ca = arg; 3488 3489 (void) dsl_dataset_user_release_tmp(ca->dp, ca->dsobj, ca->htag, 3490 B_TRUE); 3491 kmem_free(ca, sizeof (zfs_hold_cleanup_arg_t)); 3492} 3493 3494void 3495dsl_register_onexit_hold_cleanup(dsl_dataset_t *ds, const char *htag, 3496 minor_t minor) 3497{ 3498 zfs_hold_cleanup_arg_t *ca; 3499 3500 ca = kmem_alloc(sizeof (zfs_hold_cleanup_arg_t), KM_SLEEP); 3501 ca->dp = ds->ds_dir->dd_pool; 3502 ca->dsobj = ds->ds_object; 3503 (void) strlcpy(ca->htag, htag, sizeof (ca->htag)); 3504 VERIFY3U(0, ==, zfs_onexit_add_cb(minor, 3505 dsl_dataset_user_release_onexit, ca, NULL)); 3506} 3507 3508/* 3509 * If you add new checks here, you may need to add 3510 * additional checks to the "temporary" case in 3511 * snapshot_check() in dmu_objset.c. 3512 */ 3513static int 3514dsl_dataset_user_hold_check(void *arg1, void *arg2, dmu_tx_t *tx) 3515{ 3516 dsl_dataset_t *ds = arg1; 3517 struct dsl_ds_holdarg *ha = arg2; 3518 char *htag = ha->htag; 3519 objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; 3520 int error = 0; 3521 3522 if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_USERREFS) 3523 return (ENOTSUP); 3524 3525 if (!dsl_dataset_is_snapshot(ds)) 3526 return (EINVAL); 3527 3528 /* tags must be unique */ 3529 mutex_enter(&ds->ds_lock); 3530 if (ds->ds_phys->ds_userrefs_obj) { 3531 error = zap_lookup(mos, ds->ds_phys->ds_userrefs_obj, htag, 3532 8, 1, tx); 3533 if (error == 0) 3534 error = EEXIST; 3535 else if (error == ENOENT) 3536 error = 0; 3537 } 3538 mutex_exit(&ds->ds_lock); 3539 3540 if (error == 0 && ha->temphold && 3541 strlen(htag) + MAX_TAG_PREFIX_LEN >= MAXNAMELEN) 3542 error = E2BIG; 3543 3544 return (error); 3545} 3546 3547void 3548dsl_dataset_user_hold_sync(void *arg1, void *arg2, dmu_tx_t *tx) 3549{ 3550 dsl_dataset_t *ds = arg1; 3551 struct dsl_ds_holdarg *ha = arg2; 3552 char *htag = ha->htag; 3553 dsl_pool_t *dp = ds->ds_dir->dd_pool; 3554 objset_t *mos = dp->dp_meta_objset; 3555 uint64_t now = gethrestime_sec(); 3556 uint64_t zapobj; 3557 3558 mutex_enter(&ds->ds_lock); 3559 if (ds->ds_phys->ds_userrefs_obj == 0) { 3560 /* 3561 * This is the first user hold for this dataset. Create 3562 * the userrefs zap object. 3563 */ 3564 dmu_buf_will_dirty(ds->ds_dbuf, tx); 3565 zapobj = ds->ds_phys->ds_userrefs_obj = 3566 zap_create(mos, DMU_OT_USERREFS, DMU_OT_NONE, 0, tx); 3567 } else { 3568 zapobj = ds->ds_phys->ds_userrefs_obj; 3569 } 3570 ds->ds_userrefs++; 3571 mutex_exit(&ds->ds_lock); 3572 3573 VERIFY(0 == zap_add(mos, zapobj, htag, 8, 1, &now, tx)); 3574 3575 if (ha->temphold) { 3576 VERIFY(0 == dsl_pool_user_hold(dp, ds->ds_object, 3577 htag, &now, tx)); 3578 } 3579 3580 spa_history_log_internal(LOG_DS_USER_HOLD, 3581 dp->dp_spa, tx, "<%s> temp = %d dataset = %llu", htag, 3582 (int)ha->temphold, ds->ds_object); 3583} 3584 3585static int 3586dsl_dataset_user_hold_one(const char *dsname, void *arg) 3587{ 3588 struct dsl_ds_holdarg *ha = arg; 3589 dsl_dataset_t *ds; 3590 int error; 3591 char *name; 3592 3593 /* alloc a buffer to hold dsname@snapname plus terminating NULL */ 3594 name = kmem_asprintf("%s@%s", dsname, ha->snapname); 3595 error = dsl_dataset_hold(name, ha->dstg, &ds); 3596 strfree(name); 3597 if (error == 0) { 3598 ha->gotone = B_TRUE; 3599 dsl_sync_task_create(ha->dstg, dsl_dataset_user_hold_check, 3600 dsl_dataset_user_hold_sync, ds, ha, 0); 3601 } else if (error == ENOENT && ha->recursive) { 3602 error = 0; 3603 } else { 3604 (void) strlcpy(ha->failed, dsname, sizeof (ha->failed)); 3605 } 3606 return (error); 3607} 3608 3609int 3610dsl_dataset_user_hold_for_send(dsl_dataset_t *ds, char *htag, 3611 boolean_t temphold) 3612{ 3613 struct dsl_ds_holdarg *ha; 3614 int error; 3615 3616 ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP); 3617 ha->htag = htag; 3618 ha->temphold = temphold; 3619 error = dsl_sync_task_do(ds->ds_dir->dd_pool, 3620 dsl_dataset_user_hold_check, dsl_dataset_user_hold_sync, 3621 ds, ha, 0); 3622 kmem_free(ha, sizeof (struct dsl_ds_holdarg)); 3623 3624 return (error); 3625} 3626 3627int 3628dsl_dataset_user_hold(char *dsname, char *snapname, char *htag, 3629 boolean_t recursive, boolean_t temphold, int cleanup_fd) 3630{ 3631 struct dsl_ds_holdarg *ha; 3632 dsl_sync_task_t *dst; 3633 spa_t *spa; 3634 int error; 3635 minor_t minor = 0; 3636 3637 if (cleanup_fd != -1) { 3638 /* Currently we only support cleanup-on-exit of tempholds. */ 3639 if (!temphold) 3640 return (EINVAL); 3641 error = zfs_onexit_fd_hold(cleanup_fd, &minor); 3642 if (error) 3643 return (error); 3644 } 3645 3646 ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP); 3647 3648 (void) strlcpy(ha->failed, dsname, sizeof (ha->failed)); 3649 3650 error = spa_open(dsname, &spa, FTAG); 3651 if (error) { 3652 kmem_free(ha, sizeof (struct dsl_ds_holdarg)); 3653 if (cleanup_fd != -1) 3654 zfs_onexit_fd_rele(cleanup_fd); 3655 return (error); 3656 } 3657 3658 ha->dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); 3659 ha->htag = htag; 3660 ha->snapname = snapname; 3661 ha->recursive = recursive; 3662 ha->temphold = temphold; 3663 3664 if (recursive) { 3665 error = dmu_objset_find(dsname, dsl_dataset_user_hold_one, 3666 ha, DS_FIND_CHILDREN); 3667 } else { 3668 error = dsl_dataset_user_hold_one(dsname, ha); 3669 } 3670 if (error == 0) 3671 error = dsl_sync_task_group_wait(ha->dstg); 3672 3673 for (dst = list_head(&ha->dstg->dstg_tasks); dst; 3674 dst = list_next(&ha->dstg->dstg_tasks, dst)) { 3675 dsl_dataset_t *ds = dst->dst_arg1; 3676 3677 if (dst->dst_err) { 3678 dsl_dataset_name(ds, ha->failed); 3679 *strchr(ha->failed, '@') = '\0'; 3680 } else if (error == 0 && minor != 0 && temphold) { 3681 /* 3682 * If this hold is to be released upon process exit, 3683 * register that action now. 3684 */ 3685 dsl_register_onexit_hold_cleanup(ds, htag, minor); 3686 } 3687 dsl_dataset_rele(ds, ha->dstg); 3688 } 3689 3690 if (error == 0 && recursive && !ha->gotone) 3691 error = ENOENT; 3692 3693 if (error) 3694 (void) strlcpy(dsname, ha->failed, sizeof (ha->failed)); 3695 3696 dsl_sync_task_group_destroy(ha->dstg); 3697 3698 kmem_free(ha, sizeof (struct dsl_ds_holdarg)); 3699 spa_close(spa, FTAG); 3700 if (cleanup_fd != -1) 3701 zfs_onexit_fd_rele(cleanup_fd); 3702 return (error); 3703} 3704 3705struct dsl_ds_releasearg { 3706 dsl_dataset_t *ds; 3707 const char *htag; 3708 boolean_t own; /* do we own or just hold ds? */ 3709}; 3710 3711static int 3712dsl_dataset_release_might_destroy(dsl_dataset_t *ds, const char *htag, 3713 boolean_t *might_destroy) 3714{ 3715 objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; 3716 uint64_t zapobj; 3717 uint64_t tmp; 3718 int error; 3719 3720 *might_destroy = B_FALSE; 3721 3722 mutex_enter(&ds->ds_lock); 3723 zapobj = ds->ds_phys->ds_userrefs_obj; 3724 if (zapobj == 0) { 3725 /* The tag can't possibly exist */ 3726 mutex_exit(&ds->ds_lock); 3727 return (ESRCH); 3728 } 3729 3730 /* Make sure the tag exists */ 3731 error = zap_lookup(mos, zapobj, htag, 8, 1, &tmp); 3732 if (error) { 3733 mutex_exit(&ds->ds_lock); 3734 if (error == ENOENT) 3735 error = ESRCH; 3736 return (error); 3737 } 3738 3739 if (ds->ds_userrefs == 1 && ds->ds_phys->ds_num_children == 1 && 3740 DS_IS_DEFER_DESTROY(ds)) 3741 *might_destroy = B_TRUE; 3742 3743 mutex_exit(&ds->ds_lock); 3744 return (0); 3745} 3746 3747static int 3748dsl_dataset_user_release_check(void *arg1, void *tag, dmu_tx_t *tx) 3749{ 3750 struct dsl_ds_releasearg *ra = arg1; 3751 dsl_dataset_t *ds = ra->ds; 3752 boolean_t might_destroy; 3753 int error; 3754 3755 if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_USERREFS) 3756 return (ENOTSUP); 3757 3758 error = dsl_dataset_release_might_destroy(ds, ra->htag, &might_destroy); 3759 if (error) 3760 return (error); 3761 3762 if (might_destroy) { 3763 struct dsl_ds_destroyarg dsda = {0}; 3764 3765 if (dmu_tx_is_syncing(tx)) { 3766 /* 3767 * If we're not prepared to remove the snapshot, 3768 * we can't allow the release to happen right now. 3769 */ 3770 if (!ra->own) 3771 return (EBUSY); 3772 } 3773 dsda.ds = ds; 3774 dsda.releasing = B_TRUE; 3775 return (dsl_dataset_destroy_check(&dsda, tag, tx)); 3776 } 3777 3778 return (0); 3779} 3780 3781static void 3782dsl_dataset_user_release_sync(void *arg1, void *tag, dmu_tx_t *tx) 3783{ 3784 struct dsl_ds_releasearg *ra = arg1; 3785 dsl_dataset_t *ds = ra->ds; 3786 dsl_pool_t *dp = ds->ds_dir->dd_pool; 3787 objset_t *mos = dp->dp_meta_objset; 3788 uint64_t zapobj; 3789 uint64_t dsobj = ds->ds_object; 3790 uint64_t refs; 3791 int error; 3792 3793 mutex_enter(&ds->ds_lock); 3794 ds->ds_userrefs--; 3795 refs = ds->ds_userrefs; 3796 mutex_exit(&ds->ds_lock); 3797 error = dsl_pool_user_release(dp, ds->ds_object, ra->htag, tx); 3798 VERIFY(error == 0 || error == ENOENT); 3799 zapobj = ds->ds_phys->ds_userrefs_obj; 3800 VERIFY(0 == zap_remove(mos, zapobj, ra->htag, tx)); 3801 if (ds->ds_userrefs == 0 && ds->ds_phys->ds_num_children == 1 && 3802 DS_IS_DEFER_DESTROY(ds)) { 3803 struct dsl_ds_destroyarg dsda = {0}; 3804 3805 ASSERT(ra->own); 3806 dsda.ds = ds; 3807 dsda.releasing = B_TRUE; 3808 /* We already did the destroy_check */ 3809 dsl_dataset_destroy_sync(&dsda, tag, tx); 3810 } 3811 3812 spa_history_log_internal(LOG_DS_USER_RELEASE, 3813 dp->dp_spa, tx, "<%s> %lld dataset = %llu", 3814 ra->htag, (longlong_t)refs, dsobj); 3815} 3816 3817static int 3818dsl_dataset_user_release_one(const char *dsname, void *arg) 3819{ 3820 struct dsl_ds_holdarg *ha = arg; 3821 struct dsl_ds_releasearg *ra; 3822 dsl_dataset_t *ds; 3823 int error; 3824 void *dtag = ha->dstg; 3825 char *name; 3826 boolean_t own = B_FALSE; 3827 boolean_t might_destroy; 3828 3829 /* alloc a buffer to hold dsname@snapname, plus the terminating NULL */ 3830 name = kmem_asprintf("%s@%s", dsname, ha->snapname); 3831 error = dsl_dataset_hold(name, dtag, &ds); 3832 strfree(name); 3833 if (error == ENOENT && ha->recursive) 3834 return (0); 3835 (void) strlcpy(ha->failed, dsname, sizeof (ha->failed)); 3836 if (error) 3837 return (error); 3838 3839 ha->gotone = B_TRUE; 3840 3841 ASSERT(dsl_dataset_is_snapshot(ds)); 3842 3843 error = dsl_dataset_release_might_destroy(ds, ha->htag, &might_destroy); 3844 if (error) { 3845 dsl_dataset_rele(ds, dtag); 3846 return (error); 3847 } 3848 3849 if (might_destroy) { 3850#ifdef _KERNEL 3851 name = kmem_asprintf("%s@%s", dsname, ha->snapname); 3852 error = zfs_unmount_snap(name, NULL); 3853 strfree(name); 3854 if (error) { 3855 dsl_dataset_rele(ds, dtag); 3856 return (error); 3857 } 3858#endif 3859 if (!dsl_dataset_tryown(ds, B_TRUE, dtag)) { 3860 dsl_dataset_rele(ds, dtag); 3861 return (EBUSY); 3862 } else { 3863 own = B_TRUE; 3864 dsl_dataset_make_exclusive(ds, dtag); 3865 } 3866 } 3867 3868 ra = kmem_alloc(sizeof (struct dsl_ds_releasearg), KM_SLEEP); 3869 ra->ds = ds; 3870 ra->htag = ha->htag; 3871 ra->own = own; 3872 dsl_sync_task_create(ha->dstg, dsl_dataset_user_release_check, 3873 dsl_dataset_user_release_sync, ra, dtag, 0); 3874 3875 return (0); 3876} 3877 3878int 3879dsl_dataset_user_release(char *dsname, char *snapname, char *htag, 3880 boolean_t recursive) 3881{ 3882 struct dsl_ds_holdarg *ha; 3883 dsl_sync_task_t *dst; 3884 spa_t *spa; 3885 int error; 3886 3887top: 3888 ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP); 3889 3890 (void) strlcpy(ha->failed, dsname, sizeof (ha->failed)); 3891 3892 error = spa_open(dsname, &spa, FTAG); 3893 if (error) { 3894 kmem_free(ha, sizeof (struct dsl_ds_holdarg)); 3895 return (error); 3896 } 3897 3898 ha->dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); 3899 ha->htag = htag; 3900 ha->snapname = snapname; 3901 ha->recursive = recursive; 3902 if (recursive) { 3903 error = dmu_objset_find(dsname, dsl_dataset_user_release_one, 3904 ha, DS_FIND_CHILDREN); 3905 } else { 3906 error = dsl_dataset_user_release_one(dsname, ha); 3907 } 3908 if (error == 0) 3909 error = dsl_sync_task_group_wait(ha->dstg); 3910 3911 for (dst = list_head(&ha->dstg->dstg_tasks); dst; 3912 dst = list_next(&ha->dstg->dstg_tasks, dst)) { 3913 struct dsl_ds_releasearg *ra = dst->dst_arg1; 3914 dsl_dataset_t *ds = ra->ds; 3915 3916 if (dst->dst_err) 3917 dsl_dataset_name(ds, ha->failed); 3918 3919 if (ra->own) 3920 dsl_dataset_disown(ds, ha->dstg); 3921 else 3922 dsl_dataset_rele(ds, ha->dstg); 3923 3924 kmem_free(ra, sizeof (struct dsl_ds_releasearg)); 3925 } 3926 3927 if (error == 0 && recursive && !ha->gotone) 3928 error = ENOENT; 3929 3930 if (error && error != EBUSY) 3931 (void) strlcpy(dsname, ha->failed, sizeof (ha->failed)); 3932 3933 dsl_sync_task_group_destroy(ha->dstg); 3934 kmem_free(ha, sizeof (struct dsl_ds_holdarg)); 3935 spa_close(spa, FTAG); 3936 3937 /* 3938 * We can get EBUSY if we were racing with deferred destroy and 3939 * dsl_dataset_user_release_check() hadn't done the necessary 3940 * open context setup. We can also get EBUSY if we're racing 3941 * with destroy and that thread is the ds_owner. Either way 3942 * the busy condition should be transient, and we should retry 3943 * the release operation. 3944 */ 3945 if (error == EBUSY) 3946 goto top; 3947 3948 return (error); 3949} 3950 3951/* 3952 * Called at spa_load time (with retry == B_FALSE) to release a stale 3953 * temporary user hold. Also called by the onexit code (with retry == B_TRUE). 3954 */ 3955int 3956dsl_dataset_user_release_tmp(dsl_pool_t *dp, uint64_t dsobj, char *htag, 3957 boolean_t retry) 3958{ 3959 dsl_dataset_t *ds; 3960 char *snap; 3961 char *name; 3962 int namelen; 3963 int error; 3964 3965 do { 3966 rw_enter(&dp->dp_config_rwlock, RW_READER); 3967 error = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds); 3968 rw_exit(&dp->dp_config_rwlock); 3969 if (error) 3970 return (error); 3971 namelen = dsl_dataset_namelen(ds)+1; 3972 name = kmem_alloc(namelen, KM_SLEEP); 3973 dsl_dataset_name(ds, name); 3974 dsl_dataset_rele(ds, FTAG); 3975 3976 snap = strchr(name, '@'); 3977 *snap = '\0'; 3978 ++snap; 3979 error = dsl_dataset_user_release(name, snap, htag, B_FALSE); 3980 kmem_free(name, namelen); 3981 3982 /* 3983 * The object can't have been destroyed because we have a hold, 3984 * but it might have been renamed, resulting in ENOENT. Retry 3985 * if we've been requested to do so. 3986 * 3987 * It would be nice if we could use the dsobj all the way 3988 * through and avoid ENOENT entirely. But we might need to 3989 * unmount the snapshot, and there's currently no way to lookup 3990 * a vfsp using a ZFS object id. 3991 */ 3992 } while ((error == ENOENT) && retry); 3993 3994 return (error); 3995} 3996 3997int 3998dsl_dataset_get_holds(const char *dsname, nvlist_t **nvp) 3999{ 4000 dsl_dataset_t *ds; 4001 int err; 4002 4003 err = dsl_dataset_hold(dsname, FTAG, &ds); 4004 if (err) 4005 return (err); 4006 4007 VERIFY(0 == nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP)); 4008 if (ds->ds_phys->ds_userrefs_obj != 0) { 4009 zap_attribute_t *za; 4010 zap_cursor_t zc; 4011 4012 za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); 4013 for (zap_cursor_init(&zc, ds->ds_dir->dd_pool->dp_meta_objset, 4014 ds->ds_phys->ds_userrefs_obj); 4015 zap_cursor_retrieve(&zc, za) == 0; 4016 zap_cursor_advance(&zc)) { 4017 VERIFY(0 == nvlist_add_uint64(*nvp, za->za_name, 4018 za->za_first_integer)); 4019 } 4020 zap_cursor_fini(&zc); 4021 kmem_free(za, sizeof (zap_attribute_t)); 4022 } 4023 dsl_dataset_rele(ds, FTAG); 4024 return (0); 4025} 4026 4027/* 4028 * Note, this fuction is used as the callback for dmu_objset_find(). We 4029 * always return 0 so that we will continue to find and process 4030 * inconsistent datasets, even if we encounter an error trying to 4031 * process one of them. 4032 */ 4033/* ARGSUSED */ 4034int 4035dsl_destroy_inconsistent(const char *dsname, void *arg) 4036{ 4037 dsl_dataset_t *ds; 4038 4039 if (dsl_dataset_own(dsname, B_TRUE, FTAG, &ds) == 0) { 4040 if (DS_IS_INCONSISTENT(ds)) 4041 (void) dsl_dataset_destroy(ds, FTAG, B_FALSE); 4042 else 4043 dsl_dataset_disown(ds, FTAG); 4044 } 4045 return (0); 4046} 4047