dsl_dataset.c revision 256281
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2013 by Delphix. All rights reserved.
24 * Copyright (c) 2012, Joyent, Inc. All rights reserved.
25 * Portions Copyright (c) 2011 Martin Matuska <mm@FreeBSD.org>
26 */
27
28#include <sys/dmu_objset.h>
29#include <sys/dsl_dataset.h>
30#include <sys/dsl_dir.h>
31#include <sys/dsl_prop.h>
32#include <sys/dsl_synctask.h>
33#include <sys/dmu_traverse.h>
34#include <sys/dmu_impl.h>
35#include <sys/dmu_tx.h>
36#include <sys/arc.h>
37#include <sys/zio.h>
38#include <sys/zap.h>
39#include <sys/zfeature.h>
40#include <sys/unique.h>
41#include <sys/zfs_context.h>
42#include <sys/zfs_ioctl.h>
43#include <sys/spa.h>
44#include <sys/zfs_znode.h>
45#include <sys/zfs_onexit.h>
46#include <sys/zvol.h>
47#include <sys/dsl_scan.h>
48#include <sys/dsl_deadlist.h>
49#include <sys/dsl_destroy.h>
50#include <sys/dsl_userhold.h>
51
52#define	SWITCH64(x, y) \
53	{ \
54		uint64_t __tmp = (x); \
55		(x) = (y); \
56		(y) = __tmp; \
57	}
58
59#define	DS_REF_MAX	(1ULL << 62)
60
61#define	DSL_DEADLIST_BLOCKSIZE	SPA_MAXBLOCKSIZE
62
63/*
64 * Figure out how much of this delta should be propogated to the dsl_dir
65 * layer.  If there's a refreservation, that space has already been
66 * partially accounted for in our ancestors.
67 */
68static int64_t
69parent_delta(dsl_dataset_t *ds, int64_t delta)
70{
71	uint64_t old_bytes, new_bytes;
72
73	if (ds->ds_reserved == 0)
74		return (delta);
75
76	old_bytes = MAX(ds->ds_phys->ds_unique_bytes, ds->ds_reserved);
77	new_bytes = MAX(ds->ds_phys->ds_unique_bytes + delta, ds->ds_reserved);
78
79	ASSERT3U(ABS((int64_t)(new_bytes - old_bytes)), <=, ABS(delta));
80	return (new_bytes - old_bytes);
81}
82
83void
84dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx)
85{
86	int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp);
87	int compressed = BP_GET_PSIZE(bp);
88	int uncompressed = BP_GET_UCSIZE(bp);
89	int64_t delta;
90
91	dprintf_bp(bp, "ds=%p", ds);
92
93	ASSERT(dmu_tx_is_syncing(tx));
94	/* It could have been compressed away to nothing */
95	if (BP_IS_HOLE(bp))
96		return;
97	ASSERT(BP_GET_TYPE(bp) != DMU_OT_NONE);
98	ASSERT(DMU_OT_IS_VALID(BP_GET_TYPE(bp)));
99	if (ds == NULL) {
100		dsl_pool_mos_diduse_space(tx->tx_pool,
101		    used, compressed, uncompressed);
102		return;
103	}
104
105	dmu_buf_will_dirty(ds->ds_dbuf, tx);
106	mutex_enter(&ds->ds_lock);
107	delta = parent_delta(ds, used);
108	ds->ds_phys->ds_referenced_bytes += used;
109	ds->ds_phys->ds_compressed_bytes += compressed;
110	ds->ds_phys->ds_uncompressed_bytes += uncompressed;
111	ds->ds_phys->ds_unique_bytes += used;
112	mutex_exit(&ds->ds_lock);
113	dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, delta,
114	    compressed, uncompressed, tx);
115	dsl_dir_transfer_space(ds->ds_dir, used - delta,
116	    DD_USED_REFRSRV, DD_USED_HEAD, tx);
117}
118
119int
120dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx,
121    boolean_t async)
122{
123	if (BP_IS_HOLE(bp))
124		return (0);
125
126	ASSERT(dmu_tx_is_syncing(tx));
127	ASSERT(bp->blk_birth <= tx->tx_txg);
128
129	int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp);
130	int compressed = BP_GET_PSIZE(bp);
131	int uncompressed = BP_GET_UCSIZE(bp);
132
133	ASSERT(used > 0);
134	if (ds == NULL) {
135		dsl_free(tx->tx_pool, tx->tx_txg, bp);
136		dsl_pool_mos_diduse_space(tx->tx_pool,
137		    -used, -compressed, -uncompressed);
138		return (used);
139	}
140	ASSERT3P(tx->tx_pool, ==, ds->ds_dir->dd_pool);
141
142	ASSERT(!dsl_dataset_is_snapshot(ds));
143	dmu_buf_will_dirty(ds->ds_dbuf, tx);
144
145	if (bp->blk_birth > ds->ds_phys->ds_prev_snap_txg) {
146		int64_t delta;
147
148		dprintf_bp(bp, "freeing ds=%llu", ds->ds_object);
149		dsl_free(tx->tx_pool, tx->tx_txg, bp);
150
151		mutex_enter(&ds->ds_lock);
152		ASSERT(ds->ds_phys->ds_unique_bytes >= used ||
153		    !DS_UNIQUE_IS_ACCURATE(ds));
154		delta = parent_delta(ds, -used);
155		ds->ds_phys->ds_unique_bytes -= used;
156		mutex_exit(&ds->ds_lock);
157		dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD,
158		    delta, -compressed, -uncompressed, tx);
159		dsl_dir_transfer_space(ds->ds_dir, -used - delta,
160		    DD_USED_REFRSRV, DD_USED_HEAD, tx);
161	} else {
162		dprintf_bp(bp, "putting on dead list: %s", "");
163		if (async) {
164			/*
165			 * We are here as part of zio's write done callback,
166			 * which means we're a zio interrupt thread.  We can't
167			 * call dsl_deadlist_insert() now because it may block
168			 * waiting for I/O.  Instead, put bp on the deferred
169			 * queue and let dsl_pool_sync() finish the job.
170			 */
171			bplist_append(&ds->ds_pending_deadlist, bp);
172		} else {
173			dsl_deadlist_insert(&ds->ds_deadlist, bp, tx);
174		}
175		ASSERT3U(ds->ds_prev->ds_object, ==,
176		    ds->ds_phys->ds_prev_snap_obj);
177		ASSERT(ds->ds_prev->ds_phys->ds_num_children > 0);
178		/* if (bp->blk_birth > prev prev snap txg) prev unique += bs */
179		if (ds->ds_prev->ds_phys->ds_next_snap_obj ==
180		    ds->ds_object && bp->blk_birth >
181		    ds->ds_prev->ds_phys->ds_prev_snap_txg) {
182			dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
183			mutex_enter(&ds->ds_prev->ds_lock);
184			ds->ds_prev->ds_phys->ds_unique_bytes += used;
185			mutex_exit(&ds->ds_prev->ds_lock);
186		}
187		if (bp->blk_birth > ds->ds_dir->dd_origin_txg) {
188			dsl_dir_transfer_space(ds->ds_dir, used,
189			    DD_USED_HEAD, DD_USED_SNAP, tx);
190		}
191	}
192	mutex_enter(&ds->ds_lock);
193	ASSERT3U(ds->ds_phys->ds_referenced_bytes, >=, used);
194	ds->ds_phys->ds_referenced_bytes -= used;
195	ASSERT3U(ds->ds_phys->ds_compressed_bytes, >=, compressed);
196	ds->ds_phys->ds_compressed_bytes -= compressed;
197	ASSERT3U(ds->ds_phys->ds_uncompressed_bytes, >=, uncompressed);
198	ds->ds_phys->ds_uncompressed_bytes -= uncompressed;
199	mutex_exit(&ds->ds_lock);
200
201	return (used);
202}
203
204uint64_t
205dsl_dataset_prev_snap_txg(dsl_dataset_t *ds)
206{
207	uint64_t trysnap = 0;
208
209	if (ds == NULL)
210		return (0);
211	/*
212	 * The snapshot creation could fail, but that would cause an
213	 * incorrect FALSE return, which would only result in an
214	 * overestimation of the amount of space that an operation would
215	 * consume, which is OK.
216	 *
217	 * There's also a small window where we could miss a pending
218	 * snapshot, because we could set the sync task in the quiescing
219	 * phase.  So this should only be used as a guess.
220	 */
221	if (ds->ds_trysnap_txg >
222	    spa_last_synced_txg(ds->ds_dir->dd_pool->dp_spa))
223		trysnap = ds->ds_trysnap_txg;
224	return (MAX(ds->ds_phys->ds_prev_snap_txg, trysnap));
225}
226
227boolean_t
228dsl_dataset_block_freeable(dsl_dataset_t *ds, const blkptr_t *bp,
229    uint64_t blk_birth)
230{
231	if (blk_birth <= dsl_dataset_prev_snap_txg(ds))
232		return (B_FALSE);
233
234	ddt_prefetch(dsl_dataset_get_spa(ds), bp);
235
236	return (B_TRUE);
237}
238
239/* ARGSUSED */
240static void
241dsl_dataset_evict(dmu_buf_t *db, void *dsv)
242{
243	dsl_dataset_t *ds = dsv;
244
245	ASSERT(ds->ds_owner == NULL);
246
247	unique_remove(ds->ds_fsid_guid);
248
249	if (ds->ds_objset != NULL)
250		dmu_objset_evict(ds->ds_objset);
251
252	if (ds->ds_prev) {
253		dsl_dataset_rele(ds->ds_prev, ds);
254		ds->ds_prev = NULL;
255	}
256
257	bplist_destroy(&ds->ds_pending_deadlist);
258	if (ds->ds_phys->ds_deadlist_obj != 0)
259		dsl_deadlist_close(&ds->ds_deadlist);
260	if (ds->ds_dir)
261		dsl_dir_rele(ds->ds_dir, ds);
262
263	ASSERT(!list_link_active(&ds->ds_synced_link));
264
265	if (mutex_owned(&ds->ds_lock))
266		mutex_exit(&ds->ds_lock);
267	mutex_destroy(&ds->ds_lock);
268	if (mutex_owned(&ds->ds_opening_lock))
269		mutex_exit(&ds->ds_opening_lock);
270	mutex_destroy(&ds->ds_opening_lock);
271	refcount_destroy(&ds->ds_longholds);
272
273	kmem_free(ds, sizeof (dsl_dataset_t));
274}
275
276int
277dsl_dataset_get_snapname(dsl_dataset_t *ds)
278{
279	dsl_dataset_phys_t *headphys;
280	int err;
281	dmu_buf_t *headdbuf;
282	dsl_pool_t *dp = ds->ds_dir->dd_pool;
283	objset_t *mos = dp->dp_meta_objset;
284
285	if (ds->ds_snapname[0])
286		return (0);
287	if (ds->ds_phys->ds_next_snap_obj == 0)
288		return (0);
289
290	err = dmu_bonus_hold(mos, ds->ds_dir->dd_phys->dd_head_dataset_obj,
291	    FTAG, &headdbuf);
292	if (err != 0)
293		return (err);
294	headphys = headdbuf->db_data;
295	err = zap_value_search(dp->dp_meta_objset,
296	    headphys->ds_snapnames_zapobj, ds->ds_object, 0, ds->ds_snapname);
297	dmu_buf_rele(headdbuf, FTAG);
298	return (err);
299}
300
301int
302dsl_dataset_snap_lookup(dsl_dataset_t *ds, const char *name, uint64_t *value)
303{
304	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
305	uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj;
306	matchtype_t mt;
307	int err;
308
309	if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET)
310		mt = MT_FIRST;
311	else
312		mt = MT_EXACT;
313
314	err = zap_lookup_norm(mos, snapobj, name, 8, 1,
315	    value, mt, NULL, 0, NULL);
316	if (err == ENOTSUP && mt == MT_FIRST)
317		err = zap_lookup(mos, snapobj, name, 8, 1, value);
318	return (err);
319}
320
321int
322dsl_dataset_snap_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx)
323{
324	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
325	uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj;
326	matchtype_t mt;
327	int err;
328
329	dsl_dir_snap_cmtime_update(ds->ds_dir);
330
331	if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET)
332		mt = MT_FIRST;
333	else
334		mt = MT_EXACT;
335
336	err = zap_remove_norm(mos, snapobj, name, mt, tx);
337	if (err == ENOTSUP && mt == MT_FIRST)
338		err = zap_remove(mos, snapobj, name, tx);
339	return (err);
340}
341
342int
343dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag,
344    dsl_dataset_t **dsp)
345{
346	objset_t *mos = dp->dp_meta_objset;
347	dmu_buf_t *dbuf;
348	dsl_dataset_t *ds;
349	int err;
350	dmu_object_info_t doi;
351
352	ASSERT(dsl_pool_config_held(dp));
353
354	err = dmu_bonus_hold(mos, dsobj, tag, &dbuf);
355	if (err != 0)
356		return (err);
357
358	/* Make sure dsobj has the correct object type. */
359	dmu_object_info_from_db(dbuf, &doi);
360	if (doi.doi_type != DMU_OT_DSL_DATASET) {
361		dmu_buf_rele(dbuf, tag);
362		return (SET_ERROR(EINVAL));
363	}
364
365	ds = dmu_buf_get_user(dbuf);
366	if (ds == NULL) {
367		dsl_dataset_t *winner = NULL;
368
369		ds = kmem_zalloc(sizeof (dsl_dataset_t), KM_SLEEP);
370		ds->ds_dbuf = dbuf;
371		ds->ds_object = dsobj;
372		ds->ds_phys = dbuf->db_data;
373
374		mutex_init(&ds->ds_lock, NULL, MUTEX_DEFAULT, NULL);
375		mutex_init(&ds->ds_opening_lock, NULL, MUTEX_DEFAULT, NULL);
376		mutex_init(&ds->ds_sendstream_lock, NULL, MUTEX_DEFAULT, NULL);
377		refcount_create(&ds->ds_longholds);
378
379		bplist_create(&ds->ds_pending_deadlist);
380		dsl_deadlist_open(&ds->ds_deadlist,
381		    mos, ds->ds_phys->ds_deadlist_obj);
382
383		list_create(&ds->ds_sendstreams, sizeof (dmu_sendarg_t),
384		    offsetof(dmu_sendarg_t, dsa_link));
385
386		if (err == 0) {
387			err = dsl_dir_hold_obj(dp,
388			    ds->ds_phys->ds_dir_obj, NULL, ds, &ds->ds_dir);
389		}
390		if (err != 0) {
391			mutex_destroy(&ds->ds_lock);
392			mutex_destroy(&ds->ds_opening_lock);
393			refcount_destroy(&ds->ds_longholds);
394			bplist_destroy(&ds->ds_pending_deadlist);
395			dsl_deadlist_close(&ds->ds_deadlist);
396			kmem_free(ds, sizeof (dsl_dataset_t));
397			dmu_buf_rele(dbuf, tag);
398			return (err);
399		}
400
401		if (!dsl_dataset_is_snapshot(ds)) {
402			ds->ds_snapname[0] = '\0';
403			if (ds->ds_phys->ds_prev_snap_obj != 0) {
404				err = dsl_dataset_hold_obj(dp,
405				    ds->ds_phys->ds_prev_snap_obj,
406				    ds, &ds->ds_prev);
407			}
408		} else {
409			if (zfs_flags & ZFS_DEBUG_SNAPNAMES)
410				err = dsl_dataset_get_snapname(ds);
411			if (err == 0 && ds->ds_phys->ds_userrefs_obj != 0) {
412				err = zap_count(
413				    ds->ds_dir->dd_pool->dp_meta_objset,
414				    ds->ds_phys->ds_userrefs_obj,
415				    &ds->ds_userrefs);
416			}
417		}
418
419		if (err == 0 && !dsl_dataset_is_snapshot(ds)) {
420			err = dsl_prop_get_int_ds(ds,
421			    zfs_prop_to_name(ZFS_PROP_REFRESERVATION),
422			    &ds->ds_reserved);
423			if (err == 0) {
424				err = dsl_prop_get_int_ds(ds,
425				    zfs_prop_to_name(ZFS_PROP_REFQUOTA),
426				    &ds->ds_quota);
427			}
428		} else {
429			ds->ds_reserved = ds->ds_quota = 0;
430		}
431
432		if (err != 0 || (winner = dmu_buf_set_user_ie(dbuf, ds,
433		    &ds->ds_phys, dsl_dataset_evict)) != NULL) {
434			bplist_destroy(&ds->ds_pending_deadlist);
435			dsl_deadlist_close(&ds->ds_deadlist);
436			if (ds->ds_prev)
437				dsl_dataset_rele(ds->ds_prev, ds);
438			dsl_dir_rele(ds->ds_dir, ds);
439			mutex_destroy(&ds->ds_lock);
440			mutex_destroy(&ds->ds_opening_lock);
441			refcount_destroy(&ds->ds_longholds);
442			kmem_free(ds, sizeof (dsl_dataset_t));
443			if (err != 0) {
444				dmu_buf_rele(dbuf, tag);
445				return (err);
446			}
447			ds = winner;
448		} else {
449			ds->ds_fsid_guid =
450			    unique_insert(ds->ds_phys->ds_fsid_guid);
451		}
452	}
453	ASSERT3P(ds->ds_dbuf, ==, dbuf);
454	ASSERT3P(ds->ds_phys, ==, dbuf->db_data);
455	ASSERT(ds->ds_phys->ds_prev_snap_obj != 0 ||
456	    spa_version(dp->dp_spa) < SPA_VERSION_ORIGIN ||
457	    dp->dp_origin_snap == NULL || ds == dp->dp_origin_snap);
458	*dsp = ds;
459	return (0);
460}
461
462int
463dsl_dataset_hold(dsl_pool_t *dp, const char *name,
464    void *tag, dsl_dataset_t **dsp)
465{
466	dsl_dir_t *dd;
467	const char *snapname;
468	uint64_t obj;
469	int err = 0;
470
471	err = dsl_dir_hold(dp, name, FTAG, &dd, &snapname);
472	if (err != 0)
473		return (err);
474
475	ASSERT(dsl_pool_config_held(dp));
476	obj = dd->dd_phys->dd_head_dataset_obj;
477	if (obj != 0)
478		err = dsl_dataset_hold_obj(dp, obj, tag, dsp);
479	else
480		err = SET_ERROR(ENOENT);
481
482	/* we may be looking for a snapshot */
483	if (err == 0 && snapname != NULL) {
484		dsl_dataset_t *ds;
485
486		if (*snapname++ != '@') {
487			dsl_dataset_rele(*dsp, tag);
488			dsl_dir_rele(dd, FTAG);
489			return (SET_ERROR(ENOENT));
490		}
491
492		dprintf("looking for snapshot '%s'\n", snapname);
493		err = dsl_dataset_snap_lookup(*dsp, snapname, &obj);
494		if (err == 0)
495			err = dsl_dataset_hold_obj(dp, obj, tag, &ds);
496		dsl_dataset_rele(*dsp, tag);
497
498		if (err == 0) {
499			mutex_enter(&ds->ds_lock);
500			if (ds->ds_snapname[0] == 0)
501				(void) strlcpy(ds->ds_snapname, snapname,
502				    sizeof (ds->ds_snapname));
503			mutex_exit(&ds->ds_lock);
504			*dsp = ds;
505		}
506	}
507
508	dsl_dir_rele(dd, FTAG);
509	return (err);
510}
511
512int
513dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj,
514    void *tag, dsl_dataset_t **dsp)
515{
516	int err = dsl_dataset_hold_obj(dp, dsobj, tag, dsp);
517	if (err != 0)
518		return (err);
519	if (!dsl_dataset_tryown(*dsp, tag)) {
520		dsl_dataset_rele(*dsp, tag);
521		*dsp = NULL;
522		return (SET_ERROR(EBUSY));
523	}
524	return (0);
525}
526
527int
528dsl_dataset_own(dsl_pool_t *dp, const char *name,
529    void *tag, dsl_dataset_t **dsp)
530{
531	int err = dsl_dataset_hold(dp, name, tag, dsp);
532	if (err != 0)
533		return (err);
534	if (!dsl_dataset_tryown(*dsp, tag)) {
535		dsl_dataset_rele(*dsp, tag);
536		return (SET_ERROR(EBUSY));
537	}
538	return (0);
539}
540
541/*
542 * See the comment above dsl_pool_hold() for details.  In summary, a long
543 * hold is used to prevent destruction of a dataset while the pool hold
544 * is dropped, allowing other concurrent operations (e.g. spa_sync()).
545 *
546 * The dataset and pool must be held when this function is called.  After it
547 * is called, the pool hold may be released while the dataset is still held
548 * and accessed.
549 */
550void
551dsl_dataset_long_hold(dsl_dataset_t *ds, void *tag)
552{
553	ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
554	(void) refcount_add(&ds->ds_longholds, tag);
555}
556
557void
558dsl_dataset_long_rele(dsl_dataset_t *ds, void *tag)
559{
560	(void) refcount_remove(&ds->ds_longholds, tag);
561}
562
563/* Return B_TRUE if there are any long holds on this dataset. */
564boolean_t
565dsl_dataset_long_held(dsl_dataset_t *ds)
566{
567	return (!refcount_is_zero(&ds->ds_longholds));
568}
569
570void
571dsl_dataset_name(dsl_dataset_t *ds, char *name)
572{
573	if (ds == NULL) {
574		(void) strcpy(name, "mos");
575	} else {
576		dsl_dir_name(ds->ds_dir, name);
577		VERIFY0(dsl_dataset_get_snapname(ds));
578		if (ds->ds_snapname[0]) {
579			(void) strcat(name, "@");
580			/*
581			 * We use a "recursive" mutex so that we
582			 * can call dprintf_ds() with ds_lock held.
583			 */
584			if (!MUTEX_HELD(&ds->ds_lock)) {
585				mutex_enter(&ds->ds_lock);
586				(void) strcat(name, ds->ds_snapname);
587				mutex_exit(&ds->ds_lock);
588			} else {
589				(void) strcat(name, ds->ds_snapname);
590			}
591		}
592	}
593}
594
595void
596dsl_dataset_rele(dsl_dataset_t *ds, void *tag)
597{
598	dmu_buf_rele(ds->ds_dbuf, tag);
599}
600
601void
602dsl_dataset_disown(dsl_dataset_t *ds, void *tag)
603{
604	ASSERT(ds->ds_owner == tag && ds->ds_dbuf != NULL);
605
606	mutex_enter(&ds->ds_lock);
607	ds->ds_owner = NULL;
608	mutex_exit(&ds->ds_lock);
609	dsl_dataset_long_rele(ds, tag);
610	if (ds->ds_dbuf != NULL)
611		dsl_dataset_rele(ds, tag);
612	else
613		dsl_dataset_evict(NULL, ds);
614}
615
616boolean_t
617dsl_dataset_tryown(dsl_dataset_t *ds, void *tag)
618{
619	boolean_t gotit = FALSE;
620
621	mutex_enter(&ds->ds_lock);
622	if (ds->ds_owner == NULL && !DS_IS_INCONSISTENT(ds)) {
623		ds->ds_owner = tag;
624		dsl_dataset_long_hold(ds, tag);
625		gotit = TRUE;
626	}
627	mutex_exit(&ds->ds_lock);
628	return (gotit);
629}
630
631uint64_t
632dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin,
633    uint64_t flags, dmu_tx_t *tx)
634{
635	dsl_pool_t *dp = dd->dd_pool;
636	dmu_buf_t *dbuf;
637	dsl_dataset_phys_t *dsphys;
638	uint64_t dsobj;
639	objset_t *mos = dp->dp_meta_objset;
640
641	if (origin == NULL)
642		origin = dp->dp_origin_snap;
643
644	ASSERT(origin == NULL || origin->ds_dir->dd_pool == dp);
645	ASSERT(origin == NULL || origin->ds_phys->ds_num_children > 0);
646	ASSERT(dmu_tx_is_syncing(tx));
647	ASSERT(dd->dd_phys->dd_head_dataset_obj == 0);
648
649	dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
650	    DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
651	VERIFY0(dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
652	dmu_buf_will_dirty(dbuf, tx);
653	dsphys = dbuf->db_data;
654	bzero(dsphys, sizeof (dsl_dataset_phys_t));
655	dsphys->ds_dir_obj = dd->dd_object;
656	dsphys->ds_flags = flags;
657	dsphys->ds_fsid_guid = unique_create();
658	do {
659		(void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
660		    sizeof (dsphys->ds_guid));
661	} while (dsphys->ds_guid == 0);
662	dsphys->ds_snapnames_zapobj =
663	    zap_create_norm(mos, U8_TEXTPREP_TOUPPER, DMU_OT_DSL_DS_SNAP_MAP,
664	    DMU_OT_NONE, 0, tx);
665	dsphys->ds_creation_time = gethrestime_sec();
666	dsphys->ds_creation_txg = tx->tx_txg == TXG_INITIAL ? 1 : tx->tx_txg;
667
668	if (origin == NULL) {
669		dsphys->ds_deadlist_obj = dsl_deadlist_alloc(mos, tx);
670	} else {
671		dsl_dataset_t *ohds; /* head of the origin snapshot */
672
673		dsphys->ds_prev_snap_obj = origin->ds_object;
674		dsphys->ds_prev_snap_txg =
675		    origin->ds_phys->ds_creation_txg;
676		dsphys->ds_referenced_bytes =
677		    origin->ds_phys->ds_referenced_bytes;
678		dsphys->ds_compressed_bytes =
679		    origin->ds_phys->ds_compressed_bytes;
680		dsphys->ds_uncompressed_bytes =
681		    origin->ds_phys->ds_uncompressed_bytes;
682		dsphys->ds_bp = origin->ds_phys->ds_bp;
683		dsphys->ds_flags |= origin->ds_phys->ds_flags;
684
685		dmu_buf_will_dirty(origin->ds_dbuf, tx);
686		origin->ds_phys->ds_num_children++;
687
688		VERIFY0(dsl_dataset_hold_obj(dp,
689		    origin->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &ohds));
690		dsphys->ds_deadlist_obj = dsl_deadlist_clone(&ohds->ds_deadlist,
691		    dsphys->ds_prev_snap_txg, dsphys->ds_prev_snap_obj, tx);
692		dsl_dataset_rele(ohds, FTAG);
693
694		if (spa_version(dp->dp_spa) >= SPA_VERSION_NEXT_CLONES) {
695			if (origin->ds_phys->ds_next_clones_obj == 0) {
696				origin->ds_phys->ds_next_clones_obj =
697				    zap_create(mos,
698				    DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx);
699			}
700			VERIFY0(zap_add_int(mos,
701			    origin->ds_phys->ds_next_clones_obj, dsobj, tx));
702		}
703
704		dmu_buf_will_dirty(dd->dd_dbuf, tx);
705		dd->dd_phys->dd_origin_obj = origin->ds_object;
706		if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
707			if (origin->ds_dir->dd_phys->dd_clones == 0) {
708				dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx);
709				origin->ds_dir->dd_phys->dd_clones =
710				    zap_create(mos,
711				    DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx);
712			}
713			VERIFY0(zap_add_int(mos,
714			    origin->ds_dir->dd_phys->dd_clones, dsobj, tx));
715		}
716	}
717
718	if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE)
719		dsphys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
720
721	dmu_buf_rele(dbuf, FTAG);
722
723	dmu_buf_will_dirty(dd->dd_dbuf, tx);
724	dd->dd_phys->dd_head_dataset_obj = dsobj;
725
726	return (dsobj);
727}
728
729static void
730dsl_dataset_zero_zil(dsl_dataset_t *ds, dmu_tx_t *tx)
731{
732	objset_t *os;
733
734	VERIFY0(dmu_objset_from_ds(ds, &os));
735	bzero(&os->os_zil_header, sizeof (os->os_zil_header));
736	dsl_dataset_dirty(ds, tx);
737}
738
739uint64_t
740dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname,
741    dsl_dataset_t *origin, uint64_t flags, cred_t *cr, dmu_tx_t *tx)
742{
743	dsl_pool_t *dp = pdd->dd_pool;
744	uint64_t dsobj, ddobj;
745	dsl_dir_t *dd;
746
747	ASSERT(dmu_tx_is_syncing(tx));
748	ASSERT(lastname[0] != '@');
749
750	ddobj = dsl_dir_create_sync(dp, pdd, lastname, tx);
751	VERIFY0(dsl_dir_hold_obj(dp, ddobj, lastname, FTAG, &dd));
752
753	dsobj = dsl_dataset_create_sync_dd(dd, origin,
754	    flags & ~DS_CREATE_FLAG_NODIRTY, tx);
755
756	dsl_deleg_set_create_perms(dd, tx, cr);
757
758	dsl_dir_rele(dd, FTAG);
759
760	/*
761	 * If we are creating a clone, make sure we zero out any stale
762	 * data from the origin snapshots zil header.
763	 */
764	if (origin != NULL && !(flags & DS_CREATE_FLAG_NODIRTY)) {
765		dsl_dataset_t *ds;
766
767		VERIFY0(dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
768		dsl_dataset_zero_zil(ds, tx);
769		dsl_dataset_rele(ds, FTAG);
770	}
771
772	return (dsobj);
773}
774
775#ifdef __FreeBSD__
776/* FreeBSD ioctl compat begin */
777struct destroyarg {
778	nvlist_t *nvl;
779	const char *snapname;
780};
781
782static int
783dsl_check_snap_cb(const char *name, void *arg)
784{
785	struct destroyarg *da = arg;
786	dsl_dataset_t *ds;
787	char *dsname;
788
789	dsname = kmem_asprintf("%s@%s", name, da->snapname);
790	fnvlist_add_boolean(da->nvl, dsname);
791	kmem_free(dsname, strlen(dsname) + 1);
792
793	return (0);
794}
795
796int
797dmu_get_recursive_snaps_nvl(char *fsname, const char *snapname,
798    nvlist_t *snaps)
799{
800	struct destroyarg *da;
801	int err;
802
803	da = kmem_zalloc(sizeof (struct destroyarg), KM_SLEEP);
804	da->nvl = snaps;
805	da->snapname = snapname;
806	err = dmu_objset_find(fsname, dsl_check_snap_cb, da,
807	    DS_FIND_CHILDREN);
808	kmem_free(da, sizeof (struct destroyarg));
809
810	return (err);
811}
812/* FreeBSD ioctl compat end */
813#endif /* __FreeBSD__ */
814
815/*
816 * The unique space in the head dataset can be calculated by subtracting
817 * the space used in the most recent snapshot, that is still being used
818 * in this file system, from the space currently in use.  To figure out
819 * the space in the most recent snapshot still in use, we need to take
820 * the total space used in the snapshot and subtract out the space that
821 * has been freed up since the snapshot was taken.
822 */
823void
824dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds)
825{
826	uint64_t mrs_used;
827	uint64_t dlused, dlcomp, dluncomp;
828
829	ASSERT(!dsl_dataset_is_snapshot(ds));
830
831	if (ds->ds_phys->ds_prev_snap_obj != 0)
832		mrs_used = ds->ds_prev->ds_phys->ds_referenced_bytes;
833	else
834		mrs_used = 0;
835
836	dsl_deadlist_space(&ds->ds_deadlist, &dlused, &dlcomp, &dluncomp);
837
838	ASSERT3U(dlused, <=, mrs_used);
839	ds->ds_phys->ds_unique_bytes =
840	    ds->ds_phys->ds_referenced_bytes - (mrs_used - dlused);
841
842	if (spa_version(ds->ds_dir->dd_pool->dp_spa) >=
843	    SPA_VERSION_UNIQUE_ACCURATE)
844		ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
845}
846
847void
848dsl_dataset_remove_from_next_clones(dsl_dataset_t *ds, uint64_t obj,
849    dmu_tx_t *tx)
850{
851	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
852	uint64_t count;
853	int err;
854
855	ASSERT(ds->ds_phys->ds_num_children >= 2);
856	err = zap_remove_int(mos, ds->ds_phys->ds_next_clones_obj, obj, tx);
857	/*
858	 * The err should not be ENOENT, but a bug in a previous version
859	 * of the code could cause upgrade_clones_cb() to not set
860	 * ds_next_snap_obj when it should, leading to a missing entry.
861	 * If we knew that the pool was created after
862	 * SPA_VERSION_NEXT_CLONES, we could assert that it isn't
863	 * ENOENT.  However, at least we can check that we don't have
864	 * too many entries in the next_clones_obj even after failing to
865	 * remove this one.
866	 */
867	if (err != ENOENT)
868		VERIFY0(err);
869	ASSERT0(zap_count(mos, ds->ds_phys->ds_next_clones_obj,
870	    &count));
871	ASSERT3U(count, <=, ds->ds_phys->ds_num_children - 2);
872}
873
874
875blkptr_t *
876dsl_dataset_get_blkptr(dsl_dataset_t *ds)
877{
878	return (&ds->ds_phys->ds_bp);
879}
880
881void
882dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx)
883{
884	ASSERT(dmu_tx_is_syncing(tx));
885	/* If it's the meta-objset, set dp_meta_rootbp */
886	if (ds == NULL) {
887		tx->tx_pool->dp_meta_rootbp = *bp;
888	} else {
889		dmu_buf_will_dirty(ds->ds_dbuf, tx);
890		ds->ds_phys->ds_bp = *bp;
891	}
892}
893
894spa_t *
895dsl_dataset_get_spa(dsl_dataset_t *ds)
896{
897	return (ds->ds_dir->dd_pool->dp_spa);
898}
899
900void
901dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx)
902{
903	dsl_pool_t *dp;
904
905	if (ds == NULL) /* this is the meta-objset */
906		return;
907
908	ASSERT(ds->ds_objset != NULL);
909
910	if (ds->ds_phys->ds_next_snap_obj != 0)
911		panic("dirtying snapshot!");
912
913	dp = ds->ds_dir->dd_pool;
914
915	if (txg_list_add(&dp->dp_dirty_datasets, ds, tx->tx_txg)) {
916		/* up the hold count until we can be written out */
917		dmu_buf_add_ref(ds->ds_dbuf, ds);
918	}
919}
920
921boolean_t
922dsl_dataset_is_dirty(dsl_dataset_t *ds)
923{
924	for (int t = 0; t < TXG_SIZE; t++) {
925		if (txg_list_member(&ds->ds_dir->dd_pool->dp_dirty_datasets,
926		    ds, t))
927			return (B_TRUE);
928	}
929	return (B_FALSE);
930}
931
932static int
933dsl_dataset_snapshot_reserve_space(dsl_dataset_t *ds, dmu_tx_t *tx)
934{
935	uint64_t asize;
936
937	if (!dmu_tx_is_syncing(tx))
938		return (0);
939
940	/*
941	 * If there's an fs-only reservation, any blocks that might become
942	 * owned by the snapshot dataset must be accommodated by space
943	 * outside of the reservation.
944	 */
945	ASSERT(ds->ds_reserved == 0 || DS_UNIQUE_IS_ACCURATE(ds));
946	asize = MIN(ds->ds_phys->ds_unique_bytes, ds->ds_reserved);
947	if (asize > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE))
948		return (SET_ERROR(ENOSPC));
949
950	/*
951	 * Propagate any reserved space for this snapshot to other
952	 * snapshot checks in this sync group.
953	 */
954	if (asize > 0)
955		dsl_dir_willuse_space(ds->ds_dir, asize, tx);
956
957	return (0);
958}
959
960typedef struct dsl_dataset_snapshot_arg {
961	nvlist_t *ddsa_snaps;
962	nvlist_t *ddsa_props;
963	nvlist_t *ddsa_errors;
964} dsl_dataset_snapshot_arg_t;
965
966int
967dsl_dataset_snapshot_check_impl(dsl_dataset_t *ds, const char *snapname,
968    dmu_tx_t *tx, boolean_t recv)
969{
970	int error;
971	uint64_t value;
972
973	ds->ds_trysnap_txg = tx->tx_txg;
974
975	if (!dmu_tx_is_syncing(tx))
976		return (0);
977
978	/*
979	 * We don't allow multiple snapshots of the same txg.  If there
980	 * is already one, try again.
981	 */
982	if (ds->ds_phys->ds_prev_snap_txg >= tx->tx_txg)
983		return (SET_ERROR(EAGAIN));
984
985	/*
986	 * Check for conflicting snapshot name.
987	 */
988	error = dsl_dataset_snap_lookup(ds, snapname, &value);
989	if (error == 0)
990		return (SET_ERROR(EEXIST));
991	if (error != ENOENT)
992		return (error);
993
994	/*
995	 * We don't allow taking snapshots of inconsistent datasets, such as
996	 * those into which we are currently receiving.  However, if we are
997	 * creating this snapshot as part of a receive, this check will be
998	 * executed atomically with respect to the completion of the receive
999	 * itself but prior to the clearing of DS_FLAG_INCONSISTENT; in this
1000	 * case we ignore this, knowing it will be fixed up for us shortly in
1001	 * dmu_recv_end_sync().
1002	 */
1003	if (!recv && DS_IS_INCONSISTENT(ds))
1004		return (SET_ERROR(EBUSY));
1005
1006	error = dsl_dataset_snapshot_reserve_space(ds, tx);
1007	if (error != 0)
1008		return (error);
1009
1010	return (0);
1011}
1012
1013static int
1014dsl_dataset_snapshot_check(void *arg, dmu_tx_t *tx)
1015{
1016	dsl_dataset_snapshot_arg_t *ddsa = arg;
1017	dsl_pool_t *dp = dmu_tx_pool(tx);
1018	nvpair_t *pair;
1019	int rv = 0;
1020
1021	for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL);
1022	    pair != NULL; pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) {
1023		int error = 0;
1024		dsl_dataset_t *ds;
1025		char *name, *atp;
1026		char dsname[MAXNAMELEN];
1027
1028		name = nvpair_name(pair);
1029		if (strlen(name) >= MAXNAMELEN)
1030			error = SET_ERROR(ENAMETOOLONG);
1031		if (error == 0) {
1032			atp = strchr(name, '@');
1033			if (atp == NULL)
1034				error = SET_ERROR(EINVAL);
1035			if (error == 0)
1036				(void) strlcpy(dsname, name, atp - name + 1);
1037		}
1038		if (error == 0)
1039			error = dsl_dataset_hold(dp, dsname, FTAG, &ds);
1040		if (error == 0) {
1041			error = dsl_dataset_snapshot_check_impl(ds,
1042			    atp + 1, tx, B_FALSE);
1043			dsl_dataset_rele(ds, FTAG);
1044		}
1045
1046		if (error != 0) {
1047			if (ddsa->ddsa_errors != NULL) {
1048				fnvlist_add_int32(ddsa->ddsa_errors,
1049				    name, error);
1050			}
1051			rv = error;
1052		}
1053	}
1054	return (rv);
1055}
1056
1057void
1058dsl_dataset_snapshot_sync_impl(dsl_dataset_t *ds, const char *snapname,
1059    dmu_tx_t *tx)
1060{
1061	static zil_header_t zero_zil;
1062
1063	dsl_pool_t *dp = ds->ds_dir->dd_pool;
1064	dmu_buf_t *dbuf;
1065	dsl_dataset_phys_t *dsphys;
1066	uint64_t dsobj, crtxg;
1067	objset_t *mos = dp->dp_meta_objset;
1068	objset_t *os;
1069
1070	ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
1071
1072	/*
1073	 * If we are on an old pool, the zil must not be active, in which
1074	 * case it will be zeroed.  Usually zil_suspend() accomplishes this.
1075	 */
1076	ASSERT(spa_version(dmu_tx_pool(tx)->dp_spa) >= SPA_VERSION_FAST_SNAP ||
1077	    dmu_objset_from_ds(ds, &os) != 0 ||
1078	    bcmp(&os->os_phys->os_zil_header, &zero_zil,
1079	    sizeof (zero_zil)) == 0);
1080
1081
1082	/*
1083	 * The origin's ds_creation_txg has to be < TXG_INITIAL
1084	 */
1085	if (strcmp(snapname, ORIGIN_DIR_NAME) == 0)
1086		crtxg = 1;
1087	else
1088		crtxg = tx->tx_txg;
1089
1090	dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
1091	    DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
1092	VERIFY0(dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
1093	dmu_buf_will_dirty(dbuf, tx);
1094	dsphys = dbuf->db_data;
1095	bzero(dsphys, sizeof (dsl_dataset_phys_t));
1096	dsphys->ds_dir_obj = ds->ds_dir->dd_object;
1097	dsphys->ds_fsid_guid = unique_create();
1098	do {
1099		(void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
1100		    sizeof (dsphys->ds_guid));
1101	} while (dsphys->ds_guid == 0);
1102	dsphys->ds_prev_snap_obj = ds->ds_phys->ds_prev_snap_obj;
1103	dsphys->ds_prev_snap_txg = ds->ds_phys->ds_prev_snap_txg;
1104	dsphys->ds_next_snap_obj = ds->ds_object;
1105	dsphys->ds_num_children = 1;
1106	dsphys->ds_creation_time = gethrestime_sec();
1107	dsphys->ds_creation_txg = crtxg;
1108	dsphys->ds_deadlist_obj = ds->ds_phys->ds_deadlist_obj;
1109	dsphys->ds_referenced_bytes = ds->ds_phys->ds_referenced_bytes;
1110	dsphys->ds_compressed_bytes = ds->ds_phys->ds_compressed_bytes;
1111	dsphys->ds_uncompressed_bytes = ds->ds_phys->ds_uncompressed_bytes;
1112	dsphys->ds_flags = ds->ds_phys->ds_flags;
1113	dsphys->ds_bp = ds->ds_phys->ds_bp;
1114	dmu_buf_rele(dbuf, FTAG);
1115
1116	ASSERT3U(ds->ds_prev != 0, ==, ds->ds_phys->ds_prev_snap_obj != 0);
1117	if (ds->ds_prev) {
1118		uint64_t next_clones_obj =
1119		    ds->ds_prev->ds_phys->ds_next_clones_obj;
1120		ASSERT(ds->ds_prev->ds_phys->ds_next_snap_obj ==
1121		    ds->ds_object ||
1122		    ds->ds_prev->ds_phys->ds_num_children > 1);
1123		if (ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) {
1124			dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
1125			ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==,
1126			    ds->ds_prev->ds_phys->ds_creation_txg);
1127			ds->ds_prev->ds_phys->ds_next_snap_obj = dsobj;
1128		} else if (next_clones_obj != 0) {
1129			dsl_dataset_remove_from_next_clones(ds->ds_prev,
1130			    dsphys->ds_next_snap_obj, tx);
1131			VERIFY0(zap_add_int(mos,
1132			    next_clones_obj, dsobj, tx));
1133		}
1134	}
1135
1136	/*
1137	 * If we have a reference-reservation on this dataset, we will
1138	 * need to increase the amount of refreservation being charged
1139	 * since our unique space is going to zero.
1140	 */
1141	if (ds->ds_reserved) {
1142		int64_t delta;
1143		ASSERT(DS_UNIQUE_IS_ACCURATE(ds));
1144		delta = MIN(ds->ds_phys->ds_unique_bytes, ds->ds_reserved);
1145		dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV,
1146		    delta, 0, 0, tx);
1147	}
1148
1149	dmu_buf_will_dirty(ds->ds_dbuf, tx);
1150	ds->ds_phys->ds_deadlist_obj = dsl_deadlist_clone(&ds->ds_deadlist,
1151	    UINT64_MAX, ds->ds_phys->ds_prev_snap_obj, tx);
1152	dsl_deadlist_close(&ds->ds_deadlist);
1153	dsl_deadlist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj);
1154	dsl_deadlist_add_key(&ds->ds_deadlist,
1155	    ds->ds_phys->ds_prev_snap_txg, tx);
1156
1157	ASSERT3U(ds->ds_phys->ds_prev_snap_txg, <, tx->tx_txg);
1158	ds->ds_phys->ds_prev_snap_obj = dsobj;
1159	ds->ds_phys->ds_prev_snap_txg = crtxg;
1160	ds->ds_phys->ds_unique_bytes = 0;
1161	if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE)
1162		ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
1163
1164	VERIFY0(zap_add(mos, ds->ds_phys->ds_snapnames_zapobj,
1165	    snapname, 8, 1, &dsobj, tx));
1166
1167	if (ds->ds_prev)
1168		dsl_dataset_rele(ds->ds_prev, ds);
1169	VERIFY0(dsl_dataset_hold_obj(dp,
1170	    ds->ds_phys->ds_prev_snap_obj, ds, &ds->ds_prev));
1171
1172	dsl_scan_ds_snapshotted(ds, tx);
1173
1174	dsl_dir_snap_cmtime_update(ds->ds_dir);
1175
1176	spa_history_log_internal_ds(ds->ds_prev, "snapshot", tx, "");
1177}
1178
1179static void
1180dsl_dataset_snapshot_sync(void *arg, dmu_tx_t *tx)
1181{
1182	dsl_dataset_snapshot_arg_t *ddsa = arg;
1183	dsl_pool_t *dp = dmu_tx_pool(tx);
1184	nvpair_t *pair;
1185
1186	for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL);
1187	    pair != NULL; pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) {
1188		dsl_dataset_t *ds;
1189		char *name, *atp;
1190		char dsname[MAXNAMELEN];
1191
1192		name = nvpair_name(pair);
1193		atp = strchr(name, '@');
1194		(void) strlcpy(dsname, name, atp - name + 1);
1195		VERIFY0(dsl_dataset_hold(dp, dsname, FTAG, &ds));
1196
1197		dsl_dataset_snapshot_sync_impl(ds, atp + 1, tx);
1198		if (ddsa->ddsa_props != NULL) {
1199			dsl_props_set_sync_impl(ds->ds_prev,
1200			    ZPROP_SRC_LOCAL, ddsa->ddsa_props, tx);
1201		}
1202		dsl_dataset_rele(ds, FTAG);
1203	}
1204}
1205
1206/*
1207 * The snapshots must all be in the same pool.
1208 * All-or-nothing: if there are any failures, nothing will be modified.
1209 */
1210int
1211dsl_dataset_snapshot(nvlist_t *snaps, nvlist_t *props, nvlist_t *errors)
1212{
1213	dsl_dataset_snapshot_arg_t ddsa;
1214	nvpair_t *pair;
1215	boolean_t needsuspend;
1216	int error;
1217	spa_t *spa;
1218	char *firstname;
1219	nvlist_t *suspended = NULL;
1220
1221	pair = nvlist_next_nvpair(snaps, NULL);
1222	if (pair == NULL)
1223		return (0);
1224	firstname = nvpair_name(pair);
1225
1226	error = spa_open(firstname, &spa, FTAG);
1227	if (error != 0)
1228		return (error);
1229	needsuspend = (spa_version(spa) < SPA_VERSION_FAST_SNAP);
1230	spa_close(spa, FTAG);
1231
1232	if (needsuspend) {
1233		suspended = fnvlist_alloc();
1234		for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
1235		    pair = nvlist_next_nvpair(snaps, pair)) {
1236			char fsname[MAXNAMELEN];
1237			char *snapname = nvpair_name(pair);
1238			char *atp;
1239			void *cookie;
1240
1241			atp = strchr(snapname, '@');
1242			if (atp == NULL) {
1243				error = SET_ERROR(EINVAL);
1244				break;
1245			}
1246			(void) strlcpy(fsname, snapname, atp - snapname + 1);
1247
1248			error = zil_suspend(fsname, &cookie);
1249			if (error != 0)
1250				break;
1251			fnvlist_add_uint64(suspended, fsname,
1252			    (uintptr_t)cookie);
1253		}
1254	}
1255
1256	ddsa.ddsa_snaps = snaps;
1257	ddsa.ddsa_props = props;
1258	ddsa.ddsa_errors = errors;
1259
1260	if (error == 0) {
1261		error = dsl_sync_task(firstname, dsl_dataset_snapshot_check,
1262		    dsl_dataset_snapshot_sync, &ddsa,
1263		    fnvlist_num_pairs(snaps) * 3);
1264	}
1265
1266	if (suspended != NULL) {
1267		for (pair = nvlist_next_nvpair(suspended, NULL); pair != NULL;
1268		    pair = nvlist_next_nvpair(suspended, pair)) {
1269			zil_resume((void *)(uintptr_t)
1270			    fnvpair_value_uint64(pair));
1271		}
1272		fnvlist_free(suspended);
1273	}
1274
1275#ifdef __FreeBSD__
1276#ifdef _KERNEL
1277	if (error == 0) {
1278		for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
1279		    pair = nvlist_next_nvpair(snaps, pair)) {
1280			char *snapname = nvpair_name(pair);
1281			zvol_create_minors(snapname);
1282		}
1283	}
1284#endif
1285#endif
1286	return (error);
1287}
1288
1289typedef struct dsl_dataset_snapshot_tmp_arg {
1290	const char *ddsta_fsname;
1291	const char *ddsta_snapname;
1292	minor_t ddsta_cleanup_minor;
1293	const char *ddsta_htag;
1294} dsl_dataset_snapshot_tmp_arg_t;
1295
1296static int
1297dsl_dataset_snapshot_tmp_check(void *arg, dmu_tx_t *tx)
1298{
1299	dsl_dataset_snapshot_tmp_arg_t *ddsta = arg;
1300	dsl_pool_t *dp = dmu_tx_pool(tx);
1301	dsl_dataset_t *ds;
1302	int error;
1303
1304	error = dsl_dataset_hold(dp, ddsta->ddsta_fsname, FTAG, &ds);
1305	if (error != 0)
1306		return (error);
1307
1308	error = dsl_dataset_snapshot_check_impl(ds, ddsta->ddsta_snapname,
1309	    tx, B_FALSE);
1310	if (error != 0) {
1311		dsl_dataset_rele(ds, FTAG);
1312		return (error);
1313	}
1314
1315	if (spa_version(dp->dp_spa) < SPA_VERSION_USERREFS) {
1316		dsl_dataset_rele(ds, FTAG);
1317		return (SET_ERROR(ENOTSUP));
1318	}
1319	error = dsl_dataset_user_hold_check_one(NULL, ddsta->ddsta_htag,
1320	    B_TRUE, tx);
1321	if (error != 0) {
1322		dsl_dataset_rele(ds, FTAG);
1323		return (error);
1324	}
1325
1326	dsl_dataset_rele(ds, FTAG);
1327	return (0);
1328}
1329
1330static void
1331dsl_dataset_snapshot_tmp_sync(void *arg, dmu_tx_t *tx)
1332{
1333	dsl_dataset_snapshot_tmp_arg_t *ddsta = arg;
1334	dsl_pool_t *dp = dmu_tx_pool(tx);
1335	dsl_dataset_t *ds;
1336
1337	VERIFY0(dsl_dataset_hold(dp, ddsta->ddsta_fsname, FTAG, &ds));
1338
1339	dsl_dataset_snapshot_sync_impl(ds, ddsta->ddsta_snapname, tx);
1340	dsl_dataset_user_hold_sync_one(ds->ds_prev, ddsta->ddsta_htag,
1341	    ddsta->ddsta_cleanup_minor, gethrestime_sec(), tx);
1342	dsl_destroy_snapshot_sync_impl(ds->ds_prev, B_TRUE, tx);
1343
1344	dsl_dataset_rele(ds, FTAG);
1345}
1346
1347int
1348dsl_dataset_snapshot_tmp(const char *fsname, const char *snapname,
1349    minor_t cleanup_minor, const char *htag)
1350{
1351	dsl_dataset_snapshot_tmp_arg_t ddsta;
1352	int error;
1353	spa_t *spa;
1354	boolean_t needsuspend;
1355	void *cookie;
1356
1357	ddsta.ddsta_fsname = fsname;
1358	ddsta.ddsta_snapname = snapname;
1359	ddsta.ddsta_cleanup_minor = cleanup_minor;
1360	ddsta.ddsta_htag = htag;
1361
1362	error = spa_open(fsname, &spa, FTAG);
1363	if (error != 0)
1364		return (error);
1365	needsuspend = (spa_version(spa) < SPA_VERSION_FAST_SNAP);
1366	spa_close(spa, FTAG);
1367
1368	if (needsuspend) {
1369		error = zil_suspend(fsname, &cookie);
1370		if (error != 0)
1371			return (error);
1372	}
1373
1374	error = dsl_sync_task(fsname, dsl_dataset_snapshot_tmp_check,
1375	    dsl_dataset_snapshot_tmp_sync, &ddsta, 3);
1376
1377	if (needsuspend)
1378		zil_resume(cookie);
1379	return (error);
1380}
1381
1382
1383void
1384dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx)
1385{
1386	ASSERT(dmu_tx_is_syncing(tx));
1387	ASSERT(ds->ds_objset != NULL);
1388	ASSERT(ds->ds_phys->ds_next_snap_obj == 0);
1389
1390	/*
1391	 * in case we had to change ds_fsid_guid when we opened it,
1392	 * sync it out now.
1393	 */
1394	dmu_buf_will_dirty(ds->ds_dbuf, tx);
1395	ds->ds_phys->ds_fsid_guid = ds->ds_fsid_guid;
1396
1397	dmu_objset_sync(ds->ds_objset, zio, tx);
1398}
1399
1400static void
1401get_clones_stat(dsl_dataset_t *ds, nvlist_t *nv)
1402{
1403	uint64_t count = 0;
1404	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
1405	zap_cursor_t zc;
1406	zap_attribute_t za;
1407	nvlist_t *propval = fnvlist_alloc();
1408	nvlist_t *val = fnvlist_alloc();
1409
1410	ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
1411
1412	/*
1413	 * There may be missing entries in ds_next_clones_obj
1414	 * due to a bug in a previous version of the code.
1415	 * Only trust it if it has the right number of entries.
1416	 */
1417	if (ds->ds_phys->ds_next_clones_obj != 0) {
1418		ASSERT0(zap_count(mos, ds->ds_phys->ds_next_clones_obj,
1419		    &count));
1420	}
1421	if (count != ds->ds_phys->ds_num_children - 1)
1422		goto fail;
1423	for (zap_cursor_init(&zc, mos, ds->ds_phys->ds_next_clones_obj);
1424	    zap_cursor_retrieve(&zc, &za) == 0;
1425	    zap_cursor_advance(&zc)) {
1426		dsl_dataset_t *clone;
1427		char buf[ZFS_MAXNAMELEN];
1428		VERIFY0(dsl_dataset_hold_obj(ds->ds_dir->dd_pool,
1429		    za.za_first_integer, FTAG, &clone));
1430		dsl_dir_name(clone->ds_dir, buf);
1431		fnvlist_add_boolean(val, buf);
1432		dsl_dataset_rele(clone, FTAG);
1433	}
1434	zap_cursor_fini(&zc);
1435	fnvlist_add_nvlist(propval, ZPROP_VALUE, val);
1436	fnvlist_add_nvlist(nv, zfs_prop_to_name(ZFS_PROP_CLONES), propval);
1437fail:
1438	nvlist_free(val);
1439	nvlist_free(propval);
1440}
1441
1442void
1443dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv)
1444{
1445	dsl_pool_t *dp = ds->ds_dir->dd_pool;
1446	uint64_t refd, avail, uobjs, aobjs, ratio;
1447
1448	ASSERT(dsl_pool_config_held(dp));
1449
1450	ratio = ds->ds_phys->ds_compressed_bytes == 0 ? 100 :
1451	    (ds->ds_phys->ds_uncompressed_bytes * 100 /
1452	    ds->ds_phys->ds_compressed_bytes);
1453
1454	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRATIO, ratio);
1455	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_LOGICALREFERENCED,
1456	    ds->ds_phys->ds_uncompressed_bytes);
1457
1458	if (dsl_dataset_is_snapshot(ds)) {
1459		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO, ratio);
1460		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED,
1461		    ds->ds_phys->ds_unique_bytes);
1462		get_clones_stat(ds, nv);
1463	} else {
1464		dsl_dir_stats(ds->ds_dir, nv);
1465	}
1466
1467	dsl_dataset_space(ds, &refd, &avail, &uobjs, &aobjs);
1468	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_AVAILABLE, avail);
1469	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFERENCED, refd);
1470
1471	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATION,
1472	    ds->ds_phys->ds_creation_time);
1473	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATETXG,
1474	    ds->ds_phys->ds_creation_txg);
1475	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFQUOTA,
1476	    ds->ds_quota);
1477	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRESERVATION,
1478	    ds->ds_reserved);
1479	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_GUID,
1480	    ds->ds_phys->ds_guid);
1481	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_UNIQUE,
1482	    ds->ds_phys->ds_unique_bytes);
1483	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_OBJSETID,
1484	    ds->ds_object);
1485	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERREFS,
1486	    ds->ds_userrefs);
1487	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_DEFER_DESTROY,
1488	    DS_IS_DEFER_DESTROY(ds) ? 1 : 0);
1489
1490	if (ds->ds_phys->ds_prev_snap_obj != 0) {
1491		uint64_t written, comp, uncomp;
1492		dsl_pool_t *dp = ds->ds_dir->dd_pool;
1493		dsl_dataset_t *prev;
1494
1495		int err = dsl_dataset_hold_obj(dp,
1496		    ds->ds_phys->ds_prev_snap_obj, FTAG, &prev);
1497		if (err == 0) {
1498			err = dsl_dataset_space_written(prev, ds, &written,
1499			    &comp, &uncomp);
1500			dsl_dataset_rele(prev, FTAG);
1501			if (err == 0) {
1502				dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_WRITTEN,
1503				    written);
1504			}
1505		}
1506	}
1507}
1508
1509void
1510dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat)
1511{
1512	dsl_pool_t *dp = ds->ds_dir->dd_pool;
1513	ASSERT(dsl_pool_config_held(dp));
1514
1515	stat->dds_creation_txg = ds->ds_phys->ds_creation_txg;
1516	stat->dds_inconsistent = ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT;
1517	stat->dds_guid = ds->ds_phys->ds_guid;
1518	stat->dds_origin[0] = '\0';
1519	if (dsl_dataset_is_snapshot(ds)) {
1520		stat->dds_is_snapshot = B_TRUE;
1521		stat->dds_num_clones = ds->ds_phys->ds_num_children - 1;
1522	} else {
1523		stat->dds_is_snapshot = B_FALSE;
1524		stat->dds_num_clones = 0;
1525
1526		if (dsl_dir_is_clone(ds->ds_dir)) {
1527			dsl_dataset_t *ods;
1528
1529			VERIFY0(dsl_dataset_hold_obj(dp,
1530			    ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &ods));
1531			dsl_dataset_name(ods, stat->dds_origin);
1532			dsl_dataset_rele(ods, FTAG);
1533		}
1534	}
1535}
1536
1537uint64_t
1538dsl_dataset_fsid_guid(dsl_dataset_t *ds)
1539{
1540	return (ds->ds_fsid_guid);
1541}
1542
1543void
1544dsl_dataset_space(dsl_dataset_t *ds,
1545    uint64_t *refdbytesp, uint64_t *availbytesp,
1546    uint64_t *usedobjsp, uint64_t *availobjsp)
1547{
1548	*refdbytesp = ds->ds_phys->ds_referenced_bytes;
1549	*availbytesp = dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE);
1550	if (ds->ds_reserved > ds->ds_phys->ds_unique_bytes)
1551		*availbytesp += ds->ds_reserved - ds->ds_phys->ds_unique_bytes;
1552	if (ds->ds_quota != 0) {
1553		/*
1554		 * Adjust available bytes according to refquota
1555		 */
1556		if (*refdbytesp < ds->ds_quota)
1557			*availbytesp = MIN(*availbytesp,
1558			    ds->ds_quota - *refdbytesp);
1559		else
1560			*availbytesp = 0;
1561	}
1562	*usedobjsp = ds->ds_phys->ds_bp.blk_fill;
1563	*availobjsp = DN_MAX_OBJECT - *usedobjsp;
1564}
1565
1566boolean_t
1567dsl_dataset_modified_since_snap(dsl_dataset_t *ds, dsl_dataset_t *snap)
1568{
1569	dsl_pool_t *dp = ds->ds_dir->dd_pool;
1570
1571	ASSERT(dsl_pool_config_held(dp));
1572	if (snap == NULL)
1573		return (B_FALSE);
1574	if (ds->ds_phys->ds_bp.blk_birth >
1575	    snap->ds_phys->ds_creation_txg) {
1576		objset_t *os, *os_snap;
1577		/*
1578		 * It may be that only the ZIL differs, because it was
1579		 * reset in the head.  Don't count that as being
1580		 * modified.
1581		 */
1582		if (dmu_objset_from_ds(ds, &os) != 0)
1583			return (B_TRUE);
1584		if (dmu_objset_from_ds(snap, &os_snap) != 0)
1585			return (B_TRUE);
1586		return (bcmp(&os->os_phys->os_meta_dnode,
1587		    &os_snap->os_phys->os_meta_dnode,
1588		    sizeof (os->os_phys->os_meta_dnode)) != 0);
1589	}
1590	return (B_FALSE);
1591}
1592
1593typedef struct dsl_dataset_rename_snapshot_arg {
1594	const char *ddrsa_fsname;
1595	const char *ddrsa_oldsnapname;
1596	const char *ddrsa_newsnapname;
1597	boolean_t ddrsa_recursive;
1598	dmu_tx_t *ddrsa_tx;
1599} dsl_dataset_rename_snapshot_arg_t;
1600
1601/* ARGSUSED */
1602static int
1603dsl_dataset_rename_snapshot_check_impl(dsl_pool_t *dp,
1604    dsl_dataset_t *hds, void *arg)
1605{
1606	dsl_dataset_rename_snapshot_arg_t *ddrsa = arg;
1607	int error;
1608	uint64_t val;
1609
1610	error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_oldsnapname, &val);
1611	if (error != 0) {
1612		/* ignore nonexistent snapshots */
1613		return (error == ENOENT ? 0 : error);
1614	}
1615
1616	/* new name should not exist */
1617	error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_newsnapname, &val);
1618	if (error == 0)
1619		error = SET_ERROR(EEXIST);
1620	else if (error == ENOENT)
1621		error = 0;
1622
1623	/* dataset name + 1 for the "@" + the new snapshot name must fit */
1624	if (dsl_dir_namelen(hds->ds_dir) + 1 +
1625	    strlen(ddrsa->ddrsa_newsnapname) >= MAXNAMELEN)
1626		error = SET_ERROR(ENAMETOOLONG);
1627
1628	return (error);
1629}
1630
1631static int
1632dsl_dataset_rename_snapshot_check(void *arg, dmu_tx_t *tx)
1633{
1634	dsl_dataset_rename_snapshot_arg_t *ddrsa = arg;
1635	dsl_pool_t *dp = dmu_tx_pool(tx);
1636	dsl_dataset_t *hds;
1637	int error;
1638
1639	error = dsl_dataset_hold(dp, ddrsa->ddrsa_fsname, FTAG, &hds);
1640	if (error != 0)
1641		return (error);
1642
1643	if (ddrsa->ddrsa_recursive) {
1644		error = dmu_objset_find_dp(dp, hds->ds_dir->dd_object,
1645		    dsl_dataset_rename_snapshot_check_impl, ddrsa,
1646		    DS_FIND_CHILDREN);
1647	} else {
1648		error = dsl_dataset_rename_snapshot_check_impl(dp, hds, ddrsa);
1649	}
1650	dsl_dataset_rele(hds, FTAG);
1651	return (error);
1652}
1653
1654static int
1655dsl_dataset_rename_snapshot_sync_impl(dsl_pool_t *dp,
1656    dsl_dataset_t *hds, void *arg)
1657{
1658#ifdef __FreeBSD__
1659#ifdef _KERNEL
1660	char *oldname, *newname;
1661#endif
1662#endif
1663	dsl_dataset_rename_snapshot_arg_t *ddrsa = arg;
1664	dsl_dataset_t *ds;
1665	uint64_t val;
1666	dmu_tx_t *tx = ddrsa->ddrsa_tx;
1667	int error;
1668
1669	error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_oldsnapname, &val);
1670	ASSERT(error == 0 || error == ENOENT);
1671	if (error == ENOENT) {
1672		/* ignore nonexistent snapshots */
1673		return (0);
1674	}
1675
1676	VERIFY0(dsl_dataset_hold_obj(dp, val, FTAG, &ds));
1677
1678	/* log before we change the name */
1679	spa_history_log_internal_ds(ds, "rename", tx,
1680	    "-> @%s", ddrsa->ddrsa_newsnapname);
1681
1682	VERIFY0(dsl_dataset_snap_remove(hds, ddrsa->ddrsa_oldsnapname, tx));
1683	mutex_enter(&ds->ds_lock);
1684	(void) strcpy(ds->ds_snapname, ddrsa->ddrsa_newsnapname);
1685	mutex_exit(&ds->ds_lock);
1686	VERIFY0(zap_add(dp->dp_meta_objset, hds->ds_phys->ds_snapnames_zapobj,
1687	    ds->ds_snapname, 8, 1, &ds->ds_object, tx));
1688
1689#ifdef __FreeBSD__
1690#ifdef _KERNEL
1691	oldname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
1692	newname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
1693	snprintf(oldname, MAXPATHLEN, "%s@%s", ddrsa->ddrsa_fsname,
1694	    ddrsa->ddrsa_oldsnapname);
1695	snprintf(newname, MAXPATHLEN, "%s@%s", ddrsa->ddrsa_fsname,
1696	    ddrsa->ddrsa_newsnapname);
1697	zfsvfs_update_fromname(oldname, newname);
1698	zvol_rename_minors(oldname, newname);
1699	kmem_free(newname, MAXPATHLEN);
1700	kmem_free(oldname, MAXPATHLEN);
1701#endif
1702#endif
1703	dsl_dataset_rele(ds, FTAG);
1704
1705	return (0);
1706}
1707
1708static void
1709dsl_dataset_rename_snapshot_sync(void *arg, dmu_tx_t *tx)
1710{
1711	dsl_dataset_rename_snapshot_arg_t *ddrsa = arg;
1712	dsl_pool_t *dp = dmu_tx_pool(tx);
1713	dsl_dataset_t *hds;
1714
1715	VERIFY0(dsl_dataset_hold(dp, ddrsa->ddrsa_fsname, FTAG, &hds));
1716	ddrsa->ddrsa_tx = tx;
1717	if (ddrsa->ddrsa_recursive) {
1718		VERIFY0(dmu_objset_find_dp(dp, hds->ds_dir->dd_object,
1719		    dsl_dataset_rename_snapshot_sync_impl, ddrsa,
1720		    DS_FIND_CHILDREN));
1721	} else {
1722		VERIFY0(dsl_dataset_rename_snapshot_sync_impl(dp, hds, ddrsa));
1723	}
1724	dsl_dataset_rele(hds, FTAG);
1725}
1726
1727int
1728dsl_dataset_rename_snapshot(const char *fsname,
1729    const char *oldsnapname, const char *newsnapname, boolean_t recursive)
1730{
1731	dsl_dataset_rename_snapshot_arg_t ddrsa;
1732
1733	ddrsa.ddrsa_fsname = fsname;
1734	ddrsa.ddrsa_oldsnapname = oldsnapname;
1735	ddrsa.ddrsa_newsnapname = newsnapname;
1736	ddrsa.ddrsa_recursive = recursive;
1737
1738	return (dsl_sync_task(fsname, dsl_dataset_rename_snapshot_check,
1739	    dsl_dataset_rename_snapshot_sync, &ddrsa, 1));
1740}
1741
1742/*
1743 * If we're doing an ownership handoff, we need to make sure that there is
1744 * only one long hold on the dataset.  We're not allowed to change anything here
1745 * so we don't permanently release the long hold or regular hold here.  We want
1746 * to do this only when syncing to avoid the dataset unexpectedly going away
1747 * when we release the long hold.
1748 */
1749static int
1750dsl_dataset_handoff_check(dsl_dataset_t *ds, void *owner, dmu_tx_t *tx)
1751{
1752	boolean_t held;
1753
1754	if (!dmu_tx_is_syncing(tx))
1755		return (0);
1756
1757	if (owner != NULL) {
1758		VERIFY3P(ds->ds_owner, ==, owner);
1759		dsl_dataset_long_rele(ds, owner);
1760	}
1761
1762	held = dsl_dataset_long_held(ds);
1763
1764	if (owner != NULL)
1765		dsl_dataset_long_hold(ds, owner);
1766
1767	if (held)
1768		return (SET_ERROR(EBUSY));
1769
1770	return (0);
1771}
1772
1773typedef struct dsl_dataset_rollback_arg {
1774	const char *ddra_fsname;
1775	void *ddra_owner;
1776	nvlist_t *ddra_result;
1777} dsl_dataset_rollback_arg_t;
1778
1779static int
1780dsl_dataset_rollback_check(void *arg, dmu_tx_t *tx)
1781{
1782	dsl_dataset_rollback_arg_t *ddra = arg;
1783	dsl_pool_t *dp = dmu_tx_pool(tx);
1784	dsl_dataset_t *ds;
1785	int64_t unused_refres_delta;
1786	int error;
1787
1788	error = dsl_dataset_hold(dp, ddra->ddra_fsname, FTAG, &ds);
1789	if (error != 0)
1790		return (error);
1791
1792	/* must not be a snapshot */
1793	if (dsl_dataset_is_snapshot(ds)) {
1794		dsl_dataset_rele(ds, FTAG);
1795		return (SET_ERROR(EINVAL));
1796	}
1797
1798	/* must have a most recent snapshot */
1799	if (ds->ds_phys->ds_prev_snap_txg < TXG_INITIAL) {
1800		dsl_dataset_rele(ds, FTAG);
1801		return (SET_ERROR(EINVAL));
1802	}
1803
1804	error = dsl_dataset_handoff_check(ds, ddra->ddra_owner, tx);
1805	if (error != 0) {
1806		dsl_dataset_rele(ds, FTAG);
1807		return (error);
1808	}
1809
1810	/*
1811	 * Check if the snap we are rolling back to uses more than
1812	 * the refquota.
1813	 */
1814	if (ds->ds_quota != 0 &&
1815	    ds->ds_prev->ds_phys->ds_referenced_bytes > ds->ds_quota) {
1816		dsl_dataset_rele(ds, FTAG);
1817		return (SET_ERROR(EDQUOT));
1818	}
1819
1820	/*
1821	 * When we do the clone swap, we will temporarily use more space
1822	 * due to the refreservation (the head will no longer have any
1823	 * unique space, so the entire amount of the refreservation will need
1824	 * to be free).  We will immediately destroy the clone, freeing
1825	 * this space, but the freeing happens over many txg's.
1826	 */
1827	unused_refres_delta = (int64_t)MIN(ds->ds_reserved,
1828	    ds->ds_phys->ds_unique_bytes);
1829
1830	if (unused_refres_delta > 0 &&
1831	    unused_refres_delta >
1832	    dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE)) {
1833		dsl_dataset_rele(ds, FTAG);
1834		return (SET_ERROR(ENOSPC));
1835	}
1836
1837	dsl_dataset_rele(ds, FTAG);
1838	return (0);
1839}
1840
1841static void
1842dsl_dataset_rollback_sync(void *arg, dmu_tx_t *tx)
1843{
1844	dsl_dataset_rollback_arg_t *ddra = arg;
1845	dsl_pool_t *dp = dmu_tx_pool(tx);
1846	dsl_dataset_t *ds, *clone;
1847	uint64_t cloneobj;
1848	char namebuf[ZFS_MAXNAMELEN];
1849
1850	VERIFY0(dsl_dataset_hold(dp, ddra->ddra_fsname, FTAG, &ds));
1851
1852	dsl_dataset_name(ds->ds_prev, namebuf);
1853	fnvlist_add_string(ddra->ddra_result, "target", namebuf);
1854
1855	cloneobj = dsl_dataset_create_sync(ds->ds_dir, "%rollback",
1856	    ds->ds_prev, DS_CREATE_FLAG_NODIRTY, kcred, tx);
1857
1858	VERIFY0(dsl_dataset_hold_obj(dp, cloneobj, FTAG, &clone));
1859
1860	dsl_dataset_clone_swap_sync_impl(clone, ds, tx);
1861	dsl_dataset_zero_zil(ds, tx);
1862
1863	dsl_destroy_head_sync_impl(clone, tx);
1864
1865	dsl_dataset_rele(clone, FTAG);
1866	dsl_dataset_rele(ds, FTAG);
1867}
1868
1869/*
1870 * Rolls back the given filesystem or volume to the most recent snapshot.
1871 * The name of the most recent snapshot will be returned under key "target"
1872 * in the result nvlist.
1873 *
1874 * If owner != NULL:
1875 * - The existing dataset MUST be owned by the specified owner at entry
1876 * - Upon return, dataset will still be held by the same owner, whether we
1877 *   succeed or not.
1878 *
1879 * This mode is required any time the existing filesystem is mounted.  See
1880 * notes above zfs_suspend_fs() for further details.
1881 */
1882int
1883dsl_dataset_rollback(const char *fsname, void *owner, nvlist_t *result)
1884{
1885	dsl_dataset_rollback_arg_t ddra;
1886
1887	ddra.ddra_fsname = fsname;
1888	ddra.ddra_owner = owner;
1889	ddra.ddra_result = result;
1890
1891	return (dsl_sync_task(fsname, dsl_dataset_rollback_check,
1892	    dsl_dataset_rollback_sync, &ddra, 1));
1893}
1894
1895struct promotenode {
1896	list_node_t link;
1897	dsl_dataset_t *ds;
1898};
1899
1900typedef struct dsl_dataset_promote_arg {
1901	const char *ddpa_clonename;
1902	dsl_dataset_t *ddpa_clone;
1903	list_t shared_snaps, origin_snaps, clone_snaps;
1904	dsl_dataset_t *origin_origin; /* origin of the origin */
1905	uint64_t used, comp, uncomp, unique, cloneusedsnap, originusedsnap;
1906	char *err_ds;
1907} dsl_dataset_promote_arg_t;
1908
1909static int snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep);
1910static int promote_hold(dsl_dataset_promote_arg_t *ddpa, dsl_pool_t *dp,
1911    void *tag);
1912static void promote_rele(dsl_dataset_promote_arg_t *ddpa, void *tag);
1913
1914static int
1915dsl_dataset_promote_check(void *arg, dmu_tx_t *tx)
1916{
1917	dsl_dataset_promote_arg_t *ddpa = arg;
1918	dsl_pool_t *dp = dmu_tx_pool(tx);
1919	dsl_dataset_t *hds;
1920	struct promotenode *snap;
1921	dsl_dataset_t *origin_ds;
1922	int err;
1923	uint64_t unused;
1924
1925	err = promote_hold(ddpa, dp, FTAG);
1926	if (err != 0)
1927		return (err);
1928
1929	hds = ddpa->ddpa_clone;
1930
1931	if (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE) {
1932		promote_rele(ddpa, FTAG);
1933		return (SET_ERROR(EXDEV));
1934	}
1935
1936	/*
1937	 * Compute and check the amount of space to transfer.  Since this is
1938	 * so expensive, don't do the preliminary check.
1939	 */
1940	if (!dmu_tx_is_syncing(tx)) {
1941		promote_rele(ddpa, FTAG);
1942		return (0);
1943	}
1944
1945	snap = list_head(&ddpa->shared_snaps);
1946	origin_ds = snap->ds;
1947
1948	/* compute origin's new unique space */
1949	snap = list_tail(&ddpa->clone_snaps);
1950	ASSERT3U(snap->ds->ds_phys->ds_prev_snap_obj, ==, origin_ds->ds_object);
1951	dsl_deadlist_space_range(&snap->ds->ds_deadlist,
1952	    origin_ds->ds_phys->ds_prev_snap_txg, UINT64_MAX,
1953	    &ddpa->unique, &unused, &unused);
1954
1955	/*
1956	 * Walk the snapshots that we are moving
1957	 *
1958	 * Compute space to transfer.  Consider the incremental changes
1959	 * to used by each snapshot:
1960	 * (my used) = (prev's used) + (blocks born) - (blocks killed)
1961	 * So each snapshot gave birth to:
1962	 * (blocks born) = (my used) - (prev's used) + (blocks killed)
1963	 * So a sequence would look like:
1964	 * (uN - u(N-1) + kN) + ... + (u1 - u0 + k1) + (u0 - 0 + k0)
1965	 * Which simplifies to:
1966	 * uN + kN + kN-1 + ... + k1 + k0
1967	 * Note however, if we stop before we reach the ORIGIN we get:
1968	 * uN + kN + kN-1 + ... + kM - uM-1
1969	 */
1970	ddpa->used = origin_ds->ds_phys->ds_referenced_bytes;
1971	ddpa->comp = origin_ds->ds_phys->ds_compressed_bytes;
1972	ddpa->uncomp = origin_ds->ds_phys->ds_uncompressed_bytes;
1973	for (snap = list_head(&ddpa->shared_snaps); snap;
1974	    snap = list_next(&ddpa->shared_snaps, snap)) {
1975		uint64_t val, dlused, dlcomp, dluncomp;
1976		dsl_dataset_t *ds = snap->ds;
1977
1978		/*
1979		 * If there are long holds, we won't be able to evict
1980		 * the objset.
1981		 */
1982		if (dsl_dataset_long_held(ds)) {
1983			err = SET_ERROR(EBUSY);
1984			goto out;
1985		}
1986
1987		/* Check that the snapshot name does not conflict */
1988		VERIFY0(dsl_dataset_get_snapname(ds));
1989		err = dsl_dataset_snap_lookup(hds, ds->ds_snapname, &val);
1990		if (err == 0) {
1991			(void) strcpy(ddpa->err_ds, snap->ds->ds_snapname);
1992			err = SET_ERROR(EEXIST);
1993			goto out;
1994		}
1995		if (err != ENOENT)
1996			goto out;
1997
1998		/* The very first snapshot does not have a deadlist */
1999		if (ds->ds_phys->ds_prev_snap_obj == 0)
2000			continue;
2001
2002		dsl_deadlist_space(&ds->ds_deadlist,
2003		    &dlused, &dlcomp, &dluncomp);
2004		ddpa->used += dlused;
2005		ddpa->comp += dlcomp;
2006		ddpa->uncomp += dluncomp;
2007	}
2008
2009	/*
2010	 * If we are a clone of a clone then we never reached ORIGIN,
2011	 * so we need to subtract out the clone origin's used space.
2012	 */
2013	if (ddpa->origin_origin) {
2014		ddpa->used -= ddpa->origin_origin->ds_phys->ds_referenced_bytes;
2015		ddpa->comp -= ddpa->origin_origin->ds_phys->ds_compressed_bytes;
2016		ddpa->uncomp -=
2017		    ddpa->origin_origin->ds_phys->ds_uncompressed_bytes;
2018	}
2019
2020	/* Check that there is enough space here */
2021	err = dsl_dir_transfer_possible(origin_ds->ds_dir, hds->ds_dir,
2022	    ddpa->used);
2023	if (err != 0)
2024		goto out;
2025
2026	/*
2027	 * Compute the amounts of space that will be used by snapshots
2028	 * after the promotion (for both origin and clone).  For each,
2029	 * it is the amount of space that will be on all of their
2030	 * deadlists (that was not born before their new origin).
2031	 */
2032	if (hds->ds_dir->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) {
2033		uint64_t space;
2034
2035		/*
2036		 * Note, typically this will not be a clone of a clone,
2037		 * so dd_origin_txg will be < TXG_INITIAL, so
2038		 * these snaplist_space() -> dsl_deadlist_space_range()
2039		 * calls will be fast because they do not have to
2040		 * iterate over all bps.
2041		 */
2042		snap = list_head(&ddpa->origin_snaps);
2043		err = snaplist_space(&ddpa->shared_snaps,
2044		    snap->ds->ds_dir->dd_origin_txg, &ddpa->cloneusedsnap);
2045		if (err != 0)
2046			goto out;
2047
2048		err = snaplist_space(&ddpa->clone_snaps,
2049		    snap->ds->ds_dir->dd_origin_txg, &space);
2050		if (err != 0)
2051			goto out;
2052		ddpa->cloneusedsnap += space;
2053	}
2054	if (origin_ds->ds_dir->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) {
2055		err = snaplist_space(&ddpa->origin_snaps,
2056		    origin_ds->ds_phys->ds_creation_txg, &ddpa->originusedsnap);
2057		if (err != 0)
2058			goto out;
2059	}
2060
2061out:
2062	promote_rele(ddpa, FTAG);
2063	return (err);
2064}
2065
2066static void
2067dsl_dataset_promote_sync(void *arg, dmu_tx_t *tx)
2068{
2069	dsl_dataset_promote_arg_t *ddpa = arg;
2070	dsl_pool_t *dp = dmu_tx_pool(tx);
2071	dsl_dataset_t *hds;
2072	struct promotenode *snap;
2073	dsl_dataset_t *origin_ds;
2074	dsl_dataset_t *origin_head;
2075	dsl_dir_t *dd;
2076	dsl_dir_t *odd = NULL;
2077	uint64_t oldnext_obj;
2078	int64_t delta;
2079
2080	VERIFY0(promote_hold(ddpa, dp, FTAG));
2081	hds = ddpa->ddpa_clone;
2082
2083	ASSERT0(hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE);
2084
2085	snap = list_head(&ddpa->shared_snaps);
2086	origin_ds = snap->ds;
2087	dd = hds->ds_dir;
2088
2089	snap = list_head(&ddpa->origin_snaps);
2090	origin_head = snap->ds;
2091
2092	/*
2093	 * We need to explicitly open odd, since origin_ds's dd will be
2094	 * changing.
2095	 */
2096	VERIFY0(dsl_dir_hold_obj(dp, origin_ds->ds_dir->dd_object,
2097	    NULL, FTAG, &odd));
2098
2099	/* change origin's next snap */
2100	dmu_buf_will_dirty(origin_ds->ds_dbuf, tx);
2101	oldnext_obj = origin_ds->ds_phys->ds_next_snap_obj;
2102	snap = list_tail(&ddpa->clone_snaps);
2103	ASSERT3U(snap->ds->ds_phys->ds_prev_snap_obj, ==, origin_ds->ds_object);
2104	origin_ds->ds_phys->ds_next_snap_obj = snap->ds->ds_object;
2105
2106	/* change the origin's next clone */
2107	if (origin_ds->ds_phys->ds_next_clones_obj) {
2108		dsl_dataset_remove_from_next_clones(origin_ds,
2109		    snap->ds->ds_object, tx);
2110		VERIFY0(zap_add_int(dp->dp_meta_objset,
2111		    origin_ds->ds_phys->ds_next_clones_obj,
2112		    oldnext_obj, tx));
2113	}
2114
2115	/* change origin */
2116	dmu_buf_will_dirty(dd->dd_dbuf, tx);
2117	ASSERT3U(dd->dd_phys->dd_origin_obj, ==, origin_ds->ds_object);
2118	dd->dd_phys->dd_origin_obj = odd->dd_phys->dd_origin_obj;
2119	dd->dd_origin_txg = origin_head->ds_dir->dd_origin_txg;
2120	dmu_buf_will_dirty(odd->dd_dbuf, tx);
2121	odd->dd_phys->dd_origin_obj = origin_ds->ds_object;
2122	origin_head->ds_dir->dd_origin_txg =
2123	    origin_ds->ds_phys->ds_creation_txg;
2124
2125	/* change dd_clone entries */
2126	if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
2127		VERIFY0(zap_remove_int(dp->dp_meta_objset,
2128		    odd->dd_phys->dd_clones, hds->ds_object, tx));
2129		VERIFY0(zap_add_int(dp->dp_meta_objset,
2130		    ddpa->origin_origin->ds_dir->dd_phys->dd_clones,
2131		    hds->ds_object, tx));
2132
2133		VERIFY0(zap_remove_int(dp->dp_meta_objset,
2134		    ddpa->origin_origin->ds_dir->dd_phys->dd_clones,
2135		    origin_head->ds_object, tx));
2136		if (dd->dd_phys->dd_clones == 0) {
2137			dd->dd_phys->dd_clones = zap_create(dp->dp_meta_objset,
2138			    DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx);
2139		}
2140		VERIFY0(zap_add_int(dp->dp_meta_objset,
2141		    dd->dd_phys->dd_clones, origin_head->ds_object, tx));
2142	}
2143
2144	/* move snapshots to this dir */
2145	for (snap = list_head(&ddpa->shared_snaps); snap;
2146	    snap = list_next(&ddpa->shared_snaps, snap)) {
2147		dsl_dataset_t *ds = snap->ds;
2148
2149		/*
2150		 * Property callbacks are registered to a particular
2151		 * dsl_dir.  Since ours is changing, evict the objset
2152		 * so that they will be unregistered from the old dsl_dir.
2153		 */
2154		if (ds->ds_objset) {
2155			dmu_objset_evict(ds->ds_objset);
2156			ds->ds_objset = NULL;
2157		}
2158
2159		/* move snap name entry */
2160		VERIFY0(dsl_dataset_get_snapname(ds));
2161		VERIFY0(dsl_dataset_snap_remove(origin_head,
2162		    ds->ds_snapname, tx));
2163		VERIFY0(zap_add(dp->dp_meta_objset,
2164		    hds->ds_phys->ds_snapnames_zapobj, ds->ds_snapname,
2165		    8, 1, &ds->ds_object, tx));
2166
2167		/* change containing dsl_dir */
2168		dmu_buf_will_dirty(ds->ds_dbuf, tx);
2169		ASSERT3U(ds->ds_phys->ds_dir_obj, ==, odd->dd_object);
2170		ds->ds_phys->ds_dir_obj = dd->dd_object;
2171		ASSERT3P(ds->ds_dir, ==, odd);
2172		dsl_dir_rele(ds->ds_dir, ds);
2173		VERIFY0(dsl_dir_hold_obj(dp, dd->dd_object,
2174		    NULL, ds, &ds->ds_dir));
2175
2176		/* move any clone references */
2177		if (ds->ds_phys->ds_next_clones_obj &&
2178		    spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
2179			zap_cursor_t zc;
2180			zap_attribute_t za;
2181
2182			for (zap_cursor_init(&zc, dp->dp_meta_objset,
2183			    ds->ds_phys->ds_next_clones_obj);
2184			    zap_cursor_retrieve(&zc, &za) == 0;
2185			    zap_cursor_advance(&zc)) {
2186				dsl_dataset_t *cnds;
2187				uint64_t o;
2188
2189				if (za.za_first_integer == oldnext_obj) {
2190					/*
2191					 * We've already moved the
2192					 * origin's reference.
2193					 */
2194					continue;
2195				}
2196
2197				VERIFY0(dsl_dataset_hold_obj(dp,
2198				    za.za_first_integer, FTAG, &cnds));
2199				o = cnds->ds_dir->dd_phys->dd_head_dataset_obj;
2200
2201				VERIFY0(zap_remove_int(dp->dp_meta_objset,
2202				    odd->dd_phys->dd_clones, o, tx));
2203				VERIFY0(zap_add_int(dp->dp_meta_objset,
2204				    dd->dd_phys->dd_clones, o, tx));
2205				dsl_dataset_rele(cnds, FTAG);
2206			}
2207			zap_cursor_fini(&zc);
2208		}
2209
2210		ASSERT(!dsl_prop_hascb(ds));
2211	}
2212
2213	/*
2214	 * Change space accounting.
2215	 * Note, pa->*usedsnap and dd_used_breakdown[SNAP] will either
2216	 * both be valid, or both be 0 (resulting in delta == 0).  This
2217	 * is true for each of {clone,origin} independently.
2218	 */
2219
2220	delta = ddpa->cloneusedsnap -
2221	    dd->dd_phys->dd_used_breakdown[DD_USED_SNAP];
2222	ASSERT3S(delta, >=, 0);
2223	ASSERT3U(ddpa->used, >=, delta);
2224	dsl_dir_diduse_space(dd, DD_USED_SNAP, delta, 0, 0, tx);
2225	dsl_dir_diduse_space(dd, DD_USED_HEAD,
2226	    ddpa->used - delta, ddpa->comp, ddpa->uncomp, tx);
2227
2228	delta = ddpa->originusedsnap -
2229	    odd->dd_phys->dd_used_breakdown[DD_USED_SNAP];
2230	ASSERT3S(delta, <=, 0);
2231	ASSERT3U(ddpa->used, >=, -delta);
2232	dsl_dir_diduse_space(odd, DD_USED_SNAP, delta, 0, 0, tx);
2233	dsl_dir_diduse_space(odd, DD_USED_HEAD,
2234	    -ddpa->used - delta, -ddpa->comp, -ddpa->uncomp, tx);
2235
2236	origin_ds->ds_phys->ds_unique_bytes = ddpa->unique;
2237
2238	/* log history record */
2239	spa_history_log_internal_ds(hds, "promote", tx, "");
2240
2241	dsl_dir_rele(odd, FTAG);
2242	promote_rele(ddpa, FTAG);
2243}
2244
2245/*
2246 * Make a list of dsl_dataset_t's for the snapshots between first_obj
2247 * (exclusive) and last_obj (inclusive).  The list will be in reverse
2248 * order (last_obj will be the list_head()).  If first_obj == 0, do all
2249 * snapshots back to this dataset's origin.
2250 */
2251static int
2252snaplist_make(dsl_pool_t *dp,
2253    uint64_t first_obj, uint64_t last_obj, list_t *l, void *tag)
2254{
2255	uint64_t obj = last_obj;
2256
2257	list_create(l, sizeof (struct promotenode),
2258	    offsetof(struct promotenode, link));
2259
2260	while (obj != first_obj) {
2261		dsl_dataset_t *ds;
2262		struct promotenode *snap;
2263		int err;
2264
2265		err = dsl_dataset_hold_obj(dp, obj, tag, &ds);
2266		ASSERT(err != ENOENT);
2267		if (err != 0)
2268			return (err);
2269
2270		if (first_obj == 0)
2271			first_obj = ds->ds_dir->dd_phys->dd_origin_obj;
2272
2273		snap = kmem_alloc(sizeof (*snap), KM_SLEEP);
2274		snap->ds = ds;
2275		list_insert_tail(l, snap);
2276		obj = ds->ds_phys->ds_prev_snap_obj;
2277	}
2278
2279	return (0);
2280}
2281
2282static int
2283snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep)
2284{
2285	struct promotenode *snap;
2286
2287	*spacep = 0;
2288	for (snap = list_head(l); snap; snap = list_next(l, snap)) {
2289		uint64_t used, comp, uncomp;
2290		dsl_deadlist_space_range(&snap->ds->ds_deadlist,
2291		    mintxg, UINT64_MAX, &used, &comp, &uncomp);
2292		*spacep += used;
2293	}
2294	return (0);
2295}
2296
2297static void
2298snaplist_destroy(list_t *l, void *tag)
2299{
2300	struct promotenode *snap;
2301
2302	if (l == NULL || !list_link_active(&l->list_head))
2303		return;
2304
2305	while ((snap = list_tail(l)) != NULL) {
2306		list_remove(l, snap);
2307		dsl_dataset_rele(snap->ds, tag);
2308		kmem_free(snap, sizeof (*snap));
2309	}
2310	list_destroy(l);
2311}
2312
2313static int
2314promote_hold(dsl_dataset_promote_arg_t *ddpa, dsl_pool_t *dp, void *tag)
2315{
2316	int error;
2317	dsl_dir_t *dd;
2318	struct promotenode *snap;
2319
2320	error = dsl_dataset_hold(dp, ddpa->ddpa_clonename, tag,
2321	    &ddpa->ddpa_clone);
2322	if (error != 0)
2323		return (error);
2324	dd = ddpa->ddpa_clone->ds_dir;
2325
2326	if (dsl_dataset_is_snapshot(ddpa->ddpa_clone) ||
2327	    !dsl_dir_is_clone(dd)) {
2328		dsl_dataset_rele(ddpa->ddpa_clone, tag);
2329		return (SET_ERROR(EINVAL));
2330	}
2331
2332	error = snaplist_make(dp, 0, dd->dd_phys->dd_origin_obj,
2333	    &ddpa->shared_snaps, tag);
2334	if (error != 0)
2335		goto out;
2336
2337	error = snaplist_make(dp, 0, ddpa->ddpa_clone->ds_object,
2338	    &ddpa->clone_snaps, tag);
2339	if (error != 0)
2340		goto out;
2341
2342	snap = list_head(&ddpa->shared_snaps);
2343	ASSERT3U(snap->ds->ds_object, ==, dd->dd_phys->dd_origin_obj);
2344	error = snaplist_make(dp, dd->dd_phys->dd_origin_obj,
2345	    snap->ds->ds_dir->dd_phys->dd_head_dataset_obj,
2346	    &ddpa->origin_snaps, tag);
2347	if (error != 0)
2348		goto out;
2349
2350	if (snap->ds->ds_dir->dd_phys->dd_origin_obj != 0) {
2351		error = dsl_dataset_hold_obj(dp,
2352		    snap->ds->ds_dir->dd_phys->dd_origin_obj,
2353		    tag, &ddpa->origin_origin);
2354		if (error != 0)
2355			goto out;
2356	}
2357out:
2358	if (error != 0)
2359		promote_rele(ddpa, tag);
2360	return (error);
2361}
2362
2363static void
2364promote_rele(dsl_dataset_promote_arg_t *ddpa, void *tag)
2365{
2366	snaplist_destroy(&ddpa->shared_snaps, tag);
2367	snaplist_destroy(&ddpa->clone_snaps, tag);
2368	snaplist_destroy(&ddpa->origin_snaps, tag);
2369	if (ddpa->origin_origin != NULL)
2370		dsl_dataset_rele(ddpa->origin_origin, tag);
2371	dsl_dataset_rele(ddpa->ddpa_clone, tag);
2372}
2373
2374/*
2375 * Promote a clone.
2376 *
2377 * If it fails due to a conflicting snapshot name, "conflsnap" will be filled
2378 * in with the name.  (It must be at least MAXNAMELEN bytes long.)
2379 */
2380int
2381dsl_dataset_promote(const char *name, char *conflsnap)
2382{
2383	dsl_dataset_promote_arg_t ddpa = { 0 };
2384	uint64_t numsnaps;
2385	int error;
2386	objset_t *os;
2387
2388	/*
2389	 * We will modify space proportional to the number of
2390	 * snapshots.  Compute numsnaps.
2391	 */
2392	error = dmu_objset_hold(name, FTAG, &os);
2393	if (error != 0)
2394		return (error);
2395	error = zap_count(dmu_objset_pool(os)->dp_meta_objset,
2396	    dmu_objset_ds(os)->ds_phys->ds_snapnames_zapobj, &numsnaps);
2397	dmu_objset_rele(os, FTAG);
2398	if (error != 0)
2399		return (error);
2400
2401	ddpa.ddpa_clonename = name;
2402	ddpa.err_ds = conflsnap;
2403
2404	return (dsl_sync_task(name, dsl_dataset_promote_check,
2405	    dsl_dataset_promote_sync, &ddpa, 2 + numsnaps));
2406}
2407
2408int
2409dsl_dataset_clone_swap_check_impl(dsl_dataset_t *clone,
2410    dsl_dataset_t *origin_head, boolean_t force, void *owner, dmu_tx_t *tx)
2411{
2412	int64_t unused_refres_delta;
2413
2414	/* they should both be heads */
2415	if (dsl_dataset_is_snapshot(clone) ||
2416	    dsl_dataset_is_snapshot(origin_head))
2417		return (SET_ERROR(EINVAL));
2418
2419	/* if we are not forcing, the branch point should be just before them */
2420	if (!force && clone->ds_prev != origin_head->ds_prev)
2421		return (SET_ERROR(EINVAL));
2422
2423	/* clone should be the clone (unless they are unrelated) */
2424	if (clone->ds_prev != NULL &&
2425	    clone->ds_prev != clone->ds_dir->dd_pool->dp_origin_snap &&
2426	    origin_head->ds_dir != clone->ds_prev->ds_dir)
2427		return (SET_ERROR(EINVAL));
2428
2429	/* the clone should be a child of the origin */
2430	if (clone->ds_dir->dd_parent != origin_head->ds_dir)
2431		return (SET_ERROR(EINVAL));
2432
2433	/* origin_head shouldn't be modified unless 'force' */
2434	if (!force &&
2435	    dsl_dataset_modified_since_snap(origin_head, origin_head->ds_prev))
2436		return (SET_ERROR(ETXTBSY));
2437
2438	/* origin_head should have no long holds (e.g. is not mounted) */
2439	if (dsl_dataset_handoff_check(origin_head, owner, tx))
2440		return (SET_ERROR(EBUSY));
2441
2442	/* check amount of any unconsumed refreservation */
2443	unused_refres_delta =
2444	    (int64_t)MIN(origin_head->ds_reserved,
2445	    origin_head->ds_phys->ds_unique_bytes) -
2446	    (int64_t)MIN(origin_head->ds_reserved,
2447	    clone->ds_phys->ds_unique_bytes);
2448
2449	if (unused_refres_delta > 0 &&
2450	    unused_refres_delta >
2451	    dsl_dir_space_available(origin_head->ds_dir, NULL, 0, TRUE))
2452		return (SET_ERROR(ENOSPC));
2453
2454	/* clone can't be over the head's refquota */
2455	if (origin_head->ds_quota != 0 &&
2456	    clone->ds_phys->ds_referenced_bytes > origin_head->ds_quota)
2457		return (SET_ERROR(EDQUOT));
2458
2459	return (0);
2460}
2461
2462void
2463dsl_dataset_clone_swap_sync_impl(dsl_dataset_t *clone,
2464    dsl_dataset_t *origin_head, dmu_tx_t *tx)
2465{
2466	dsl_pool_t *dp = dmu_tx_pool(tx);
2467	int64_t unused_refres_delta;
2468
2469	ASSERT(clone->ds_reserved == 0);
2470	ASSERT(origin_head->ds_quota == 0 ||
2471	    clone->ds_phys->ds_unique_bytes <= origin_head->ds_quota);
2472	ASSERT3P(clone->ds_prev, ==, origin_head->ds_prev);
2473
2474	dmu_buf_will_dirty(clone->ds_dbuf, tx);
2475	dmu_buf_will_dirty(origin_head->ds_dbuf, tx);
2476
2477	if (clone->ds_objset != NULL) {
2478		dmu_objset_evict(clone->ds_objset);
2479		clone->ds_objset = NULL;
2480	}
2481
2482	if (origin_head->ds_objset != NULL) {
2483		dmu_objset_evict(origin_head->ds_objset);
2484		origin_head->ds_objset = NULL;
2485	}
2486
2487	unused_refres_delta =
2488	    (int64_t)MIN(origin_head->ds_reserved,
2489	    origin_head->ds_phys->ds_unique_bytes) -
2490	    (int64_t)MIN(origin_head->ds_reserved,
2491	    clone->ds_phys->ds_unique_bytes);
2492
2493	/*
2494	 * Reset origin's unique bytes, if it exists.
2495	 */
2496	if (clone->ds_prev) {
2497		dsl_dataset_t *origin = clone->ds_prev;
2498		uint64_t comp, uncomp;
2499
2500		dmu_buf_will_dirty(origin->ds_dbuf, tx);
2501		dsl_deadlist_space_range(&clone->ds_deadlist,
2502		    origin->ds_phys->ds_prev_snap_txg, UINT64_MAX,
2503		    &origin->ds_phys->ds_unique_bytes, &comp, &uncomp);
2504	}
2505
2506	/* swap blkptrs */
2507	{
2508		blkptr_t tmp;
2509		tmp = origin_head->ds_phys->ds_bp;
2510		origin_head->ds_phys->ds_bp = clone->ds_phys->ds_bp;
2511		clone->ds_phys->ds_bp = tmp;
2512	}
2513
2514	/* set dd_*_bytes */
2515	{
2516		int64_t dused, dcomp, duncomp;
2517		uint64_t cdl_used, cdl_comp, cdl_uncomp;
2518		uint64_t odl_used, odl_comp, odl_uncomp;
2519
2520		ASSERT3U(clone->ds_dir->dd_phys->
2521		    dd_used_breakdown[DD_USED_SNAP], ==, 0);
2522
2523		dsl_deadlist_space(&clone->ds_deadlist,
2524		    &cdl_used, &cdl_comp, &cdl_uncomp);
2525		dsl_deadlist_space(&origin_head->ds_deadlist,
2526		    &odl_used, &odl_comp, &odl_uncomp);
2527
2528		dused = clone->ds_phys->ds_referenced_bytes + cdl_used -
2529		    (origin_head->ds_phys->ds_referenced_bytes + odl_used);
2530		dcomp = clone->ds_phys->ds_compressed_bytes + cdl_comp -
2531		    (origin_head->ds_phys->ds_compressed_bytes + odl_comp);
2532		duncomp = clone->ds_phys->ds_uncompressed_bytes +
2533		    cdl_uncomp -
2534		    (origin_head->ds_phys->ds_uncompressed_bytes + odl_uncomp);
2535
2536		dsl_dir_diduse_space(origin_head->ds_dir, DD_USED_HEAD,
2537		    dused, dcomp, duncomp, tx);
2538		dsl_dir_diduse_space(clone->ds_dir, DD_USED_HEAD,
2539		    -dused, -dcomp, -duncomp, tx);
2540
2541		/*
2542		 * The difference in the space used by snapshots is the
2543		 * difference in snapshot space due to the head's
2544		 * deadlist (since that's the only thing that's
2545		 * changing that affects the snapused).
2546		 */
2547		dsl_deadlist_space_range(&clone->ds_deadlist,
2548		    origin_head->ds_dir->dd_origin_txg, UINT64_MAX,
2549		    &cdl_used, &cdl_comp, &cdl_uncomp);
2550		dsl_deadlist_space_range(&origin_head->ds_deadlist,
2551		    origin_head->ds_dir->dd_origin_txg, UINT64_MAX,
2552		    &odl_used, &odl_comp, &odl_uncomp);
2553		dsl_dir_transfer_space(origin_head->ds_dir, cdl_used - odl_used,
2554		    DD_USED_HEAD, DD_USED_SNAP, tx);
2555	}
2556
2557	/* swap ds_*_bytes */
2558	SWITCH64(origin_head->ds_phys->ds_referenced_bytes,
2559	    clone->ds_phys->ds_referenced_bytes);
2560	SWITCH64(origin_head->ds_phys->ds_compressed_bytes,
2561	    clone->ds_phys->ds_compressed_bytes);
2562	SWITCH64(origin_head->ds_phys->ds_uncompressed_bytes,
2563	    clone->ds_phys->ds_uncompressed_bytes);
2564	SWITCH64(origin_head->ds_phys->ds_unique_bytes,
2565	    clone->ds_phys->ds_unique_bytes);
2566
2567	/* apply any parent delta for change in unconsumed refreservation */
2568	dsl_dir_diduse_space(origin_head->ds_dir, DD_USED_REFRSRV,
2569	    unused_refres_delta, 0, 0, tx);
2570
2571	/*
2572	 * Swap deadlists.
2573	 */
2574	dsl_deadlist_close(&clone->ds_deadlist);
2575	dsl_deadlist_close(&origin_head->ds_deadlist);
2576	SWITCH64(origin_head->ds_phys->ds_deadlist_obj,
2577	    clone->ds_phys->ds_deadlist_obj);
2578	dsl_deadlist_open(&clone->ds_deadlist, dp->dp_meta_objset,
2579	    clone->ds_phys->ds_deadlist_obj);
2580	dsl_deadlist_open(&origin_head->ds_deadlist, dp->dp_meta_objset,
2581	    origin_head->ds_phys->ds_deadlist_obj);
2582
2583	dsl_scan_ds_clone_swapped(origin_head, clone, tx);
2584
2585	spa_history_log_internal_ds(clone, "clone swap", tx,
2586	    "parent=%s", origin_head->ds_dir->dd_myname);
2587}
2588
2589/*
2590 * Given a pool name and a dataset object number in that pool,
2591 * return the name of that dataset.
2592 */
2593int
2594dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf)
2595{
2596	dsl_pool_t *dp;
2597	dsl_dataset_t *ds;
2598	int error;
2599
2600	error = dsl_pool_hold(pname, FTAG, &dp);
2601	if (error != 0)
2602		return (error);
2603
2604	error = dsl_dataset_hold_obj(dp, obj, FTAG, &ds);
2605	if (error == 0) {
2606		dsl_dataset_name(ds, buf);
2607		dsl_dataset_rele(ds, FTAG);
2608	}
2609	dsl_pool_rele(dp, FTAG);
2610
2611	return (error);
2612}
2613
2614int
2615dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota,
2616    uint64_t asize, uint64_t inflight, uint64_t *used, uint64_t *ref_rsrv)
2617{
2618	int error = 0;
2619
2620	ASSERT3S(asize, >, 0);
2621
2622	/*
2623	 * *ref_rsrv is the portion of asize that will come from any
2624	 * unconsumed refreservation space.
2625	 */
2626	*ref_rsrv = 0;
2627
2628	mutex_enter(&ds->ds_lock);
2629	/*
2630	 * Make a space adjustment for reserved bytes.
2631	 */
2632	if (ds->ds_reserved > ds->ds_phys->ds_unique_bytes) {
2633		ASSERT3U(*used, >=,
2634		    ds->ds_reserved - ds->ds_phys->ds_unique_bytes);
2635		*used -= (ds->ds_reserved - ds->ds_phys->ds_unique_bytes);
2636		*ref_rsrv =
2637		    asize - MIN(asize, parent_delta(ds, asize + inflight));
2638	}
2639
2640	if (!check_quota || ds->ds_quota == 0) {
2641		mutex_exit(&ds->ds_lock);
2642		return (0);
2643	}
2644	/*
2645	 * If they are requesting more space, and our current estimate
2646	 * is over quota, they get to try again unless the actual
2647	 * on-disk is over quota and there are no pending changes (which
2648	 * may free up space for us).
2649	 */
2650	if (ds->ds_phys->ds_referenced_bytes + inflight >= ds->ds_quota) {
2651		if (inflight > 0 ||
2652		    ds->ds_phys->ds_referenced_bytes < ds->ds_quota)
2653			error = SET_ERROR(ERESTART);
2654		else
2655			error = SET_ERROR(EDQUOT);
2656	}
2657	mutex_exit(&ds->ds_lock);
2658
2659	return (error);
2660}
2661
2662typedef struct dsl_dataset_set_qr_arg {
2663	const char *ddsqra_name;
2664	zprop_source_t ddsqra_source;
2665	uint64_t ddsqra_value;
2666} dsl_dataset_set_qr_arg_t;
2667
2668
2669/* ARGSUSED */
2670static int
2671dsl_dataset_set_refquota_check(void *arg, dmu_tx_t *tx)
2672{
2673	dsl_dataset_set_qr_arg_t *ddsqra = arg;
2674	dsl_pool_t *dp = dmu_tx_pool(tx);
2675	dsl_dataset_t *ds;
2676	int error;
2677	uint64_t newval;
2678
2679	if (spa_version(dp->dp_spa) < SPA_VERSION_REFQUOTA)
2680		return (SET_ERROR(ENOTSUP));
2681
2682	error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds);
2683	if (error != 0)
2684		return (error);
2685
2686	if (dsl_dataset_is_snapshot(ds)) {
2687		dsl_dataset_rele(ds, FTAG);
2688		return (SET_ERROR(EINVAL));
2689	}
2690
2691	error = dsl_prop_predict(ds->ds_dir,
2692	    zfs_prop_to_name(ZFS_PROP_REFQUOTA),
2693	    ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval);
2694	if (error != 0) {
2695		dsl_dataset_rele(ds, FTAG);
2696		return (error);
2697	}
2698
2699	if (newval == 0) {
2700		dsl_dataset_rele(ds, FTAG);
2701		return (0);
2702	}
2703
2704	if (newval < ds->ds_phys->ds_referenced_bytes ||
2705	    newval < ds->ds_reserved) {
2706		dsl_dataset_rele(ds, FTAG);
2707		return (SET_ERROR(ENOSPC));
2708	}
2709
2710	dsl_dataset_rele(ds, FTAG);
2711	return (0);
2712}
2713
2714static void
2715dsl_dataset_set_refquota_sync(void *arg, dmu_tx_t *tx)
2716{
2717	dsl_dataset_set_qr_arg_t *ddsqra = arg;
2718	dsl_pool_t *dp = dmu_tx_pool(tx);
2719	dsl_dataset_t *ds;
2720	uint64_t newval;
2721
2722	VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds));
2723
2724	dsl_prop_set_sync_impl(ds,
2725	    zfs_prop_to_name(ZFS_PROP_REFQUOTA),
2726	    ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1,
2727	    &ddsqra->ddsqra_value, tx);
2728
2729	VERIFY0(dsl_prop_get_int_ds(ds,
2730	    zfs_prop_to_name(ZFS_PROP_REFQUOTA), &newval));
2731
2732	if (ds->ds_quota != newval) {
2733		dmu_buf_will_dirty(ds->ds_dbuf, tx);
2734		ds->ds_quota = newval;
2735	}
2736	dsl_dataset_rele(ds, FTAG);
2737}
2738
2739int
2740dsl_dataset_set_refquota(const char *dsname, zprop_source_t source,
2741    uint64_t refquota)
2742{
2743	dsl_dataset_set_qr_arg_t ddsqra;
2744
2745	ddsqra.ddsqra_name = dsname;
2746	ddsqra.ddsqra_source = source;
2747	ddsqra.ddsqra_value = refquota;
2748
2749	return (dsl_sync_task(dsname, dsl_dataset_set_refquota_check,
2750	    dsl_dataset_set_refquota_sync, &ddsqra, 0));
2751}
2752
2753static int
2754dsl_dataset_set_refreservation_check(void *arg, dmu_tx_t *tx)
2755{
2756	dsl_dataset_set_qr_arg_t *ddsqra = arg;
2757	dsl_pool_t *dp = dmu_tx_pool(tx);
2758	dsl_dataset_t *ds;
2759	int error;
2760	uint64_t newval, unique;
2761
2762	if (spa_version(dp->dp_spa) < SPA_VERSION_REFRESERVATION)
2763		return (SET_ERROR(ENOTSUP));
2764
2765	error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds);
2766	if (error != 0)
2767		return (error);
2768
2769	if (dsl_dataset_is_snapshot(ds)) {
2770		dsl_dataset_rele(ds, FTAG);
2771		return (SET_ERROR(EINVAL));
2772	}
2773
2774	error = dsl_prop_predict(ds->ds_dir,
2775	    zfs_prop_to_name(ZFS_PROP_REFRESERVATION),
2776	    ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval);
2777	if (error != 0) {
2778		dsl_dataset_rele(ds, FTAG);
2779		return (error);
2780	}
2781
2782	/*
2783	 * If we are doing the preliminary check in open context, the
2784	 * space estimates may be inaccurate.
2785	 */
2786	if (!dmu_tx_is_syncing(tx)) {
2787		dsl_dataset_rele(ds, FTAG);
2788		return (0);
2789	}
2790
2791	mutex_enter(&ds->ds_lock);
2792	if (!DS_UNIQUE_IS_ACCURATE(ds))
2793		dsl_dataset_recalc_head_uniq(ds);
2794	unique = ds->ds_phys->ds_unique_bytes;
2795	mutex_exit(&ds->ds_lock);
2796
2797	if (MAX(unique, newval) > MAX(unique, ds->ds_reserved)) {
2798		uint64_t delta = MAX(unique, newval) -
2799		    MAX(unique, ds->ds_reserved);
2800
2801		if (delta >
2802		    dsl_dir_space_available(ds->ds_dir, NULL, 0, B_TRUE) ||
2803		    (ds->ds_quota > 0 && newval > ds->ds_quota)) {
2804			dsl_dataset_rele(ds, FTAG);
2805			return (SET_ERROR(ENOSPC));
2806		}
2807	}
2808
2809	dsl_dataset_rele(ds, FTAG);
2810	return (0);
2811}
2812
2813void
2814dsl_dataset_set_refreservation_sync_impl(dsl_dataset_t *ds,
2815    zprop_source_t source, uint64_t value, dmu_tx_t *tx)
2816{
2817	uint64_t newval;
2818	uint64_t unique;
2819	int64_t delta;
2820
2821	dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_REFRESERVATION),
2822	    source, sizeof (value), 1, &value, tx);
2823
2824	VERIFY0(dsl_prop_get_int_ds(ds,
2825	    zfs_prop_to_name(ZFS_PROP_REFRESERVATION), &newval));
2826
2827	dmu_buf_will_dirty(ds->ds_dbuf, tx);
2828	mutex_enter(&ds->ds_dir->dd_lock);
2829	mutex_enter(&ds->ds_lock);
2830	ASSERT(DS_UNIQUE_IS_ACCURATE(ds));
2831	unique = ds->ds_phys->ds_unique_bytes;
2832	delta = MAX(0, (int64_t)(newval - unique)) -
2833	    MAX(0, (int64_t)(ds->ds_reserved - unique));
2834	ds->ds_reserved = newval;
2835	mutex_exit(&ds->ds_lock);
2836
2837	dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, delta, 0, 0, tx);
2838	mutex_exit(&ds->ds_dir->dd_lock);
2839}
2840
2841static void
2842dsl_dataset_set_refreservation_sync(void *arg, dmu_tx_t *tx)
2843{
2844	dsl_dataset_set_qr_arg_t *ddsqra = arg;
2845	dsl_pool_t *dp = dmu_tx_pool(tx);
2846	dsl_dataset_t *ds;
2847
2848	VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds));
2849	dsl_dataset_set_refreservation_sync_impl(ds,
2850	    ddsqra->ddsqra_source, ddsqra->ddsqra_value, tx);
2851	dsl_dataset_rele(ds, FTAG);
2852}
2853
2854int
2855dsl_dataset_set_refreservation(const char *dsname, zprop_source_t source,
2856    uint64_t refreservation)
2857{
2858	dsl_dataset_set_qr_arg_t ddsqra;
2859
2860	ddsqra.ddsqra_name = dsname;
2861	ddsqra.ddsqra_source = source;
2862	ddsqra.ddsqra_value = refreservation;
2863
2864	return (dsl_sync_task(dsname, dsl_dataset_set_refreservation_check,
2865	    dsl_dataset_set_refreservation_sync, &ddsqra, 0));
2866}
2867
2868/*
2869 * Return (in *usedp) the amount of space written in new that is not
2870 * present in oldsnap.  New may be a snapshot or the head.  Old must be
2871 * a snapshot before new, in new's filesystem (or its origin).  If not then
2872 * fail and return EINVAL.
2873 *
2874 * The written space is calculated by considering two components:  First, we
2875 * ignore any freed space, and calculate the written as new's used space
2876 * minus old's used space.  Next, we add in the amount of space that was freed
2877 * between the two snapshots, thus reducing new's used space relative to old's.
2878 * Specifically, this is the space that was born before old->ds_creation_txg,
2879 * and freed before new (ie. on new's deadlist or a previous deadlist).
2880 *
2881 * space freed                         [---------------------]
2882 * snapshots                       ---O-------O--------O-------O------
2883 *                                         oldsnap            new
2884 */
2885int
2886dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new,
2887    uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
2888{
2889	int err = 0;
2890	uint64_t snapobj;
2891	dsl_pool_t *dp = new->ds_dir->dd_pool;
2892
2893	ASSERT(dsl_pool_config_held(dp));
2894
2895	*usedp = 0;
2896	*usedp += new->ds_phys->ds_referenced_bytes;
2897	*usedp -= oldsnap->ds_phys->ds_referenced_bytes;
2898
2899	*compp = 0;
2900	*compp += new->ds_phys->ds_compressed_bytes;
2901	*compp -= oldsnap->ds_phys->ds_compressed_bytes;
2902
2903	*uncompp = 0;
2904	*uncompp += new->ds_phys->ds_uncompressed_bytes;
2905	*uncompp -= oldsnap->ds_phys->ds_uncompressed_bytes;
2906
2907	snapobj = new->ds_object;
2908	while (snapobj != oldsnap->ds_object) {
2909		dsl_dataset_t *snap;
2910		uint64_t used, comp, uncomp;
2911
2912		if (snapobj == new->ds_object) {
2913			snap = new;
2914		} else {
2915			err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &snap);
2916			if (err != 0)
2917				break;
2918		}
2919
2920		if (snap->ds_phys->ds_prev_snap_txg ==
2921		    oldsnap->ds_phys->ds_creation_txg) {
2922			/*
2923			 * The blocks in the deadlist can not be born after
2924			 * ds_prev_snap_txg, so get the whole deadlist space,
2925			 * which is more efficient (especially for old-format
2926			 * deadlists).  Unfortunately the deadlist code
2927			 * doesn't have enough information to make this
2928			 * optimization itself.
2929			 */
2930			dsl_deadlist_space(&snap->ds_deadlist,
2931			    &used, &comp, &uncomp);
2932		} else {
2933			dsl_deadlist_space_range(&snap->ds_deadlist,
2934			    0, oldsnap->ds_phys->ds_creation_txg,
2935			    &used, &comp, &uncomp);
2936		}
2937		*usedp += used;
2938		*compp += comp;
2939		*uncompp += uncomp;
2940
2941		/*
2942		 * If we get to the beginning of the chain of snapshots
2943		 * (ds_prev_snap_obj == 0) before oldsnap, then oldsnap
2944		 * was not a snapshot of/before new.
2945		 */
2946		snapobj = snap->ds_phys->ds_prev_snap_obj;
2947		if (snap != new)
2948			dsl_dataset_rele(snap, FTAG);
2949		if (snapobj == 0) {
2950			err = SET_ERROR(EINVAL);
2951			break;
2952		}
2953
2954	}
2955	return (err);
2956}
2957
2958/*
2959 * Return (in *usedp) the amount of space that will be reclaimed if firstsnap,
2960 * lastsnap, and all snapshots in between are deleted.
2961 *
2962 * blocks that would be freed            [---------------------------]
2963 * snapshots                       ---O-------O--------O-------O--------O
2964 *                                        firstsnap        lastsnap
2965 *
2966 * This is the set of blocks that were born after the snap before firstsnap,
2967 * (birth > firstsnap->prev_snap_txg) and died before the snap after the
2968 * last snap (ie, is on lastsnap->ds_next->ds_deadlist or an earlier deadlist).
2969 * We calculate this by iterating over the relevant deadlists (from the snap
2970 * after lastsnap, backward to the snap after firstsnap), summing up the
2971 * space on the deadlist that was born after the snap before firstsnap.
2972 */
2973int
2974dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap,
2975    dsl_dataset_t *lastsnap,
2976    uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
2977{
2978	int err = 0;
2979	uint64_t snapobj;
2980	dsl_pool_t *dp = firstsnap->ds_dir->dd_pool;
2981
2982	ASSERT(dsl_dataset_is_snapshot(firstsnap));
2983	ASSERT(dsl_dataset_is_snapshot(lastsnap));
2984
2985	/*
2986	 * Check that the snapshots are in the same dsl_dir, and firstsnap
2987	 * is before lastsnap.
2988	 */
2989	if (firstsnap->ds_dir != lastsnap->ds_dir ||
2990	    firstsnap->ds_phys->ds_creation_txg >
2991	    lastsnap->ds_phys->ds_creation_txg)
2992		return (SET_ERROR(EINVAL));
2993
2994	*usedp = *compp = *uncompp = 0;
2995
2996	snapobj = lastsnap->ds_phys->ds_next_snap_obj;
2997	while (snapobj != firstsnap->ds_object) {
2998		dsl_dataset_t *ds;
2999		uint64_t used, comp, uncomp;
3000
3001		err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &ds);
3002		if (err != 0)
3003			break;
3004
3005		dsl_deadlist_space_range(&ds->ds_deadlist,
3006		    firstsnap->ds_phys->ds_prev_snap_txg, UINT64_MAX,
3007		    &used, &comp, &uncomp);
3008		*usedp += used;
3009		*compp += comp;
3010		*uncompp += uncomp;
3011
3012		snapobj = ds->ds_phys->ds_prev_snap_obj;
3013		ASSERT3U(snapobj, !=, 0);
3014		dsl_dataset_rele(ds, FTAG);
3015	}
3016	return (err);
3017}
3018
3019/*
3020 * Return TRUE if 'earlier' is an earlier snapshot in 'later's timeline.
3021 * For example, they could both be snapshots of the same filesystem, and
3022 * 'earlier' is before 'later'.  Or 'earlier' could be the origin of
3023 * 'later's filesystem.  Or 'earlier' could be an older snapshot in the origin's
3024 * filesystem.  Or 'earlier' could be the origin's origin.
3025 */
3026boolean_t
3027dsl_dataset_is_before(dsl_dataset_t *later, dsl_dataset_t *earlier)
3028{
3029	dsl_pool_t *dp = later->ds_dir->dd_pool;
3030	int error;
3031	boolean_t ret;
3032
3033	ASSERT(dsl_pool_config_held(dp));
3034
3035	if (earlier->ds_phys->ds_creation_txg >=
3036	    later->ds_phys->ds_creation_txg)
3037		return (B_FALSE);
3038
3039	if (later->ds_dir == earlier->ds_dir)
3040		return (B_TRUE);
3041	if (!dsl_dir_is_clone(later->ds_dir))
3042		return (B_FALSE);
3043
3044	if (later->ds_dir->dd_phys->dd_origin_obj == earlier->ds_object)
3045		return (B_TRUE);
3046	dsl_dataset_t *origin;
3047	error = dsl_dataset_hold_obj(dp,
3048	    later->ds_dir->dd_phys->dd_origin_obj, FTAG, &origin);
3049	if (error != 0)
3050		return (B_FALSE);
3051	ret = dsl_dataset_is_before(origin, earlier);
3052	dsl_dataset_rele(origin, FTAG);
3053	return (ret);
3054}
3055