1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2013 by Delphix. All rights reserved.
24 * Copyright (c) 2013 Steven Hartland. All rights reserved.
25 */
26
27#include <sys/zfs_context.h>
28#include <sys/dsl_userhold.h>
29#include <sys/dsl_dataset.h>
30#include <sys/dsl_synctask.h>
31#include <sys/dmu_tx.h>
32#include <sys/dsl_pool.h>
33#include <sys/dsl_dir.h>
34#include <sys/dmu_traverse.h>
35#include <sys/dsl_scan.h>
36#include <sys/dmu_objset.h>
37#include <sys/zap.h>
38#include <sys/zfeature.h>
39#include <sys/zfs_ioctl.h>
40#include <sys/dsl_deleg.h>
41
42typedef struct dmu_snapshots_destroy_arg {
43	nvlist_t *dsda_snaps;
44	nvlist_t *dsda_successful_snaps;
45	boolean_t dsda_defer;
46	nvlist_t *dsda_errlist;
47} dmu_snapshots_destroy_arg_t;
48
49int
50dsl_destroy_snapshot_check_impl(dsl_dataset_t *ds, boolean_t defer)
51{
52	if (!dsl_dataset_is_snapshot(ds))
53		return (SET_ERROR(EINVAL));
54
55	if (dsl_dataset_long_held(ds))
56		return (SET_ERROR(EBUSY));
57
58	/*
59	 * Only allow deferred destroy on pools that support it.
60	 * NOTE: deferred destroy is only supported on snapshots.
61	 */
62	if (defer) {
63		if (spa_version(ds->ds_dir->dd_pool->dp_spa) <
64		    SPA_VERSION_USERREFS)
65			return (SET_ERROR(ENOTSUP));
66		return (0);
67	}
68
69	/*
70	 * If this snapshot has an elevated user reference count,
71	 * we can't destroy it yet.
72	 */
73	if (ds->ds_userrefs > 0)
74		return (SET_ERROR(EBUSY));
75
76	/*
77	 * Can't delete a branch point.
78	 */
79	if (ds->ds_phys->ds_num_children > 1)
80		return (SET_ERROR(EEXIST));
81
82	return (0);
83}
84
85static int
86dsl_destroy_snapshot_check(void *arg, dmu_tx_t *tx)
87{
88	dmu_snapshots_destroy_arg_t *dsda = arg;
89	dsl_pool_t *dp = dmu_tx_pool(tx);
90	nvpair_t *pair;
91	int error = 0;
92
93	if (!dmu_tx_is_syncing(tx))
94		return (0);
95
96	for (pair = nvlist_next_nvpair(dsda->dsda_snaps, NULL);
97	    pair != NULL; pair = nvlist_next_nvpair(dsda->dsda_snaps, pair)) {
98		dsl_dataset_t *ds;
99
100		error = dsl_dataset_hold(dp, nvpair_name(pair),
101		    FTAG, &ds);
102
103		/*
104		 * If the snapshot does not exist, silently ignore it
105		 * (it's "already destroyed").
106		 */
107		if (error == ENOENT)
108			continue;
109
110		if (error == 0) {
111			error = dsl_destroy_snapshot_check_impl(ds,
112			    dsda->dsda_defer);
113			dsl_dataset_rele(ds, FTAG);
114		}
115
116		if (error == 0) {
117			fnvlist_add_boolean(dsda->dsda_successful_snaps,
118			    nvpair_name(pair));
119		} else {
120			fnvlist_add_int32(dsda->dsda_errlist,
121			    nvpair_name(pair), error);
122		}
123	}
124
125	pair = nvlist_next_nvpair(dsda->dsda_errlist, NULL);
126	if (pair != NULL)
127		return (fnvpair_value_int32(pair));
128
129	return (0);
130}
131
132struct process_old_arg {
133	dsl_dataset_t *ds;
134	dsl_dataset_t *ds_prev;
135	boolean_t after_branch_point;
136	zio_t *pio;
137	uint64_t used, comp, uncomp;
138};
139
140static int
141process_old_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
142{
143	struct process_old_arg *poa = arg;
144	dsl_pool_t *dp = poa->ds->ds_dir->dd_pool;
145
146	if (bp->blk_birth <= poa->ds->ds_phys->ds_prev_snap_txg) {
147		dsl_deadlist_insert(&poa->ds->ds_deadlist, bp, tx);
148		if (poa->ds_prev && !poa->after_branch_point &&
149		    bp->blk_birth >
150		    poa->ds_prev->ds_phys->ds_prev_snap_txg) {
151			poa->ds_prev->ds_phys->ds_unique_bytes +=
152			    bp_get_dsize_sync(dp->dp_spa, bp);
153		}
154	} else {
155		poa->used += bp_get_dsize_sync(dp->dp_spa, bp);
156		poa->comp += BP_GET_PSIZE(bp);
157		poa->uncomp += BP_GET_UCSIZE(bp);
158		dsl_free_sync(poa->pio, dp, tx->tx_txg, bp);
159	}
160	return (0);
161}
162
163static void
164process_old_deadlist(dsl_dataset_t *ds, dsl_dataset_t *ds_prev,
165    dsl_dataset_t *ds_next, boolean_t after_branch_point, dmu_tx_t *tx)
166{
167	struct process_old_arg poa = { 0 };
168	dsl_pool_t *dp = ds->ds_dir->dd_pool;
169	objset_t *mos = dp->dp_meta_objset;
170	uint64_t deadlist_obj;
171
172	ASSERT(ds->ds_deadlist.dl_oldfmt);
173	ASSERT(ds_next->ds_deadlist.dl_oldfmt);
174
175	poa.ds = ds;
176	poa.ds_prev = ds_prev;
177	poa.after_branch_point = after_branch_point;
178	poa.pio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
179	VERIFY0(bpobj_iterate(&ds_next->ds_deadlist.dl_bpobj,
180	    process_old_cb, &poa, tx));
181	VERIFY0(zio_wait(poa.pio));
182	ASSERT3U(poa.used, ==, ds->ds_phys->ds_unique_bytes);
183
184	/* change snapused */
185	dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP,
186	    -poa.used, -poa.comp, -poa.uncomp, tx);
187
188	/* swap next's deadlist to our deadlist */
189	dsl_deadlist_close(&ds->ds_deadlist);
190	dsl_deadlist_close(&ds_next->ds_deadlist);
191	deadlist_obj = ds->ds_phys->ds_deadlist_obj;
192	ds->ds_phys->ds_deadlist_obj = ds_next->ds_phys->ds_deadlist_obj;
193	ds_next->ds_phys->ds_deadlist_obj = deadlist_obj;
194	dsl_deadlist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj);
195	dsl_deadlist_open(&ds_next->ds_deadlist, mos,
196	    ds_next->ds_phys->ds_deadlist_obj);
197}
198
199static void
200dsl_dataset_remove_clones_key(dsl_dataset_t *ds, uint64_t mintxg, dmu_tx_t *tx)
201{
202	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
203	zap_cursor_t zc;
204	zap_attribute_t za;
205
206	/*
207	 * If it is the old version, dd_clones doesn't exist so we can't
208	 * find the clones, but dsl_deadlist_remove_key() is a no-op so it
209	 * doesn't matter.
210	 */
211	if (ds->ds_dir->dd_phys->dd_clones == 0)
212		return;
213
214	for (zap_cursor_init(&zc, mos, ds->ds_dir->dd_phys->dd_clones);
215	    zap_cursor_retrieve(&zc, &za) == 0;
216	    zap_cursor_advance(&zc)) {
217		dsl_dataset_t *clone;
218
219		VERIFY0(dsl_dataset_hold_obj(ds->ds_dir->dd_pool,
220		    za.za_first_integer, FTAG, &clone));
221		if (clone->ds_dir->dd_origin_txg > mintxg) {
222			dsl_deadlist_remove_key(&clone->ds_deadlist,
223			    mintxg, tx);
224			dsl_dataset_remove_clones_key(clone, mintxg, tx);
225		}
226		dsl_dataset_rele(clone, FTAG);
227	}
228	zap_cursor_fini(&zc);
229}
230
231void
232dsl_destroy_snapshot_sync_impl(dsl_dataset_t *ds, boolean_t defer, dmu_tx_t *tx)
233{
234	int err;
235	int after_branch_point = FALSE;
236	dsl_pool_t *dp = ds->ds_dir->dd_pool;
237	objset_t *mos = dp->dp_meta_objset;
238	dsl_dataset_t *ds_prev = NULL;
239	uint64_t obj;
240
241	ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
242	ASSERT3U(ds->ds_phys->ds_bp.blk_birth, <=, tx->tx_txg);
243	ASSERT(refcount_is_zero(&ds->ds_longholds));
244
245	if (defer &&
246	    (ds->ds_userrefs > 0 || ds->ds_phys->ds_num_children > 1)) {
247		ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS);
248		dmu_buf_will_dirty(ds->ds_dbuf, tx);
249		ds->ds_phys->ds_flags |= DS_FLAG_DEFER_DESTROY;
250		spa_history_log_internal_ds(ds, "defer_destroy", tx, "");
251		return;
252	}
253
254	ASSERT3U(ds->ds_phys->ds_num_children, <=, 1);
255
256	/* We need to log before removing it from the namespace. */
257	spa_history_log_internal_ds(ds, "destroy", tx, "");
258
259	dsl_scan_ds_destroyed(ds, tx);
260
261	obj = ds->ds_object;
262
263	if (ds->ds_phys->ds_prev_snap_obj != 0) {
264		ASSERT3P(ds->ds_prev, ==, NULL);
265		VERIFY0(dsl_dataset_hold_obj(dp,
266		    ds->ds_phys->ds_prev_snap_obj, FTAG, &ds_prev));
267		after_branch_point =
268		    (ds_prev->ds_phys->ds_next_snap_obj != obj);
269
270		dmu_buf_will_dirty(ds_prev->ds_dbuf, tx);
271		if (after_branch_point &&
272		    ds_prev->ds_phys->ds_next_clones_obj != 0) {
273			dsl_dataset_remove_from_next_clones(ds_prev, obj, tx);
274			if (ds->ds_phys->ds_next_snap_obj != 0) {
275				VERIFY0(zap_add_int(mos,
276				    ds_prev->ds_phys->ds_next_clones_obj,
277				    ds->ds_phys->ds_next_snap_obj, tx));
278			}
279		}
280		if (!after_branch_point) {
281			ds_prev->ds_phys->ds_next_snap_obj =
282			    ds->ds_phys->ds_next_snap_obj;
283		}
284	}
285
286	dsl_dataset_t *ds_next;
287	uint64_t old_unique;
288	uint64_t used = 0, comp = 0, uncomp = 0;
289
290	VERIFY0(dsl_dataset_hold_obj(dp,
291	    ds->ds_phys->ds_next_snap_obj, FTAG, &ds_next));
292	ASSERT3U(ds_next->ds_phys->ds_prev_snap_obj, ==, obj);
293
294	old_unique = ds_next->ds_phys->ds_unique_bytes;
295
296	dmu_buf_will_dirty(ds_next->ds_dbuf, tx);
297	ds_next->ds_phys->ds_prev_snap_obj =
298	    ds->ds_phys->ds_prev_snap_obj;
299	ds_next->ds_phys->ds_prev_snap_txg =
300	    ds->ds_phys->ds_prev_snap_txg;
301	ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==,
302	    ds_prev ? ds_prev->ds_phys->ds_creation_txg : 0);
303
304	if (ds_next->ds_deadlist.dl_oldfmt) {
305		process_old_deadlist(ds, ds_prev, ds_next,
306		    after_branch_point, tx);
307	} else {
308		/* Adjust prev's unique space. */
309		if (ds_prev && !after_branch_point) {
310			dsl_deadlist_space_range(&ds_next->ds_deadlist,
311			    ds_prev->ds_phys->ds_prev_snap_txg,
312			    ds->ds_phys->ds_prev_snap_txg,
313			    &used, &comp, &uncomp);
314			ds_prev->ds_phys->ds_unique_bytes += used;
315		}
316
317		/* Adjust snapused. */
318		dsl_deadlist_space_range(&ds_next->ds_deadlist,
319		    ds->ds_phys->ds_prev_snap_txg, UINT64_MAX,
320		    &used, &comp, &uncomp);
321		dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP,
322		    -used, -comp, -uncomp, tx);
323
324		/* Move blocks to be freed to pool's free list. */
325		dsl_deadlist_move_bpobj(&ds_next->ds_deadlist,
326		    &dp->dp_free_bpobj, ds->ds_phys->ds_prev_snap_txg,
327		    tx);
328		dsl_dir_diduse_space(tx->tx_pool->dp_free_dir,
329		    DD_USED_HEAD, used, comp, uncomp, tx);
330
331		/* Merge our deadlist into next's and free it. */
332		dsl_deadlist_merge(&ds_next->ds_deadlist,
333		    ds->ds_phys->ds_deadlist_obj, tx);
334	}
335	dsl_deadlist_close(&ds->ds_deadlist);
336	dsl_deadlist_free(mos, ds->ds_phys->ds_deadlist_obj, tx);
337	dmu_buf_will_dirty(ds->ds_dbuf, tx);
338	ds->ds_phys->ds_deadlist_obj = 0;
339
340	/* Collapse range in clone heads */
341	dsl_dataset_remove_clones_key(ds,
342	    ds->ds_phys->ds_creation_txg, tx);
343
344	if (dsl_dataset_is_snapshot(ds_next)) {
345		dsl_dataset_t *ds_nextnext;
346
347		/*
348		 * Update next's unique to include blocks which
349		 * were previously shared by only this snapshot
350		 * and it.  Those blocks will be born after the
351		 * prev snap and before this snap, and will have
352		 * died after the next snap and before the one
353		 * after that (ie. be on the snap after next's
354		 * deadlist).
355		 */
356		VERIFY0(dsl_dataset_hold_obj(dp,
357		    ds_next->ds_phys->ds_next_snap_obj, FTAG, &ds_nextnext));
358		dsl_deadlist_space_range(&ds_nextnext->ds_deadlist,
359		    ds->ds_phys->ds_prev_snap_txg,
360		    ds->ds_phys->ds_creation_txg,
361		    &used, &comp, &uncomp);
362		ds_next->ds_phys->ds_unique_bytes += used;
363		dsl_dataset_rele(ds_nextnext, FTAG);
364		ASSERT3P(ds_next->ds_prev, ==, NULL);
365
366		/* Collapse range in this head. */
367		dsl_dataset_t *hds;
368		VERIFY0(dsl_dataset_hold_obj(dp,
369		    ds->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &hds));
370		dsl_deadlist_remove_key(&hds->ds_deadlist,
371		    ds->ds_phys->ds_creation_txg, tx);
372		dsl_dataset_rele(hds, FTAG);
373
374	} else {
375		ASSERT3P(ds_next->ds_prev, ==, ds);
376		dsl_dataset_rele(ds_next->ds_prev, ds_next);
377		ds_next->ds_prev = NULL;
378		if (ds_prev) {
379			VERIFY0(dsl_dataset_hold_obj(dp,
380			    ds->ds_phys->ds_prev_snap_obj,
381			    ds_next, &ds_next->ds_prev));
382		}
383
384		dsl_dataset_recalc_head_uniq(ds_next);
385
386		/*
387		 * Reduce the amount of our unconsumed refreservation
388		 * being charged to our parent by the amount of
389		 * new unique data we have gained.
390		 */
391		if (old_unique < ds_next->ds_reserved) {
392			int64_t mrsdelta;
393			uint64_t new_unique =
394			    ds_next->ds_phys->ds_unique_bytes;
395
396			ASSERT(old_unique <= new_unique);
397			mrsdelta = MIN(new_unique - old_unique,
398			    ds_next->ds_reserved - old_unique);
399			dsl_dir_diduse_space(ds->ds_dir,
400			    DD_USED_REFRSRV, -mrsdelta, 0, 0, tx);
401		}
402	}
403	dsl_dataset_rele(ds_next, FTAG);
404
405	/*
406	 * This must be done after the dsl_traverse(), because it will
407	 * re-open the objset.
408	 */
409	if (ds->ds_objset) {
410		dmu_objset_evict(ds->ds_objset);
411		ds->ds_objset = NULL;
412	}
413
414	/* remove from snapshot namespace */
415	dsl_dataset_t *ds_head;
416	ASSERT(ds->ds_phys->ds_snapnames_zapobj == 0);
417	VERIFY0(dsl_dataset_hold_obj(dp,
418	    ds->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &ds_head));
419	VERIFY0(dsl_dataset_get_snapname(ds));
420#ifdef ZFS_DEBUG
421	{
422		uint64_t val;
423
424		err = dsl_dataset_snap_lookup(ds_head,
425		    ds->ds_snapname, &val);
426		ASSERT0(err);
427		ASSERT3U(val, ==, obj);
428	}
429#endif
430	VERIFY0(dsl_dataset_snap_remove(ds_head, ds->ds_snapname, tx));
431	dsl_dataset_rele(ds_head, FTAG);
432
433	if (ds_prev != NULL)
434		dsl_dataset_rele(ds_prev, FTAG);
435
436	spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx);
437
438	if (ds->ds_phys->ds_next_clones_obj != 0) {
439		uint64_t count;
440		ASSERT0(zap_count(mos,
441		    ds->ds_phys->ds_next_clones_obj, &count) && count == 0);
442		VERIFY0(dmu_object_free(mos,
443		    ds->ds_phys->ds_next_clones_obj, tx));
444	}
445	if (ds->ds_phys->ds_props_obj != 0)
446		VERIFY0(zap_destroy(mos, ds->ds_phys->ds_props_obj, tx));
447	if (ds->ds_phys->ds_userrefs_obj != 0)
448		VERIFY0(zap_destroy(mos, ds->ds_phys->ds_userrefs_obj, tx));
449	dsl_dir_rele(ds->ds_dir, ds);
450	ds->ds_dir = NULL;
451	VERIFY0(dmu_object_free(mos, obj, tx));
452}
453
454static void
455dsl_destroy_snapshot_sync(void *arg, dmu_tx_t *tx)
456{
457	dmu_snapshots_destroy_arg_t *dsda = arg;
458	dsl_pool_t *dp = dmu_tx_pool(tx);
459	nvpair_t *pair;
460
461	for (pair = nvlist_next_nvpair(dsda->dsda_successful_snaps, NULL);
462	    pair != NULL;
463	    pair = nvlist_next_nvpair(dsda->dsda_successful_snaps, pair)) {
464		dsl_dataset_t *ds;
465
466		VERIFY0(dsl_dataset_hold(dp, nvpair_name(pair), FTAG, &ds));
467
468		dsl_destroy_snapshot_sync_impl(ds, dsda->dsda_defer, tx);
469		dsl_dataset_rele(ds, FTAG);
470	}
471}
472
473/*
474 * The semantics of this function are described in the comment above
475 * lzc_destroy_snaps().  To summarize:
476 *
477 * The snapshots must all be in the same pool.
478 *
479 * Snapshots that don't exist will be silently ignored (considered to be
480 * "already deleted").
481 *
482 * On success, all snaps will be destroyed and this will return 0.
483 * On failure, no snaps will be destroyed, the errlist will be filled in,
484 * and this will return an errno.
485 */
486int
487dsl_destroy_snapshots_nvl(nvlist_t *snaps, boolean_t defer,
488    nvlist_t *errlist)
489{
490	dmu_snapshots_destroy_arg_t dsda;
491	int error;
492	nvpair_t *pair;
493
494	pair = nvlist_next_nvpair(snaps, NULL);
495	if (pair == NULL)
496		return (0);
497
498	dsda.dsda_snaps = snaps;
499	dsda.dsda_successful_snaps = fnvlist_alloc();
500	dsda.dsda_defer = defer;
501	dsda.dsda_errlist = errlist;
502
503	error = dsl_sync_task(nvpair_name(pair),
504	    dsl_destroy_snapshot_check, dsl_destroy_snapshot_sync,
505	    &dsda, 0);
506	fnvlist_free(dsda.dsda_successful_snaps);
507
508	return (error);
509}
510
511int
512dsl_destroy_snapshot(const char *name, boolean_t defer)
513{
514	int error;
515	nvlist_t *nvl = fnvlist_alloc();
516	nvlist_t *errlist = fnvlist_alloc();
517
518	fnvlist_add_boolean(nvl, name);
519	error = dsl_destroy_snapshots_nvl(nvl, defer, errlist);
520	fnvlist_free(errlist);
521	fnvlist_free(nvl);
522	return (error);
523}
524
525struct killarg {
526	dsl_dataset_t *ds;
527	dmu_tx_t *tx;
528};
529
530/* ARGSUSED */
531static int
532kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
533    const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
534{
535	struct killarg *ka = arg;
536	dmu_tx_t *tx = ka->tx;
537
538	if (bp == NULL)
539		return (0);
540
541	if (zb->zb_level == ZB_ZIL_LEVEL) {
542		ASSERT(zilog != NULL);
543		/*
544		 * It's a block in the intent log.  It has no
545		 * accounting, so just free it.
546		 */
547		dsl_free(ka->tx->tx_pool, ka->tx->tx_txg, bp);
548	} else {
549		ASSERT(zilog == NULL);
550		ASSERT3U(bp->blk_birth, >, ka->ds->ds_phys->ds_prev_snap_txg);
551		(void) dsl_dataset_block_kill(ka->ds, bp, tx, B_FALSE);
552	}
553
554	return (0);
555}
556
557static void
558old_synchronous_dataset_destroy(dsl_dataset_t *ds, dmu_tx_t *tx)
559{
560	struct killarg ka;
561
562	/*
563	 * Free everything that we point to (that's born after
564	 * the previous snapshot, if we are a clone)
565	 *
566	 * NB: this should be very quick, because we already
567	 * freed all the objects in open context.
568	 */
569	ka.ds = ds;
570	ka.tx = tx;
571	VERIFY0(traverse_dataset(ds,
572	    ds->ds_phys->ds_prev_snap_txg, TRAVERSE_POST,
573	    kill_blkptr, &ka));
574	ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || ds->ds_phys->ds_unique_bytes == 0);
575}
576
577typedef struct dsl_destroy_head_arg {
578	const char *ddha_name;
579} dsl_destroy_head_arg_t;
580
581int
582dsl_destroy_head_check_impl(dsl_dataset_t *ds, int expected_holds)
583{
584	int error;
585	uint64_t count;
586	objset_t *mos;
587
588	if (dsl_dataset_is_snapshot(ds))
589		return (SET_ERROR(EINVAL));
590
591	if (refcount_count(&ds->ds_longholds) != expected_holds)
592		return (SET_ERROR(EBUSY));
593
594	mos = ds->ds_dir->dd_pool->dp_meta_objset;
595
596	/*
597	 * Can't delete a head dataset if there are snapshots of it.
598	 * (Except if the only snapshots are from the branch we cloned
599	 * from.)
600	 */
601	if (ds->ds_prev != NULL &&
602	    ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object)
603		return (SET_ERROR(EBUSY));
604
605	/*
606	 * Can't delete if there are children of this fs.
607	 */
608	error = zap_count(mos,
609	    ds->ds_dir->dd_phys->dd_child_dir_zapobj, &count);
610	if (error != 0)
611		return (error);
612	if (count != 0)
613		return (SET_ERROR(EEXIST));
614
615	if (dsl_dir_is_clone(ds->ds_dir) && DS_IS_DEFER_DESTROY(ds->ds_prev) &&
616	    ds->ds_prev->ds_phys->ds_num_children == 2 &&
617	    ds->ds_prev->ds_userrefs == 0) {
618		/* We need to remove the origin snapshot as well. */
619		if (!refcount_is_zero(&ds->ds_prev->ds_longholds))
620			return (SET_ERROR(EBUSY));
621	}
622	return (0);
623}
624
625static int
626dsl_destroy_head_check(void *arg, dmu_tx_t *tx)
627{
628	dsl_destroy_head_arg_t *ddha = arg;
629	dsl_pool_t *dp = dmu_tx_pool(tx);
630	dsl_dataset_t *ds;
631	int error;
632
633	error = dsl_dataset_hold(dp, ddha->ddha_name, FTAG, &ds);
634	if (error != 0)
635		return (error);
636
637	error = dsl_destroy_head_check_impl(ds, 0);
638	dsl_dataset_rele(ds, FTAG);
639	return (error);
640}
641
642static void
643dsl_dir_destroy_sync(uint64_t ddobj, dmu_tx_t *tx)
644{
645	dsl_dir_t *dd;
646	dsl_pool_t *dp = dmu_tx_pool(tx);
647	objset_t *mos = dp->dp_meta_objset;
648	dd_used_t t;
649
650	ASSERT(RRW_WRITE_HELD(&dmu_tx_pool(tx)->dp_config_rwlock));
651
652	VERIFY0(dsl_dir_hold_obj(dp, ddobj, NULL, FTAG, &dd));
653
654	ASSERT0(dd->dd_phys->dd_head_dataset_obj);
655
656	/*
657	 * Remove our reservation. The impl() routine avoids setting the
658	 * actual property, which would require the (already destroyed) ds.
659	 */
660	dsl_dir_set_reservation_sync_impl(dd, 0, tx);
661
662	ASSERT0(dd->dd_phys->dd_used_bytes);
663	ASSERT0(dd->dd_phys->dd_reserved);
664	for (t = 0; t < DD_USED_NUM; t++)
665		ASSERT0(dd->dd_phys->dd_used_breakdown[t]);
666
667	VERIFY0(zap_destroy(mos, dd->dd_phys->dd_child_dir_zapobj, tx));
668	VERIFY0(zap_destroy(mos, dd->dd_phys->dd_props_zapobj, tx));
669	VERIFY0(dsl_deleg_destroy(mos, dd->dd_phys->dd_deleg_zapobj, tx));
670	VERIFY0(zap_remove(mos,
671	    dd->dd_parent->dd_phys->dd_child_dir_zapobj, dd->dd_myname, tx));
672
673	dsl_dir_rele(dd, FTAG);
674	VERIFY0(dmu_object_free(mos, ddobj, tx));
675}
676
677void
678dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx)
679{
680	dsl_pool_t *dp = dmu_tx_pool(tx);
681	objset_t *mos = dp->dp_meta_objset;
682	uint64_t obj, ddobj, prevobj = 0;
683	boolean_t rmorigin;
684
685	ASSERT3U(ds->ds_phys->ds_num_children, <=, 1);
686	ASSERT(ds->ds_prev == NULL ||
687	    ds->ds_prev->ds_phys->ds_next_snap_obj != ds->ds_object);
688	ASSERT3U(ds->ds_phys->ds_bp.blk_birth, <=, tx->tx_txg);
689	ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
690
691	/* We need to log before removing it from the namespace. */
692	spa_history_log_internal_ds(ds, "destroy", tx, "");
693
694	rmorigin = (dsl_dir_is_clone(ds->ds_dir) &&
695	    DS_IS_DEFER_DESTROY(ds->ds_prev) &&
696	    ds->ds_prev->ds_phys->ds_num_children == 2 &&
697	    ds->ds_prev->ds_userrefs == 0);
698
699	/* Remove our reservation */
700	if (ds->ds_reserved != 0) {
701		dsl_dataset_set_refreservation_sync_impl(ds,
702		    (ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED),
703		    0, tx);
704		ASSERT0(ds->ds_reserved);
705	}
706
707	dsl_scan_ds_destroyed(ds, tx);
708
709	obj = ds->ds_object;
710
711	if (ds->ds_phys->ds_prev_snap_obj != 0) {
712		/* This is a clone */
713		ASSERT(ds->ds_prev != NULL);
714		ASSERT3U(ds->ds_prev->ds_phys->ds_next_snap_obj, !=, obj);
715		ASSERT0(ds->ds_phys->ds_next_snap_obj);
716
717		dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
718		if (ds->ds_prev->ds_phys->ds_next_clones_obj != 0) {
719			dsl_dataset_remove_from_next_clones(ds->ds_prev,
720			    obj, tx);
721		}
722
723		ASSERT3U(ds->ds_prev->ds_phys->ds_num_children, >, 1);
724		ds->ds_prev->ds_phys->ds_num_children--;
725	}
726
727	zfeature_info_t *async_destroy =
728	    &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY];
729	objset_t *os;
730
731	/*
732	 * Destroy the deadlist.  Unless it's a clone, the
733	 * deadlist should be empty.  (If it's a clone, it's
734	 * safe to ignore the deadlist contents.)
735	 */
736	dsl_deadlist_close(&ds->ds_deadlist);
737	dsl_deadlist_free(mos, ds->ds_phys->ds_deadlist_obj, tx);
738	dmu_buf_will_dirty(ds->ds_dbuf, tx);
739	ds->ds_phys->ds_deadlist_obj = 0;
740
741	VERIFY0(dmu_objset_from_ds(ds, &os));
742
743	if (!spa_feature_is_enabled(dp->dp_spa, async_destroy)) {
744		old_synchronous_dataset_destroy(ds, tx);
745	} else {
746		/*
747		 * Move the bptree into the pool's list of trees to
748		 * clean up and update space accounting information.
749		 */
750		uint64_t used, comp, uncomp;
751
752		zil_destroy_sync(dmu_objset_zil(os), tx);
753
754		if (!spa_feature_is_active(dp->dp_spa, async_destroy)) {
755			dsl_scan_t *scn = dp->dp_scan;
756
757			spa_feature_incr(dp->dp_spa, async_destroy, tx);
758			dp->dp_bptree_obj = bptree_alloc(mos, tx);
759			VERIFY0(zap_add(mos,
760			    DMU_POOL_DIRECTORY_OBJECT,
761			    DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1,
762			    &dp->dp_bptree_obj, tx));
763			ASSERT(!scn->scn_async_destroying);
764			scn->scn_async_destroying = B_TRUE;
765		}
766
767		used = ds->ds_dir->dd_phys->dd_used_bytes;
768		comp = ds->ds_dir->dd_phys->dd_compressed_bytes;
769		uncomp = ds->ds_dir->dd_phys->dd_uncompressed_bytes;
770
771		ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) ||
772		    ds->ds_phys->ds_unique_bytes == used);
773
774		bptree_add(mos, dp->dp_bptree_obj,
775		    &ds->ds_phys->ds_bp, ds->ds_phys->ds_prev_snap_txg,
776		    used, comp, uncomp, tx);
777		dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD,
778		    -used, -comp, -uncomp, tx);
779		dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD,
780		    used, comp, uncomp, tx);
781	}
782
783	if (ds->ds_prev != NULL) {
784		if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
785			VERIFY0(zap_remove_int(mos,
786			    ds->ds_prev->ds_dir->dd_phys->dd_clones,
787			    ds->ds_object, tx));
788		}
789		prevobj = ds->ds_prev->ds_object;
790		dsl_dataset_rele(ds->ds_prev, ds);
791		ds->ds_prev = NULL;
792	}
793
794	/*
795	 * This must be done after the dsl_traverse(), because it will
796	 * re-open the objset.
797	 */
798	if (ds->ds_objset) {
799		dmu_objset_evict(ds->ds_objset);
800		ds->ds_objset = NULL;
801	}
802
803	/* Erase the link in the dir */
804	dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
805	ds->ds_dir->dd_phys->dd_head_dataset_obj = 0;
806	ddobj = ds->ds_dir->dd_object;
807	ASSERT(ds->ds_phys->ds_snapnames_zapobj != 0);
808	VERIFY0(zap_destroy(mos, ds->ds_phys->ds_snapnames_zapobj, tx));
809
810	spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx);
811
812	ASSERT0(ds->ds_phys->ds_next_clones_obj);
813	ASSERT0(ds->ds_phys->ds_props_obj);
814	ASSERT0(ds->ds_phys->ds_userrefs_obj);
815	dsl_dir_rele(ds->ds_dir, ds);
816	ds->ds_dir = NULL;
817	VERIFY0(dmu_object_free(mos, obj, tx));
818
819	dsl_dir_destroy_sync(ddobj, tx);
820
821	if (rmorigin) {
822		dsl_dataset_t *prev;
823		VERIFY0(dsl_dataset_hold_obj(dp, prevobj, FTAG, &prev));
824		dsl_destroy_snapshot_sync_impl(prev, B_FALSE, tx);
825		dsl_dataset_rele(prev, FTAG);
826	}
827}
828
829static void
830dsl_destroy_head_sync(void *arg, dmu_tx_t *tx)
831{
832	dsl_destroy_head_arg_t *ddha = arg;
833	dsl_pool_t *dp = dmu_tx_pool(tx);
834	dsl_dataset_t *ds;
835
836	VERIFY0(dsl_dataset_hold(dp, ddha->ddha_name, FTAG, &ds));
837	dsl_destroy_head_sync_impl(ds, tx);
838	dsl_dataset_rele(ds, FTAG);
839}
840
841static void
842dsl_destroy_head_begin_sync(void *arg, dmu_tx_t *tx)
843{
844	dsl_destroy_head_arg_t *ddha = arg;
845	dsl_pool_t *dp = dmu_tx_pool(tx);
846	dsl_dataset_t *ds;
847
848	VERIFY0(dsl_dataset_hold(dp, ddha->ddha_name, FTAG, &ds));
849
850	/* Mark it as inconsistent on-disk, in case we crash */
851	dmu_buf_will_dirty(ds->ds_dbuf, tx);
852	ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT;
853
854	spa_history_log_internal_ds(ds, "destroy begin", tx, "");
855	dsl_dataset_rele(ds, FTAG);
856}
857
858int
859dsl_destroy_head(const char *name)
860{
861	dsl_destroy_head_arg_t ddha;
862	int error;
863	spa_t *spa;
864	boolean_t isenabled;
865
866#ifdef _KERNEL
867	zfs_destroy_unmount_origin(name);
868#endif
869
870	error = spa_open(name, &spa, FTAG);
871	if (error != 0)
872		return (error);
873	isenabled = spa_feature_is_enabled(spa,
874	    &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY]);
875	spa_close(spa, FTAG);
876
877	ddha.ddha_name = name;
878
879	if (!isenabled) {
880		objset_t *os;
881
882		error = dsl_sync_task(name, dsl_destroy_head_check,
883		    dsl_destroy_head_begin_sync, &ddha, 0);
884		if (error != 0)
885			return (error);
886
887		/*
888		 * Head deletion is processed in one txg on old pools;
889		 * remove the objects from open context so that the txg sync
890		 * is not too long.
891		 */
892		error = dmu_objset_own(name, DMU_OST_ANY, B_FALSE, FTAG, &os);
893		if (error == 0) {
894			uint64_t prev_snap_txg =
895			    dmu_objset_ds(os)->ds_phys->ds_prev_snap_txg;
896			for (uint64_t obj = 0; error == 0;
897			    error = dmu_object_next(os, &obj, FALSE,
898			    prev_snap_txg))
899				(void) dmu_free_long_object(os, obj);
900			/* sync out all frees */
901			txg_wait_synced(dmu_objset_pool(os), 0);
902			dmu_objset_disown(os, FTAG);
903		}
904	}
905
906	return (dsl_sync_task(name, dsl_destroy_head_check,
907	    dsl_destroy_head_sync, &ddha, 0));
908}
909
910/*
911 * Note, this function is used as the callback for dmu_objset_find().  We
912 * always return 0 so that we will continue to find and process
913 * inconsistent datasets, even if we encounter an error trying to
914 * process one of them.
915 */
916/* ARGSUSED */
917int
918dsl_destroy_inconsistent(const char *dsname, void *arg)
919{
920	objset_t *os;
921
922	if (dmu_objset_hold(dsname, FTAG, &os) == 0) {
923		boolean_t inconsistent = DS_IS_INCONSISTENT(dmu_objset_ds(os));
924		dmu_objset_rele(os, FTAG);
925		if (inconsistent)
926			(void) dsl_destroy_head(dsname);
927	}
928	return (0);
929}
930