1168404Spjd/*
2168404Spjd * CDDL HEADER START
3168404Spjd *
4168404Spjd * The contents of this file are subject to the terms of the
5168404Spjd * Common Development and Distribution License (the "License").
6168404Spjd * You may not use this file except in compliance with the License.
7168404Spjd *
8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9168404Spjd * or http://www.opensolaris.org/os/licensing.
10168404Spjd * See the License for the specific language governing permissions
11168404Spjd * and limitations under the License.
12168404Spjd *
13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each
14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15168404Spjd * If applicable, add the following below this CDDL HEADER, with the
16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying
17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner]
18168404Spjd *
19168404Spjd * CDDL HEADER END
20168404Spjd */
21168404Spjd/*
22219089Spjd * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23304139Savg * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
24251478Sdelphij * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
25265744Sdelphij * Copyright (c) 2013, Joyent, Inc. All rights reserved.
26288549Smav * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
27282756Savg * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
28288569Smav * Copyright (c) 2015, STRATO AG, Inc. All rights reserved.
29297112Smav * Copyright (c) 2014 Integros [integros.com]
30168404Spjd */
31168404Spjd
32219089Spjd/* Portions Copyright 2010 Robert Milkowski */
33219089Spjd
34185029Spjd#include <sys/cred.h>
35168404Spjd#include <sys/zfs_context.h>
36168404Spjd#include <sys/dmu_objset.h>
37168404Spjd#include <sys/dsl_dir.h>
38168404Spjd#include <sys/dsl_dataset.h>
39168404Spjd#include <sys/dsl_prop.h>
40168404Spjd#include <sys/dsl_pool.h>
41168404Spjd#include <sys/dsl_synctask.h>
42185029Spjd#include <sys/dsl_deleg.h>
43168404Spjd#include <sys/dnode.h>
44168404Spjd#include <sys/dbuf.h>
45168404Spjd#include <sys/zvol.h>
46168404Spjd#include <sys/dmu_tx.h>
47168404Spjd#include <sys/zap.h>
48168404Spjd#include <sys/zil.h>
49168404Spjd#include <sys/dmu_impl.h>
50185029Spjd#include <sys/zfs_ioctl.h>
51219089Spjd#include <sys/sa.h>
52219089Spjd#include <sys/zfs_onexit.h>
53248571Smm#include <sys/dsl_destroy.h>
54288569Smav#include <sys/vdev.h>
55168404Spjd
56219089Spjd/*
57219089Spjd * Needed to close a window in dnode_move() that allows the objset to be freed
58219089Spjd * before it can be safely accessed.
59219089Spjd */
60219089Spjdkrwlock_t os_lock;
61219089Spjd
62288569Smav/*
63288569Smav * Tunable to overwrite the maximum number of threads for the parallization
64288569Smav * of dmu_objset_find_dp, needed to speed up the import of pools with many
65288569Smav * datasets.
66288569Smav * Default is 4 times the number of leaf vdevs.
67288569Smav */
68288569Smavint dmu_find_threads = 0;
69288569Smav
70288569Smavstatic void dmu_objset_find_dp_cb(void *arg);
71288569Smav
72219089Spjdvoid
73219089Spjddmu_objset_init(void)
74219089Spjd{
75219089Spjd	rw_init(&os_lock, NULL, RW_DEFAULT, NULL);
76219089Spjd}
77219089Spjd
78219089Spjdvoid
79219089Spjddmu_objset_fini(void)
80219089Spjd{
81219089Spjd	rw_destroy(&os_lock);
82219089Spjd}
83219089Spjd
84168404Spjdspa_t *
85168404Spjddmu_objset_spa(objset_t *os)
86168404Spjd{
87219089Spjd	return (os->os_spa);
88168404Spjd}
89168404Spjd
90168404Spjdzilog_t *
91168404Spjddmu_objset_zil(objset_t *os)
92168404Spjd{
93219089Spjd	return (os->os_zil);
94168404Spjd}
95168404Spjd
96168404Spjddsl_pool_t *
97168404Spjddmu_objset_pool(objset_t *os)
98168404Spjd{
99168404Spjd	dsl_dataset_t *ds;
100168404Spjd
101219089Spjd	if ((ds = os->os_dsl_dataset) != NULL && ds->ds_dir)
102168404Spjd		return (ds->ds_dir->dd_pool);
103168404Spjd	else
104219089Spjd		return (spa_get_dsl(os->os_spa));
105168404Spjd}
106168404Spjd
107168404Spjddsl_dataset_t *
108168404Spjddmu_objset_ds(objset_t *os)
109168404Spjd{
110219089Spjd	return (os->os_dsl_dataset);
111168404Spjd}
112168404Spjd
113168404Spjddmu_objset_type_t
114168404Spjddmu_objset_type(objset_t *os)
115168404Spjd{
116219089Spjd	return (os->os_phys->os_type);
117168404Spjd}
118168404Spjd
119168404Spjdvoid
120168404Spjddmu_objset_name(objset_t *os, char *buf)
121168404Spjd{
122219089Spjd	dsl_dataset_name(os->os_dsl_dataset, buf);
123168404Spjd}
124168404Spjd
125168404Spjduint64_t
126168404Spjddmu_objset_id(objset_t *os)
127168404Spjd{
128219089Spjd	dsl_dataset_t *ds = os->os_dsl_dataset;
129168404Spjd
130168404Spjd	return (ds ? ds->ds_object : 0);
131168404Spjd}
132168404Spjd
133268647Sdelphijzfs_sync_type_t
134219089Spjddmu_objset_syncprop(objset_t *os)
135219089Spjd{
136219089Spjd	return (os->os_sync);
137219089Spjd}
138219089Spjd
139268647Sdelphijzfs_logbias_op_t
140219089Spjddmu_objset_logbias(objset_t *os)
141219089Spjd{
142219089Spjd	return (os->os_logbias);
143219089Spjd}
144219089Spjd
145168404Spjdstatic void
146168404Spjdchecksum_changed_cb(void *arg, uint64_t newval)
147168404Spjd{
148219089Spjd	objset_t *os = arg;
149168404Spjd
150168404Spjd	/*
151168404Spjd	 * Inheritance should have been done by now.
152168404Spjd	 */
153168404Spjd	ASSERT(newval != ZIO_CHECKSUM_INHERIT);
154168404Spjd
155219089Spjd	os->os_checksum = zio_checksum_select(newval, ZIO_CHECKSUM_ON_VALUE);
156168404Spjd}
157168404Spjd
158168404Spjdstatic void
159168404Spjdcompression_changed_cb(void *arg, uint64_t newval)
160168404Spjd{
161219089Spjd	objset_t *os = arg;
162168404Spjd
163168404Spjd	/*
164168404Spjd	 * Inheritance and range checking should have been done by now.
165168404Spjd	 */
166168404Spjd	ASSERT(newval != ZIO_COMPRESS_INHERIT);
167168404Spjd
168288542Smav	os->os_compress = zio_compress_select(os->os_spa, newval,
169288542Smav	    ZIO_COMPRESS_ON);
170168404Spjd}
171168404Spjd
172168404Spjdstatic void
173168404Spjdcopies_changed_cb(void *arg, uint64_t newval)
174168404Spjd{
175219089Spjd	objset_t *os = arg;
176168404Spjd
177168404Spjd	/*
178168404Spjd	 * Inheritance and range checking should have been done by now.
179168404Spjd	 */
180168404Spjd	ASSERT(newval > 0);
181219089Spjd	ASSERT(newval <= spa_max_replication(os->os_spa));
182168404Spjd
183219089Spjd	os->os_copies = newval;
184168404Spjd}
185168404Spjd
186185029Spjdstatic void
187219089Spjddedup_changed_cb(void *arg, uint64_t newval)
188219089Spjd{
189219089Spjd	objset_t *os = arg;
190219089Spjd	spa_t *spa = os->os_spa;
191219089Spjd	enum zio_checksum checksum;
192219089Spjd
193219089Spjd	/*
194219089Spjd	 * Inheritance should have been done by now.
195219089Spjd	 */
196219089Spjd	ASSERT(newval != ZIO_CHECKSUM_INHERIT);
197219089Spjd
198219089Spjd	checksum = zio_checksum_dedup_select(spa, newval, ZIO_CHECKSUM_OFF);
199219089Spjd
200219089Spjd	os->os_dedup_checksum = checksum & ZIO_CHECKSUM_MASK;
201219089Spjd	os->os_dedup_verify = !!(checksum & ZIO_CHECKSUM_VERIFY);
202219089Spjd}
203219089Spjd
204219089Spjdstatic void
205185029Spjdprimary_cache_changed_cb(void *arg, uint64_t newval)
206185029Spjd{
207219089Spjd	objset_t *os = arg;
208185029Spjd
209185029Spjd	/*
210185029Spjd	 * Inheritance and range checking should have been done by now.
211185029Spjd	 */
212185029Spjd	ASSERT(newval == ZFS_CACHE_ALL || newval == ZFS_CACHE_NONE ||
213185029Spjd	    newval == ZFS_CACHE_METADATA);
214185029Spjd
215219089Spjd	os->os_primary_cache = newval;
216185029Spjd}
217185029Spjd
218185029Spjdstatic void
219185029Spjdsecondary_cache_changed_cb(void *arg, uint64_t newval)
220185029Spjd{
221219089Spjd	objset_t *os = arg;
222185029Spjd
223185029Spjd	/*
224185029Spjd	 * Inheritance and range checking should have been done by now.
225185029Spjd	 */
226185029Spjd	ASSERT(newval == ZFS_CACHE_ALL || newval == ZFS_CACHE_NONE ||
227185029Spjd	    newval == ZFS_CACHE_METADATA);
228185029Spjd
229219089Spjd	os->os_secondary_cache = newval;
230185029Spjd}
231185029Spjd
232219089Spjdstatic void
233219089Spjdsync_changed_cb(void *arg, uint64_t newval)
234219089Spjd{
235219089Spjd	objset_t *os = arg;
236219089Spjd
237219089Spjd	/*
238219089Spjd	 * Inheritance and range checking should have been done by now.
239219089Spjd	 */
240219089Spjd	ASSERT(newval == ZFS_SYNC_STANDARD || newval == ZFS_SYNC_ALWAYS ||
241219089Spjd	    newval == ZFS_SYNC_DISABLED);
242219089Spjd
243219089Spjd	os->os_sync = newval;
244219089Spjd	if (os->os_zil)
245219089Spjd		zil_set_sync(os->os_zil, newval);
246219089Spjd}
247219089Spjd
248219089Spjdstatic void
249268647Sdelphijredundant_metadata_changed_cb(void *arg, uint64_t newval)
250268647Sdelphij{
251268647Sdelphij	objset_t *os = arg;
252268647Sdelphij
253268647Sdelphij	/*
254268647Sdelphij	 * Inheritance and range checking should have been done by now.
255268647Sdelphij	 */
256268647Sdelphij	ASSERT(newval == ZFS_REDUNDANT_METADATA_ALL ||
257268647Sdelphij	    newval == ZFS_REDUNDANT_METADATA_MOST);
258268647Sdelphij
259268647Sdelphij	os->os_redundant_metadata = newval;
260268647Sdelphij}
261268647Sdelphij
262268647Sdelphijstatic void
263219089Spjdlogbias_changed_cb(void *arg, uint64_t newval)
264219089Spjd{
265219089Spjd	objset_t *os = arg;
266219089Spjd
267219089Spjd	ASSERT(newval == ZFS_LOGBIAS_LATENCY ||
268219089Spjd	    newval == ZFS_LOGBIAS_THROUGHPUT);
269219089Spjd	os->os_logbias = newval;
270219089Spjd	if (os->os_zil)
271219089Spjd		zil_set_logbias(os->os_zil, newval);
272219089Spjd}
273219089Spjd
274276081Sdelphijstatic void
275276081Sdelphijrecordsize_changed_cb(void *arg, uint64_t newval)
276276081Sdelphij{
277276081Sdelphij	objset_t *os = arg;
278276081Sdelphij
279276081Sdelphij	os->os_recordsize = newval;
280276081Sdelphij}
281276081Sdelphij
282168404Spjdvoid
283168404Spjddmu_objset_byteswap(void *buf, size_t size)
284168404Spjd{
285168404Spjd	objset_phys_t *osp = buf;
286168404Spjd
287209962Smm	ASSERT(size == OBJSET_OLD_PHYS_SIZE || size == sizeof (objset_phys_t));
288168404Spjd	dnode_byteswap(&osp->os_meta_dnode);
289168404Spjd	byteswap_uint64_array(&osp->os_zil_header, sizeof (zil_header_t));
290168404Spjd	osp->os_type = BSWAP_64(osp->os_type);
291209962Smm	osp->os_flags = BSWAP_64(osp->os_flags);
292209962Smm	if (size == sizeof (objset_phys_t)) {
293209962Smm		dnode_byteswap(&osp->os_userused_dnode);
294209962Smm		dnode_byteswap(&osp->os_groupused_dnode);
295209962Smm	}
296168404Spjd}
297168404Spjd
298168404Spjdint
299168404Spjddmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
300219089Spjd    objset_t **osp)
301168404Spjd{
302219089Spjd	objset_t *os;
303185029Spjd	int i, err;
304168404Spjd
305185029Spjd	ASSERT(ds == NULL || MUTEX_HELD(&ds->ds_opening_lock));
306185029Spjd
307219089Spjd	os = kmem_zalloc(sizeof (objset_t), KM_SLEEP);
308219089Spjd	os->os_dsl_dataset = ds;
309219089Spjd	os->os_spa = spa;
310219089Spjd	os->os_rootbp = bp;
311219089Spjd	if (!BP_IS_HOLE(os->os_rootbp)) {
312277586Sdelphij		arc_flags_t aflags = ARC_FLAG_WAIT;
313268657Sdelphij		zbookmark_phys_t zb;
314219089Spjd		SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
315219089Spjd		    ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
316219089Spjd
317219089Spjd		if (DMU_OS_IS_L2CACHEABLE(os))
318277586Sdelphij			aflags |= ARC_FLAG_L2CACHE;
319168404Spjd
320219089Spjd		dprintf_bp(os->os_rootbp, "reading %s", "");
321246666Smm		err = arc_read(NULL, spa, os->os_rootbp,
322219089Spjd		    arc_getbuf_func, &os->os_phys_buf,
323168404Spjd		    ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &aflags, &zb);
324248571Smm		if (err != 0) {
325219089Spjd			kmem_free(os, sizeof (objset_t));
326185029Spjd			/* convert checksum errors into IO errors */
327185029Spjd			if (err == ECKSUM)
328249195Smm				err = SET_ERROR(EIO);
329168404Spjd			return (err);
330168404Spjd		}
331209962Smm
332209962Smm		/* Increase the blocksize if we are permitted. */
333209962Smm		if (spa_version(spa) >= SPA_VERSION_USERSPACE &&
334219089Spjd		    arc_buf_size(os->os_phys_buf) < sizeof (objset_phys_t)) {
335307266Smav			arc_buf_t *buf = arc_alloc_buf(spa,
336219089Spjd			    sizeof (objset_phys_t), &os->os_phys_buf,
337209962Smm			    ARC_BUFC_METADATA);
338209962Smm			bzero(buf->b_data, sizeof (objset_phys_t));
339219089Spjd			bcopy(os->os_phys_buf->b_data, buf->b_data,
340219089Spjd			    arc_buf_size(os->os_phys_buf));
341307266Smav			arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf);
342219089Spjd			os->os_phys_buf = buf;
343209962Smm		}
344209962Smm
345219089Spjd		os->os_phys = os->os_phys_buf->b_data;
346219089Spjd		os->os_flags = os->os_phys->os_flags;
347168404Spjd	} else {
348209962Smm		int size = spa_version(spa) >= SPA_VERSION_USERSPACE ?
349209962Smm		    sizeof (objset_phys_t) : OBJSET_OLD_PHYS_SIZE;
350307266Smav		os->os_phys_buf = arc_alloc_buf(spa, size,
351219089Spjd		    &os->os_phys_buf, ARC_BUFC_METADATA);
352219089Spjd		os->os_phys = os->os_phys_buf->b_data;
353219089Spjd		bzero(os->os_phys, size);
354168404Spjd	}
355168404Spjd
356168404Spjd	/*
357168404Spjd	 * Note: the changed_cb will be called once before the register
358168404Spjd	 * func returns, thus changing the checksum/compression from the
359185029Spjd	 * default (fletcher2/off).  Snapshots don't need to know about
360185029Spjd	 * checksum/compression/copies.
361168404Spjd	 */
362268649Sdelphij	if (ds != NULL) {
363290756Smav		boolean_t needlock = B_FALSE;
364290756Smav
365290756Smav		/*
366290756Smav		 * Note: it's valid to open the objset if the dataset is
367290756Smav		 * long-held, in which case the pool_config lock will not
368290756Smav		 * be held.
369290756Smav		 */
370290756Smav		if (!dsl_pool_config_held(dmu_objset_pool(os))) {
371290756Smav			needlock = B_TRUE;
372290756Smav			dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
373290756Smav		}
374248571Smm		err = dsl_prop_register(ds,
375248571Smm		    zfs_prop_to_name(ZFS_PROP_PRIMARYCACHE),
376219089Spjd		    primary_cache_changed_cb, os);
377248571Smm		if (err == 0) {
378248571Smm			err = dsl_prop_register(ds,
379248571Smm			    zfs_prop_to_name(ZFS_PROP_SECONDARYCACHE),
380219089Spjd			    secondary_cache_changed_cb, os);
381248571Smm		}
382288549Smav		if (!ds->ds_is_snapshot) {
383248571Smm			if (err == 0) {
384248571Smm				err = dsl_prop_register(ds,
385248571Smm				    zfs_prop_to_name(ZFS_PROP_CHECKSUM),
386219089Spjd				    checksum_changed_cb, os);
387248571Smm			}
388248571Smm			if (err == 0) {
389248571Smm				err = dsl_prop_register(ds,
390248571Smm				    zfs_prop_to_name(ZFS_PROP_COMPRESSION),
391219089Spjd				    compression_changed_cb, os);
392248571Smm			}
393248571Smm			if (err == 0) {
394248571Smm				err = dsl_prop_register(ds,
395248571Smm				    zfs_prop_to_name(ZFS_PROP_COPIES),
396219089Spjd				    copies_changed_cb, os);
397248571Smm			}
398248571Smm			if (err == 0) {
399248571Smm				err = dsl_prop_register(ds,
400248571Smm				    zfs_prop_to_name(ZFS_PROP_DEDUP),
401219089Spjd				    dedup_changed_cb, os);
402248571Smm			}
403248571Smm			if (err == 0) {
404248571Smm				err = dsl_prop_register(ds,
405248571Smm				    zfs_prop_to_name(ZFS_PROP_LOGBIAS),
406219089Spjd				    logbias_changed_cb, os);
407248571Smm			}
408248571Smm			if (err == 0) {
409248571Smm				err = dsl_prop_register(ds,
410248571Smm				    zfs_prop_to_name(ZFS_PROP_SYNC),
411219089Spjd				    sync_changed_cb, os);
412248571Smm			}
413268647Sdelphij			if (err == 0) {
414268647Sdelphij				err = dsl_prop_register(ds,
415268647Sdelphij				    zfs_prop_to_name(
416268647Sdelphij				    ZFS_PROP_REDUNDANT_METADATA),
417268647Sdelphij				    redundant_metadata_changed_cb, os);
418268647Sdelphij			}
419276081Sdelphij			if (err == 0) {
420276081Sdelphij				err = dsl_prop_register(ds,
421276081Sdelphij				    zfs_prop_to_name(ZFS_PROP_RECORDSIZE),
422276081Sdelphij				    recordsize_changed_cb, os);
423276081Sdelphij			}
424185029Spjd		}
425290756Smav		if (needlock)
426290756Smav			dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
427248571Smm		if (err != 0) {
428307266Smav			arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf);
429219089Spjd			kmem_free(os, sizeof (objset_t));
430168404Spjd			return (err);
431168404Spjd		}
432268649Sdelphij	} else {
433168404Spjd		/* It's the meta-objset. */
434219089Spjd		os->os_checksum = ZIO_CHECKSUM_FLETCHER_4;
435288542Smav		os->os_compress = ZIO_COMPRESS_ON;
436219089Spjd		os->os_copies = spa_max_replication(spa);
437219089Spjd		os->os_dedup_checksum = ZIO_CHECKSUM_OFF;
438268647Sdelphij		os->os_dedup_verify = B_FALSE;
439268647Sdelphij		os->os_logbias = ZFS_LOGBIAS_LATENCY;
440268647Sdelphij		os->os_sync = ZFS_SYNC_STANDARD;
441219089Spjd		os->os_primary_cache = ZFS_CACHE_ALL;
442219089Spjd		os->os_secondary_cache = ZFS_CACHE_ALL;
443168404Spjd	}
444168404Spjd
445288549Smav	if (ds == NULL || !ds->ds_is_snapshot)
446219089Spjd		os->os_zil_header = os->os_phys->os_zil_header;
447219089Spjd	os->os_zil = zil_alloc(os, &os->os_zil_header);
448168404Spjd
449168404Spjd	for (i = 0; i < TXG_SIZE; i++) {
450219089Spjd		list_create(&os->os_dirty_dnodes[i], sizeof (dnode_t),
451168404Spjd		    offsetof(dnode_t, dn_dirty_link[i]));
452219089Spjd		list_create(&os->os_free_dnodes[i], sizeof (dnode_t),
453168404Spjd		    offsetof(dnode_t, dn_dirty_link[i]));
454168404Spjd	}
455219089Spjd	list_create(&os->os_dnodes, sizeof (dnode_t),
456168404Spjd	    offsetof(dnode_t, dn_link));
457219089Spjd	list_create(&os->os_downgraded_dbufs, sizeof (dmu_buf_impl_t),
458168404Spjd	    offsetof(dmu_buf_impl_t, db_link));
459168404Spjd
460219089Spjd	mutex_init(&os->os_lock, NULL, MUTEX_DEFAULT, NULL);
461219089Spjd	mutex_init(&os->os_obj_lock, NULL, MUTEX_DEFAULT, NULL);
462219089Spjd	mutex_init(&os->os_user_ptr_lock, NULL, MUTEX_DEFAULT, NULL);
463168404Spjd
464288549Smav	dnode_special_open(os, &os->os_phys->os_meta_dnode,
465288549Smav	    DMU_META_DNODE_OBJECT, &os->os_meta_dnode);
466219089Spjd	if (arc_buf_size(os->os_phys_buf) >= sizeof (objset_phys_t)) {
467288549Smav		dnode_special_open(os, &os->os_phys->os_userused_dnode,
468288549Smav		    DMU_USERUSED_OBJECT, &os->os_userused_dnode);
469288549Smav		dnode_special_open(os, &os->os_phys->os_groupused_dnode,
470288549Smav		    DMU_GROUPUSED_OBJECT, &os->os_groupused_dnode);
471209962Smm	}
472168404Spjd
473219089Spjd	*osp = os;
474168404Spjd	return (0);
475168404Spjd}
476168404Spjd
477219089Spjdint
478219089Spjddmu_objset_from_ds(dsl_dataset_t *ds, objset_t **osp)
479168404Spjd{
480219089Spjd	int err = 0;
481168404Spjd
482290756Smav	/*
483290756Smav	 * We shouldn't be doing anything with dsl_dataset_t's unless the
484290756Smav	 * pool_config lock is held, or the dataset is long-held.
485290756Smav	 */
486290756Smav	ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool) ||
487290756Smav	    dsl_dataset_long_held(ds));
488290756Smav
489185029Spjd	mutex_enter(&ds->ds_opening_lock);
490268649Sdelphij	if (ds->ds_objset == NULL) {
491268649Sdelphij		objset_t *os;
492308083Smav		rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
493168404Spjd		err = dmu_objset_open_impl(dsl_dataset_get_spa(ds),
494268649Sdelphij		    ds, dsl_dataset_get_blkptr(ds), &os);
495308083Smav		rrw_exit(&ds->ds_bp_rwlock, FTAG);
496268649Sdelphij
497268649Sdelphij		if (err == 0) {
498268649Sdelphij			mutex_enter(&ds->ds_lock);
499268649Sdelphij			ASSERT(ds->ds_objset == NULL);
500268649Sdelphij			ds->ds_objset = os;
501268649Sdelphij			mutex_exit(&ds->ds_lock);
502268649Sdelphij		}
503168404Spjd	}
504268649Sdelphij	*osp = ds->ds_objset;
505185029Spjd	mutex_exit(&ds->ds_opening_lock);
506219089Spjd	return (err);
507168404Spjd}
508168404Spjd
509248571Smm/*
510248571Smm * Holds the pool while the objset is held.  Therefore only one objset
511248571Smm * can be held at a time.
512248571Smm */
513185029Spjdint
514219089Spjddmu_objset_hold(const char *name, void *tag, objset_t **osp)
515185029Spjd{
516248571Smm	dsl_pool_t *dp;
517219089Spjd	dsl_dataset_t *ds;
518185029Spjd	int err;
519185029Spjd
520248571Smm	err = dsl_pool_hold(name, tag, &dp);
521248571Smm	if (err != 0)
522219089Spjd		return (err);
523248571Smm	err = dsl_dataset_hold(dp, name, tag, &ds);
524248571Smm	if (err != 0) {
525248571Smm		dsl_pool_rele(dp, tag);
526248571Smm		return (err);
527248571Smm	}
528219089Spjd
529219089Spjd	err = dmu_objset_from_ds(ds, osp);
530248571Smm	if (err != 0) {
531219089Spjd		dsl_dataset_rele(ds, tag);
532248571Smm		dsl_pool_rele(dp, tag);
533248571Smm	}
534219089Spjd
535185029Spjd	return (err);
536185029Spjd}
537185029Spjd
538288569Smavstatic int
539288569Smavdmu_objset_own_impl(dsl_dataset_t *ds, dmu_objset_type_t type,
540288569Smav    boolean_t readonly, void *tag, objset_t **osp)
541288569Smav{
542288569Smav	int err;
543288569Smav
544288569Smav	err = dmu_objset_from_ds(ds, osp);
545288569Smav	if (err != 0) {
546288569Smav		dsl_dataset_disown(ds, tag);
547288569Smav	} else if (type != DMU_OST_ANY && type != (*osp)->os_phys->os_type) {
548288569Smav		dsl_dataset_disown(ds, tag);
549288569Smav		return (SET_ERROR(EINVAL));
550288569Smav	} else if (!readonly && dsl_dataset_is_snapshot(ds)) {
551288569Smav		dsl_dataset_disown(ds, tag);
552288569Smav		return (SET_ERROR(EROFS));
553288569Smav	}
554288569Smav	return (err);
555288569Smav}
556288569Smav
557248571Smm/*
558248571Smm * dsl_pool must not be held when this is called.
559248571Smm * Upon successful return, there will be a longhold on the dataset,
560248571Smm * and the dsl_pool will not be held.
561248571Smm */
562185029Spjdint
563219089Spjddmu_objset_own(const char *name, dmu_objset_type_t type,
564219089Spjd    boolean_t readonly, void *tag, objset_t **osp)
565185029Spjd{
566248571Smm	dsl_pool_t *dp;
567185029Spjd	dsl_dataset_t *ds;
568185029Spjd	int err;
569185029Spjd
570248571Smm	err = dsl_pool_hold(name, FTAG, &dp);
571248571Smm	if (err != 0)
572185029Spjd		return (err);
573248571Smm	err = dsl_dataset_own(dp, name, tag, &ds);
574248571Smm	if (err != 0) {
575248571Smm		dsl_pool_rele(dp, FTAG);
576248571Smm		return (err);
577248571Smm	}
578288569Smav	err = dmu_objset_own_impl(ds, type, readonly, tag, osp);
579288569Smav	dsl_pool_rele(dp, FTAG);
580185029Spjd
581185029Spjd	return (err);
582185029Spjd}
583185029Spjd
584288569Smavint
585288569Smavdmu_objset_own_obj(dsl_pool_t *dp, uint64_t obj, dmu_objset_type_t type,
586288569Smav    boolean_t readonly, void *tag, objset_t **osp)
587288569Smav{
588288569Smav	dsl_dataset_t *ds;
589288569Smav	int err;
590288569Smav
591288569Smav	err = dsl_dataset_own_obj(dp, obj, tag, &ds);
592288569Smav	if (err != 0)
593288569Smav		return (err);
594288569Smav
595288569Smav	return (dmu_objset_own_impl(ds, type, readonly, tag, osp));
596288569Smav}
597288569Smav
598168404Spjdvoid
599219089Spjddmu_objset_rele(objset_t *os, void *tag)
600168404Spjd{
601248571Smm	dsl_pool_t *dp = dmu_objset_pool(os);
602219089Spjd	dsl_dataset_rele(os->os_dsl_dataset, tag);
603248571Smm	dsl_pool_rele(dp, tag);
604219089Spjd}
605185029Spjd
606253816Sdelphij/*
607253816Sdelphij * When we are called, os MUST refer to an objset associated with a dataset
608253816Sdelphij * that is owned by 'tag'; that is, is held and long held by 'tag' and ds_owner
609253816Sdelphij * == tag.  We will then release and reacquire ownership of the dataset while
610253816Sdelphij * holding the pool config_rwlock to avoid intervening namespace or ownership
611253816Sdelphij * changes may occur.
612253816Sdelphij *
613253816Sdelphij * This exists solely to accommodate zfs_ioc_userspace_upgrade()'s desire to
614253816Sdelphij * release the hold on its dataset and acquire a new one on the dataset of the
615253816Sdelphij * same name so that it can be partially torn down and reconstructed.
616253816Sdelphij */
617219089Spjdvoid
618331612Savgdmu_objset_refresh_ownership(dsl_dataset_t *ds, dsl_dataset_t **newds,
619331612Savg    void *tag)
620253816Sdelphij{
621253816Sdelphij	dsl_pool_t *dp;
622307122Smav	char name[ZFS_MAX_DATASET_NAME_LEN];
623253816Sdelphij
624253816Sdelphij	VERIFY3P(ds, !=, NULL);
625253816Sdelphij	VERIFY3P(ds->ds_owner, ==, tag);
626253816Sdelphij	VERIFY(dsl_dataset_long_held(ds));
627253816Sdelphij
628253816Sdelphij	dsl_dataset_name(ds, name);
629331612Savg	dp = ds->ds_dir->dd_pool;
630253816Sdelphij	dsl_pool_config_enter(dp, FTAG);
631331612Savg	dsl_dataset_disown(ds, tag);
632331612Savg	VERIFY0(dsl_dataset_own(dp, name, tag, newds));
633253816Sdelphij	dsl_pool_config_exit(dp, FTAG);
634253816Sdelphij}
635253816Sdelphij
636253816Sdelphijvoid
637219089Spjddmu_objset_disown(objset_t *os, void *tag)
638219089Spjd{
639219089Spjd	dsl_dataset_disown(os->os_dsl_dataset, tag);
640168404Spjd}
641168404Spjd
642248571Smmvoid
643185029Spjddmu_objset_evict_dbufs(objset_t *os)
644168404Spjd{
645288549Smav	dnode_t dn_marker;
646168404Spjd	dnode_t *dn;
647168404Spjd
648219089Spjd	mutex_enter(&os->os_lock);
649288549Smav	dn = list_head(&os->os_dnodes);
650288549Smav	while (dn != NULL) {
651288549Smav		/*
652288549Smav		 * Skip dnodes without holds.  We have to do this dance
653288549Smav		 * because dnode_add_ref() only works if there is already a
654288549Smav		 * hold.  If the dnode has no holds, then it has no dbufs.
655288549Smav		 */
656288549Smav		if (dnode_add_ref(dn, FTAG)) {
657288549Smav			list_insert_after(&os->os_dnodes, dn, &dn_marker);
658288549Smav			mutex_exit(&os->os_lock);
659168404Spjd
660288549Smav			dnode_evict_dbufs(dn);
661288549Smav			dnode_rele(dn, FTAG);
662168404Spjd
663288549Smav			mutex_enter(&os->os_lock);
664288549Smav			dn = list_next(&os->os_dnodes, &dn_marker);
665288549Smav			list_remove(&os->os_dnodes, &dn_marker);
666288549Smav		} else {
667288549Smav			dn = list_next(&os->os_dnodes, dn);
668288549Smav		}
669288549Smav	}
670288549Smav	mutex_exit(&os->os_lock);
671168404Spjd
672288549Smav	if (DMU_USERUSED_DNODE(os) != NULL) {
673288549Smav		dnode_evict_dbufs(DMU_GROUPUSED_DNODE(os));
674288549Smav		dnode_evict_dbufs(DMU_USERUSED_DNODE(os));
675168404Spjd	}
676288549Smav	dnode_evict_dbufs(DMU_META_DNODE(os));
677168404Spjd}
678168404Spjd
679288549Smav/*
680288549Smav * Objset eviction processing is split into into two pieces.
681288549Smav * The first marks the objset as evicting, evicts any dbufs that
682288549Smav * have a refcount of zero, and then queues up the objset for the
683288549Smav * second phase of eviction.  Once os->os_dnodes has been cleared by
684288549Smav * dnode_buf_pageout()->dnode_destroy(), the second phase is executed.
685288549Smav * The second phase closes the special dnodes, dequeues the objset from
686288549Smav * the list of those undergoing eviction, and finally frees the objset.
687288549Smav *
688288549Smav * NOTE: Due to asynchronous eviction processing (invocation of
689288549Smav *       dnode_buf_pageout()), it is possible for the meta dnode for the
690288549Smav *       objset to have no holds even though os->os_dnodes is not empty.
691288549Smav */
692168404Spjdvoid
693219089Spjddmu_objset_evict(objset_t *os)
694168404Spjd{
695219089Spjd	dsl_dataset_t *ds = os->os_dsl_dataset;
696168404Spjd
697219089Spjd	for (int t = 0; t < TXG_SIZE; t++)
698219089Spjd		ASSERT(!dmu_objset_is_dirty(os, t));
699168404Spjd
700289100Sdelphij	if (ds)
701289100Sdelphij		dsl_prop_unregister_all(ds, os);
702168404Spjd
703219089Spjd	if (os->os_sa)
704219089Spjd		sa_tear_down(os);
705219089Spjd
706248571Smm	dmu_objset_evict_dbufs(os);
707168404Spjd
708288549Smav	mutex_enter(&os->os_lock);
709288549Smav	spa_evicting_os_register(os->os_spa, os);
710288549Smav	if (list_is_empty(&os->os_dnodes)) {
711288549Smav		mutex_exit(&os->os_lock);
712288549Smav		dmu_objset_evict_done(os);
713288549Smav	} else {
714288549Smav		mutex_exit(&os->os_lock);
715288549Smav	}
716288549Smav}
717288549Smav
718288549Smavvoid
719288549Smavdmu_objset_evict_done(objset_t *os)
720288549Smav{
721288549Smav	ASSERT3P(list_head(&os->os_dnodes), ==, NULL);
722288549Smav
723219089Spjd	dnode_special_close(&os->os_meta_dnode);
724219089Spjd	if (DMU_USERUSED_DNODE(os)) {
725219089Spjd		dnode_special_close(&os->os_userused_dnode);
726219089Spjd		dnode_special_close(&os->os_groupused_dnode);
727209962Smm	}
728219089Spjd	zil_free(os->os_zil);
729168404Spjd
730307266Smav	arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf);
731219089Spjd
732219089Spjd	/*
733219089Spjd	 * This is a barrier to prevent the objset from going away in
734219089Spjd	 * dnode_move() until we can safely ensure that the objset is still in
735219089Spjd	 * use. We consider the objset valid before the barrier and invalid
736219089Spjd	 * after the barrier.
737219089Spjd	 */
738219089Spjd	rw_enter(&os_lock, RW_READER);
739219089Spjd	rw_exit(&os_lock);
740219089Spjd
741219089Spjd	mutex_destroy(&os->os_lock);
742219089Spjd	mutex_destroy(&os->os_obj_lock);
743219089Spjd	mutex_destroy(&os->os_user_ptr_lock);
744288549Smav	spa_evicting_os_deregister(os->os_spa, os);
745219089Spjd	kmem_free(os, sizeof (objset_t));
746168404Spjd}
747168404Spjd
748219089Spjdtimestruc_t
749219089Spjddmu_objset_snap_cmtime(objset_t *os)
750219089Spjd{
751219089Spjd	return (dsl_dir_snap_cmtime(os->os_dsl_dataset->ds_dir));
752219089Spjd}
753219089Spjd
754168404Spjd/* called from dsl for meta-objset */
755219089Spjdobjset_t *
756168404Spjddmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
757168404Spjd    dmu_objset_type_t type, dmu_tx_t *tx)
758168404Spjd{
759219089Spjd	objset_t *os;
760168404Spjd	dnode_t *mdn;
761168404Spjd
762168404Spjd	ASSERT(dmu_tx_is_syncing(tx));
763248571Smm
764219089Spjd	if (ds != NULL)
765248571Smm		VERIFY0(dmu_objset_from_ds(ds, &os));
766219089Spjd	else
767248571Smm		VERIFY0(dmu_objset_open_impl(spa, NULL, bp, &os));
768168404Spjd
769219089Spjd	mdn = DMU_META_DNODE(os);
770219089Spjd
771168404Spjd	dnode_allocate(mdn, DMU_OT_DNODE, 1 << DNODE_BLOCK_SHIFT,
772168404Spjd	    DN_MAX_INDBLKSHIFT, DMU_OT_NONE, 0, tx);
773168404Spjd
774168404Spjd	/*
775168404Spjd	 * We don't want to have to increase the meta-dnode's nlevels
776168404Spjd	 * later, because then we could do it in quescing context while
777168404Spjd	 * we are also accessing it in open context.
778168404Spjd	 *
779168404Spjd	 * This precaution is not necessary for the MOS (ds == NULL),
780168404Spjd	 * because the MOS is only updated in syncing context.
781168404Spjd	 * This is most fortunate: the MOS is the only objset that
782168404Spjd	 * needs to be synced multiple times as spa_sync() iterates
783168404Spjd	 * to convergence, so minimizing its dn_nlevels matters.
784168404Spjd	 */
785168404Spjd	if (ds != NULL) {
786168404Spjd		int levels = 1;
787168404Spjd
788168404Spjd		/*
789168404Spjd		 * Determine the number of levels necessary for the meta-dnode
790307126Smav		 * to contain DN_MAX_OBJECT dnodes.  Note that in order to
791307126Smav		 * ensure that we do not overflow 64 bits, there has to be
792307126Smav		 * a nlevels that gives us a number of blocks > DN_MAX_OBJECT
793307126Smav		 * but < 2^64.  Therefore,
794307126Smav		 * (mdn->dn_indblkshift - SPA_BLKPTRSHIFT) (10) must be
795307126Smav		 * less than (64 - log2(DN_MAX_OBJECT)) (16).
796168404Spjd		 */
797307126Smav		while ((uint64_t)mdn->dn_nblkptr <<
798307126Smav		    (mdn->dn_datablkshift - DNODE_SHIFT +
799168404Spjd		    (levels - 1) * (mdn->dn_indblkshift - SPA_BLKPTRSHIFT)) <
800307126Smav		    DN_MAX_OBJECT)
801168404Spjd			levels++;
802168404Spjd
803168404Spjd		mdn->dn_next_nlevels[tx->tx_txg & TXG_MASK] =
804168404Spjd		    mdn->dn_nlevels = levels;
805168404Spjd	}
806168404Spjd
807168404Spjd	ASSERT(type != DMU_OST_NONE);
808168404Spjd	ASSERT(type != DMU_OST_ANY);
809168404Spjd	ASSERT(type < DMU_OST_NUMTYPES);
810219089Spjd	os->os_phys->os_type = type;
811219089Spjd	if (dmu_objset_userused_enabled(os)) {
812219089Spjd		os->os_phys->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE;
813219089Spjd		os->os_flags = os->os_phys->os_flags;
814209962Smm	}
815168404Spjd
816168404Spjd	dsl_dataset_dirty(ds, tx);
817168404Spjd
818219089Spjd	return (os);
819168404Spjd}
820168404Spjd
821248571Smmtypedef struct dmu_objset_create_arg {
822248571Smm	const char *doca_name;
823248571Smm	cred_t *doca_cred;
824248571Smm	void (*doca_userfunc)(objset_t *os, void *arg,
825248571Smm	    cred_t *cr, dmu_tx_t *tx);
826248571Smm	void *doca_userarg;
827248571Smm	dmu_objset_type_t doca_type;
828248571Smm	uint64_t doca_flags;
829248571Smm} dmu_objset_create_arg_t;
830168404Spjd
831185029Spjd/*ARGSUSED*/
832168404Spjdstatic int
833248571Smmdmu_objset_create_check(void *arg, dmu_tx_t *tx)
834168404Spjd{
835248571Smm	dmu_objset_create_arg_t *doca = arg;
836248571Smm	dsl_pool_t *dp = dmu_tx_pool(tx);
837248571Smm	dsl_dir_t *pdd;
838248571Smm	const char *tail;
839248571Smm	int error;
840168404Spjd
841248571Smm	if (strchr(doca->doca_name, '@') != NULL)
842249195Smm		return (SET_ERROR(EINVAL));
843168404Spjd
844307122Smav	if (strlen(doca->doca_name) >= ZFS_MAX_DATASET_NAME_LEN)
845307122Smav		return (SET_ERROR(ENAMETOOLONG));
846307122Smav
847248571Smm	error = dsl_dir_hold(dp, doca->doca_name, FTAG, &pdd, &tail);
848248571Smm	if (error != 0)
849248571Smm		return (error);
850248571Smm	if (tail == NULL) {
851248571Smm		dsl_dir_rele(pdd, FTAG);
852249195Smm		return (SET_ERROR(EEXIST));
853168404Spjd	}
854265744Sdelphij	error = dsl_fs_ss_limit_check(pdd, 1, ZFS_PROP_FILESYSTEM_LIMIT, NULL,
855265744Sdelphij	    doca->doca_cred);
856248571Smm	dsl_dir_rele(pdd, FTAG);
857185029Spjd
858265744Sdelphij	return (error);
859168404Spjd}
860168404Spjd
861168404Spjdstatic void
862248571Smmdmu_objset_create_sync(void *arg, dmu_tx_t *tx)
863168404Spjd{
864248571Smm	dmu_objset_create_arg_t *doca = arg;
865248571Smm	dsl_pool_t *dp = dmu_tx_pool(tx);
866248571Smm	dsl_dir_t *pdd;
867248571Smm	const char *tail;
868248571Smm	dsl_dataset_t *ds;
869219089Spjd	uint64_t obj;
870248571Smm	blkptr_t *bp;
871248571Smm	objset_t *os;
872168404Spjd
873248571Smm	VERIFY0(dsl_dir_hold(dp, doca->doca_name, FTAG, &pdd, &tail));
874168404Spjd
875248571Smm	obj = dsl_dataset_create_sync(pdd, tail, NULL, doca->doca_flags,
876248571Smm	    doca->doca_cred, tx);
877168404Spjd
878248571Smm	VERIFY0(dsl_dataset_hold_obj(pdd->dd_pool, obj, FTAG, &ds));
879308083Smav	rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
880248571Smm	bp = dsl_dataset_get_blkptr(ds);
881248571Smm	os = dmu_objset_create_impl(pdd->dd_pool->dp_spa,
882248571Smm	    ds, bp, doca->doca_type, tx);
883308083Smav	rrw_exit(&ds->ds_bp_rwlock, FTAG);
884168404Spjd
885248571Smm	if (doca->doca_userfunc != NULL) {
886248571Smm		doca->doca_userfunc(os, doca->doca_userarg,
887248571Smm		    doca->doca_cred, tx);
888168404Spjd	}
889185029Spjd
890248571Smm	spa_history_log_internal_ds(ds, "create", tx, "");
891248571Smm	dsl_dataset_rele(ds, FTAG);
892248571Smm	dsl_dir_rele(pdd, FTAG);
893168404Spjd}
894168404Spjd
895168404Spjdint
896219089Spjddmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags,
897185029Spjd    void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg)
898168404Spjd{
899248571Smm	dmu_objset_create_arg_t doca;
900168404Spjd
901248571Smm	doca.doca_name = name;
902248571Smm	doca.doca_cred = CRED();
903248571Smm	doca.doca_flags = flags;
904248571Smm	doca.doca_userfunc = func;
905248571Smm	doca.doca_userarg = arg;
906248571Smm	doca.doca_type = type;
907168404Spjd
908248571Smm	return (dsl_sync_task(name,
909269006Sdelphij	    dmu_objset_create_check, dmu_objset_create_sync, &doca,
910269006Sdelphij	    5, ZFS_SPACE_CHECK_NORMAL));
911168404Spjd}
912168404Spjd
913248571Smmtypedef struct dmu_objset_clone_arg {
914248571Smm	const char *doca_clone;
915248571Smm	const char *doca_origin;
916248571Smm	cred_t *doca_cred;
917248571Smm} dmu_objset_clone_arg_t;
918248571Smm
919248571Smm/*ARGSUSED*/
920248571Smmstatic int
921248571Smmdmu_objset_clone_check(void *arg, dmu_tx_t *tx)
922168404Spjd{
923248571Smm	dmu_objset_clone_arg_t *doca = arg;
924219089Spjd	dsl_dir_t *pdd;
925219089Spjd	const char *tail;
926248571Smm	int error;
927248571Smm	dsl_dataset_t *origin;
928248571Smm	dsl_pool_t *dp = dmu_tx_pool(tx);
929168404Spjd
930248571Smm	if (strchr(doca->doca_clone, '@') != NULL)
931249195Smm		return (SET_ERROR(EINVAL));
932248571Smm
933307122Smav	if (strlen(doca->doca_clone) >= ZFS_MAX_DATASET_NAME_LEN)
934307122Smav		return (SET_ERROR(ENAMETOOLONG));
935307122Smav
936248571Smm	error = dsl_dir_hold(dp, doca->doca_clone, FTAG, &pdd, &tail);
937248571Smm	if (error != 0)
938248571Smm		return (error);
939219089Spjd	if (tail == NULL) {
940248571Smm		dsl_dir_rele(pdd, FTAG);
941249195Smm		return (SET_ERROR(EEXIST));
942168404Spjd	}
943282756Savg
944265744Sdelphij	error = dsl_fs_ss_limit_check(pdd, 1, ZFS_PROP_FILESYSTEM_LIMIT, NULL,
945265744Sdelphij	    doca->doca_cred);
946265744Sdelphij	if (error != 0) {
947265744Sdelphij		dsl_dir_rele(pdd, FTAG);
948265744Sdelphij		return (SET_ERROR(EDQUOT));
949265744Sdelphij	}
950248571Smm	dsl_dir_rele(pdd, FTAG);
951185029Spjd
952248571Smm	error = dsl_dataset_hold(dp, doca->doca_origin, FTAG, &origin);
953248571Smm	if (error != 0)
954219089Spjd		return (error);
955219089Spjd
956248571Smm	/* You can only clone snapshots, not the head datasets. */
957288549Smav	if (!origin->ds_is_snapshot) {
958248571Smm		dsl_dataset_rele(origin, FTAG);
959249195Smm		return (SET_ERROR(EINVAL));
960219089Spjd	}
961248571Smm	dsl_dataset_rele(origin, FTAG);
962248571Smm
963248571Smm	return (0);
964209962Smm}
965209962Smm
966209962Smmstatic void
967248571Smmdmu_objset_clone_sync(void *arg, dmu_tx_t *tx)
968209962Smm{
969248571Smm	dmu_objset_clone_arg_t *doca = arg;
970248571Smm	dsl_pool_t *dp = dmu_tx_pool(tx);
971248571Smm	dsl_dir_t *pdd;
972248571Smm	const char *tail;
973248571Smm	dsl_dataset_t *origin, *ds;
974248571Smm	uint64_t obj;
975307122Smav	char namebuf[ZFS_MAX_DATASET_NAME_LEN];
976209962Smm
977248571Smm	VERIFY0(dsl_dir_hold(dp, doca->doca_clone, FTAG, &pdd, &tail));
978248571Smm	VERIFY0(dsl_dataset_hold(dp, doca->doca_origin, FTAG, &origin));
979209962Smm
980248571Smm	obj = dsl_dataset_create_sync(pdd, tail, origin, 0,
981248571Smm	    doca->doca_cred, tx);
982219089Spjd
983248571Smm	VERIFY0(dsl_dataset_hold_obj(pdd->dd_pool, obj, FTAG, &ds));
984248571Smm	dsl_dataset_name(origin, namebuf);
985248571Smm	spa_history_log_internal_ds(ds, "clone", tx,
986248571Smm	    "origin=%s (%llu)", namebuf, origin->ds_object);
987248571Smm	dsl_dataset_rele(ds, FTAG);
988248571Smm	dsl_dataset_rele(origin, FTAG);
989248571Smm	dsl_dir_rele(pdd, FTAG);
990209962Smm}
991209962Smm
992248571Smmint
993248571Smmdmu_objset_clone(const char *clone, const char *origin)
994168404Spjd{
995248571Smm	dmu_objset_clone_arg_t doca;
996168404Spjd
997248571Smm	doca.doca_clone = clone;
998248571Smm	doca.doca_origin = origin;
999248571Smm	doca.doca_cred = CRED();
1000219089Spjd
1001248571Smm	return (dsl_sync_task(clone,
1002269006Sdelphij	    dmu_objset_clone_check, dmu_objset_clone_sync, &doca,
1003269006Sdelphij	    5, ZFS_SPACE_CHECK_NORMAL));
1004168404Spjd}
1005168404Spjd
1006168404Spjdint
1007248571Smmdmu_objset_snapshot_one(const char *fsname, const char *snapname)
1008168404Spjd{
1009168404Spjd	int err;
1010248571Smm	char *longsnap = kmem_asprintf("%s@%s", fsname, snapname);
1011248571Smm	nvlist_t *snaps = fnvlist_alloc();
1012168404Spjd
1013248571Smm	fnvlist_add_boolean(snaps, longsnap);
1014248571Smm	strfree(longsnap);
1015248571Smm	err = dsl_dataset_snapshot(snaps, NULL, NULL);
1016248571Smm	fnvlist_free(snaps);
1017168404Spjd	return (err);
1018168404Spjd}
1019168404Spjd
1020168404Spjdstatic void
1021209962Smmdmu_objset_sync_dnodes(list_t *list, list_t *newlist, dmu_tx_t *tx)
1022168404Spjd{
1023168404Spjd	dnode_t *dn;
1024168404Spjd
1025168404Spjd	while (dn = list_head(list)) {
1026168404Spjd		ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
1027168404Spjd		ASSERT(dn->dn_dbuf->db_data_pending);
1028168404Spjd		/*
1029209962Smm		 * Initialize dn_zio outside dnode_sync() because the
1030209962Smm		 * meta-dnode needs to set it ouside dnode_sync().
1031168404Spjd		 */
1032168404Spjd		dn->dn_zio = dn->dn_dbuf->db_data_pending->dr_zio;
1033168404Spjd		ASSERT(dn->dn_zio);
1034168404Spjd
1035168404Spjd		ASSERT3U(dn->dn_nlevels, <=, DN_MAX_LEVELS);
1036168404Spjd		list_remove(list, dn);
1037209962Smm
1038209962Smm		if (newlist) {
1039209962Smm			(void) dnode_add_ref(dn, newlist);
1040209962Smm			list_insert_tail(newlist, dn);
1041209962Smm		}
1042209962Smm
1043168404Spjd		dnode_sync(dn, tx);
1044168404Spjd	}
1045168404Spjd}
1046168404Spjd
1047168404Spjd/* ARGSUSED */
1048168404Spjdstatic void
1049219089Spjddmu_objset_write_ready(zio_t *zio, arc_buf_t *abuf, void *arg)
1050168404Spjd{
1051185029Spjd	blkptr_t *bp = zio->io_bp;
1052219089Spjd	objset_t *os = arg;
1053168404Spjd	dnode_phys_t *dnp = &os->os_phys->os_meta_dnode;
1054168404Spjd
1055268649Sdelphij	ASSERT(!BP_IS_EMBEDDED(bp));
1056248571Smm	ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_OBJSET);
1057248571Smm	ASSERT0(BP_GET_LEVEL(bp));
1058185029Spjd
1059168404Spjd	/*
1060209962Smm	 * Update rootbp fill count: it should be the number of objects
1061209962Smm	 * allocated in the object set (not counting the "special"
1062209962Smm	 * objects that are stored in the objset_phys_t -- the meta
1063209962Smm	 * dnode and user/group accounting objects).
1064168404Spjd	 */
1065209962Smm	bp->blk_fill = 0;
1066185029Spjd	for (int i = 0; i < dnp->dn_nblkptr; i++)
1067268649Sdelphij		bp->blk_fill += BP_GET_FILL(&dnp->dn_blkptr[i]);
1068308083Smav	if (os->os_dsl_dataset != NULL)
1069308083Smav		rrw_enter(&os->os_dsl_dataset->ds_bp_rwlock, RW_WRITER, FTAG);
1070308083Smav	*os->os_rootbp = *bp;
1071308083Smav	if (os->os_dsl_dataset != NULL)
1072308083Smav		rrw_exit(&os->os_dsl_dataset->ds_bp_rwlock, FTAG);
1073219089Spjd}
1074168404Spjd
1075219089Spjd/* ARGSUSED */
1076219089Spjdstatic void
1077219089Spjddmu_objset_write_done(zio_t *zio, arc_buf_t *abuf, void *arg)
1078219089Spjd{
1079219089Spjd	blkptr_t *bp = zio->io_bp;
1080219089Spjd	blkptr_t *bp_orig = &zio->io_bp_orig;
1081219089Spjd	objset_t *os = arg;
1082219089Spjd
1083185029Spjd	if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
1084219089Spjd		ASSERT(BP_EQUAL(bp, bp_orig));
1085185029Spjd	} else {
1086219089Spjd		dsl_dataset_t *ds = os->os_dsl_dataset;
1087219089Spjd		dmu_tx_t *tx = os->os_synctx;
1088219089Spjd
1089219089Spjd		(void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);
1090219089Spjd		dsl_dataset_block_born(ds, bp, tx);
1091168404Spjd	}
1092308083Smav	kmem_free(bp, sizeof (*bp));
1093168404Spjd}
1094168404Spjd
1095168404Spjd/* called from dsl */
1096168404Spjdvoid
1097219089Spjddmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx)
1098168404Spjd{
1099168404Spjd	int txgoff;
1100268657Sdelphij	zbookmark_phys_t zb;
1101219089Spjd	zio_prop_t zp;
1102168404Spjd	zio_t *zio;
1103168404Spjd	list_t *list;
1104209962Smm	list_t *newlist = NULL;
1105168404Spjd	dbuf_dirty_record_t *dr;
1106308083Smav	blkptr_t *blkptr_copy = kmem_alloc(sizeof (*os->os_rootbp), KM_SLEEP);
1107308083Smav	*blkptr_copy = *os->os_rootbp;
1108168404Spjd
1109168404Spjd	dprintf_ds(os->os_dsl_dataset, "txg=%llu\n", tx->tx_txg);
1110168404Spjd
1111168404Spjd	ASSERT(dmu_tx_is_syncing(tx));
1112168404Spjd	/* XXX the write_done callback should really give us the tx... */
1113168404Spjd	os->os_synctx = tx;
1114168404Spjd
1115168404Spjd	if (os->os_dsl_dataset == NULL) {
1116168404Spjd		/*
1117168404Spjd		 * This is the MOS.  If we have upgraded,
1118168404Spjd		 * spa_max_replication() could change, so reset
1119168404Spjd		 * os_copies here.
1120168404Spjd		 */
1121168404Spjd		os->os_copies = spa_max_replication(os->os_spa);
1122168404Spjd	}
1123168404Spjd
1124168404Spjd	/*
1125168404Spjd	 * Create the root block IO
1126168404Spjd	 */
1127219089Spjd	SET_BOOKMARK(&zb, os->os_dsl_dataset ?
1128219089Spjd	    os->os_dsl_dataset->ds_object : DMU_META_OBJSET,
1129219089Spjd	    ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
1130246666Smm	arc_release(os->os_phys_buf, &os->os_phys_buf);
1131185029Spjd
1132219089Spjd	dmu_write_policy(os, NULL, 0, 0, &zp);
1133185029Spjd
1134219089Spjd	zio = arc_write(pio, os->os_spa, tx->tx_txg,
1135308083Smav	    blkptr_copy, os->os_phys_buf, DMU_OS_IS_L2CACHEABLE(os),
1136304139Savg	    &zp, dmu_objset_write_ready, NULL, NULL, dmu_objset_write_done,
1137304139Savg	    os, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
1138185029Spjd
1139168404Spjd	/*
1140209962Smm	 * Sync special dnodes - the parent IO for the sync is the root block
1141168404Spjd	 */
1142219089Spjd	DMU_META_DNODE(os)->dn_zio = zio;
1143219089Spjd	dnode_sync(DMU_META_DNODE(os), tx);
1144168404Spjd
1145209962Smm	os->os_phys->os_flags = os->os_flags;
1146209962Smm
1147219089Spjd	if (DMU_USERUSED_DNODE(os) &&
1148219089Spjd	    DMU_USERUSED_DNODE(os)->dn_type != DMU_OT_NONE) {
1149219089Spjd		DMU_USERUSED_DNODE(os)->dn_zio = zio;
1150219089Spjd		dnode_sync(DMU_USERUSED_DNODE(os), tx);
1151219089Spjd		DMU_GROUPUSED_DNODE(os)->dn_zio = zio;
1152219089Spjd		dnode_sync(DMU_GROUPUSED_DNODE(os), tx);
1153209962Smm	}
1154209962Smm
1155168404Spjd	txgoff = tx->tx_txg & TXG_MASK;
1156168404Spjd
1157209962Smm	if (dmu_objset_userused_enabled(os)) {
1158209962Smm		newlist = &os->os_synced_dnodes;
1159209962Smm		/*
1160209962Smm		 * We must create the list here because it uses the
1161209962Smm		 * dn_dirty_link[] of this txg.
1162209962Smm		 */
1163209962Smm		list_create(newlist, sizeof (dnode_t),
1164209962Smm		    offsetof(dnode_t, dn_dirty_link[txgoff]));
1165209962Smm	}
1166168404Spjd
1167209962Smm	dmu_objset_sync_dnodes(&os->os_free_dnodes[txgoff], newlist, tx);
1168209962Smm	dmu_objset_sync_dnodes(&os->os_dirty_dnodes[txgoff], newlist, tx);
1169209962Smm
1170219089Spjd	list = &DMU_META_DNODE(os)->dn_dirty_records[txgoff];
1171168404Spjd	while (dr = list_head(list)) {
1172248571Smm		ASSERT0(dr->dr_dbuf->db_level);
1173168404Spjd		list_remove(list, dr);
1174168404Spjd		if (dr->dr_zio)
1175168404Spjd			zio_nowait(dr->dr_zio);
1176168404Spjd	}
1177168404Spjd	/*
1178168404Spjd	 * Free intent log blocks up to this tx.
1179168404Spjd	 */
1180168404Spjd	zil_sync(os->os_zil, tx);
1181185029Spjd	os->os_phys->os_zil_header = os->os_zil_header;
1182168404Spjd	zio_nowait(zio);
1183168404Spjd}
1184168404Spjd
1185219089Spjdboolean_t
1186219089Spjddmu_objset_is_dirty(objset_t *os, uint64_t txg)
1187219089Spjd{
1188219089Spjd	return (!list_is_empty(&os->os_dirty_dnodes[txg & TXG_MASK]) ||
1189219089Spjd	    !list_is_empty(&os->os_free_dnodes[txg & TXG_MASK]));
1190219089Spjd}
1191219089Spjd
1192209962Smmstatic objset_used_cb_t *used_cbs[DMU_OST_NUMTYPES];
1193209962Smm
1194168404Spjdvoid
1195209962Smmdmu_objset_register_type(dmu_objset_type_t ost, objset_used_cb_t *cb)
1196209962Smm{
1197209962Smm	used_cbs[ost] = cb;
1198209962Smm}
1199209962Smm
1200209962Smmboolean_t
1201219089Spjddmu_objset_userused_enabled(objset_t *os)
1202209962Smm{
1203209962Smm	return (spa_version(os->os_spa) >= SPA_VERSION_USERSPACE &&
1204219089Spjd	    used_cbs[os->os_phys->os_type] != NULL &&
1205219089Spjd	    DMU_USERUSED_DNODE(os) != NULL);
1206209962Smm}
1207209962Smm
1208308586Smavtypedef struct userquota_node {
1209308586Smav	uint64_t uqn_id;
1210308586Smav	int64_t uqn_delta;
1211308586Smav	avl_node_t uqn_node;
1212308586Smav} userquota_node_t;
1213308586Smav
1214308586Smavtypedef struct userquota_cache {
1215308586Smav	avl_tree_t uqc_user_deltas;
1216308586Smav	avl_tree_t uqc_group_deltas;
1217308586Smav} userquota_cache_t;
1218308586Smav
1219308586Smavstatic int
1220308586Smavuserquota_compare(const void *l, const void *r)
1221308586Smav{
1222308586Smav	const userquota_node_t *luqn = l;
1223308586Smav	const userquota_node_t *ruqn = r;
1224308586Smav
1225308586Smav	if (luqn->uqn_id < ruqn->uqn_id)
1226308586Smav		return (-1);
1227308586Smav	if (luqn->uqn_id > ruqn->uqn_id)
1228308586Smav		return (1);
1229308586Smav	return (0);
1230308586Smav}
1231308586Smav
1232219089Spjdstatic void
1233308586Smavdo_userquota_cacheflush(objset_t *os, userquota_cache_t *cache, dmu_tx_t *tx)
1234219089Spjd{
1235308586Smav	void *cookie;
1236308586Smav	userquota_node_t *uqn;
1237308586Smav
1238308586Smav	ASSERT(dmu_tx_is_syncing(tx));
1239308586Smav
1240308586Smav	cookie = NULL;
1241308586Smav	while ((uqn = avl_destroy_nodes(&cache->uqc_user_deltas,
1242308586Smav	    &cookie)) != NULL) {
1243308586Smav		VERIFY0(zap_increment_int(os, DMU_USERUSED_OBJECT,
1244308586Smav		    uqn->uqn_id, uqn->uqn_delta, tx));
1245308586Smav		kmem_free(uqn, sizeof (*uqn));
1246308586Smav	}
1247308586Smav	avl_destroy(&cache->uqc_user_deltas);
1248308586Smav
1249308586Smav	cookie = NULL;
1250308586Smav	while ((uqn = avl_destroy_nodes(&cache->uqc_group_deltas,
1251308586Smav	    &cookie)) != NULL) {
1252308586Smav		VERIFY0(zap_increment_int(os, DMU_GROUPUSED_OBJECT,
1253308586Smav		    uqn->uqn_id, uqn->uqn_delta, tx));
1254308586Smav		kmem_free(uqn, sizeof (*uqn));
1255308586Smav	}
1256308586Smav	avl_destroy(&cache->uqc_group_deltas);
1257308586Smav}
1258308586Smav
1259308586Smavstatic void
1260308586Smavuserquota_update_cache(avl_tree_t *avl, uint64_t id, int64_t delta)
1261308586Smav{
1262308586Smav	userquota_node_t search = { .uqn_id = id };
1263308586Smav	avl_index_t idx;
1264308586Smav
1265308586Smav	userquota_node_t *uqn = avl_find(avl, &search, &idx);
1266308586Smav	if (uqn == NULL) {
1267308586Smav		uqn = kmem_zalloc(sizeof (*uqn), KM_SLEEP);
1268308586Smav		uqn->uqn_id = id;
1269308586Smav		avl_insert(avl, uqn, idx);
1270308586Smav	}
1271308586Smav	uqn->uqn_delta += delta;
1272308586Smav}
1273308586Smav
1274308586Smavstatic void
1275308586Smavdo_userquota_update(userquota_cache_t *cache, uint64_t used, uint64_t flags,
1276308586Smav    uint64_t user, uint64_t group, boolean_t subtract)
1277308586Smav{
1278219089Spjd	if ((flags & DNODE_FLAG_USERUSED_ACCOUNTED)) {
1279219089Spjd		int64_t delta = DNODE_SIZE + used;
1280219089Spjd		if (subtract)
1281219089Spjd			delta = -delta;
1282308586Smav
1283308586Smav		userquota_update_cache(&cache->uqc_user_deltas, user, delta);
1284308586Smav		userquota_update_cache(&cache->uqc_group_deltas, group, delta);
1285219089Spjd	}
1286219089Spjd}
1287219089Spjd
1288209962Smmvoid
1289219089Spjddmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx)
1290209962Smm{
1291209962Smm	dnode_t *dn;
1292209962Smm	list_t *list = &os->os_synced_dnodes;
1293308586Smav	userquota_cache_t cache = { 0 };
1294209962Smm
1295209962Smm	ASSERT(list_head(list) == NULL || dmu_objset_userused_enabled(os));
1296209962Smm
1297308586Smav	avl_create(&cache.uqc_user_deltas, userquota_compare,
1298308586Smav	    sizeof (userquota_node_t), offsetof(userquota_node_t, uqn_node));
1299308586Smav	avl_create(&cache.uqc_group_deltas, userquota_compare,
1300308586Smav	    sizeof (userquota_node_t), offsetof(userquota_node_t, uqn_node));
1301308586Smav
1302209962Smm	while (dn = list_head(list)) {
1303219089Spjd		int flags;
1304209962Smm		ASSERT(!DMU_OBJECT_IS_SPECIAL(dn->dn_object));
1305209962Smm		ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE ||
1306209962Smm		    dn->dn_phys->dn_flags &
1307209962Smm		    DNODE_FLAG_USERUSED_ACCOUNTED);
1308209962Smm
1309209962Smm		/* Allocate the user/groupused objects if necessary. */
1310219089Spjd		if (DMU_USERUSED_DNODE(os)->dn_type == DMU_OT_NONE) {
1311308586Smav			VERIFY0(zap_create_claim(os,
1312209962Smm			    DMU_USERUSED_OBJECT,
1313209962Smm			    DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx));
1314308586Smav			VERIFY0(zap_create_claim(os,
1315209962Smm			    DMU_GROUPUSED_OBJECT,
1316209962Smm			    DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx));
1317209962Smm		}
1318209962Smm
1319219089Spjd		flags = dn->dn_id_flags;
1320219089Spjd		ASSERT(flags);
1321219089Spjd		if (flags & DN_ID_OLD_EXIST)  {
1322308586Smav			do_userquota_update(&cache,
1323308586Smav			    dn->dn_oldused, dn->dn_oldflags,
1324308586Smav			    dn->dn_olduid, dn->dn_oldgid, B_TRUE);
1325209962Smm		}
1326219089Spjd		if (flags & DN_ID_NEW_EXIST) {
1327308586Smav			do_userquota_update(&cache,
1328308586Smav			    DN_USED_BYTES(dn->dn_phys),
1329219089Spjd			    dn->dn_phys->dn_flags,  dn->dn_newuid,
1330308586Smav			    dn->dn_newgid, B_FALSE);
1331219089Spjd		}
1332209962Smm
1333209962Smm		mutex_enter(&dn->dn_mtx);
1334219089Spjd		dn->dn_oldused = 0;
1335219089Spjd		dn->dn_oldflags = 0;
1336219089Spjd		if (dn->dn_id_flags & DN_ID_NEW_EXIST) {
1337219089Spjd			dn->dn_olduid = dn->dn_newuid;
1338219089Spjd			dn->dn_oldgid = dn->dn_newgid;
1339219089Spjd			dn->dn_id_flags |= DN_ID_OLD_EXIST;
1340219089Spjd			if (dn->dn_bonuslen == 0)
1341219089Spjd				dn->dn_id_flags |= DN_ID_CHKED_SPILL;
1342219089Spjd			else
1343219089Spjd				dn->dn_id_flags |= DN_ID_CHKED_BONUS;
1344219089Spjd		}
1345219089Spjd		dn->dn_id_flags &= ~(DN_ID_NEW_EXIST);
1346209962Smm		mutex_exit(&dn->dn_mtx);
1347209962Smm
1348209962Smm		list_remove(list, dn);
1349209962Smm		dnode_rele(dn, list);
1350209962Smm	}
1351308586Smav	do_userquota_cacheflush(os, &cache, tx);
1352209962Smm}
1353209962Smm
1354219089Spjd/*
1355219089Spjd * Returns a pointer to data to find uid/gid from
1356219089Spjd *
1357219089Spjd * If a dirty record for transaction group that is syncing can't
1358219089Spjd * be found then NULL is returned.  In the NULL case it is assumed
1359219089Spjd * the uid/gid aren't changing.
1360219089Spjd */
1361219089Spjdstatic void *
1362219089Spjddmu_objset_userquota_find_data(dmu_buf_impl_t *db, dmu_tx_t *tx)
1363219089Spjd{
1364219089Spjd	dbuf_dirty_record_t *dr, **drp;
1365219089Spjd	void *data;
1366219089Spjd
1367219089Spjd	if (db->db_dirtycnt == 0)
1368219089Spjd		return (db->db.db_data);  /* Nothing is changing */
1369219089Spjd
1370219089Spjd	for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next)
1371219089Spjd		if (dr->dr_txg == tx->tx_txg)
1372219089Spjd			break;
1373219089Spjd
1374219089Spjd	if (dr == NULL) {
1375219089Spjd		data = NULL;
1376219089Spjd	} else {
1377219089Spjd		dnode_t *dn;
1378219089Spjd
1379219089Spjd		DB_DNODE_ENTER(dr->dr_dbuf);
1380219089Spjd		dn = DB_DNODE(dr->dr_dbuf);
1381219089Spjd
1382219089Spjd		if (dn->dn_bonuslen == 0 &&
1383219089Spjd		    dr->dr_dbuf->db_blkid == DMU_SPILL_BLKID)
1384219089Spjd			data = dr->dt.dl.dr_data->b_data;
1385219089Spjd		else
1386219089Spjd			data = dr->dt.dl.dr_data;
1387219089Spjd
1388219089Spjd		DB_DNODE_EXIT(dr->dr_dbuf);
1389219089Spjd	}
1390219089Spjd
1391219089Spjd	return (data);
1392219089Spjd}
1393219089Spjd
1394219089Spjdvoid
1395219089Spjddmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx)
1396219089Spjd{
1397219089Spjd	objset_t *os = dn->dn_objset;
1398219089Spjd	void *data = NULL;
1399219089Spjd	dmu_buf_impl_t *db = NULL;
1400247187Smm	uint64_t *user = NULL;
1401247187Smm	uint64_t *group = NULL;
1402219089Spjd	int flags = dn->dn_id_flags;
1403219089Spjd	int error;
1404219089Spjd	boolean_t have_spill = B_FALSE;
1405219089Spjd
1406219089Spjd	if (!dmu_objset_userused_enabled(dn->dn_objset))
1407219089Spjd		return;
1408219089Spjd
1409219089Spjd	if (before && (flags & (DN_ID_CHKED_BONUS|DN_ID_OLD_EXIST|
1410219089Spjd	    DN_ID_CHKED_SPILL)))
1411219089Spjd		return;
1412219089Spjd
1413219089Spjd	if (before && dn->dn_bonuslen != 0)
1414219089Spjd		data = DN_BONUS(dn->dn_phys);
1415219089Spjd	else if (!before && dn->dn_bonuslen != 0) {
1416219089Spjd		if (dn->dn_bonus) {
1417219089Spjd			db = dn->dn_bonus;
1418219089Spjd			mutex_enter(&db->db_mtx);
1419219089Spjd			data = dmu_objset_userquota_find_data(db, tx);
1420219089Spjd		} else {
1421219089Spjd			data = DN_BONUS(dn->dn_phys);
1422219089Spjd		}
1423219089Spjd	} else if (dn->dn_bonuslen == 0 && dn->dn_bonustype == DMU_OT_SA) {
1424219089Spjd			int rf = 0;
1425219089Spjd
1426219089Spjd			if (RW_WRITE_HELD(&dn->dn_struct_rwlock))
1427219089Spjd				rf |= DB_RF_HAVESTRUCT;
1428219089Spjd			error = dmu_spill_hold_by_dnode(dn,
1429219089Spjd			    rf | DB_RF_MUST_SUCCEED,
1430219089Spjd			    FTAG, (dmu_buf_t **)&db);
1431219089Spjd			ASSERT(error == 0);
1432219089Spjd			mutex_enter(&db->db_mtx);
1433219089Spjd			data = (before) ? db->db.db_data :
1434219089Spjd			    dmu_objset_userquota_find_data(db, tx);
1435219089Spjd			have_spill = B_TRUE;
1436219089Spjd	} else {
1437219089Spjd		mutex_enter(&dn->dn_mtx);
1438219089Spjd		dn->dn_id_flags |= DN_ID_CHKED_BONUS;
1439219089Spjd		mutex_exit(&dn->dn_mtx);
1440219089Spjd		return;
1441219089Spjd	}
1442219089Spjd
1443219089Spjd	if (before) {
1444219089Spjd		ASSERT(data);
1445219089Spjd		user = &dn->dn_olduid;
1446219089Spjd		group = &dn->dn_oldgid;
1447219089Spjd	} else if (data) {
1448219089Spjd		user = &dn->dn_newuid;
1449219089Spjd		group = &dn->dn_newgid;
1450219089Spjd	}
1451219089Spjd
1452219089Spjd	/*
1453219089Spjd	 * Must always call the callback in case the object
1454219089Spjd	 * type has changed and that type isn't an object type to track
1455219089Spjd	 */
1456219089Spjd	error = used_cbs[os->os_phys->os_type](dn->dn_bonustype, data,
1457219089Spjd	    user, group);
1458219089Spjd
1459219089Spjd	/*
1460219089Spjd	 * Preserve existing uid/gid when the callback can't determine
1461219089Spjd	 * what the new uid/gid are and the callback returned EEXIST.
1462219089Spjd	 * The EEXIST error tells us to just use the existing uid/gid.
1463219089Spjd	 * If we don't know what the old values are then just assign
1464219089Spjd	 * them to 0, since that is a new file  being created.
1465219089Spjd	 */
1466219089Spjd	if (!before && data == NULL && error == EEXIST) {
1467219089Spjd		if (flags & DN_ID_OLD_EXIST) {
1468219089Spjd			dn->dn_newuid = dn->dn_olduid;
1469219089Spjd			dn->dn_newgid = dn->dn_oldgid;
1470219089Spjd		} else {
1471219089Spjd			dn->dn_newuid = 0;
1472219089Spjd			dn->dn_newgid = 0;
1473219089Spjd		}
1474219089Spjd		error = 0;
1475219089Spjd	}
1476219089Spjd
1477219089Spjd	if (db)
1478219089Spjd		mutex_exit(&db->db_mtx);
1479219089Spjd
1480219089Spjd	mutex_enter(&dn->dn_mtx);
1481219089Spjd	if (error == 0 && before)
1482219089Spjd		dn->dn_id_flags |= DN_ID_OLD_EXIST;
1483219089Spjd	if (error == 0 && !before)
1484219089Spjd		dn->dn_id_flags |= DN_ID_NEW_EXIST;
1485219089Spjd
1486219089Spjd	if (have_spill) {
1487219089Spjd		dn->dn_id_flags |= DN_ID_CHKED_SPILL;
1488219089Spjd	} else {
1489219089Spjd		dn->dn_id_flags |= DN_ID_CHKED_BONUS;
1490219089Spjd	}
1491219089Spjd	mutex_exit(&dn->dn_mtx);
1492219089Spjd	if (have_spill)
1493219089Spjd		dmu_buf_rele((dmu_buf_t *)db, FTAG);
1494219089Spjd}
1495219089Spjd
1496209962Smmboolean_t
1497209962Smmdmu_objset_userspace_present(objset_t *os)
1498209962Smm{
1499219089Spjd	return (os->os_phys->os_flags &
1500209962Smm	    OBJSET_FLAG_USERACCOUNTING_COMPLETE);
1501209962Smm}
1502209962Smm
1503209962Smmint
1504209962Smmdmu_objset_userspace_upgrade(objset_t *os)
1505209962Smm{
1506209962Smm	uint64_t obj;
1507209962Smm	int err = 0;
1508209962Smm
1509209962Smm	if (dmu_objset_userspace_present(os))
1510209962Smm		return (0);
1511219089Spjd	if (!dmu_objset_userused_enabled(os))
1512249195Smm		return (SET_ERROR(ENOTSUP));
1513209962Smm	if (dmu_objset_is_snapshot(os))
1514249195Smm		return (SET_ERROR(EINVAL));
1515209962Smm
1516209962Smm	/*
1517209962Smm	 * We simply need to mark every object dirty, so that it will be
1518209962Smm	 * synced out and now accounted.  If this is called
1519209962Smm	 * concurrently, or if we already did some work before crashing,
1520209962Smm	 * that's fine, since we track each object's accounted state
1521209962Smm	 * independently.
1522209962Smm	 */
1523209962Smm
1524209962Smm	for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, 0)) {
1525209962Smm		dmu_tx_t *tx;
1526209962Smm		dmu_buf_t *db;
1527209962Smm		int objerr;
1528209962Smm
1529209962Smm		if (issig(JUSTLOOKING) && issig(FORREAL))
1530249195Smm			return (SET_ERROR(EINTR));
1531209962Smm
1532209962Smm		objerr = dmu_bonus_hold(os, obj, FTAG, &db);
1533248571Smm		if (objerr != 0)
1534209962Smm			continue;
1535209962Smm		tx = dmu_tx_create(os);
1536209962Smm		dmu_tx_hold_bonus(tx, obj);
1537209962Smm		objerr = dmu_tx_assign(tx, TXG_WAIT);
1538248571Smm		if (objerr != 0) {
1539209962Smm			dmu_tx_abort(tx);
1540209962Smm			continue;
1541209962Smm		}
1542209962Smm		dmu_buf_will_dirty(db, tx);
1543209962Smm		dmu_buf_rele(db, FTAG);
1544209962Smm		dmu_tx_commit(tx);
1545209962Smm	}
1546209962Smm
1547219089Spjd	os->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE;
1548209962Smm	txg_wait_synced(dmu_objset_pool(os), 0);
1549209962Smm	return (0);
1550209962Smm}
1551209962Smm
1552209962Smmvoid
1553168404Spjddmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp,
1554168404Spjd    uint64_t *usedobjsp, uint64_t *availobjsp)
1555168404Spjd{
1556219089Spjd	dsl_dataset_space(os->os_dsl_dataset, refdbytesp, availbytesp,
1557168404Spjd	    usedobjsp, availobjsp);
1558168404Spjd}
1559168404Spjd
1560168404Spjduint64_t
1561168404Spjddmu_objset_fsid_guid(objset_t *os)
1562168404Spjd{
1563219089Spjd	return (dsl_dataset_fsid_guid(os->os_dsl_dataset));
1564168404Spjd}
1565168404Spjd
1566168404Spjdvoid
1567168404Spjddmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat)
1568168404Spjd{
1569219089Spjd	stat->dds_type = os->os_phys->os_type;
1570219089Spjd	if (os->os_dsl_dataset)
1571219089Spjd		dsl_dataset_fast_stat(os->os_dsl_dataset, stat);
1572168404Spjd}
1573168404Spjd
1574168404Spjdvoid
1575168404Spjddmu_objset_stats(objset_t *os, nvlist_t *nv)
1576168404Spjd{
1577219089Spjd	ASSERT(os->os_dsl_dataset ||
1578219089Spjd	    os->os_phys->os_type == DMU_OST_META);
1579168404Spjd
1580219089Spjd	if (os->os_dsl_dataset != NULL)
1581219089Spjd		dsl_dataset_stats(os->os_dsl_dataset, nv);
1582168404Spjd
1583168404Spjd	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_TYPE,
1584219089Spjd	    os->os_phys->os_type);
1585209962Smm	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERACCOUNTING,
1586209962Smm	    dmu_objset_userspace_present(os));
1587168404Spjd}
1588168404Spjd
1589168404Spjdint
1590168404Spjddmu_objset_is_snapshot(objset_t *os)
1591168404Spjd{
1592219089Spjd	if (os->os_dsl_dataset != NULL)
1593288549Smav		return (os->os_dsl_dataset->ds_is_snapshot);
1594168404Spjd	else
1595168404Spjd		return (B_FALSE);
1596168404Spjd}
1597168404Spjd
1598168404Spjdint
1599185029Spjddmu_snapshot_realname(objset_t *os, char *name, char *real, int maxlen,
1600185029Spjd    boolean_t *conflict)
1601185029Spjd{
1602219089Spjd	dsl_dataset_t *ds = os->os_dsl_dataset;
1603185029Spjd	uint64_t ignored;
1604185029Spjd
1605277585Sdelphij	if (dsl_dataset_phys(ds)->ds_snapnames_zapobj == 0)
1606249195Smm		return (SET_ERROR(ENOENT));
1607185029Spjd
1608185029Spjd	return (zap_lookup_norm(ds->ds_dir->dd_pool->dp_meta_objset,
1609277585Sdelphij	    dsl_dataset_phys(ds)->ds_snapnames_zapobj, name, 8, 1, &ignored,
1610277585Sdelphij	    MT_FIRST, real, maxlen, conflict));
1611185029Spjd}
1612185029Spjd
1613185029Spjdint
1614168404Spjddmu_snapshot_list_next(objset_t *os, int namelen, char *name,
1615185029Spjd    uint64_t *idp, uint64_t *offp, boolean_t *case_conflict)
1616168404Spjd{
1617219089Spjd	dsl_dataset_t *ds = os->os_dsl_dataset;
1618168404Spjd	zap_cursor_t cursor;
1619168404Spjd	zap_attribute_t attr;
1620168404Spjd
1621248571Smm	ASSERT(dsl_pool_config_held(dmu_objset_pool(os)));
1622248571Smm
1623277585Sdelphij	if (dsl_dataset_phys(ds)->ds_snapnames_zapobj == 0)
1624249195Smm		return (SET_ERROR(ENOENT));
1625168404Spjd
1626168404Spjd	zap_cursor_init_serialized(&cursor,
1627168404Spjd	    ds->ds_dir->dd_pool->dp_meta_objset,
1628277585Sdelphij	    dsl_dataset_phys(ds)->ds_snapnames_zapobj, *offp);
1629168404Spjd
1630168404Spjd	if (zap_cursor_retrieve(&cursor, &attr) != 0) {
1631168404Spjd		zap_cursor_fini(&cursor);
1632249195Smm		return (SET_ERROR(ENOENT));
1633168404Spjd	}
1634168404Spjd
1635168404Spjd	if (strlen(attr.za_name) + 1 > namelen) {
1636168404Spjd		zap_cursor_fini(&cursor);
1637249195Smm		return (SET_ERROR(ENAMETOOLONG));
1638168404Spjd	}
1639168404Spjd
1640168404Spjd	(void) strcpy(name, attr.za_name);
1641168404Spjd	if (idp)
1642168404Spjd		*idp = attr.za_first_integer;
1643185029Spjd	if (case_conflict)
1644185029Spjd		*case_conflict = attr.za_normalization_conflict;
1645168404Spjd	zap_cursor_advance(&cursor);
1646168404Spjd	*offp = zap_cursor_serialize(&cursor);
1647168404Spjd	zap_cursor_fini(&cursor);
1648168404Spjd
1649168404Spjd	return (0);
1650168404Spjd}
1651168404Spjd
1652168404Spjdint
1653168404Spjddmu_dir_list_next(objset_t *os, int namelen, char *name,
1654168404Spjd    uint64_t *idp, uint64_t *offp)
1655168404Spjd{
1656219089Spjd	dsl_dir_t *dd = os->os_dsl_dataset->ds_dir;
1657168404Spjd	zap_cursor_t cursor;
1658168404Spjd	zap_attribute_t attr;
1659168404Spjd
1660168404Spjd	/* there is no next dir on a snapshot! */
1661219089Spjd	if (os->os_dsl_dataset->ds_object !=
1662277585Sdelphij	    dsl_dir_phys(dd)->dd_head_dataset_obj)
1663249195Smm		return (SET_ERROR(ENOENT));
1664168404Spjd
1665168404Spjd	zap_cursor_init_serialized(&cursor,
1666168404Spjd	    dd->dd_pool->dp_meta_objset,
1667277585Sdelphij	    dsl_dir_phys(dd)->dd_child_dir_zapobj, *offp);
1668168404Spjd
1669168404Spjd	if (zap_cursor_retrieve(&cursor, &attr) != 0) {
1670168404Spjd		zap_cursor_fini(&cursor);
1671249195Smm		return (SET_ERROR(ENOENT));
1672168404Spjd	}
1673168404Spjd
1674168404Spjd	if (strlen(attr.za_name) + 1 > namelen) {
1675168404Spjd		zap_cursor_fini(&cursor);
1676249195Smm		return (SET_ERROR(ENAMETOOLONG));
1677168404Spjd	}
1678168404Spjd
1679168404Spjd	(void) strcpy(name, attr.za_name);
1680168404Spjd	if (idp)
1681168404Spjd		*idp = attr.za_first_integer;
1682168404Spjd	zap_cursor_advance(&cursor);
1683168404Spjd	*offp = zap_cursor_serialize(&cursor);
1684168404Spjd	zap_cursor_fini(&cursor);
1685168404Spjd
1686168404Spjd	return (0);
1687168404Spjd}
1688168404Spjd
1689288569Smavtypedef struct dmu_objset_find_ctx {
1690288569Smav	taskq_t		*dc_tq;
1691288569Smav	dsl_pool_t	*dc_dp;
1692288569Smav	uint64_t	dc_ddobj;
1693288569Smav	int		(*dc_func)(dsl_pool_t *, dsl_dataset_t *, void *);
1694288569Smav	void		*dc_arg;
1695288569Smav	int		dc_flags;
1696288569Smav	kmutex_t	*dc_error_lock;
1697288569Smav	int		*dc_error;
1698288569Smav} dmu_objset_find_ctx_t;
1699288569Smav
1700288569Smavstatic void
1701288569Smavdmu_objset_find_dp_impl(dmu_objset_find_ctx_t *dcp)
1702168404Spjd{
1703288569Smav	dsl_pool_t *dp = dcp->dc_dp;
1704288569Smav	dmu_objset_find_ctx_t *child_dcp;
1705248571Smm	dsl_dir_t *dd;
1706248571Smm	dsl_dataset_t *ds;
1707248571Smm	zap_cursor_t zc;
1708248571Smm	zap_attribute_t *attr;
1709248571Smm	uint64_t thisobj;
1710288569Smav	int err = 0;
1711248571Smm
1712288569Smav	/* don't process if there already was an error */
1713288569Smav	if (*dcp->dc_error != 0)
1714288569Smav		goto out;
1715248571Smm
1716288569Smav	err = dsl_dir_hold_obj(dp, dcp->dc_ddobj, NULL, FTAG, &dd);
1717248571Smm	if (err != 0)
1718288569Smav		goto out;
1719248571Smm
1720248571Smm	/* Don't visit hidden ($MOS & $ORIGIN) objsets. */
1721248571Smm	if (dd->dd_myname[0] == '$') {
1722248571Smm		dsl_dir_rele(dd, FTAG);
1723288569Smav		goto out;
1724248571Smm	}
1725248571Smm
1726277585Sdelphij	thisobj = dsl_dir_phys(dd)->dd_head_dataset_obj;
1727248571Smm	attr = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
1728248571Smm
1729248571Smm	/*
1730248571Smm	 * Iterate over all children.
1731248571Smm	 */
1732288569Smav	if (dcp->dc_flags & DS_FIND_CHILDREN) {
1733248571Smm		for (zap_cursor_init(&zc, dp->dp_meta_objset,
1734277585Sdelphij		    dsl_dir_phys(dd)->dd_child_dir_zapobj);
1735248571Smm		    zap_cursor_retrieve(&zc, attr) == 0;
1736248571Smm		    (void) zap_cursor_advance(&zc)) {
1737248571Smm			ASSERT3U(attr->za_integer_length, ==,
1738248571Smm			    sizeof (uint64_t));
1739248571Smm			ASSERT3U(attr->za_num_integers, ==, 1);
1740248571Smm
1741288569Smav			child_dcp = kmem_alloc(sizeof (*child_dcp), KM_SLEEP);
1742288569Smav			*child_dcp = *dcp;
1743288569Smav			child_dcp->dc_ddobj = attr->za_first_integer;
1744288569Smav			if (dcp->dc_tq != NULL)
1745288569Smav				(void) taskq_dispatch(dcp->dc_tq,
1746288569Smav				    dmu_objset_find_dp_cb, child_dcp, TQ_SLEEP);
1747288569Smav			else
1748288569Smav				dmu_objset_find_dp_impl(child_dcp);
1749248571Smm		}
1750248571Smm		zap_cursor_fini(&zc);
1751248571Smm	}
1752248571Smm
1753248571Smm	/*
1754248571Smm	 * Iterate over all snapshots.
1755248571Smm	 */
1756288569Smav	if (dcp->dc_flags & DS_FIND_SNAPSHOTS) {
1757248571Smm		dsl_dataset_t *ds;
1758248571Smm		err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds);
1759248571Smm
1760248571Smm		if (err == 0) {
1761277585Sdelphij			uint64_t snapobj;
1762277585Sdelphij
1763277585Sdelphij			snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj;
1764248571Smm			dsl_dataset_rele(ds, FTAG);
1765248571Smm
1766248571Smm			for (zap_cursor_init(&zc, dp->dp_meta_objset, snapobj);
1767248571Smm			    zap_cursor_retrieve(&zc, attr) == 0;
1768248571Smm			    (void) zap_cursor_advance(&zc)) {
1769248571Smm				ASSERT3U(attr->za_integer_length, ==,
1770248571Smm				    sizeof (uint64_t));
1771248571Smm				ASSERT3U(attr->za_num_integers, ==, 1);
1772248571Smm
1773248571Smm				err = dsl_dataset_hold_obj(dp,
1774248571Smm				    attr->za_first_integer, FTAG, &ds);
1775248571Smm				if (err != 0)
1776248571Smm					break;
1777288569Smav				err = dcp->dc_func(dp, ds, dcp->dc_arg);
1778248571Smm				dsl_dataset_rele(ds, FTAG);
1779248571Smm				if (err != 0)
1780248571Smm					break;
1781248571Smm			}
1782248571Smm			zap_cursor_fini(&zc);
1783248571Smm		}
1784248571Smm	}
1785248571Smm
1786248571Smm	dsl_dir_rele(dd, FTAG);
1787248571Smm	kmem_free(attr, sizeof (zap_attribute_t));
1788248571Smm
1789248571Smm	if (err != 0)
1790288569Smav		goto out;
1791248571Smm
1792248571Smm	/*
1793248571Smm	 * Apply to self.
1794248571Smm	 */
1795248571Smm	err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds);
1796248571Smm	if (err != 0)
1797288569Smav		goto out;
1798288569Smav	err = dcp->dc_func(dp, ds, dcp->dc_arg);
1799248571Smm	dsl_dataset_rele(ds, FTAG);
1800288569Smav
1801288569Smavout:
1802288569Smav	if (err != 0) {
1803288569Smav		mutex_enter(dcp->dc_error_lock);
1804288569Smav		/* only keep first error */
1805288569Smav		if (*dcp->dc_error == 0)
1806288569Smav			*dcp->dc_error = err;
1807288569Smav		mutex_exit(dcp->dc_error_lock);
1808288569Smav	}
1809288569Smav
1810288569Smav	kmem_free(dcp, sizeof (*dcp));
1811185029Spjd}
1812185029Spjd
1813288569Smavstatic void
1814288569Smavdmu_objset_find_dp_cb(void *arg)
1815288569Smav{
1816288569Smav	dmu_objset_find_ctx_t *dcp = arg;
1817288569Smav	dsl_pool_t *dp = dcp->dc_dp;
1818288569Smav
1819288570Smav	/*
1820288570Smav	 * We need to get a pool_config_lock here, as there are several
1821288570Smav	 * asssert(pool_config_held) down the stack. Getting a lock via
1822288570Smav	 * dsl_pool_config_enter is risky, as it might be stalled by a
1823288570Smav	 * pending writer. This would deadlock, as the write lock can
1824288570Smav	 * only be granted when our parent thread gives up the lock.
1825288570Smav	 * The _prio interface gives us priority over a pending writer.
1826288570Smav	 */
1827288570Smav	dsl_pool_config_enter_prio(dp, FTAG);
1828288569Smav
1829288569Smav	dmu_objset_find_dp_impl(dcp);
1830288569Smav
1831288569Smav	dsl_pool_config_exit(dp, FTAG);
1832288569Smav}
1833288569Smav
1834185029Spjd/*
1835288569Smav * Find objsets under and including ddobj, call func(ds) on each.
1836288569Smav * The order for the enumeration is completely undefined.
1837288569Smav * func is called with dsl_pool_config held.
1838288569Smav */
1839288569Smavint
1840288569Smavdmu_objset_find_dp(dsl_pool_t *dp, uint64_t ddobj,
1841288569Smav    int func(dsl_pool_t *, dsl_dataset_t *, void *), void *arg, int flags)
1842288569Smav{
1843288569Smav	int error = 0;
1844288569Smav	taskq_t *tq = NULL;
1845288569Smav	int ntasks;
1846288569Smav	dmu_objset_find_ctx_t *dcp;
1847288569Smav	kmutex_t err_lock;
1848288569Smav
1849288569Smav	mutex_init(&err_lock, NULL, MUTEX_DEFAULT, NULL);
1850288569Smav	dcp = kmem_alloc(sizeof (*dcp), KM_SLEEP);
1851288569Smav	dcp->dc_tq = NULL;
1852288569Smav	dcp->dc_dp = dp;
1853288569Smav	dcp->dc_ddobj = ddobj;
1854288569Smav	dcp->dc_func = func;
1855288569Smav	dcp->dc_arg = arg;
1856288569Smav	dcp->dc_flags = flags;
1857288569Smav	dcp->dc_error_lock = &err_lock;
1858288569Smav	dcp->dc_error = &error;
1859288569Smav
1860288569Smav	if ((flags & DS_FIND_SERIALIZE) || dsl_pool_config_held_writer(dp)) {
1861288569Smav		/*
1862288569Smav		 * In case a write lock is held we can't make use of
1863288569Smav		 * parallelism, as down the stack of the worker threads
1864288569Smav		 * the lock is asserted via dsl_pool_config_held.
1865288569Smav		 * In case of a read lock this is solved by getting a read
1866288569Smav		 * lock in each worker thread, which isn't possible in case
1867288569Smav		 * of a writer lock. So we fall back to the synchronous path
1868288569Smav		 * here.
1869288569Smav		 * In the future it might be possible to get some magic into
1870288569Smav		 * dsl_pool_config_held in a way that it returns true for
1871288569Smav		 * the worker threads so that a single lock held from this
1872288569Smav		 * thread suffices. For now, stay single threaded.
1873288569Smav		 */
1874288569Smav		dmu_objset_find_dp_impl(dcp);
1875297104Smav		mutex_destroy(&err_lock);
1876288569Smav
1877288569Smav		return (error);
1878288569Smav	}
1879288569Smav
1880288569Smav	ntasks = dmu_find_threads;
1881288569Smav	if (ntasks == 0)
1882288569Smav		ntasks = vdev_count_leaves(dp->dp_spa) * 4;
1883288569Smav	tq = taskq_create("dmu_objset_find", ntasks, minclsyspri, ntasks,
1884288569Smav	    INT_MAX, 0);
1885288569Smav	if (tq == NULL) {
1886288569Smav		kmem_free(dcp, sizeof (*dcp));
1887297104Smav		mutex_destroy(&err_lock);
1888297104Smav
1889288569Smav		return (SET_ERROR(ENOMEM));
1890288569Smav	}
1891288569Smav	dcp->dc_tq = tq;
1892288569Smav
1893288569Smav	/* dcp will be freed by task */
1894288569Smav	(void) taskq_dispatch(tq, dmu_objset_find_dp_cb, dcp, TQ_SLEEP);
1895288569Smav
1896288569Smav	/*
1897288569Smav	 * PORTING: this code relies on the property of taskq_wait to wait
1898288569Smav	 * until no more tasks are queued and no more tasks are active. As
1899288569Smav	 * we always queue new tasks from within other tasks, task_wait
1900288569Smav	 * reliably waits for the full recursion to finish, even though we
1901288569Smav	 * enqueue new tasks after taskq_wait has been called.
1902288569Smav	 * On platforms other than illumos, taskq_wait may not have this
1903288569Smav	 * property.
1904288569Smav	 */
1905288569Smav	taskq_wait(tq);
1906288569Smav	taskq_destroy(tq);
1907288569Smav	mutex_destroy(&err_lock);
1908288569Smav
1909288569Smav	return (error);
1910288569Smav}
1911288569Smav
1912288569Smav/*
1913248571Smm * Find all objsets under name, and for each, call 'func(child_name, arg)'.
1914248571Smm * The dp_config_rwlock must not be held when this is called, and it
1915248571Smm * will not be held when the callback is called.
1916248571Smm * Therefore this function should only be used when the pool is not changing
1917248571Smm * (e.g. in syncing context), or the callback can deal with the possible races.
1918185029Spjd */
1919248571Smmstatic int
1920248571Smmdmu_objset_find_impl(spa_t *spa, const char *name,
1921248571Smm    int func(const char *, void *), void *arg, int flags)
1922185029Spjd{
1923168404Spjd	dsl_dir_t *dd;
1924248571Smm	dsl_pool_t *dp = spa_get_dsl(spa);
1925185029Spjd	dsl_dataset_t *ds;
1926168404Spjd	zap_cursor_t zc;
1927168498Spjd	zap_attribute_t *attr;
1928168404Spjd	char *child;
1929185029Spjd	uint64_t thisobj;
1930185029Spjd	int err;
1931168404Spjd
1932248571Smm	dsl_pool_config_enter(dp, FTAG);
1933248571Smm
1934248571Smm	err = dsl_dir_hold(dp, name, FTAG, &dd, NULL);
1935248571Smm	if (err != 0) {
1936248571Smm		dsl_pool_config_exit(dp, FTAG);
1937168404Spjd		return (err);
1938248571Smm	}
1939168404Spjd
1940185029Spjd	/* Don't visit hidden ($MOS & $ORIGIN) objsets. */
1941185029Spjd	if (dd->dd_myname[0] == '$') {
1942248571Smm		dsl_dir_rele(dd, FTAG);
1943248571Smm		dsl_pool_config_exit(dp, FTAG);
1944185029Spjd		return (0);
1945185029Spjd	}
1946185029Spjd
1947277585Sdelphij	thisobj = dsl_dir_phys(dd)->dd_head_dataset_obj;
1948168498Spjd	attr = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
1949168404Spjd
1950168404Spjd	/*
1951168404Spjd	 * Iterate over all children.
1952168404Spjd	 */
1953168404Spjd	if (flags & DS_FIND_CHILDREN) {
1954185029Spjd		for (zap_cursor_init(&zc, dp->dp_meta_objset,
1955277585Sdelphij		    dsl_dir_phys(dd)->dd_child_dir_zapobj);
1956168498Spjd		    zap_cursor_retrieve(&zc, attr) == 0;
1957168404Spjd		    (void) zap_cursor_advance(&zc)) {
1958248571Smm			ASSERT3U(attr->za_integer_length, ==,
1959248571Smm			    sizeof (uint64_t));
1960248571Smm			ASSERT3U(attr->za_num_integers, ==, 1);
1961168404Spjd
1962219089Spjd			child = kmem_asprintf("%s/%s", name, attr->za_name);
1963248571Smm			dsl_pool_config_exit(dp, FTAG);
1964248571Smm			err = dmu_objset_find_impl(spa, child,
1965248571Smm			    func, arg, flags);
1966248571Smm			dsl_pool_config_enter(dp, FTAG);
1967219089Spjd			strfree(child);
1968248571Smm			if (err != 0)
1969168404Spjd				break;
1970168404Spjd		}
1971168404Spjd		zap_cursor_fini(&zc);
1972168404Spjd
1973248571Smm		if (err != 0) {
1974248571Smm			dsl_dir_rele(dd, FTAG);
1975248571Smm			dsl_pool_config_exit(dp, FTAG);
1976168498Spjd			kmem_free(attr, sizeof (zap_attribute_t));
1977168404Spjd			return (err);
1978168404Spjd		}
1979168404Spjd	}
1980168404Spjd
1981168404Spjd	/*
1982168404Spjd	 * Iterate over all snapshots.
1983168404Spjd	 */
1984185029Spjd	if (flags & DS_FIND_SNAPSHOTS) {
1985185029Spjd		err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds);
1986168404Spjd
1987185029Spjd		if (err == 0) {
1988277585Sdelphij			uint64_t snapobj;
1989277585Sdelphij
1990277585Sdelphij			snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj;
1991185029Spjd			dsl_dataset_rele(ds, FTAG);
1992168404Spjd
1993185029Spjd			for (zap_cursor_init(&zc, dp->dp_meta_objset, snapobj);
1994185029Spjd			    zap_cursor_retrieve(&zc, attr) == 0;
1995185029Spjd			    (void) zap_cursor_advance(&zc)) {
1996248571Smm				ASSERT3U(attr->za_integer_length, ==,
1997185029Spjd				    sizeof (uint64_t));
1998248571Smm				ASSERT3U(attr->za_num_integers, ==, 1);
1999168404Spjd
2000219089Spjd				child = kmem_asprintf("%s@%s",
2001219089Spjd				    name, attr->za_name);
2002248571Smm				dsl_pool_config_exit(dp, FTAG);
2003248571Smm				err = func(child, arg);
2004248571Smm				dsl_pool_config_enter(dp, FTAG);
2005219089Spjd				strfree(child);
2006248571Smm				if (err != 0)
2007185029Spjd					break;
2008185029Spjd			}
2009185029Spjd			zap_cursor_fini(&zc);
2010168404Spjd		}
2011168404Spjd	}
2012168404Spjd
2013248571Smm	dsl_dir_rele(dd, FTAG);
2014168498Spjd	kmem_free(attr, sizeof (zap_attribute_t));
2015248571Smm	dsl_pool_config_exit(dp, FTAG);
2016168404Spjd
2017248571Smm	if (err != 0)
2018168404Spjd		return (err);
2019168404Spjd
2020248571Smm	/* Apply to self. */
2021248571Smm	return (func(name, arg));
2022168404Spjd}
2023185029Spjd
2024248571Smm/*
2025248571Smm * See comment above dmu_objset_find_impl().
2026248571Smm */
2027207626Smmint
2028248571Smmdmu_objset_find(char *name, int func(const char *, void *), void *arg,
2029248571Smm    int flags)
2030207626Smm{
2031248571Smm	spa_t *spa;
2032248571Smm	int error;
2033207626Smm
2034248571Smm	error = spa_open(name, &spa, FTAG);
2035248571Smm	if (error != 0)
2036248571Smm		return (error);
2037248571Smm	error = dmu_objset_find_impl(spa, name, func, arg, flags);
2038248571Smm	spa_close(spa, FTAG);
2039248571Smm	return (error);
2040207626Smm}
2041207626Smm
2042185029Spjdvoid
2043185029Spjddmu_objset_set_user(objset_t *os, void *user_ptr)
2044185029Spjd{
2045219089Spjd	ASSERT(MUTEX_HELD(&os->os_user_ptr_lock));
2046219089Spjd	os->os_user_ptr = user_ptr;
2047185029Spjd}
2048185029Spjd
2049185029Spjdvoid *
2050185029Spjddmu_objset_get_user(objset_t *os)
2051185029Spjd{
2052219089Spjd	ASSERT(MUTEX_HELD(&os->os_user_ptr_lock));
2053219089Spjd	return (os->os_user_ptr);
2054185029Spjd}
2055248571Smm
2056248571Smm/*
2057248571Smm * Determine name of filesystem, given name of snapshot.
2058307122Smav * buf must be at least ZFS_MAX_DATASET_NAME_LEN bytes
2059248571Smm */
2060248571Smmint
2061248571Smmdmu_fsname(const char *snapname, char *buf)
2062248571Smm{
2063248571Smm	char *atp = strchr(snapname, '@');
2064248571Smm	if (atp == NULL)
2065249195Smm		return (SET_ERROR(EINVAL));
2066307122Smav	if (atp - snapname >= ZFS_MAX_DATASET_NAME_LEN)
2067249195Smm		return (SET_ERROR(ENAMETOOLONG));
2068248571Smm	(void) strlcpy(buf, snapname, atp - snapname + 1);
2069248571Smm	return (0);
2070248571Smm}
2071