zfs_vfsops.c revision 222199
1168404Spjd/*
2168404Spjd * CDDL HEADER START
3168404Spjd *
4168404Spjd * The contents of this file are subject to the terms of the
5168404Spjd * Common Development and Distribution License (the "License").
6168404Spjd * You may not use this file except in compliance with the License.
7168404Spjd *
8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9168404Spjd * or http://www.opensolaris.org/os/licensing.
10168404Spjd * See the License for the specific language governing permissions
11168404Spjd * and limitations under the License.
12168404Spjd *
13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each
14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15168404Spjd * If applicable, add the following below this CDDL HEADER, with the
16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying
17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner]
18168404Spjd *
19168404Spjd * CDDL HEADER END
20168404Spjd */
21168404Spjd/*
22219089Spjd * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23168404Spjd */
24168404Spjd
25219089Spjd/* Portions Copyright 2010 Robert Milkowski */
26219089Spjd
27168404Spjd#include <sys/types.h>
28168404Spjd#include <sys/param.h>
29168404Spjd#include <sys/systm.h>
30168404Spjd#include <sys/kernel.h>
31168404Spjd#include <sys/sysmacros.h>
32168404Spjd#include <sys/kmem.h>
33168404Spjd#include <sys/acl.h>
34168404Spjd#include <sys/vnode.h>
35168404Spjd#include <sys/vfs.h>
36168404Spjd#include <sys/mntent.h>
37168404Spjd#include <sys/mount.h>
38168404Spjd#include <sys/cmn_err.h>
39168404Spjd#include <sys/zfs_znode.h>
40168404Spjd#include <sys/zfs_dir.h>
41168404Spjd#include <sys/zil.h>
42168404Spjd#include <sys/fs/zfs.h>
43168404Spjd#include <sys/dmu.h>
44168404Spjd#include <sys/dsl_prop.h>
45168404Spjd#include <sys/dsl_dataset.h>
46185029Spjd#include <sys/dsl_deleg.h>
47168404Spjd#include <sys/spa.h>
48168404Spjd#include <sys/zap.h>
49219089Spjd#include <sys/sa.h>
50168404Spjd#include <sys/varargs.h>
51168962Spjd#include <sys/policy.h>
52168404Spjd#include <sys/atomic.h>
53168404Spjd#include <sys/zfs_ioctl.h>
54168404Spjd#include <sys/zfs_ctldir.h>
55185029Spjd#include <sys/zfs_fuid.h>
56168962Spjd#include <sys/sunddi.h>
57168404Spjd#include <sys/dnlc.h>
58185029Spjd#include <sys/dmu_objset.h>
59185029Spjd#include <sys/spa_boot.h>
60219089Spjd#include <sys/sa.h>
61219089Spjd#include "zfs_comutil.h"
62168404Spjd
63168404Spjdstruct mtx zfs_debug_mtx;
64168404SpjdMTX_SYSINIT(zfs_debug_mtx, &zfs_debug_mtx, "zfs_debug", MTX_DEF);
65185029Spjd
66168404SpjdSYSCTL_NODE(_vfs, OID_AUTO, zfs, CTLFLAG_RW, 0, "ZFS file system");
67185029Spjd
68219089Spjdint zfs_super_owner;
69185029SpjdSYSCTL_INT(_vfs_zfs, OID_AUTO, super_owner, CTLFLAG_RW, &zfs_super_owner, 0,
70185029Spjd    "File system owner can perform privileged operation on his file systems");
71185029Spjd
72219089Spjdint zfs_debug_level;
73168713SpjdTUNABLE_INT("vfs.zfs.debug", &zfs_debug_level);
74168404SpjdSYSCTL_INT(_vfs_zfs, OID_AUTO, debug, CTLFLAG_RW, &zfs_debug_level, 0,
75168404Spjd    "Debug level");
76168404Spjd
77185029SpjdSYSCTL_NODE(_vfs_zfs, OID_AUTO, version, CTLFLAG_RD, 0, "ZFS versions");
78185029Spjdstatic int zfs_version_acl = ZFS_ACL_VERSION;
79185029SpjdSYSCTL_INT(_vfs_zfs_version, OID_AUTO, acl, CTLFLAG_RD, &zfs_version_acl, 0,
80185029Spjd    "ZFS_ACL_VERSION");
81185029Spjdstatic int zfs_version_spa = SPA_VERSION;
82185029SpjdSYSCTL_INT(_vfs_zfs_version, OID_AUTO, spa, CTLFLAG_RD, &zfs_version_spa, 0,
83185029Spjd    "SPA_VERSION");
84185029Spjdstatic int zfs_version_zpl = ZPL_VERSION;
85185029SpjdSYSCTL_INT(_vfs_zfs_version, OID_AUTO, zpl, CTLFLAG_RD, &zfs_version_zpl, 0,
86185029Spjd    "ZPL_VERSION");
87185029Spjd
88191990Sattiliostatic int zfs_mount(vfs_t *vfsp);
89191990Sattiliostatic int zfs_umount(vfs_t *vfsp, int fflag);
90191990Sattiliostatic int zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp);
91191990Sattiliostatic int zfs_statfs(vfs_t *vfsp, struct statfs *statp);
92168404Spjdstatic int zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp);
93191990Sattiliostatic int zfs_sync(vfs_t *vfsp, int waitfor);
94196982Spjdstatic int zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, int *extflagsp,
95196982Spjd    struct ucred **credanonp, int *numsecflavors, int **secflavors);
96222167Srmacklemstatic int zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp);
97168404Spjdstatic void zfs_objset_close(zfsvfs_t *zfsvfs);
98168404Spjdstatic void zfs_freevfs(vfs_t *vfsp);
99168404Spjd
100168404Spjdstatic struct vfsops zfs_vfsops = {
101168404Spjd	.vfs_mount =		zfs_mount,
102168404Spjd	.vfs_unmount =		zfs_umount,
103168404Spjd	.vfs_root =		zfs_root,
104168404Spjd	.vfs_statfs =		zfs_statfs,
105168404Spjd	.vfs_vget =		zfs_vget,
106168404Spjd	.vfs_sync =		zfs_sync,
107196982Spjd	.vfs_checkexp =		zfs_checkexp,
108168404Spjd	.vfs_fhtovp =		zfs_fhtovp,
109168404Spjd};
110168404Spjd
111185029SpjdVFS_SET(zfs_vfsops, zfs, VFCF_JAIL | VFCF_DELEGADMIN);
112168404Spjd
113168404Spjd/*
114168404Spjd * We need to keep a count of active fs's.
115168404Spjd * This is necessary to prevent our module
116168404Spjd * from being unloaded after a umount -f
117168404Spjd */
118168404Spjdstatic uint32_t	zfs_active_fs_count = 0;
119168404Spjd
120168404Spjd/*ARGSUSED*/
121168404Spjdstatic int
122191990Sattiliozfs_sync(vfs_t *vfsp, int waitfor)
123168404Spjd{
124168404Spjd
125168404Spjd	/*
126168404Spjd	 * Data integrity is job one.  We don't want a compromised kernel
127168404Spjd	 * writing to the storage pool, so we never sync during panic.
128168404Spjd	 */
129168404Spjd	if (panicstr)
130168404Spjd		return (0);
131168404Spjd
132168404Spjd	if (vfsp != NULL) {
133168404Spjd		/*
134168404Spjd		 * Sync a specific filesystem.
135168404Spjd		 */
136168404Spjd		zfsvfs_t *zfsvfs = vfsp->vfs_data;
137209962Smm		dsl_pool_t *dp;
138168404Spjd		int error;
139168404Spjd
140191990Sattilio		error = vfs_stdsync(vfsp, waitfor);
141168404Spjd		if (error != 0)
142168404Spjd			return (error);
143168404Spjd
144168404Spjd		ZFS_ENTER(zfsvfs);
145209962Smm		dp = dmu_objset_pool(zfsvfs->z_os);
146209962Smm
147209962Smm		/*
148209962Smm		 * If the system is shutting down, then skip any
149209962Smm		 * filesystems which may exist on a suspended pool.
150209962Smm		 */
151209962Smm		if (sys_shutdown && spa_suspended(dp->dp_spa)) {
152209962Smm			ZFS_EXIT(zfsvfs);
153209962Smm			return (0);
154209962Smm		}
155209962Smm
156168404Spjd		if (zfsvfs->z_log != NULL)
157219089Spjd			zil_commit(zfsvfs->z_log, 0);
158219089Spjd
159168404Spjd		ZFS_EXIT(zfsvfs);
160168404Spjd	} else {
161168404Spjd		/*
162168404Spjd		 * Sync all ZFS filesystems.  This is what happens when you
163168404Spjd		 * run sync(1M).  Unlike other filesystems, ZFS honors the
164168404Spjd		 * request by waiting for all pools to commit all dirty data.
165168404Spjd		 */
166168404Spjd		spa_sync_allpools();
167168404Spjd	}
168168404Spjd
169168404Spjd	return (0);
170168404Spjd}
171168404Spjd
172219089Spjd#ifndef __FreeBSD__
173219089Spjdstatic int
174219089Spjdzfs_create_unique_device(dev_t *dev)
175219089Spjd{
176219089Spjd	major_t new_major;
177219089Spjd
178219089Spjd	do {
179219089Spjd		ASSERT3U(zfs_minor, <=, MAXMIN32);
180219089Spjd		minor_t start = zfs_minor;
181219089Spjd		do {
182219089Spjd			mutex_enter(&zfs_dev_mtx);
183219089Spjd			if (zfs_minor >= MAXMIN32) {
184219089Spjd				/*
185219089Spjd				 * If we're still using the real major
186219089Spjd				 * keep out of /dev/zfs and /dev/zvol minor
187219089Spjd				 * number space.  If we're using a getudev()'ed
188219089Spjd				 * major number, we can use all of its minors.
189219089Spjd				 */
190219089Spjd				if (zfs_major == ddi_name_to_major(ZFS_DRIVER))
191219089Spjd					zfs_minor = ZFS_MIN_MINOR;
192219089Spjd				else
193219089Spjd					zfs_minor = 0;
194219089Spjd			} else {
195219089Spjd				zfs_minor++;
196219089Spjd			}
197219089Spjd			*dev = makedevice(zfs_major, zfs_minor);
198219089Spjd			mutex_exit(&zfs_dev_mtx);
199219089Spjd		} while (vfs_devismounted(*dev) && zfs_minor != start);
200219089Spjd		if (zfs_minor == start) {
201219089Spjd			/*
202219089Spjd			 * We are using all ~262,000 minor numbers for the
203219089Spjd			 * current major number.  Create a new major number.
204219089Spjd			 */
205219089Spjd			if ((new_major = getudev()) == (major_t)-1) {
206219089Spjd				cmn_err(CE_WARN,
207219089Spjd				    "zfs_mount: Can't get unique major "
208219089Spjd				    "device number.");
209219089Spjd				return (-1);
210219089Spjd			}
211219089Spjd			mutex_enter(&zfs_dev_mtx);
212219089Spjd			zfs_major = new_major;
213219089Spjd			zfs_minor = 0;
214219089Spjd
215219089Spjd			mutex_exit(&zfs_dev_mtx);
216219089Spjd		} else {
217219089Spjd			break;
218219089Spjd		}
219219089Spjd		/* CONSTANTCONDITION */
220219089Spjd	} while (1);
221219089Spjd
222219089Spjd	return (0);
223219089Spjd}
224219089Spjd#endif	/* !__FreeBSD__ */
225219089Spjd
226168404Spjdstatic void
227168404Spjdatime_changed_cb(void *arg, uint64_t newval)
228168404Spjd{
229168404Spjd	zfsvfs_t *zfsvfs = arg;
230168404Spjd
231168404Spjd	if (newval == TRUE) {
232168404Spjd		zfsvfs->z_atime = TRUE;
233168404Spjd		zfsvfs->z_vfs->vfs_flag &= ~MNT_NOATIME;
234168404Spjd		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME);
235168404Spjd		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0);
236168404Spjd	} else {
237168404Spjd		zfsvfs->z_atime = FALSE;
238168404Spjd		zfsvfs->z_vfs->vfs_flag |= MNT_NOATIME;
239168404Spjd		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME);
240168404Spjd		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0);
241168404Spjd	}
242168404Spjd}
243168404Spjd
244168404Spjdstatic void
245168404Spjdxattr_changed_cb(void *arg, uint64_t newval)
246168404Spjd{
247168404Spjd	zfsvfs_t *zfsvfs = arg;
248168404Spjd
249168404Spjd	if (newval == TRUE) {
250168404Spjd		/* XXX locking on vfs_flag? */
251168404Spjd#ifdef TODO
252168404Spjd		zfsvfs->z_vfs->vfs_flag |= VFS_XATTR;
253168404Spjd#endif
254168404Spjd		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR);
255168404Spjd		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_XATTR, NULL, 0);
256168404Spjd	} else {
257168404Spjd		/* XXX locking on vfs_flag? */
258168404Spjd#ifdef TODO
259168404Spjd		zfsvfs->z_vfs->vfs_flag &= ~VFS_XATTR;
260168404Spjd#endif
261168404Spjd		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_XATTR);
262168404Spjd		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR, NULL, 0);
263168404Spjd	}
264168404Spjd}
265168404Spjd
266168404Spjdstatic void
267168404Spjdblksz_changed_cb(void *arg, uint64_t newval)
268168404Spjd{
269168404Spjd	zfsvfs_t *zfsvfs = arg;
270168404Spjd
271168404Spjd	if (newval < SPA_MINBLOCKSIZE ||
272168404Spjd	    newval > SPA_MAXBLOCKSIZE || !ISP2(newval))
273168404Spjd		newval = SPA_MAXBLOCKSIZE;
274168404Spjd
275168404Spjd	zfsvfs->z_max_blksz = newval;
276204101Spjd	zfsvfs->z_vfs->mnt_stat.f_iosize = newval;
277168404Spjd}
278168404Spjd
279168404Spjdstatic void
280168404Spjdreadonly_changed_cb(void *arg, uint64_t newval)
281168404Spjd{
282168404Spjd	zfsvfs_t *zfsvfs = arg;
283168404Spjd
284168404Spjd	if (newval) {
285168404Spjd		/* XXX locking on vfs_flag? */
286168404Spjd		zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY;
287168404Spjd		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW);
288168404Spjd		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0);
289168404Spjd	} else {
290168404Spjd		/* XXX locking on vfs_flag? */
291168404Spjd		zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
292168404Spjd		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO);
293168404Spjd		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0);
294168404Spjd	}
295168404Spjd}
296168404Spjd
297168404Spjdstatic void
298168404Spjdsetuid_changed_cb(void *arg, uint64_t newval)
299168404Spjd{
300168404Spjd	zfsvfs_t *zfsvfs = arg;
301168404Spjd
302168404Spjd	if (newval == FALSE) {
303168404Spjd		zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID;
304168404Spjd		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID);
305168404Spjd		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0);
306168404Spjd	} else {
307168404Spjd		zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID;
308168404Spjd		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID);
309168404Spjd		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0);
310168404Spjd	}
311168404Spjd}
312168404Spjd
313168404Spjdstatic void
314168404Spjdexec_changed_cb(void *arg, uint64_t newval)
315168404Spjd{
316168404Spjd	zfsvfs_t *zfsvfs = arg;
317168404Spjd
318168404Spjd	if (newval == FALSE) {
319168404Spjd		zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC;
320168404Spjd		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC);
321168404Spjd		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0);
322168404Spjd	} else {
323168404Spjd		zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC;
324168404Spjd		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC);
325168404Spjd		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0);
326168404Spjd	}
327168404Spjd}
328168404Spjd
329185029Spjd/*
330185029Spjd * The nbmand mount option can be changed at mount time.
331185029Spjd * We can't allow it to be toggled on live file systems or incorrect
332185029Spjd * behavior may be seen from cifs clients
333185029Spjd *
334185029Spjd * This property isn't registered via dsl_prop_register(), but this callback
335185029Spjd * will be called when a file system is first mounted
336185029Spjd */
337168404Spjdstatic void
338185029Spjdnbmand_changed_cb(void *arg, uint64_t newval)
339185029Spjd{
340185029Spjd	zfsvfs_t *zfsvfs = arg;
341185029Spjd	if (newval == FALSE) {
342185029Spjd		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND);
343185029Spjd		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND, NULL, 0);
344185029Spjd	} else {
345185029Spjd		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND);
346185029Spjd		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND, NULL, 0);
347185029Spjd	}
348185029Spjd}
349185029Spjd
350185029Spjdstatic void
351168404Spjdsnapdir_changed_cb(void *arg, uint64_t newval)
352168404Spjd{
353168404Spjd	zfsvfs_t *zfsvfs = arg;
354168404Spjd
355168404Spjd	zfsvfs->z_show_ctldir = newval;
356168404Spjd}
357168404Spjd
358168404Spjdstatic void
359185029Spjdvscan_changed_cb(void *arg, uint64_t newval)
360185029Spjd{
361185029Spjd	zfsvfs_t *zfsvfs = arg;
362185029Spjd
363185029Spjd	zfsvfs->z_vscan = newval;
364185029Spjd}
365185029Spjd
366185029Spjdstatic void
367168404Spjdacl_inherit_changed_cb(void *arg, uint64_t newval)
368168404Spjd{
369168404Spjd	zfsvfs_t *zfsvfs = arg;
370168404Spjd
371168404Spjd	zfsvfs->z_acl_inherit = newval;
372168404Spjd}
373168404Spjd
374168404Spjdstatic int
375168404Spjdzfs_register_callbacks(vfs_t *vfsp)
376168404Spjd{
377168404Spjd	struct dsl_dataset *ds = NULL;
378168404Spjd	objset_t *os = NULL;
379168404Spjd	zfsvfs_t *zfsvfs = NULL;
380185029Spjd	uint64_t nbmand;
381219089Spjd	int readonly, do_readonly = B_FALSE;
382219089Spjd	int setuid, do_setuid = B_FALSE;
383219089Spjd	int exec, do_exec = B_FALSE;
384219089Spjd	int xattr, do_xattr = B_FALSE;
385219089Spjd	int atime, do_atime = B_FALSE;
386168404Spjd	int error = 0;
387168404Spjd
388168404Spjd	ASSERT(vfsp);
389168404Spjd	zfsvfs = vfsp->vfs_data;
390168404Spjd	ASSERT(zfsvfs);
391168404Spjd	os = zfsvfs->z_os;
392168404Spjd
393168404Spjd	/*
394196965Spjd	 * This function can be called for a snapshot when we update snapshot's
395196965Spjd	 * mount point, which isn't really supported.
396196965Spjd	 */
397196965Spjd	if (dmu_objset_is_snapshot(os))
398196965Spjd		return (EOPNOTSUPP);
399196965Spjd
400196965Spjd	/*
401168404Spjd	 * The act of registering our callbacks will destroy any mount
402168404Spjd	 * options we may have.  In order to enable temporary overrides
403168404Spjd	 * of mount options, we stash away the current values and
404168404Spjd	 * restore them after we register the callbacks.
405168404Spjd	 */
406219089Spjd	if (vfs_optionisset(vfsp, MNTOPT_RO, NULL) ||
407219089Spjd	    !spa_writeable(dmu_objset_spa(os))) {
408168404Spjd		readonly = B_TRUE;
409168404Spjd		do_readonly = B_TRUE;
410168404Spjd	} else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) {
411168404Spjd		readonly = B_FALSE;
412168404Spjd		do_readonly = B_TRUE;
413168404Spjd	}
414168404Spjd	if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) {
415168404Spjd		setuid = B_FALSE;
416168404Spjd		do_setuid = B_TRUE;
417168404Spjd	} else {
418168404Spjd		if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) {
419168404Spjd			setuid = B_FALSE;
420168404Spjd			do_setuid = B_TRUE;
421168404Spjd		} else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) {
422168404Spjd			setuid = B_TRUE;
423168404Spjd			do_setuid = B_TRUE;
424168404Spjd		}
425168404Spjd	}
426168404Spjd	if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) {
427168404Spjd		exec = B_FALSE;
428168404Spjd		do_exec = B_TRUE;
429168404Spjd	} else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) {
430168404Spjd		exec = B_TRUE;
431168404Spjd		do_exec = B_TRUE;
432168404Spjd	}
433168404Spjd	if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) {
434168404Spjd		xattr = B_FALSE;
435168404Spjd		do_xattr = B_TRUE;
436168404Spjd	} else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) {
437168404Spjd		xattr = B_TRUE;
438168404Spjd		do_xattr = B_TRUE;
439168404Spjd	}
440185029Spjd	if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) {
441185029Spjd		atime = B_FALSE;
442185029Spjd		do_atime = B_TRUE;
443185029Spjd	} else if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) {
444185029Spjd		atime = B_TRUE;
445185029Spjd		do_atime = B_TRUE;
446185029Spjd	}
447168404Spjd
448168404Spjd	/*
449185029Spjd	 * nbmand is a special property.  It can only be changed at
450185029Spjd	 * mount time.
451185029Spjd	 *
452185029Spjd	 * This is weird, but it is documented to only be changeable
453185029Spjd	 * at mount time.
454185029Spjd	 */
455185029Spjd	if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) {
456185029Spjd		nbmand = B_FALSE;
457185029Spjd	} else if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) {
458185029Spjd		nbmand = B_TRUE;
459185029Spjd	} else {
460185029Spjd		char osname[MAXNAMELEN];
461185029Spjd
462185029Spjd		dmu_objset_name(os, osname);
463185029Spjd		if (error = dsl_prop_get_integer(osname, "nbmand", &nbmand,
464185029Spjd		    NULL)) {
465185029Spjd			return (error);
466185029Spjd		}
467185029Spjd	}
468185029Spjd
469185029Spjd	/*
470168404Spjd	 * Register property callbacks.
471168404Spjd	 *
472168404Spjd	 * It would probably be fine to just check for i/o error from
473168404Spjd	 * the first prop_register(), but I guess I like to go
474168404Spjd	 * overboard...
475168404Spjd	 */
476168404Spjd	ds = dmu_objset_ds(os);
477168404Spjd	error = dsl_prop_register(ds, "atime", atime_changed_cb, zfsvfs);
478168404Spjd	error = error ? error : dsl_prop_register(ds,
479168404Spjd	    "xattr", xattr_changed_cb, zfsvfs);
480168404Spjd	error = error ? error : dsl_prop_register(ds,
481168404Spjd	    "recordsize", blksz_changed_cb, zfsvfs);
482168404Spjd	error = error ? error : dsl_prop_register(ds,
483168404Spjd	    "readonly", readonly_changed_cb, zfsvfs);
484168404Spjd	error = error ? error : dsl_prop_register(ds,
485168404Spjd	    "setuid", setuid_changed_cb, zfsvfs);
486168404Spjd	error = error ? error : dsl_prop_register(ds,
487168404Spjd	    "exec", exec_changed_cb, zfsvfs);
488168404Spjd	error = error ? error : dsl_prop_register(ds,
489168404Spjd	    "snapdir", snapdir_changed_cb, zfsvfs);
490168404Spjd	error = error ? error : dsl_prop_register(ds,
491168404Spjd	    "aclinherit", acl_inherit_changed_cb, zfsvfs);
492185029Spjd	error = error ? error : dsl_prop_register(ds,
493185029Spjd	    "vscan", vscan_changed_cb, zfsvfs);
494168404Spjd	if (error)
495168404Spjd		goto unregister;
496168404Spjd
497168404Spjd	/*
498168404Spjd	 * Invoke our callbacks to restore temporary mount options.
499168404Spjd	 */
500168404Spjd	if (do_readonly)
501168404Spjd		readonly_changed_cb(zfsvfs, readonly);
502168404Spjd	if (do_setuid)
503168404Spjd		setuid_changed_cb(zfsvfs, setuid);
504168404Spjd	if (do_exec)
505168404Spjd		exec_changed_cb(zfsvfs, exec);
506168404Spjd	if (do_xattr)
507168404Spjd		xattr_changed_cb(zfsvfs, xattr);
508185029Spjd	if (do_atime)
509185029Spjd		atime_changed_cb(zfsvfs, atime);
510168404Spjd
511185029Spjd	nbmand_changed_cb(zfsvfs, nbmand);
512185029Spjd
513168404Spjd	return (0);
514168404Spjd
515168404Spjdunregister:
516168404Spjd	/*
517168404Spjd	 * We may attempt to unregister some callbacks that are not
518168404Spjd	 * registered, but this is OK; it will simply return ENOMSG,
519168404Spjd	 * which we will ignore.
520168404Spjd	 */
521168404Spjd	(void) dsl_prop_unregister(ds, "atime", atime_changed_cb, zfsvfs);
522168404Spjd	(void) dsl_prop_unregister(ds, "xattr", xattr_changed_cb, zfsvfs);
523168404Spjd	(void) dsl_prop_unregister(ds, "recordsize", blksz_changed_cb, zfsvfs);
524168404Spjd	(void) dsl_prop_unregister(ds, "readonly", readonly_changed_cb, zfsvfs);
525168404Spjd	(void) dsl_prop_unregister(ds, "setuid", setuid_changed_cb, zfsvfs);
526168404Spjd	(void) dsl_prop_unregister(ds, "exec", exec_changed_cb, zfsvfs);
527168404Spjd	(void) dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb, zfsvfs);
528168404Spjd	(void) dsl_prop_unregister(ds, "aclinherit", acl_inherit_changed_cb,
529168404Spjd	    zfsvfs);
530185029Spjd	(void) dsl_prop_unregister(ds, "vscan", vscan_changed_cb, zfsvfs);
531168404Spjd	return (error);
532168404Spjd
533168404Spjd}
534168404Spjd
535219089Spjdstatic int
536219089Spjdzfs_space_delta_cb(dmu_object_type_t bonustype, void *data,
537219089Spjd    uint64_t *userp, uint64_t *groupp)
538209962Smm{
539219089Spjd	znode_phys_t *znp = data;
540219089Spjd	int error = 0;
541209962Smm
542219089Spjd	/*
543219089Spjd	 * Is it a valid type of object to track?
544219089Spjd	 */
545219089Spjd	if (bonustype != DMU_OT_ZNODE && bonustype != DMU_OT_SA)
546219089Spjd		return (ENOENT);
547209962Smm
548219089Spjd	/*
549219089Spjd	 * If we have a NULL data pointer
550219089Spjd	 * then assume the id's aren't changing and
551219089Spjd	 * return EEXIST to the dmu to let it know to
552219089Spjd	 * use the same ids
553219089Spjd	 */
554219089Spjd	if (data == NULL)
555219089Spjd		return (EEXIST);
556209962Smm
557219089Spjd	if (bonustype == DMU_OT_ZNODE) {
558219089Spjd		*userp = znp->zp_uid;
559219089Spjd		*groupp = znp->zp_gid;
560219089Spjd	} else {
561219089Spjd		int hdrsize;
562209962Smm
563219089Spjd		ASSERT(bonustype == DMU_OT_SA);
564219089Spjd		hdrsize = sa_hdrsize(data);
565209962Smm
566219089Spjd		if (hdrsize != 0) {
567219089Spjd			*userp = *((uint64_t *)((uintptr_t)data + hdrsize +
568219089Spjd			    SA_UID_OFFSET));
569219089Spjd			*groupp = *((uint64_t *)((uintptr_t)data + hdrsize +
570219089Spjd			    SA_GID_OFFSET));
571219089Spjd		} else {
572219089Spjd			/*
573219089Spjd			 * This should only happen for newly created
574219089Spjd			 * files that haven't had the znode data filled
575219089Spjd			 * in yet.
576219089Spjd			 */
577219089Spjd			*userp = 0;
578219089Spjd			*groupp = 0;
579219089Spjd		}
580209962Smm	}
581219089Spjd	return (error);
582209962Smm}
583209962Smm
584209962Smmstatic void
585209962Smmfuidstr_to_sid(zfsvfs_t *zfsvfs, const char *fuidstr,
586209962Smm    char *domainbuf, int buflen, uid_t *ridp)
587209962Smm{
588209962Smm	uint64_t fuid;
589209962Smm	const char *domain;
590209962Smm
591209962Smm	fuid = strtonum(fuidstr, NULL);
592209962Smm
593209962Smm	domain = zfs_fuid_find_by_idx(zfsvfs, FUID_INDEX(fuid));
594209962Smm	if (domain)
595209962Smm		(void) strlcpy(domainbuf, domain, buflen);
596209962Smm	else
597209962Smm		domainbuf[0] = '\0';
598209962Smm	*ridp = FUID_RID(fuid);
599209962Smm}
600209962Smm
601209962Smmstatic uint64_t
602209962Smmzfs_userquota_prop_to_obj(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type)
603209962Smm{
604209962Smm	switch (type) {
605209962Smm	case ZFS_PROP_USERUSED:
606209962Smm		return (DMU_USERUSED_OBJECT);
607209962Smm	case ZFS_PROP_GROUPUSED:
608209962Smm		return (DMU_GROUPUSED_OBJECT);
609209962Smm	case ZFS_PROP_USERQUOTA:
610209962Smm		return (zfsvfs->z_userquota_obj);
611209962Smm	case ZFS_PROP_GROUPQUOTA:
612209962Smm		return (zfsvfs->z_groupquota_obj);
613209962Smm	}
614209962Smm	return (0);
615209962Smm}
616209962Smm
617209962Smmint
618209962Smmzfs_userspace_many(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
619209962Smm    uint64_t *cookiep, void *vbuf, uint64_t *bufsizep)
620209962Smm{
621209962Smm	int error;
622209962Smm	zap_cursor_t zc;
623209962Smm	zap_attribute_t za;
624209962Smm	zfs_useracct_t *buf = vbuf;
625209962Smm	uint64_t obj;
626209962Smm
627209962Smm	if (!dmu_objset_userspace_present(zfsvfs->z_os))
628209962Smm		return (ENOTSUP);
629209962Smm
630209962Smm	obj = zfs_userquota_prop_to_obj(zfsvfs, type);
631209962Smm	if (obj == 0) {
632209962Smm		*bufsizep = 0;
633209962Smm		return (0);
634209962Smm	}
635209962Smm
636209962Smm	for (zap_cursor_init_serialized(&zc, zfsvfs->z_os, obj, *cookiep);
637209962Smm	    (error = zap_cursor_retrieve(&zc, &za)) == 0;
638209962Smm	    zap_cursor_advance(&zc)) {
639209962Smm		if ((uintptr_t)buf - (uintptr_t)vbuf + sizeof (zfs_useracct_t) >
640209962Smm		    *bufsizep)
641209962Smm			break;
642209962Smm
643209962Smm		fuidstr_to_sid(zfsvfs, za.za_name,
644209962Smm		    buf->zu_domain, sizeof (buf->zu_domain), &buf->zu_rid);
645209962Smm
646209962Smm		buf->zu_space = za.za_first_integer;
647209962Smm		buf++;
648209962Smm	}
649209962Smm	if (error == ENOENT)
650209962Smm		error = 0;
651209962Smm
652209962Smm	ASSERT3U((uintptr_t)buf - (uintptr_t)vbuf, <=, *bufsizep);
653209962Smm	*bufsizep = (uintptr_t)buf - (uintptr_t)vbuf;
654209962Smm	*cookiep = zap_cursor_serialize(&zc);
655209962Smm	zap_cursor_fini(&zc);
656209962Smm	return (error);
657209962Smm}
658209962Smm
659209962Smm/*
660209962Smm * buf must be big enough (eg, 32 bytes)
661209962Smm */
662168404Spjdstatic int
663209962Smmid_to_fuidstr(zfsvfs_t *zfsvfs, const char *domain, uid_t rid,
664209962Smm    char *buf, boolean_t addok)
665209962Smm{
666209962Smm	uint64_t fuid;
667209962Smm	int domainid = 0;
668209962Smm
669209962Smm	if (domain && domain[0]) {
670209962Smm		domainid = zfs_fuid_find_by_domain(zfsvfs, domain, NULL, addok);
671209962Smm		if (domainid == -1)
672209962Smm			return (ENOENT);
673209962Smm	}
674209962Smm	fuid = FUID_ENCODE(domainid, rid);
675209962Smm	(void) sprintf(buf, "%llx", (longlong_t)fuid);
676209962Smm	return (0);
677209962Smm}
678209962Smm
679209962Smmint
680209962Smmzfs_userspace_one(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
681209962Smm    const char *domain, uint64_t rid, uint64_t *valp)
682209962Smm{
683209962Smm	char buf[32];
684209962Smm	int err;
685209962Smm	uint64_t obj;
686209962Smm
687209962Smm	*valp = 0;
688209962Smm
689209962Smm	if (!dmu_objset_userspace_present(zfsvfs->z_os))
690209962Smm		return (ENOTSUP);
691209962Smm
692209962Smm	obj = zfs_userquota_prop_to_obj(zfsvfs, type);
693209962Smm	if (obj == 0)
694209962Smm		return (0);
695209962Smm
696209962Smm	err = id_to_fuidstr(zfsvfs, domain, rid, buf, B_FALSE);
697209962Smm	if (err)
698209962Smm		return (err);
699209962Smm
700209962Smm	err = zap_lookup(zfsvfs->z_os, obj, buf, 8, 1, valp);
701209962Smm	if (err == ENOENT)
702209962Smm		err = 0;
703209962Smm	return (err);
704209962Smm}
705209962Smm
706209962Smmint
707209962Smmzfs_set_userquota(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
708209962Smm    const char *domain, uint64_t rid, uint64_t quota)
709209962Smm{
710209962Smm	char buf[32];
711209962Smm	int err;
712209962Smm	dmu_tx_t *tx;
713209962Smm	uint64_t *objp;
714209962Smm	boolean_t fuid_dirtied;
715209962Smm
716209962Smm	if (type != ZFS_PROP_USERQUOTA && type != ZFS_PROP_GROUPQUOTA)
717209962Smm		return (EINVAL);
718209962Smm
719209962Smm	if (zfsvfs->z_version < ZPL_VERSION_USERSPACE)
720209962Smm		return (ENOTSUP);
721209962Smm
722209962Smm	objp = (type == ZFS_PROP_USERQUOTA) ? &zfsvfs->z_userquota_obj :
723209962Smm	    &zfsvfs->z_groupquota_obj;
724209962Smm
725209962Smm	err = id_to_fuidstr(zfsvfs, domain, rid, buf, B_TRUE);
726209962Smm	if (err)
727209962Smm		return (err);
728209962Smm	fuid_dirtied = zfsvfs->z_fuid_dirty;
729209962Smm
730209962Smm	tx = dmu_tx_create(zfsvfs->z_os);
731209962Smm	dmu_tx_hold_zap(tx, *objp ? *objp : DMU_NEW_OBJECT, B_TRUE, NULL);
732209962Smm	if (*objp == 0) {
733209962Smm		dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE,
734209962Smm		    zfs_userquota_prop_prefixes[type]);
735209962Smm	}
736209962Smm	if (fuid_dirtied)
737209962Smm		zfs_fuid_txhold(zfsvfs, tx);
738209962Smm	err = dmu_tx_assign(tx, TXG_WAIT);
739209962Smm	if (err) {
740209962Smm		dmu_tx_abort(tx);
741209962Smm		return (err);
742209962Smm	}
743209962Smm
744209962Smm	mutex_enter(&zfsvfs->z_lock);
745209962Smm	if (*objp == 0) {
746209962Smm		*objp = zap_create(zfsvfs->z_os, DMU_OT_USERGROUP_QUOTA,
747209962Smm		    DMU_OT_NONE, 0, tx);
748209962Smm		VERIFY(0 == zap_add(zfsvfs->z_os, MASTER_NODE_OBJ,
749209962Smm		    zfs_userquota_prop_prefixes[type], 8, 1, objp, tx));
750209962Smm	}
751209962Smm	mutex_exit(&zfsvfs->z_lock);
752209962Smm
753209962Smm	if (quota == 0) {
754209962Smm		err = zap_remove(zfsvfs->z_os, *objp, buf, tx);
755209962Smm		if (err == ENOENT)
756209962Smm			err = 0;
757209962Smm	} else {
758209962Smm		err = zap_update(zfsvfs->z_os, *objp, buf, 8, 1, &quota, tx);
759209962Smm	}
760209962Smm	ASSERT(err == 0);
761209962Smm	if (fuid_dirtied)
762209962Smm		zfs_fuid_sync(zfsvfs, tx);
763209962Smm	dmu_tx_commit(tx);
764209962Smm	return (err);
765209962Smm}
766209962Smm
767209962Smmboolean_t
768219089Spjdzfs_fuid_overquota(zfsvfs_t *zfsvfs, boolean_t isgroup, uint64_t fuid)
769209962Smm{
770209962Smm	char buf[32];
771209962Smm	uint64_t used, quota, usedobj, quotaobj;
772209962Smm	int err;
773209962Smm
774209962Smm	usedobj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT;
775209962Smm	quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj;
776209962Smm
777209962Smm	if (quotaobj == 0 || zfsvfs->z_replay)
778209962Smm		return (B_FALSE);
779209962Smm
780209962Smm	(void) sprintf(buf, "%llx", (longlong_t)fuid);
781209962Smm	err = zap_lookup(zfsvfs->z_os, quotaobj, buf, 8, 1, &quota);
782209962Smm	if (err != 0)
783209962Smm		return (B_FALSE);
784209962Smm
785209962Smm	err = zap_lookup(zfsvfs->z_os, usedobj, buf, 8, 1, &used);
786209962Smm	if (err != 0)
787209962Smm		return (B_FALSE);
788209962Smm	return (used >= quota);
789209962Smm}
790209962Smm
791219089Spjdboolean_t
792219089Spjdzfs_owner_overquota(zfsvfs_t *zfsvfs, znode_t *zp, boolean_t isgroup)
793219089Spjd{
794219089Spjd	uint64_t fuid;
795219089Spjd	uint64_t quotaobj;
796219089Spjd
797219089Spjd	quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj;
798219089Spjd
799219089Spjd	fuid = isgroup ? zp->z_gid : zp->z_uid;
800219089Spjd
801219089Spjd	if (quotaobj == 0 || zfsvfs->z_replay)
802219089Spjd		return (B_FALSE);
803219089Spjd
804219089Spjd	return (zfs_fuid_overquota(zfsvfs, isgroup, fuid));
805219089Spjd}
806219089Spjd
807209962Smmint
808219089Spjdzfsvfs_create(const char *osname, zfsvfs_t **zfvp)
809209962Smm{
810209962Smm	objset_t *os;
811209962Smm	zfsvfs_t *zfsvfs;
812209962Smm	uint64_t zval;
813209962Smm	int i, error;
814219089Spjd	uint64_t sa_obj;
815209962Smm
816219089Spjd	zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
817219089Spjd
818219089Spjd	/*
819219089Spjd	 * We claim to always be readonly so we can open snapshots;
820219089Spjd	 * other ZPL code will prevent us from writing to snapshots.
821219089Spjd	 */
822219089Spjd	error = dmu_objset_own(osname, DMU_OST_ZFS, B_TRUE, zfsvfs, &os);
823219089Spjd	if (error) {
824219089Spjd		kmem_free(zfsvfs, sizeof (zfsvfs_t));
825209962Smm		return (error);
826209962Smm	}
827209962Smm
828209962Smm	/*
829209962Smm	 * Initialize the zfs-specific filesystem structure.
830209962Smm	 * Should probably make this a kmem cache, shuffle fields,
831209962Smm	 * and just bzero up to z_hold_mtx[].
832209962Smm	 */
833209962Smm	zfsvfs->z_vfs = NULL;
834209962Smm	zfsvfs->z_parent = zfsvfs;
835209962Smm	zfsvfs->z_max_blksz = SPA_MAXBLOCKSIZE;
836209962Smm	zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE;
837209962Smm	zfsvfs->z_os = os;
838209962Smm
839209962Smm	error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version);
840209962Smm	if (error) {
841209962Smm		goto out;
842219089Spjd	} else if (zfsvfs->z_version >
843219089Spjd	    zfs_zpl_version_map(spa_version(dmu_objset_spa(os)))) {
844219089Spjd		(void) printf("Can't mount a version %lld file system "
845219089Spjd		    "on a version %lld pool\n. Pool must be upgraded to mount "
846219089Spjd		    "this file system.", (u_longlong_t)zfsvfs->z_version,
847219089Spjd		    (u_longlong_t)spa_version(dmu_objset_spa(os)));
848209962Smm		error = ENOTSUP;
849209962Smm		goto out;
850209962Smm	}
851209962Smm	if ((error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &zval)) != 0)
852209962Smm		goto out;
853209962Smm	zfsvfs->z_norm = (int)zval;
854209962Smm
855209962Smm	if ((error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &zval)) != 0)
856209962Smm		goto out;
857209962Smm	zfsvfs->z_utf8 = (zval != 0);
858209962Smm
859209962Smm	if ((error = zfs_get_zplprop(os, ZFS_PROP_CASE, &zval)) != 0)
860209962Smm		goto out;
861209962Smm	zfsvfs->z_case = (uint_t)zval;
862209962Smm
863209962Smm	/*
864209962Smm	 * Fold case on file systems that are always or sometimes case
865209962Smm	 * insensitive.
866209962Smm	 */
867209962Smm	if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
868209962Smm	    zfsvfs->z_case == ZFS_CASE_MIXED)
869209962Smm		zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;
870209962Smm
871209962Smm	zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
872219089Spjd	zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
873209962Smm
874219089Spjd	if (zfsvfs->z_use_sa) {
875219089Spjd		/* should either have both of these objects or none */
876219089Spjd		error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1,
877219089Spjd		    &sa_obj);
878219089Spjd		if (error)
879219089Spjd			return (error);
880219089Spjd	} else {
881219089Spjd		/*
882219089Spjd		 * Pre SA versions file systems should never touch
883219089Spjd		 * either the attribute registration or layout objects.
884219089Spjd		 */
885219089Spjd		sa_obj = 0;
886219089Spjd	}
887219089Spjd
888219089Spjd	error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END,
889219089Spjd	    &zfsvfs->z_attr_table);
890219089Spjd	if (error)
891219089Spjd		goto out;
892219089Spjd
893219089Spjd	if (zfsvfs->z_version >= ZPL_VERSION_SA)
894219089Spjd		sa_register_update_callback(os, zfs_sa_upgrade);
895219089Spjd
896209962Smm	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1,
897209962Smm	    &zfsvfs->z_root);
898209962Smm	if (error)
899209962Smm		goto out;
900209962Smm	ASSERT(zfsvfs->z_root != 0);
901209962Smm
902209962Smm	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1,
903209962Smm	    &zfsvfs->z_unlinkedobj);
904209962Smm	if (error)
905209962Smm		goto out;
906209962Smm
907209962Smm	error = zap_lookup(os, MASTER_NODE_OBJ,
908209962Smm	    zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA],
909209962Smm	    8, 1, &zfsvfs->z_userquota_obj);
910209962Smm	if (error && error != ENOENT)
911209962Smm		goto out;
912209962Smm
913209962Smm	error = zap_lookup(os, MASTER_NODE_OBJ,
914209962Smm	    zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA],
915209962Smm	    8, 1, &zfsvfs->z_groupquota_obj);
916209962Smm	if (error && error != ENOENT)
917209962Smm		goto out;
918209962Smm
919209962Smm	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1,
920209962Smm	    &zfsvfs->z_fuid_obj);
921209962Smm	if (error && error != ENOENT)
922209962Smm		goto out;
923209962Smm
924209962Smm	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1,
925209962Smm	    &zfsvfs->z_shares_dir);
926209962Smm	if (error && error != ENOENT)
927209962Smm		goto out;
928209962Smm
929209962Smm	mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
930209962Smm	mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL);
931209962Smm	list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
932209962Smm	    offsetof(znode_t, z_link_node));
933209962Smm	rrw_init(&zfsvfs->z_teardown_lock);
934209962Smm	rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL);
935209962Smm	rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL);
936209962Smm	for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
937209962Smm		mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
938209962Smm
939219089Spjd	*zfvp = zfsvfs;
940209962Smm	return (0);
941209962Smm
942209962Smmout:
943219089Spjd	dmu_objset_disown(os, zfsvfs);
944219089Spjd	*zfvp = NULL;
945209962Smm	kmem_free(zfsvfs, sizeof (zfsvfs_t));
946209962Smm	return (error);
947209962Smm}
948209962Smm
949209962Smmstatic int
950185029Spjdzfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
951168404Spjd{
952185029Spjd	int error;
953185029Spjd
954185029Spjd	error = zfs_register_callbacks(zfsvfs->z_vfs);
955185029Spjd	if (error)
956185029Spjd		return (error);
957185029Spjd
958185029Spjd	/*
959185029Spjd	 * Set the objset user_ptr to track its zfsvfs.
960185029Spjd	 */
961219089Spjd	mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
962185029Spjd	dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
963219089Spjd	mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
964185029Spjd
965208689Smm	zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data);
966208689Smm
967185029Spjd	/*
968185029Spjd	 * If we are not mounting (ie: online recv), then we don't
969185029Spjd	 * have to worry about replaying the log as we blocked all
970185029Spjd	 * operations out since we closed the ZIL.
971185029Spjd	 */
972185029Spjd	if (mounting) {
973185029Spjd		boolean_t readonly;
974185029Spjd
975185029Spjd		/*
976185029Spjd		 * During replay we remove the read only flag to
977185029Spjd		 * allow replays to succeed.
978185029Spjd		 */
979185029Spjd		readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY;
980208689Smm		if (readonly != 0)
981208689Smm			zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
982208689Smm		else
983208689Smm			zfs_unlinked_drain(zfsvfs);
984185029Spjd
985219089Spjd		/*
986219089Spjd		 * Parse and replay the intent log.
987219089Spjd		 *
988219089Spjd		 * Because of ziltest, this must be done after
989219089Spjd		 * zfs_unlinked_drain().  (Further note: ziltest
990219089Spjd		 * doesn't use readonly mounts, where
991219089Spjd		 * zfs_unlinked_drain() isn't called.)  This is because
992219089Spjd		 * ziltest causes spa_sync() to think it's committed,
993219089Spjd		 * but actually it is not, so the intent log contains
994219089Spjd		 * many txg's worth of changes.
995219089Spjd		 *
996219089Spjd		 * In particular, if object N is in the unlinked set in
997219089Spjd		 * the last txg to actually sync, then it could be
998219089Spjd		 * actually freed in a later txg and then reallocated
999219089Spjd		 * in a yet later txg.  This would write a "create
1000219089Spjd		 * object N" record to the intent log.  Normally, this
1001219089Spjd		 * would be fine because the spa_sync() would have
1002219089Spjd		 * written out the fact that object N is free, before
1003219089Spjd		 * we could write the "create object N" intent log
1004219089Spjd		 * record.
1005219089Spjd		 *
1006219089Spjd		 * But when we are in ziltest mode, we advance the "open
1007219089Spjd		 * txg" without actually spa_sync()-ing the changes to
1008219089Spjd		 * disk.  So we would see that object N is still
1009219089Spjd		 * allocated and in the unlinked set, and there is an
1010219089Spjd		 * intent log record saying to allocate it.
1011219089Spjd		 */
1012219089Spjd		if (spa_writeable(dmu_objset_spa(zfsvfs->z_os))) {
1013219089Spjd			if (zil_replay_disable) {
1014219089Spjd				zil_destroy(zfsvfs->z_log, B_FALSE);
1015219089Spjd			} else {
1016219089Spjd				zfsvfs->z_replay = B_TRUE;
1017219089Spjd				zil_replay(zfsvfs->z_os, zfsvfs,
1018219089Spjd				    zfs_replay_vector);
1019219089Spjd				zfsvfs->z_replay = B_FALSE;
1020219089Spjd			}
1021208689Smm		}
1022185029Spjd		zfsvfs->z_vfs->vfs_flag |= readonly; /* restore readonly bit */
1023185029Spjd	}
1024185029Spjd
1025185029Spjd	return (0);
1026185029Spjd}
1027185029Spjd
1028210470Smmextern krwlock_t zfsvfs_lock; /* in zfs_znode.c */
1029210470Smm
1030209962Smmvoid
1031209962Smmzfsvfs_free(zfsvfs_t *zfsvfs)
1032185029Spjd{
1033209962Smm	int i;
1034209962Smm
1035210470Smm	/*
1036210470Smm	 * This is a barrier to prevent the filesystem from going away in
1037210470Smm	 * zfs_znode_move() until we can safely ensure that the filesystem is
1038210470Smm	 * not unmounted. We consider the filesystem valid before the barrier
1039210470Smm	 * and invalid after the barrier.
1040210470Smm	 */
1041210470Smm	rw_enter(&zfsvfs_lock, RW_READER);
1042210470Smm	rw_exit(&zfsvfs_lock);
1043210470Smm
1044209962Smm	zfs_fuid_destroy(zfsvfs);
1045209962Smm
1046185029Spjd	mutex_destroy(&zfsvfs->z_znodes_lock);
1047209962Smm	mutex_destroy(&zfsvfs->z_lock);
1048185029Spjd	list_destroy(&zfsvfs->z_all_znodes);
1049185029Spjd	rrw_destroy(&zfsvfs->z_teardown_lock);
1050185029Spjd	rw_destroy(&zfsvfs->z_teardown_inactive_lock);
1051185029Spjd	rw_destroy(&zfsvfs->z_fuid_lock);
1052209962Smm	for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
1053209962Smm		mutex_destroy(&zfsvfs->z_hold_mtx[i]);
1054185029Spjd	kmem_free(zfsvfs, sizeof (zfsvfs_t));
1055185029Spjd}
1056185029Spjd
1057209962Smmstatic void
1058209962Smmzfs_set_fuid_feature(zfsvfs_t *zfsvfs)
1059209962Smm{
1060209962Smm	zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
1061219089Spjd	if (zfsvfs->z_vfs) {
1062219089Spjd		if (zfsvfs->z_use_fuids) {
1063219089Spjd			vfs_set_feature(zfsvfs->z_vfs, VFSFT_XVATTR);
1064219089Spjd			vfs_set_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS);
1065219089Spjd			vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS);
1066219089Spjd			vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE);
1067219089Spjd			vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER);
1068219089Spjd			vfs_set_feature(zfsvfs->z_vfs, VFSFT_REPARSE);
1069219089Spjd		} else {
1070219089Spjd			vfs_clear_feature(zfsvfs->z_vfs, VFSFT_XVATTR);
1071219089Spjd			vfs_clear_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS);
1072219089Spjd			vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS);
1073219089Spjd			vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE);
1074219089Spjd			vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER);
1075219089Spjd			vfs_clear_feature(zfsvfs->z_vfs, VFSFT_REPARSE);
1076219089Spjd		}
1077209962Smm	}
1078219089Spjd	zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
1079209962Smm}
1080209962Smm
1081185029Spjdstatic int
1082185029Spjdzfs_domount(vfs_t *vfsp, char *osname)
1083185029Spjd{
1084209962Smm	uint64_t recordsize, fsid_guid;
1085168404Spjd	int error = 0;
1086168404Spjd	zfsvfs_t *zfsvfs;
1087209962Smm	vnode_t *vp;
1088168404Spjd
1089168404Spjd	ASSERT(vfsp);
1090168404Spjd	ASSERT(osname);
1091168404Spjd
1092219089Spjd	error = zfsvfs_create(osname, &zfsvfs);
1093209962Smm	if (error)
1094209962Smm		return (error);
1095168404Spjd	zfsvfs->z_vfs = vfsp;
1096168404Spjd
1097168404Spjd	if (error = dsl_prop_get_integer(osname, "recordsize", &recordsize,
1098168404Spjd	    NULL))
1099168404Spjd		goto out;
1100204101Spjd	zfsvfs->z_vfs->vfs_bsize = SPA_MINBLOCKSIZE;
1101204101Spjd	zfsvfs->z_vfs->mnt_stat.f_iosize = recordsize;
1102168404Spjd
1103168404Spjd	vfsp->vfs_data = zfsvfs;
1104218386Strasz	vfsp->mnt_flag |= MNT_LOCAL;
1105168404Spjd	vfsp->mnt_kern_flag |= MNTK_MPSAFE;
1106168404Spjd	vfsp->mnt_kern_flag |= MNTK_LOOKUP_SHARED;
1107193440Sps	vfsp->mnt_kern_flag |= MNTK_SHARED_WRITES;
1108168404Spjd
1109209962Smm	/*
1110209962Smm	 * The fsid is 64 bits, composed of an 8-bit fs type, which
1111209962Smm	 * separates our fsid from any other filesystem types, and a
1112209962Smm	 * 56-bit objset unique ID.  The objset unique ID is unique to
1113209962Smm	 * all objsets open on this system, provided by unique_create().
1114209962Smm	 * The 8-bit fs type must be put in the low bits of fsid[1]
1115209962Smm	 * because that's where other Solaris filesystems put it.
1116209962Smm	 */
1117209962Smm	fsid_guid = dmu_objset_fsid_guid(zfsvfs->z_os);
1118209962Smm	ASSERT((fsid_guid & ~((1ULL<<56)-1)) == 0);
1119209962Smm	vfsp->vfs_fsid.val[0] = fsid_guid;
1120209962Smm	vfsp->vfs_fsid.val[1] = ((fsid_guid>>32) << 8) |
1121209962Smm	    vfsp->mnt_vfc->vfc_typenum & 0xFF;
1122168404Spjd
1123185029Spjd	/*
1124185029Spjd	 * Set features for file system.
1125185029Spjd	 */
1126209962Smm	zfs_set_fuid_feature(zfsvfs);
1127185029Spjd	if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) {
1128185029Spjd		vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);
1129185029Spjd		vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);
1130185029Spjd		vfs_set_feature(vfsp, VFSFT_NOCASESENSITIVE);
1131185029Spjd	} else if (zfsvfs->z_case == ZFS_CASE_MIXED) {
1132185029Spjd		vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);
1133185029Spjd		vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);
1134185029Spjd	}
1135219089Spjd	vfs_set_feature(vfsp, VFSFT_ZEROCOPY_SUPPORTED);
1136185029Spjd
1137168404Spjd	if (dmu_objset_is_snapshot(zfsvfs->z_os)) {
1138185029Spjd		uint64_t pval;
1139168404Spjd
1140168404Spjd		atime_changed_cb(zfsvfs, B_FALSE);
1141168404Spjd		readonly_changed_cb(zfsvfs, B_TRUE);
1142185029Spjd		if (error = dsl_prop_get_integer(osname, "xattr", &pval, NULL))
1143168404Spjd			goto out;
1144185029Spjd		xattr_changed_cb(zfsvfs, pval);
1145168404Spjd		zfsvfs->z_issnap = B_TRUE;
1146219089Spjd		zfsvfs->z_os->os_sync = ZFS_SYNC_DISABLED;
1147209962Smm
1148219089Spjd		mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
1149209962Smm		dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
1150219089Spjd		mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
1151168404Spjd	} else {
1152185029Spjd		error = zfsvfs_setup(zfsvfs, B_TRUE);
1153168404Spjd	}
1154168404Spjd
1155168404Spjd	vfs_mountedfrom(vfsp, osname);
1156209962Smm	/* Grab extra reference. */
1157209962Smm	VERIFY(VFS_ROOT(vfsp, LK_EXCLUSIVE, &vp) == 0);
1158209962Smm	VOP_UNLOCK(vp, 0);
1159168404Spjd
1160168404Spjd	if (!zfsvfs->z_issnap)
1161168404Spjd		zfsctl_create(zfsvfs);
1162168404Spjdout:
1163168404Spjd	if (error) {
1164219089Spjd		dmu_objset_disown(zfsvfs->z_os, zfsvfs);
1165209962Smm		zfsvfs_free(zfsvfs);
1166168404Spjd	} else {
1167168404Spjd		atomic_add_32(&zfs_active_fs_count, 1);
1168168404Spjd	}
1169168404Spjd
1170168404Spjd	return (error);
1171168404Spjd}
1172168404Spjd
1173168404Spjdvoid
1174168404Spjdzfs_unregister_callbacks(zfsvfs_t *zfsvfs)
1175168404Spjd{
1176168404Spjd	objset_t *os = zfsvfs->z_os;
1177168404Spjd	struct dsl_dataset *ds;
1178168404Spjd
1179168404Spjd	/*
1180168404Spjd	 * Unregister properties.
1181168404Spjd	 */
1182168404Spjd	if (!dmu_objset_is_snapshot(os)) {
1183168404Spjd		ds = dmu_objset_ds(os);
1184168404Spjd		VERIFY(dsl_prop_unregister(ds, "atime", atime_changed_cb,
1185168404Spjd		    zfsvfs) == 0);
1186168404Spjd
1187168404Spjd		VERIFY(dsl_prop_unregister(ds, "xattr", xattr_changed_cb,
1188168404Spjd		    zfsvfs) == 0);
1189168404Spjd
1190168404Spjd		VERIFY(dsl_prop_unregister(ds, "recordsize", blksz_changed_cb,
1191168404Spjd		    zfsvfs) == 0);
1192168404Spjd
1193168404Spjd		VERIFY(dsl_prop_unregister(ds, "readonly", readonly_changed_cb,
1194168404Spjd		    zfsvfs) == 0);
1195168404Spjd
1196168404Spjd		VERIFY(dsl_prop_unregister(ds, "setuid", setuid_changed_cb,
1197168404Spjd		    zfsvfs) == 0);
1198168404Spjd
1199168404Spjd		VERIFY(dsl_prop_unregister(ds, "exec", exec_changed_cb,
1200168404Spjd		    zfsvfs) == 0);
1201168404Spjd
1202168404Spjd		VERIFY(dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb,
1203168404Spjd		    zfsvfs) == 0);
1204168404Spjd
1205168404Spjd		VERIFY(dsl_prop_unregister(ds, "aclinherit",
1206168404Spjd		    acl_inherit_changed_cb, zfsvfs) == 0);
1207185029Spjd
1208185029Spjd		VERIFY(dsl_prop_unregister(ds, "vscan",
1209185029Spjd		    vscan_changed_cb, zfsvfs) == 0);
1210168404Spjd	}
1211168404Spjd}
1212168404Spjd
1213219089Spjd#ifdef SECLABEL
1214219089Spjd/*
1215219089Spjd * Convert a decimal digit string to a uint64_t integer.
1216219089Spjd */
1217219089Spjdstatic int
1218219089Spjdstr_to_uint64(char *str, uint64_t *objnum)
1219219089Spjd{
1220219089Spjd	uint64_t num = 0;
1221219089Spjd
1222219089Spjd	while (*str) {
1223219089Spjd		if (*str < '0' || *str > '9')
1224219089Spjd			return (EINVAL);
1225219089Spjd
1226219089Spjd		num = num*10 + *str++ - '0';
1227219089Spjd	}
1228219089Spjd
1229219089Spjd	*objnum = num;
1230219089Spjd	return (0);
1231219089Spjd}
1232219089Spjd
1233219089Spjd/*
1234219089Spjd * The boot path passed from the boot loader is in the form of
1235219089Spjd * "rootpool-name/root-filesystem-object-number'. Convert this
1236219089Spjd * string to a dataset name: "rootpool-name/root-filesystem-name".
1237219089Spjd */
1238219089Spjdstatic int
1239219089Spjdzfs_parse_bootfs(char *bpath, char *outpath)
1240219089Spjd{
1241219089Spjd	char *slashp;
1242219089Spjd	uint64_t objnum;
1243219089Spjd	int error;
1244219089Spjd
1245219089Spjd	if (*bpath == 0 || *bpath == '/')
1246219089Spjd		return (EINVAL);
1247219089Spjd
1248219089Spjd	(void) strcpy(outpath, bpath);
1249219089Spjd
1250219089Spjd	slashp = strchr(bpath, '/');
1251219089Spjd
1252219089Spjd	/* if no '/', just return the pool name */
1253219089Spjd	if (slashp == NULL) {
1254219089Spjd		return (0);
1255219089Spjd	}
1256219089Spjd
1257219089Spjd	/* if not a number, just return the root dataset name */
1258219089Spjd	if (str_to_uint64(slashp+1, &objnum)) {
1259219089Spjd		return (0);
1260219089Spjd	}
1261219089Spjd
1262219089Spjd	*slashp = '\0';
1263219089Spjd	error = dsl_dsobj_to_dsname(bpath, objnum, outpath);
1264219089Spjd	*slashp = '/';
1265219089Spjd
1266219089Spjd	return (error);
1267219089Spjd}
1268219089Spjd
1269219089Spjd/*
1270219089Spjd * zfs_check_global_label:
1271219089Spjd *	Check that the hex label string is appropriate for the dataset
1272219089Spjd *	being mounted into the global_zone proper.
1273219089Spjd *
1274219089Spjd *	Return an error if the hex label string is not default or
1275219089Spjd *	admin_low/admin_high.  For admin_low labels, the corresponding
1276219089Spjd *	dataset must be readonly.
1277219089Spjd */
1278219089Spjdint
1279219089Spjdzfs_check_global_label(const char *dsname, const char *hexsl)
1280219089Spjd{
1281219089Spjd	if (strcasecmp(hexsl, ZFS_MLSLABEL_DEFAULT) == 0)
1282219089Spjd		return (0);
1283219089Spjd	if (strcasecmp(hexsl, ADMIN_HIGH) == 0)
1284219089Spjd		return (0);
1285219089Spjd	if (strcasecmp(hexsl, ADMIN_LOW) == 0) {
1286219089Spjd		/* must be readonly */
1287219089Spjd		uint64_t rdonly;
1288219089Spjd
1289219089Spjd		if (dsl_prop_get_integer(dsname,
1290219089Spjd		    zfs_prop_to_name(ZFS_PROP_READONLY), &rdonly, NULL))
1291219089Spjd			return (EACCES);
1292219089Spjd		return (rdonly ? 0 : EACCES);
1293219089Spjd	}
1294219089Spjd	return (EACCES);
1295219089Spjd}
1296219089Spjd
1297219089Spjd/*
1298219089Spjd * zfs_mount_label_policy:
1299219089Spjd *	Determine whether the mount is allowed according to MAC check.
1300219089Spjd *	by comparing (where appropriate) label of the dataset against
1301219089Spjd *	the label of the zone being mounted into.  If the dataset has
1302219089Spjd *	no label, create one.
1303219089Spjd *
1304219089Spjd *	Returns:
1305219089Spjd *		 0 :	access allowed
1306219089Spjd *		>0 :	error code, such as EACCES
1307219089Spjd */
1308219089Spjdstatic int
1309219089Spjdzfs_mount_label_policy(vfs_t *vfsp, char *osname)
1310219089Spjd{
1311219089Spjd	int		error, retv;
1312219089Spjd	zone_t		*mntzone = NULL;
1313219089Spjd	ts_label_t	*mnt_tsl;
1314219089Spjd	bslabel_t	*mnt_sl;
1315219089Spjd	bslabel_t	ds_sl;
1316219089Spjd	char		ds_hexsl[MAXNAMELEN];
1317219089Spjd
1318219089Spjd	retv = EACCES;				/* assume the worst */
1319219089Spjd
1320219089Spjd	/*
1321219089Spjd	 * Start by getting the dataset label if it exists.
1322219089Spjd	 */
1323219089Spjd	error = dsl_prop_get(osname, zfs_prop_to_name(ZFS_PROP_MLSLABEL),
1324219089Spjd	    1, sizeof (ds_hexsl), &ds_hexsl, NULL);
1325219089Spjd	if (error)
1326219089Spjd		return (EACCES);
1327219089Spjd
1328219089Spjd	/*
1329219089Spjd	 * If labeling is NOT enabled, then disallow the mount of datasets
1330219089Spjd	 * which have a non-default label already.  No other label checks
1331219089Spjd	 * are needed.
1332219089Spjd	 */
1333219089Spjd	if (!is_system_labeled()) {
1334219089Spjd		if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0)
1335219089Spjd			return (0);
1336219089Spjd		return (EACCES);
1337219089Spjd	}
1338219089Spjd
1339219089Spjd	/*
1340219089Spjd	 * Get the label of the mountpoint.  If mounting into the global
1341219089Spjd	 * zone (i.e. mountpoint is not within an active zone and the
1342219089Spjd	 * zoned property is off), the label must be default or
1343219089Spjd	 * admin_low/admin_high only; no other checks are needed.
1344219089Spjd	 */
1345219089Spjd	mntzone = zone_find_by_any_path(refstr_value(vfsp->vfs_mntpt), B_FALSE);
1346219089Spjd	if (mntzone->zone_id == GLOBAL_ZONEID) {
1347219089Spjd		uint64_t zoned;
1348219089Spjd
1349219089Spjd		zone_rele(mntzone);
1350219089Spjd
1351219089Spjd		if (dsl_prop_get_integer(osname,
1352219089Spjd		    zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, NULL))
1353219089Spjd			return (EACCES);
1354219089Spjd		if (!zoned)
1355219089Spjd			return (zfs_check_global_label(osname, ds_hexsl));
1356219089Spjd		else
1357219089Spjd			/*
1358219089Spjd			 * This is the case of a zone dataset being mounted
1359219089Spjd			 * initially, before the zone has been fully created;
1360219089Spjd			 * allow this mount into global zone.
1361219089Spjd			 */
1362219089Spjd			return (0);
1363219089Spjd	}
1364219089Spjd
1365219089Spjd	mnt_tsl = mntzone->zone_slabel;
1366219089Spjd	ASSERT(mnt_tsl != NULL);
1367219089Spjd	label_hold(mnt_tsl);
1368219089Spjd	mnt_sl = label2bslabel(mnt_tsl);
1369219089Spjd
1370219089Spjd	if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0) {
1371219089Spjd		/*
1372219089Spjd		 * The dataset doesn't have a real label, so fabricate one.
1373219089Spjd		 */
1374219089Spjd		char *str = NULL;
1375219089Spjd
1376219089Spjd		if (l_to_str_internal(mnt_sl, &str) == 0 &&
1377219089Spjd		    dsl_prop_set(osname, zfs_prop_to_name(ZFS_PROP_MLSLABEL),
1378219089Spjd		    ZPROP_SRC_LOCAL, 1, strlen(str) + 1, str) == 0)
1379219089Spjd			retv = 0;
1380219089Spjd		if (str != NULL)
1381219089Spjd			kmem_free(str, strlen(str) + 1);
1382219089Spjd	} else if (hexstr_to_label(ds_hexsl, &ds_sl) == 0) {
1383219089Spjd		/*
1384219089Spjd		 * Now compare labels to complete the MAC check.  If the
1385219089Spjd		 * labels are equal then allow access.  If the mountpoint
1386219089Spjd		 * label dominates the dataset label, allow readonly access.
1387219089Spjd		 * Otherwise, access is denied.
1388219089Spjd		 */
1389219089Spjd		if (blequal(mnt_sl, &ds_sl))
1390219089Spjd			retv = 0;
1391219089Spjd		else if (bldominates(mnt_sl, &ds_sl)) {
1392219089Spjd			vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0);
1393219089Spjd			retv = 0;
1394219089Spjd		}
1395219089Spjd	}
1396219089Spjd
1397219089Spjd	label_rele(mnt_tsl);
1398219089Spjd	zone_rele(mntzone);
1399219089Spjd	return (retv);
1400219089Spjd}
1401219089Spjd#endif	/* SECLABEL */
1402219089Spjd
1403219089Spjd#ifdef OPENSOLARIS_MOUNTROOT
1404219089Spjdstatic int
1405219089Spjdzfs_mountroot(vfs_t *vfsp, enum whymountroot why)
1406219089Spjd{
1407219089Spjd	int error = 0;
1408219089Spjd	static int zfsrootdone = 0;
1409219089Spjd	zfsvfs_t *zfsvfs = NULL;
1410219089Spjd	znode_t *zp = NULL;
1411219089Spjd	vnode_t *vp = NULL;
1412219089Spjd	char *zfs_bootfs;
1413219089Spjd	char *zfs_devid;
1414219089Spjd
1415219089Spjd	ASSERT(vfsp);
1416219089Spjd
1417219089Spjd	/*
1418219089Spjd	 * The filesystem that we mount as root is defined in the
1419219089Spjd	 * boot property "zfs-bootfs" with a format of
1420219089Spjd	 * "poolname/root-dataset-objnum".
1421219089Spjd	 */
1422219089Spjd	if (why == ROOT_INIT) {
1423219089Spjd		if (zfsrootdone++)
1424219089Spjd			return (EBUSY);
1425219089Spjd		/*
1426219089Spjd		 * the process of doing a spa_load will require the
1427219089Spjd		 * clock to be set before we could (for example) do
1428219089Spjd		 * something better by looking at the timestamp on
1429219089Spjd		 * an uberblock, so just set it to -1.
1430219089Spjd		 */
1431219089Spjd		clkset(-1);
1432219089Spjd
1433219089Spjd		if ((zfs_bootfs = spa_get_bootprop("zfs-bootfs")) == NULL) {
1434219089Spjd			cmn_err(CE_NOTE, "spa_get_bootfs: can not get "
1435219089Spjd			    "bootfs name");
1436219089Spjd			return (EINVAL);
1437219089Spjd		}
1438219089Spjd		zfs_devid = spa_get_bootprop("diskdevid");
1439219089Spjd		error = spa_import_rootpool(rootfs.bo_name, zfs_devid);
1440219089Spjd		if (zfs_devid)
1441219089Spjd			spa_free_bootprop(zfs_devid);
1442219089Spjd		if (error) {
1443219089Spjd			spa_free_bootprop(zfs_bootfs);
1444219089Spjd			cmn_err(CE_NOTE, "spa_import_rootpool: error %d",
1445219089Spjd			    error);
1446219089Spjd			return (error);
1447219089Spjd		}
1448219089Spjd		if (error = zfs_parse_bootfs(zfs_bootfs, rootfs.bo_name)) {
1449219089Spjd			spa_free_bootprop(zfs_bootfs);
1450219089Spjd			cmn_err(CE_NOTE, "zfs_parse_bootfs: error %d",
1451219089Spjd			    error);
1452219089Spjd			return (error);
1453219089Spjd		}
1454219089Spjd
1455219089Spjd		spa_free_bootprop(zfs_bootfs);
1456219089Spjd
1457219089Spjd		if (error = vfs_lock(vfsp))
1458219089Spjd			return (error);
1459219089Spjd
1460219089Spjd		if (error = zfs_domount(vfsp, rootfs.bo_name)) {
1461219089Spjd			cmn_err(CE_NOTE, "zfs_domount: error %d", error);
1462219089Spjd			goto out;
1463219089Spjd		}
1464219089Spjd
1465219089Spjd		zfsvfs = (zfsvfs_t *)vfsp->vfs_data;
1466219089Spjd		ASSERT(zfsvfs);
1467219089Spjd		if (error = zfs_zget(zfsvfs, zfsvfs->z_root, &zp)) {
1468219089Spjd			cmn_err(CE_NOTE, "zfs_zget: error %d", error);
1469219089Spjd			goto out;
1470219089Spjd		}
1471219089Spjd
1472219089Spjd		vp = ZTOV(zp);
1473219089Spjd		mutex_enter(&vp->v_lock);
1474219089Spjd		vp->v_flag |= VROOT;
1475219089Spjd		mutex_exit(&vp->v_lock);
1476219089Spjd		rootvp = vp;
1477219089Spjd
1478219089Spjd		/*
1479219089Spjd		 * Leave rootvp held.  The root file system is never unmounted.
1480219089Spjd		 */
1481219089Spjd
1482219089Spjd		vfs_add((struct vnode *)0, vfsp,
1483219089Spjd		    (vfsp->vfs_flag & VFS_RDONLY) ? MS_RDONLY : 0);
1484219089Spjdout:
1485219089Spjd		vfs_unlock(vfsp);
1486219089Spjd		return (error);
1487219089Spjd	} else if (why == ROOT_REMOUNT) {
1488219089Spjd		readonly_changed_cb(vfsp->vfs_data, B_FALSE);
1489219089Spjd		vfsp->vfs_flag |= VFS_REMOUNT;
1490219089Spjd
1491219089Spjd		/* refresh mount options */
1492219089Spjd		zfs_unregister_callbacks(vfsp->vfs_data);
1493219089Spjd		return (zfs_register_callbacks(vfsp));
1494219089Spjd
1495219089Spjd	} else if (why == ROOT_UNMOUNT) {
1496219089Spjd		zfs_unregister_callbacks((zfsvfs_t *)vfsp->vfs_data);
1497219089Spjd		(void) zfs_sync(vfsp, 0, 0);
1498219089Spjd		return (0);
1499219089Spjd	}
1500219089Spjd
1501219089Spjd	/*
1502219089Spjd	 * if "why" is equal to anything else other than ROOT_INIT,
1503219089Spjd	 * ROOT_REMOUNT, or ROOT_UNMOUNT, we do not support it.
1504219089Spjd	 */
1505219089Spjd	return (ENOTSUP);
1506219089Spjd}
1507219089Spjd#endif	/* OPENSOLARIS_MOUNTROOT */
1508219089Spjd
1509168404Spjd/*ARGSUSED*/
1510168404Spjdstatic int
1511191990Sattiliozfs_mount(vfs_t *vfsp)
1512168404Spjd{
1513191990Sattilio	kthread_t	*td = curthread;
1514185029Spjd	vnode_t		*mvp = vfsp->mnt_vnodecovered;
1515185029Spjd	cred_t		*cr = td->td_ucred;
1516185029Spjd	char		*osname;
1517185029Spjd	int		error = 0;
1518185029Spjd	int		canwrite;
1519168404Spjd
1520185029Spjd	if (vfs_getopt(vfsp->mnt_optnew, "from", (void **)&osname, NULL))
1521185029Spjd		return (EINVAL);
1522185029Spjd
1523168404Spjd	/*
1524185029Spjd	 * If full-owner-access is enabled and delegated administration is
1525185029Spjd	 * turned on, we must set nosuid.
1526185029Spjd	 */
1527185029Spjd	if (zfs_super_owner &&
1528185029Spjd	    dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != ECANCELED) {
1529185029Spjd		secpolicy_fs_mount_clearopts(cr, vfsp);
1530185029Spjd	}
1531185029Spjd
1532185029Spjd	/*
1533185029Spjd	 * Check for mount privilege?
1534185029Spjd	 *
1535185029Spjd	 * If we don't have privilege then see if
1536185029Spjd	 * we have local permission to allow it
1537185029Spjd	 */
1538185029Spjd	error = secpolicy_fs_mount(cr, mvp, vfsp);
1539185029Spjd	if (error) {
1540212694Smm		if (dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != 0)
1541196944Spjd			goto out;
1542196944Spjd
1543196944Spjd		if (!(vfsp->vfs_flag & MS_REMOUNT)) {
1544185029Spjd			vattr_t		vattr;
1545185029Spjd
1546185029Spjd			/*
1547185029Spjd			 * Make sure user is the owner of the mount point
1548185029Spjd			 * or has sufficient privileges.
1549185029Spjd			 */
1550185029Spjd
1551185029Spjd			vattr.va_mask = AT_UID;
1552185029Spjd
1553196662Spjd			vn_lock(mvp, LK_SHARED | LK_RETRY);
1554212694Smm			if (VOP_GETATTR(mvp, &vattr, cr)) {
1555196662Spjd				VOP_UNLOCK(mvp, 0);
1556185029Spjd				goto out;
1557185029Spjd			}
1558185029Spjd
1559185029Spjd			if (secpolicy_vnode_owner(mvp, cr, vattr.va_uid) != 0 &&
1560185029Spjd			    VOP_ACCESS(mvp, VWRITE, cr, td) != 0) {
1561196662Spjd				VOP_UNLOCK(mvp, 0);
1562185029Spjd				goto out;
1563185029Spjd			}
1564196662Spjd			VOP_UNLOCK(mvp, 0);
1565196944Spjd		}
1566185029Spjd
1567196944Spjd		secpolicy_fs_mount_clearopts(cr, vfsp);
1568185029Spjd	}
1569185029Spjd
1570185029Spjd	/*
1571185029Spjd	 * Refuse to mount a filesystem if we are in a local zone and the
1572185029Spjd	 * dataset is not visible.
1573185029Spjd	 */
1574185029Spjd	if (!INGLOBALZONE(curthread) &&
1575185029Spjd	    (!zone_dataset_visible(osname, &canwrite) || !canwrite)) {
1576185029Spjd		error = EPERM;
1577185029Spjd		goto out;
1578185029Spjd	}
1579185029Spjd
1580219089Spjd#ifdef SECLABEL
1581219089Spjd	error = zfs_mount_label_policy(vfsp, osname);
1582219089Spjd	if (error)
1583219089Spjd		goto out;
1584219089Spjd#endif
1585219089Spjd
1586218386Strasz	vfsp->vfs_flag |= MNT_NFS4ACLS;
1587218386Strasz
1588185029Spjd	/*
1589168404Spjd	 * When doing a remount, we simply refresh our temporary properties
1590168404Spjd	 * according to those options set in the current VFS options.
1591168404Spjd	 */
1592185029Spjd	if (vfsp->vfs_flag & MS_REMOUNT) {
1593185029Spjd		/* refresh mount options */
1594185029Spjd		zfs_unregister_callbacks(vfsp->vfs_data);
1595185029Spjd		error = zfs_register_callbacks(vfsp);
1596185029Spjd		goto out;
1597185029Spjd	}
1598168404Spjd
1599168510Spjd	DROP_GIANT();
1600185029Spjd	error = zfs_domount(vfsp, osname);
1601168510Spjd	PICKUP_GIANT();
1602209962Smm
1603215260Smm#ifdef sun
1604209962Smm	/*
1605209962Smm	 * Add an extra VFS_HOLD on our parent vfs so that it can't
1606209962Smm	 * disappear due to a forced unmount.
1607209962Smm	 */
1608209962Smm	if (error == 0 && ((zfsvfs_t *)vfsp->vfs_data)->z_issnap)
1609209962Smm		VFS_HOLD(mvp->v_vfsp);
1610215260Smm#endif	/* sun */
1611209962Smm
1612185029Spjdout:
1613168510Spjd	return (error);
1614168404Spjd}
1615168404Spjd
1616168404Spjdstatic int
1617191990Sattiliozfs_statfs(vfs_t *vfsp, struct statfs *statp)
1618169170Spjd{
1619168404Spjd	zfsvfs_t *zfsvfs = vfsp->vfs_data;
1620168404Spjd	uint64_t refdbytes, availbytes, usedobjs, availobjs;
1621168404Spjd
1622168404Spjd	statp->f_version = STATFS_VERSION;
1623168404Spjd
1624168404Spjd	ZFS_ENTER(zfsvfs);
1625168404Spjd
1626168404Spjd	dmu_objset_space(zfsvfs->z_os,
1627168404Spjd	    &refdbytes, &availbytes, &usedobjs, &availobjs);
1628168404Spjd
1629168404Spjd	/*
1630168404Spjd	 * The underlying storage pool actually uses multiple block sizes.
1631168404Spjd	 * We report the fragsize as the smallest block size we support,
1632168404Spjd	 * and we report our blocksize as the filesystem's maximum blocksize.
1633168404Spjd	 */
1634204101Spjd	statp->f_bsize = SPA_MINBLOCKSIZE;
1635204101Spjd	statp->f_iosize = zfsvfs->z_vfs->mnt_stat.f_iosize;
1636168404Spjd
1637168404Spjd	/*
1638168404Spjd	 * The following report "total" blocks of various kinds in the
1639168404Spjd	 * file system, but reported in terms of f_frsize - the
1640168404Spjd	 * "fragment" size.
1641168404Spjd	 */
1642168404Spjd
1643204101Spjd	statp->f_blocks = (refdbytes + availbytes) >> SPA_MINBLOCKSHIFT;
1644168404Spjd	statp->f_bfree = availbytes / statp->f_bsize;
1645168404Spjd	statp->f_bavail = statp->f_bfree; /* no root reservation */
1646168404Spjd
1647168404Spjd	/*
1648168404Spjd	 * statvfs() should really be called statufs(), because it assumes
1649168404Spjd	 * static metadata.  ZFS doesn't preallocate files, so the best
1650168404Spjd	 * we can do is report the max that could possibly fit in f_files,
1651168404Spjd	 * and that minus the number actually used in f_ffree.
1652168404Spjd	 * For f_ffree, report the smaller of the number of object available
1653168404Spjd	 * and the number of blocks (each object will take at least a block).
1654168404Spjd	 */
1655168404Spjd	statp->f_ffree = MIN(availobjs, statp->f_bfree);
1656168404Spjd	statp->f_files = statp->f_ffree + usedobjs;
1657168404Spjd
1658168404Spjd	/*
1659168404Spjd	 * We're a zfs filesystem.
1660168404Spjd	 */
1661168404Spjd	(void) strlcpy(statp->f_fstypename, "zfs", sizeof(statp->f_fstypename));
1662168404Spjd
1663168404Spjd	strlcpy(statp->f_mntfromname, vfsp->mnt_stat.f_mntfromname,
1664168404Spjd	    sizeof(statp->f_mntfromname));
1665168404Spjd	strlcpy(statp->f_mntonname, vfsp->mnt_stat.f_mntonname,
1666168404Spjd	    sizeof(statp->f_mntonname));
1667168404Spjd
1668168404Spjd	statp->f_namemax = ZFS_MAXNAMELEN;
1669168404Spjd
1670168404Spjd	ZFS_EXIT(zfsvfs);
1671168404Spjd	return (0);
1672168404Spjd}
1673168404Spjd
1674219089Spjdint
1675219089Spjdzfs_vnode_lock(vnode_t *vp, int flags)
1676219089Spjd{
1677219089Spjd	int error;
1678219089Spjd
1679219089Spjd	ASSERT(vp != NULL);
1680219089Spjd
1681219089Spjd	/*
1682219089Spjd	 * Check if the file system wasn't forcibly unmounted in the meantime.
1683219089Spjd	 */
1684219089Spjd	error = vn_lock(vp, flags);
1685219089Spjd	if (error == 0 && (vp->v_iflag & VI_DOOMED) != 0) {
1686219089Spjd		VOP_UNLOCK(vp, 0);
1687219089Spjd		error = ENOENT;
1688219089Spjd	}
1689219089Spjd
1690219089Spjd	return (error);
1691219089Spjd}
1692219089Spjd
1693168404Spjdstatic int
1694191990Sattiliozfs_root(vfs_t *vfsp, int flags, vnode_t **vpp)
1695168404Spjd{
1696168404Spjd	zfsvfs_t *zfsvfs = vfsp->vfs_data;
1697168404Spjd	znode_t *rootzp;
1698168404Spjd	int error;
1699168404Spjd
1700197459Spjd	ZFS_ENTER_NOERROR(zfsvfs);
1701168404Spjd
1702168404Spjd	error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp);
1703219089Spjd	if (error == 0)
1704219089Spjd		*vpp = ZTOV(rootzp);
1705206667Spjd
1706206667Spjd	ZFS_EXIT(zfsvfs);
1707206667Spjd
1708168404Spjd	if (error == 0) {
1709219089Spjd		error = zfs_vnode_lock(*vpp, flags);
1710219089Spjd		if (error == 0)
1711219089Spjd			(*vpp)->v_vflag |= VV_ROOT;
1712168404Spjd	}
1713219089Spjd	if (error != 0)
1714219089Spjd		*vpp = NULL;
1715168404Spjd
1716168404Spjd	return (error);
1717168404Spjd}
1718168404Spjd
1719185029Spjd/*
1720185029Spjd * Teardown the zfsvfs::z_os.
1721185029Spjd *
1722185029Spjd * Note, if 'unmounting' if FALSE, we return with the 'z_teardown_lock'
1723185029Spjd * and 'z_teardown_inactive_lock' held.
1724185029Spjd */
1725185029Spjdstatic int
1726185029Spjdzfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting)
1727185029Spjd{
1728185029Spjd	znode_t	*zp;
1729185029Spjd
1730185029Spjd	rrw_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
1731185029Spjd
1732185029Spjd	if (!unmounting) {
1733185029Spjd		/*
1734185029Spjd		 * We purge the parent filesystem's vfsp as the parent
1735185029Spjd		 * filesystem and all of its snapshots have their vnode's
1736185029Spjd		 * v_vfsp set to the parent's filesystem's vfsp.  Note,
1737185029Spjd		 * 'z_parent' is self referential for non-snapshots.
1738185029Spjd		 */
1739185029Spjd		(void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0);
1740197351Spjd#ifdef FREEBSD_NAMECACHE
1741197351Spjd		cache_purgevfs(zfsvfs->z_parent->z_vfs);
1742197351Spjd#endif
1743185029Spjd	}
1744185029Spjd
1745185029Spjd	/*
1746185029Spjd	 * Close the zil. NB: Can't close the zil while zfs_inactive
1747185029Spjd	 * threads are blocked as zil_close can call zfs_inactive.
1748185029Spjd	 */
1749185029Spjd	if (zfsvfs->z_log) {
1750185029Spjd		zil_close(zfsvfs->z_log);
1751185029Spjd		zfsvfs->z_log = NULL;
1752185029Spjd	}
1753185029Spjd
1754185029Spjd	rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_WRITER);
1755185029Spjd
1756185029Spjd	/*
1757185029Spjd	 * If we are not unmounting (ie: online recv) and someone already
1758185029Spjd	 * unmounted this file system while we were doing the switcheroo,
1759185029Spjd	 * or a reopen of z_os failed then just bail out now.
1760185029Spjd	 */
1761185029Spjd	if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) {
1762185029Spjd		rw_exit(&zfsvfs->z_teardown_inactive_lock);
1763185029Spjd		rrw_exit(&zfsvfs->z_teardown_lock, FTAG);
1764185029Spjd		return (EIO);
1765185029Spjd	}
1766185029Spjd
1767185029Spjd	/*
1768185029Spjd	 * At this point there are no vops active, and any new vops will
1769185029Spjd	 * fail with EIO since we have z_teardown_lock for writer (only
1770185029Spjd	 * relavent for forced unmount).
1771185029Spjd	 *
1772185029Spjd	 * Release all holds on dbufs.
1773185029Spjd	 */
1774185029Spjd	mutex_enter(&zfsvfs->z_znodes_lock);
1775185029Spjd	for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL;
1776185029Spjd	    zp = list_next(&zfsvfs->z_all_znodes, zp))
1777219089Spjd		if (zp->z_sa_hdl) {
1778196297Spjd			ASSERT(ZTOV(zp)->v_count >= 0);
1779185029Spjd			zfs_znode_dmu_fini(zp);
1780185029Spjd		}
1781185029Spjd	mutex_exit(&zfsvfs->z_znodes_lock);
1782185029Spjd
1783185029Spjd	/*
1784185029Spjd	 * If we are unmounting, set the unmounted flag and let new vops
1785185029Spjd	 * unblock.  zfs_inactive will have the unmounted behavior, and all
1786185029Spjd	 * other vops will fail with EIO.
1787185029Spjd	 */
1788185029Spjd	if (unmounting) {
1789185029Spjd		zfsvfs->z_unmounted = B_TRUE;
1790185029Spjd		rrw_exit(&zfsvfs->z_teardown_lock, FTAG);
1791185029Spjd		rw_exit(&zfsvfs->z_teardown_inactive_lock);
1792197133Spjd
1793197133Spjd#ifdef __FreeBSD__
1794197133Spjd		/*
1795197133Spjd		 * Some znodes might not be fully reclaimed, wait for them.
1796197133Spjd		 */
1797197133Spjd		mutex_enter(&zfsvfs->z_znodes_lock);
1798197133Spjd		while (list_head(&zfsvfs->z_all_znodes) != NULL) {
1799197133Spjd			msleep(zfsvfs, &zfsvfs->z_znodes_lock, 0,
1800197133Spjd			    "zteardown", 0);
1801197133Spjd		}
1802197133Spjd		mutex_exit(&zfsvfs->z_znodes_lock);
1803197133Spjd#endif
1804185029Spjd	}
1805185029Spjd
1806185029Spjd	/*
1807185029Spjd	 * z_os will be NULL if there was an error in attempting to reopen
1808185029Spjd	 * zfsvfs, so just return as the properties had already been
1809185029Spjd	 * unregistered and cached data had been evicted before.
1810185029Spjd	 */
1811185029Spjd	if (zfsvfs->z_os == NULL)
1812185029Spjd		return (0);
1813185029Spjd
1814185029Spjd	/*
1815185029Spjd	 * Unregister properties.
1816185029Spjd	 */
1817185029Spjd	zfs_unregister_callbacks(zfsvfs);
1818185029Spjd
1819185029Spjd	/*
1820185029Spjd	 * Evict cached data
1821185029Spjd	 */
1822219089Spjd	if (dmu_objset_is_dirty_anywhere(zfsvfs->z_os))
1823219089Spjd		if (!(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY))
1824219089Spjd			txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
1825219089Spjd	(void) dmu_objset_evict_dbufs(zfsvfs->z_os);
1826185029Spjd
1827185029Spjd	return (0);
1828185029Spjd}
1829185029Spjd
1830168404Spjd/*ARGSUSED*/
1831168404Spjdstatic int
1832191990Sattiliozfs_umount(vfs_t *vfsp, int fflag)
1833168404Spjd{
1834209962Smm	kthread_t *td = curthread;
1835168404Spjd	zfsvfs_t *zfsvfs = vfsp->vfs_data;
1836185029Spjd	objset_t *os;
1837209962Smm	cred_t *cr = td->td_ucred;
1838168404Spjd	int ret;
1839168404Spjd
1840185029Spjd	ret = secpolicy_fs_unmount(cr, vfsp);
1841185029Spjd	if (ret) {
1842212694Smm		if (dsl_deleg_access((char *)refstr_value(vfsp->vfs_resource),
1843212694Smm		    ZFS_DELEG_PERM_MOUNT, cr))
1844185029Spjd			return (ret);
1845185029Spjd	}
1846219089Spjd
1847185029Spjd	/*
1848185029Spjd	 * We purge the parent filesystem's vfsp as the parent filesystem
1849185029Spjd	 * and all of its snapshots have their vnode's v_vfsp set to the
1850185029Spjd	 * parent's filesystem's vfsp.  Note, 'z_parent' is self
1851185029Spjd	 * referential for non-snapshots.
1852185029Spjd	 */
1853185029Spjd	(void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0);
1854168404Spjd
1855168404Spjd	/*
1856168404Spjd	 * Unmount any snapshots mounted under .zfs before unmounting the
1857168404Spjd	 * dataset itself.
1858168404Spjd	 */
1859169170Spjd	if (zfsvfs->z_ctldir != NULL) {
1860168404Spjd		if ((ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0)
1861168404Spjd			return (ret);
1862209962Smm		ret = vflush(vfsp, 0, 0, td);
1863168404Spjd		ASSERT(ret == EBUSY);
1864168404Spjd		if (!(fflag & MS_FORCE)) {
1865168404Spjd			if (zfsvfs->z_ctldir->v_count > 1)
1866168404Spjd				return (EBUSY);
1867168404Spjd			ASSERT(zfsvfs->z_ctldir->v_count == 1);
1868168404Spjd		}
1869168404Spjd		zfsctl_destroy(zfsvfs);
1870168404Spjd		ASSERT(zfsvfs->z_ctldir == NULL);
1871168404Spjd	}
1872168404Spjd
1873197459Spjd	if (fflag & MS_FORCE) {
1874197459Spjd		/*
1875197459Spjd		 * Mark file system as unmounted before calling
1876197459Spjd		 * vflush(FORCECLOSE). This way we ensure no future vnops
1877197459Spjd		 * will be called and risk operating on DOOMED vnodes.
1878197459Spjd		 */
1879197459Spjd		rrw_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
1880197459Spjd		zfsvfs->z_unmounted = B_TRUE;
1881197459Spjd		rrw_exit(&zfsvfs->z_teardown_lock, FTAG);
1882197459Spjd	}
1883197459Spjd
1884168404Spjd	/*
1885168404Spjd	 * Flush all the files.
1886168404Spjd	 */
1887209962Smm	ret = vflush(vfsp, 1, (fflag & MS_FORCE) ? FORCECLOSE : 0, td);
1888168404Spjd	if (ret != 0) {
1889168404Spjd		if (!zfsvfs->z_issnap) {
1890168404Spjd			zfsctl_create(zfsvfs);
1891168404Spjd			ASSERT(zfsvfs->z_ctldir != NULL);
1892168404Spjd		}
1893168404Spjd		return (ret);
1894168404Spjd	}
1895168404Spjd
1896185029Spjd	if (!(fflag & MS_FORCE)) {
1897185029Spjd		/*
1898185029Spjd		 * Check the number of active vnodes in the file system.
1899185029Spjd		 * Our count is maintained in the vfs structure, but the
1900185029Spjd		 * number is off by 1 to indicate a hold on the vfs
1901185029Spjd		 * structure itself.
1902185029Spjd		 *
1903185029Spjd		 * The '.zfs' directory maintains a reference of its
1904185029Spjd		 * own, and any active references underneath are
1905185029Spjd		 * reflected in the vnode count.
1906185029Spjd		 */
1907185029Spjd		if (zfsvfs->z_ctldir == NULL) {
1908185029Spjd			if (vfsp->vfs_count > 1)
1909185029Spjd				return (EBUSY);
1910185029Spjd		} else {
1911185029Spjd			if (vfsp->vfs_count > 2 ||
1912185029Spjd			    zfsvfs->z_ctldir->v_count > 1)
1913185029Spjd				return (EBUSY);
1914185029Spjd		}
1915185029Spjd	} else {
1916168404Spjd		MNT_ILOCK(vfsp);
1917168404Spjd		vfsp->mnt_kern_flag |= MNTK_UNMOUNTF;
1918168404Spjd		MNT_IUNLOCK(vfsp);
1919185029Spjd	}
1920168404Spjd
1921185029Spjd	VERIFY(zfsvfs_teardown(zfsvfs, B_TRUE) == 0);
1922185029Spjd	os = zfsvfs->z_os;
1923185029Spjd
1924185029Spjd	/*
1925185029Spjd	 * z_os will be NULL if there was an error in
1926185029Spjd	 * attempting to reopen zfsvfs.
1927185029Spjd	 */
1928185029Spjd	if (os != NULL) {
1929168404Spjd		/*
1930185029Spjd		 * Unset the objset user_ptr.
1931168404Spjd		 */
1932219089Spjd		mutex_enter(&os->os_user_ptr_lock);
1933185029Spjd		dmu_objset_set_user(os, NULL);
1934219089Spjd		mutex_exit(&os->os_user_ptr_lock);
1935185029Spjd
1936185029Spjd		/*
1937185029Spjd		 * Finally release the objset
1938185029Spjd		 */
1939219089Spjd		dmu_objset_disown(os, zfsvfs);
1940168404Spjd	}
1941168404Spjd
1942185029Spjd	/*
1943185029Spjd	 * We can now safely destroy the '.zfs' directory node.
1944185029Spjd	 */
1945185029Spjd	if (zfsvfs->z_ctldir != NULL)
1946185029Spjd		zfsctl_destroy(zfsvfs);
1947185029Spjd	if (zfsvfs->z_issnap) {
1948185029Spjd		vnode_t *svp = vfsp->mnt_vnodecovered;
1949185029Spjd
1950197515Spjd		if (svp->v_count >= 2)
1951192211Skmacy			VN_RELE(svp);
1952185029Spjd	}
1953168404Spjd	zfs_freevfs(vfsp);
1954168404Spjd
1955168404Spjd	return (0);
1956168404Spjd}
1957168404Spjd
1958168404Spjdstatic int
1959168404Spjdzfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp)
1960168404Spjd{
1961168404Spjd	zfsvfs_t	*zfsvfs = vfsp->vfs_data;
1962168404Spjd	znode_t		*zp;
1963168404Spjd	int 		err;
1964168404Spjd
1965197167Spjd	/*
1966215397Savg	 * zfs_zget() can't operate on virtual entries like .zfs/ or
1967211855Spjd	 * .zfs/snapshot/ directories, that's why we return EOPNOTSUPP.
1968211855Spjd	 * This will make NFS to switch to LOOKUP instead of using VGET.
1969197167Spjd	 */
1970197167Spjd	if (ino == ZFSCTL_INO_ROOT || ino == ZFSCTL_INO_SNAPDIR)
1971197167Spjd		return (EOPNOTSUPP);
1972197167Spjd
1973168404Spjd	ZFS_ENTER(zfsvfs);
1974168404Spjd	err = zfs_zget(zfsvfs, ino, &zp);
1975168404Spjd	if (err == 0 && zp->z_unlinked) {
1976168404Spjd		VN_RELE(ZTOV(zp));
1977168404Spjd		err = EINVAL;
1978168404Spjd	}
1979219089Spjd	if (err == 0)
1980219089Spjd		*vpp = ZTOV(zp);
1981206667Spjd	ZFS_EXIT(zfsvfs);
1982219089Spjd	if (err == 0)
1983219089Spjd		err = zfs_vnode_lock(*vpp, flags);
1984168404Spjd	if (err != 0)
1985168404Spjd		*vpp = NULL;
1986171063Sdfr	return (err);
1987168404Spjd}
1988168404Spjd
1989168404Spjdstatic int
1990196982Spjdzfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, int *extflagsp,
1991196982Spjd    struct ucred **credanonp, int *numsecflavors, int **secflavors)
1992196982Spjd{
1993196982Spjd	zfsvfs_t *zfsvfs = vfsp->vfs_data;
1994196982Spjd
1995196982Spjd	/*
1996196982Spjd	 * If this is regular file system vfsp is the same as
1997196982Spjd	 * zfsvfs->z_parent->z_vfs, but if it is snapshot,
1998196982Spjd	 * zfsvfs->z_parent->z_vfs represents parent file system
1999196982Spjd	 * which we have to use here, because only this file system
2000196982Spjd	 * has mnt_export configured.
2001196982Spjd	 */
2002196982Spjd	return (vfs_stdcheckexp(zfsvfs->z_parent->z_vfs, nam, extflagsp,
2003196982Spjd	    credanonp, numsecflavors, secflavors));
2004196982Spjd}
2005196982Spjd
2006197151SpjdCTASSERT(SHORT_FID_LEN <= sizeof(struct fid));
2007197151SpjdCTASSERT(LONG_FID_LEN <= sizeof(struct fid));
2008196982Spjd
2009196982Spjdstatic int
2010222167Srmacklemzfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp)
2011168404Spjd{
2012168404Spjd	zfsvfs_t	*zfsvfs = vfsp->vfs_data;
2013168404Spjd	znode_t		*zp;
2014168404Spjd	uint64_t	object = 0;
2015168404Spjd	uint64_t	fid_gen = 0;
2016168404Spjd	uint64_t	gen_mask;
2017168404Spjd	uint64_t	zp_gen;
2018219089Spjd	int 		i, err;
2019168404Spjd
2020168404Spjd	*vpp = NULL;
2021168404Spjd
2022168404Spjd	ZFS_ENTER(zfsvfs);
2023168404Spjd
2024196979Spjd	/*
2025197177Spjd	 * On FreeBSD we can get snapshot's mount point or its parent file
2026197177Spjd	 * system mount point depending if snapshot is already mounted or not.
2027196979Spjd	 */
2028197177Spjd	if (zfsvfs->z_parent == zfsvfs && fidp->fid_len == LONG_FID_LEN) {
2029168404Spjd		zfid_long_t	*zlfid = (zfid_long_t *)fidp;
2030168404Spjd		uint64_t	objsetid = 0;
2031168404Spjd		uint64_t	setgen = 0;
2032168404Spjd
2033168404Spjd		for (i = 0; i < sizeof (zlfid->zf_setid); i++)
2034168404Spjd			objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i);
2035168404Spjd
2036168404Spjd		for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
2037168404Spjd			setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i);
2038168404Spjd
2039168404Spjd		ZFS_EXIT(zfsvfs);
2040168404Spjd
2041168404Spjd		err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs);
2042168404Spjd		if (err)
2043168404Spjd			return (EINVAL);
2044168404Spjd		ZFS_ENTER(zfsvfs);
2045168404Spjd	}
2046168404Spjd
2047168404Spjd	if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) {
2048168404Spjd		zfid_short_t	*zfid = (zfid_short_t *)fidp;
2049168404Spjd
2050168404Spjd		for (i = 0; i < sizeof (zfid->zf_object); i++)
2051168404Spjd			object |= ((uint64_t)zfid->zf_object[i]) << (8 * i);
2052168404Spjd
2053168404Spjd		for (i = 0; i < sizeof (zfid->zf_gen); i++)
2054168404Spjd			fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i);
2055168404Spjd	} else {
2056168404Spjd		ZFS_EXIT(zfsvfs);
2057168404Spjd		return (EINVAL);
2058168404Spjd	}
2059168404Spjd
2060168404Spjd	/* A zero fid_gen means we are in the .zfs control directories */
2061168404Spjd	if (fid_gen == 0 &&
2062168404Spjd	    (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) {
2063168404Spjd		*vpp = zfsvfs->z_ctldir;
2064168404Spjd		ASSERT(*vpp != NULL);
2065168404Spjd		if (object == ZFSCTL_INO_SNAPDIR) {
2066168404Spjd			VERIFY(zfsctl_root_lookup(*vpp, "snapshot", vpp, NULL,
2067185029Spjd			    0, NULL, NULL, NULL, NULL, NULL) == 0);
2068168404Spjd		} else {
2069168404Spjd			VN_HOLD(*vpp);
2070168404Spjd		}
2071206667Spjd		ZFS_EXIT(zfsvfs);
2072222199Srmacklem		err = zfs_vnode_lock(*vpp, flags | LK_RETRY);
2073219089Spjd		if (err != 0)
2074219089Spjd			*vpp = NULL;
2075219089Spjd		return (err);
2076168404Spjd	}
2077168404Spjd
2078168404Spjd	gen_mask = -1ULL >> (64 - 8 * i);
2079168404Spjd
2080168404Spjd	dprintf("getting %llu [%u mask %llx]\n", object, fid_gen, gen_mask);
2081168404Spjd	if (err = zfs_zget(zfsvfs, object, &zp)) {
2082168404Spjd		ZFS_EXIT(zfsvfs);
2083168404Spjd		return (err);
2084168404Spjd	}
2085219089Spjd	(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen,
2086219089Spjd	    sizeof (uint64_t));
2087219089Spjd	zp_gen = zp_gen & gen_mask;
2088168404Spjd	if (zp_gen == 0)
2089168404Spjd		zp_gen = 1;
2090168404Spjd	if (zp->z_unlinked || zp_gen != fid_gen) {
2091168404Spjd		dprintf("znode gen (%u) != fid gen (%u)\n", zp_gen, fid_gen);
2092168404Spjd		VN_RELE(ZTOV(zp));
2093168404Spjd		ZFS_EXIT(zfsvfs);
2094168404Spjd		return (EINVAL);
2095168404Spjd	}
2096168404Spjd
2097219089Spjd	*vpp = ZTOV(zp);
2098206667Spjd	ZFS_EXIT(zfsvfs);
2099222199Srmacklem	err = zfs_vnode_lock(*vpp, flags | LK_RETRY);
2100219089Spjd	if (err == 0)
2101219089Spjd		vnode_create_vobject(*vpp, zp->z_size, curthread);
2102219089Spjd	else
2103219089Spjd		*vpp = NULL;
2104219089Spjd	return (err);
2105168404Spjd}
2106168404Spjd
2107185029Spjd/*
2108185029Spjd * Block out VOPs and close zfsvfs_t::z_os
2109185029Spjd *
2110185029Spjd * Note, if successful, then we return with the 'z_teardown_lock' and
2111185029Spjd * 'z_teardown_inactive_lock' write held.
2112185029Spjd */
2113185029Spjdint
2114219089Spjdzfs_suspend_fs(zfsvfs_t *zfsvfs)
2115168404Spjd{
2116185029Spjd	int error;
2117168404Spjd
2118185029Spjd	if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0)
2119185029Spjd		return (error);
2120219089Spjd	dmu_objset_disown(zfsvfs->z_os, zfsvfs);
2121168404Spjd
2122185029Spjd	return (0);
2123185029Spjd}
2124168404Spjd
2125185029Spjd/*
2126185029Spjd * Reopen zfsvfs_t::z_os and release VOPs.
2127185029Spjd */
2128185029Spjdint
2129219089Spjdzfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname)
2130185029Spjd{
2131185029Spjd	int err;
2132168404Spjd
2133185029Spjd	ASSERT(RRW_WRITE_HELD(&zfsvfs->z_teardown_lock));
2134185029Spjd	ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock));
2135185029Spjd
2136219089Spjd	err = dmu_objset_own(osname, DMU_OST_ZFS, B_FALSE, zfsvfs,
2137219089Spjd	    &zfsvfs->z_os);
2138185029Spjd	if (err) {
2139185029Spjd		zfsvfs->z_os = NULL;
2140185029Spjd	} else {
2141185029Spjd		znode_t *zp;
2142219089Spjd		uint64_t sa_obj = 0;
2143185029Spjd
2144219089Spjd		/*
2145219089Spjd		 * Make sure version hasn't changed
2146219089Spjd		 */
2147219089Spjd
2148219089Spjd		err = zfs_get_zplprop(zfsvfs->z_os, ZFS_PROP_VERSION,
2149219089Spjd		    &zfsvfs->z_version);
2150219089Spjd
2151219089Spjd		if (err)
2152219089Spjd			goto bail;
2153219089Spjd
2154219089Spjd		err = zap_lookup(zfsvfs->z_os, MASTER_NODE_OBJ,
2155219089Spjd		    ZFS_SA_ATTRS, 8, 1, &sa_obj);
2156219089Spjd
2157219089Spjd		if (err && zfsvfs->z_version >= ZPL_VERSION_SA)
2158219089Spjd			goto bail;
2159219089Spjd
2160219089Spjd		if ((err = sa_setup(zfsvfs->z_os, sa_obj,
2161219089Spjd		    zfs_attr_table,  ZPL_END, &zfsvfs->z_attr_table)) != 0)
2162219089Spjd			goto bail;
2163219089Spjd
2164219089Spjd		if (zfsvfs->z_version >= ZPL_VERSION_SA)
2165219089Spjd			sa_register_update_callback(zfsvfs->z_os,
2166219089Spjd			    zfs_sa_upgrade);
2167219089Spjd
2168185029Spjd		VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0);
2169185029Spjd
2170219089Spjd		zfs_set_fuid_feature(zfsvfs);
2171219089Spjd
2172185029Spjd		/*
2173185029Spjd		 * Attempt to re-establish all the active znodes with
2174185029Spjd		 * their dbufs.  If a zfs_rezget() fails, then we'll let
2175185029Spjd		 * any potential callers discover that via ZFS_ENTER_VERIFY_VP
2176185029Spjd		 * when they try to use their znode.
2177185029Spjd		 */
2178185029Spjd		mutex_enter(&zfsvfs->z_znodes_lock);
2179185029Spjd		for (zp = list_head(&zfsvfs->z_all_znodes); zp;
2180185029Spjd		    zp = list_next(&zfsvfs->z_all_znodes, zp)) {
2181185029Spjd			(void) zfs_rezget(zp);
2182185029Spjd		}
2183185029Spjd		mutex_exit(&zfsvfs->z_znodes_lock);
2184168404Spjd	}
2185168404Spjd
2186219089Spjdbail:
2187185029Spjd	/* release the VOPs */
2188185029Spjd	rw_exit(&zfsvfs->z_teardown_inactive_lock);
2189185029Spjd	rrw_exit(&zfsvfs->z_teardown_lock, FTAG);
2190185029Spjd
2191185029Spjd	if (err) {
2192185029Spjd		/*
2193219089Spjd		 * Since we couldn't reopen zfsvfs::z_os, or
2194219089Spjd		 * setup the sa framework force unmount this file system.
2195185029Spjd		 */
2196185029Spjd		if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0)
2197185029Spjd			(void) dounmount(zfsvfs->z_vfs, MS_FORCE, curthread);
2198168404Spjd	}
2199185029Spjd	return (err);
2200168404Spjd}
2201168404Spjd
2202168404Spjdstatic void
2203168404Spjdzfs_freevfs(vfs_t *vfsp)
2204168404Spjd{
2205168404Spjd	zfsvfs_t *zfsvfs = vfsp->vfs_data;
2206168404Spjd
2207215260Smm#ifdef sun
2208209962Smm	/*
2209209962Smm	 * If this is a snapshot, we have an extra VFS_HOLD on our parent
2210219089Spjd	 * from zfs_mount().  Release it here.  If we came through
2211219089Spjd	 * zfs_mountroot() instead, we didn't grab an extra hold, so
2212219089Spjd	 * skip the VFS_RELE for rootvfs.
2213209962Smm	 */
2214219089Spjd	if (zfsvfs->z_issnap && (vfsp != rootvfs))
2215209962Smm		VFS_RELE(zfsvfs->z_parent->z_vfs);
2216215260Smm#endif	/* sun */
2217168404Spjd
2218209962Smm	zfsvfs_free(zfsvfs);
2219185029Spjd
2220168404Spjd	atomic_add_32(&zfs_active_fs_count, -1);
2221168404Spjd}
2222168404Spjd
2223172135Spjd#ifdef __i386__
2224172135Spjdstatic int desiredvnodes_backup;
2225172135Spjd#endif
2226172135Spjd
2227172135Spjdstatic void
2228172135Spjdzfs_vnodes_adjust(void)
2229172135Spjd{
2230172135Spjd#ifdef __i386__
2231185029Spjd	int newdesiredvnodes;
2232172135Spjd
2233172135Spjd	desiredvnodes_backup = desiredvnodes;
2234172135Spjd
2235172135Spjd	/*
2236172135Spjd	 * We calculate newdesiredvnodes the same way it is done in
2237172135Spjd	 * vntblinit(). If it is equal to desiredvnodes, it means that
2238172135Spjd	 * it wasn't tuned by the administrator and we can tune it down.
2239172135Spjd	 */
2240185029Spjd	newdesiredvnodes = min(maxproc + cnt.v_page_count / 4, 2 *
2241185029Spjd	    vm_kmem_size / (5 * (sizeof(struct vm_object) +
2242185029Spjd	    sizeof(struct vnode))));
2243185029Spjd	if (newdesiredvnodes == desiredvnodes)
2244185029Spjd		desiredvnodes = (3 * newdesiredvnodes) / 4;
2245172135Spjd#endif
2246172135Spjd}
2247172135Spjd
2248172135Spjdstatic void
2249172135Spjdzfs_vnodes_adjust_back(void)
2250172135Spjd{
2251172135Spjd
2252172135Spjd#ifdef __i386__
2253172135Spjd	desiredvnodes = desiredvnodes_backup;
2254172135Spjd#endif
2255172135Spjd}
2256172135Spjd
2257168404Spjdvoid
2258168404Spjdzfs_init(void)
2259168404Spjd{
2260168404Spjd
2261202129Sdelphij	printf("ZFS filesystem version " ZPL_VERSION_STRING "\n");
2262168404Spjd
2263168404Spjd	/*
2264219089Spjd	 * Initialize .zfs directory structures
2265168404Spjd	 */
2266219089Spjd	zfsctl_init();
2267168404Spjd
2268168404Spjd	/*
2269219089Spjd	 * Initialize znode cache, vnode ops, etc...
2270168404Spjd	 */
2271219089Spjd	zfs_znode_init();
2272172135Spjd
2273172135Spjd	/*
2274219089Spjd	 * Reduce number of vnodes. Originally number of vnodes is calculated
2275172135Spjd	 * with UFS inode in mind. We reduce it here, because it's too big for
2276172135Spjd	 * ZFS/i386.
2277172135Spjd	 */
2278172135Spjd	zfs_vnodes_adjust();
2279209962Smm
2280209962Smm	dmu_objset_register_type(DMU_OST_ZFS, zfs_space_delta_cb);
2281168404Spjd}
2282168404Spjd
2283168404Spjdvoid
2284168404Spjdzfs_fini(void)
2285168404Spjd{
2286168404Spjd	zfsctl_fini();
2287168404Spjd	zfs_znode_fini();
2288172135Spjd	zfs_vnodes_adjust_back();
2289168404Spjd}
2290168404Spjd
2291168404Spjdint
2292168404Spjdzfs_busy(void)
2293168404Spjd{
2294168404Spjd	return (zfs_active_fs_count != 0);
2295168404Spjd}
2296185029Spjd
2297185029Spjdint
2298209962Smmzfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers)
2299185029Spjd{
2300185029Spjd	int error;
2301209962Smm	objset_t *os = zfsvfs->z_os;
2302185029Spjd	dmu_tx_t *tx;
2303185029Spjd
2304185029Spjd	if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION)
2305185029Spjd		return (EINVAL);
2306185029Spjd
2307209962Smm	if (newvers < zfsvfs->z_version)
2308209962Smm		return (EINVAL);
2309185029Spjd
2310219089Spjd	if (zfs_spa_version_map(newvers) >
2311219089Spjd	    spa_version(dmu_objset_spa(zfsvfs->z_os)))
2312219089Spjd		return (ENOTSUP);
2313219089Spjd
2314185029Spjd	tx = dmu_tx_create(os);
2315209962Smm	dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR);
2316219089Spjd	if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
2317219089Spjd		dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE,
2318219089Spjd		    ZFS_SA_ATTRS);
2319219089Spjd		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
2320219089Spjd	}
2321185029Spjd	error = dmu_tx_assign(tx, TXG_WAIT);
2322185029Spjd	if (error) {
2323185029Spjd		dmu_tx_abort(tx);
2324209962Smm		return (error);
2325185029Spjd	}
2326219089Spjd
2327209962Smm	error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR,
2328209962Smm	    8, 1, &newvers, tx);
2329185029Spjd
2330209962Smm	if (error) {
2331209962Smm		dmu_tx_commit(tx);
2332209962Smm		return (error);
2333209962Smm	}
2334209962Smm
2335219089Spjd	if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
2336219089Spjd		uint64_t sa_obj;
2337219089Spjd
2338219089Spjd		ASSERT3U(spa_version(dmu_objset_spa(zfsvfs->z_os)), >=,
2339219089Spjd		    SPA_VERSION_SA);
2340219089Spjd		sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE,
2341219089Spjd		    DMU_OT_NONE, 0, tx);
2342219089Spjd
2343219089Spjd		error = zap_add(os, MASTER_NODE_OBJ,
2344219089Spjd		    ZFS_SA_ATTRS, 8, 1, &sa_obj, tx);
2345219089Spjd		ASSERT3U(error, ==, 0);
2346219089Spjd
2347219089Spjd		VERIFY(0 == sa_set_sa_object(os, sa_obj));
2348219089Spjd		sa_register_update_callback(os, zfs_sa_upgrade);
2349219089Spjd	}
2350219089Spjd
2351219089Spjd	spa_history_log_internal(LOG_DS_UPGRADE,
2352219089Spjd	    dmu_objset_spa(os), tx, "oldver=%llu newver=%llu dataset = %llu",
2353209962Smm	    zfsvfs->z_version, newvers, dmu_objset_id(os));
2354209962Smm
2355185029Spjd	dmu_tx_commit(tx);
2356185029Spjd
2357209962Smm	zfsvfs->z_version = newvers;
2358209962Smm
2359219089Spjd	zfs_set_fuid_feature(zfsvfs);
2360209962Smm
2361209962Smm	return (0);
2362185029Spjd}
2363219089Spjd
2364185029Spjd/*
2365185029Spjd * Read a property stored within the master node.
2366185029Spjd */
2367185029Spjdint
2368185029Spjdzfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value)
2369185029Spjd{
2370185029Spjd	const char *pname;
2371185029Spjd	int error = ENOENT;
2372185029Spjd
2373185029Spjd	/*
2374185029Spjd	 * Look up the file system's value for the property.  For the
2375185029Spjd	 * version property, we look up a slightly different string.
2376185029Spjd	 */
2377185029Spjd	if (prop == ZFS_PROP_VERSION)
2378185029Spjd		pname = ZPL_VERSION_STR;
2379185029Spjd	else
2380185029Spjd		pname = zfs_prop_to_name(prop);
2381185029Spjd
2382185029Spjd	if (os != NULL)
2383185029Spjd		error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value);
2384185029Spjd
2385185029Spjd	if (error == ENOENT) {
2386185029Spjd		/* No value set, use the default value */
2387185029Spjd		switch (prop) {
2388185029Spjd		case ZFS_PROP_VERSION:
2389185029Spjd			*value = ZPL_VERSION;
2390185029Spjd			break;
2391185029Spjd		case ZFS_PROP_NORMALIZE:
2392185029Spjd		case ZFS_PROP_UTF8ONLY:
2393185029Spjd			*value = 0;
2394185029Spjd			break;
2395185029Spjd		case ZFS_PROP_CASE:
2396185029Spjd			*value = ZFS_CASE_SENSITIVE;
2397185029Spjd			break;
2398185029Spjd		default:
2399185029Spjd			return (error);
2400185029Spjd		}
2401185029Spjd		error = 0;
2402185029Spjd	}
2403185029Spjd	return (error);
2404185029Spjd}
2405