zfs_vfsops.c revision 206667
151852Sbp/*
266539Sbp * CDDL HEADER START
351852Sbp *
451852Sbp * The contents of this file are subject to the terms of the
551852Sbp * Common Development and Distribution License (the "License").
651852Sbp * You may not use this file except in compliance with the License.
751852Sbp *
851852Sbp * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
951852Sbp * or http://www.opensolaris.org/os/licensing.
1051852Sbp * See the License for the specific language governing permissions
1151852Sbp * and limitations under the License.
1251852Sbp *
1351852Sbp * When distributing Covered Code, include this CDDL HEADER in each
1451852Sbp * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
1551852Sbp * If applicable, add the following below this CDDL HEADER, with the
1651852Sbp * fields enclosed by brackets "[]" replaced with your own identifying
1751852Sbp * information: Portions Copyright [yyyy] [name of copyright owner]
1851852Sbp *
1951852Sbp * CDDL HEADER END
2051852Sbp */
2151852Sbp/*
2251852Sbp * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
2351852Sbp * Use is subject to license terms.
2451852Sbp */
2551852Sbp
2651852Sbp#include <sys/types.h>
2751852Sbp#include <sys/param.h>
2851852Sbp#include <sys/systm.h>
2951852Sbp#include <sys/kernel.h>
3051852Sbp#include <sys/sysmacros.h>
3151852Sbp#include <sys/kmem.h>
3251852Sbp#include <sys/acl.h>
3351852Sbp#include <sys/vnode.h>
3451852Sbp#include <sys/vfs.h>
3551852Sbp#include <sys/mntent.h>
3651852Sbp#include <sys/mount.h>
3751852Sbp#include <sys/cmn_err.h>
3851852Sbp#include <sys/zfs_znode.h>
3951852Sbp#include <sys/zfs_dir.h>
4051852Sbp#include <sys/zil.h>
4151852Sbp#include <sys/fs/zfs.h>
4251852Sbp#include <sys/dmu.h>
4351852Sbp#include <sys/dsl_prop.h>
4451852Sbp#include <sys/dsl_dataset.h>
4551852Sbp#include <sys/dsl_deleg.h>
4651852Sbp#include <sys/spa.h>
4751852Sbp#include <sys/zap.h>
4860041Sphk#include <sys/varargs.h>
4951852Sbp#include <sys/policy.h>
5051852Sbp#include <sys/atomic.h>
5151852Sbp#include <sys/zfs_ioctl.h>
5251852Sbp#include <sys/zfs_ctldir.h>
5351852Sbp#include <sys/zfs_fuid.h>
5451852Sbp#include <sys/sunddi.h>
5551852Sbp#include <sys/dnlc.h>
5651852Sbp#include <sys/dmu_objset.h>
5777223Sru#include <sys/spa_boot.h>
5877223Sru#include <sys/vdev_impl.h>	/* VDEV_BOOT_VERSION */
5977223Sru
6051852Sbpstruct mtx zfs_debug_mtx;
6151852SbpMTX_SYSINIT(zfs_debug_mtx, &zfs_debug_mtx, "zfs_debug", MTX_DEF);
6251852Sbp
6351852SbpSYSCTL_NODE(_vfs, OID_AUTO, zfs, CTLFLAG_RW, 0, "ZFS file system");
6451852Sbp
6551852Sbpint zfs_super_owner = 0;
6696755StrhodesSYSCTL_INT(_vfs_zfs, OID_AUTO, super_owner, CTLFLAG_RW, &zfs_super_owner, 0,
6751852Sbp    "File system owner can perform privileged operation on his file systems");
6851852Sbp
6951852Sbpint zfs_debug_level = 0;
7059755SpeterTUNABLE_INT("vfs.zfs.debug", &zfs_debug_level);
7174637SbpSYSCTL_INT(_vfs_zfs, OID_AUTO, debug, CTLFLAG_RW, &zfs_debug_level, 0,
7259755Speter    "Debug level");
73116271Sphk
74116271SphkSYSCTL_NODE(_vfs_zfs, OID_AUTO, version, CTLFLAG_RD, 0, "ZFS versions");
75116271Sphkstatic int zfs_version_acl = ZFS_ACL_VERSION;
76116271SphkSYSCTL_INT(_vfs_zfs_version, OID_AUTO, acl, CTLFLAG_RD, &zfs_version_acl, 0,
77116271Sphk    "ZFS_ACL_VERSION");
78116271Sphkstatic int zfs_version_dmu_backup_header = DMU_BACKUP_HEADER_VERSION;
79116271SphkSYSCTL_INT(_vfs_zfs_version, OID_AUTO, dmu_backup_header, CTLFLAG_RD,
80116271Sphk    &zfs_version_dmu_backup_header, 0, "DMU_BACKUP_HEADER_VERSION");
8151852Sbpstatic int zfs_version_dmu_backup_stream = DMU_BACKUP_STREAM_VERSION;
8251852SbpSYSCTL_INT(_vfs_zfs_version, OID_AUTO, dmu_backup_stream, CTLFLAG_RD,
83116271Sphk    &zfs_version_dmu_backup_stream, 0, "DMU_BACKUP_STREAM_VERSION");
84116271Sphkstatic int zfs_version_spa = SPA_VERSION;
85116271SphkSYSCTL_INT(_vfs_zfs_version, OID_AUTO, spa, CTLFLAG_RD, &zfs_version_spa, 0,
86116271Sphk    "SPA_VERSION");
87116271Sphkstatic int zfs_version_vdev_boot = VDEV_BOOT_VERSION;
88116271SphkSYSCTL_INT(_vfs_zfs_version, OID_AUTO, vdev_boot, CTLFLAG_RD,
89116271Sphk    &zfs_version_vdev_boot, 0, "VDEV_BOOT_VERSION");
90116271Sphkstatic int zfs_version_zpl = ZPL_VERSION;
91116271SphkSYSCTL_INT(_vfs_zfs_version, OID_AUTO, zpl, CTLFLAG_RD, &zfs_version_zpl, 0,
9251852Sbp    "ZPL_VERSION");
9351852Sbp
9451852Sbpstatic int zfs_mount(vfs_t *vfsp);
9551852Sbpstatic int zfs_umount(vfs_t *vfsp, int fflag);
9651852Sbpstatic int zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp);
9751852Sbpstatic int zfs_statfs(vfs_t *vfsp, struct statfs *statp);
9851852Sbpstatic int zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp);
9951852Sbpstatic int zfs_sync(vfs_t *vfsp, int waitfor);
10051852Sbpstatic int zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, int *extflagsp,
10151852Sbp    struct ucred **credanonp, int *numsecflavors, int **secflavors);
10251852Sbpstatic int zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, vnode_t **vpp);
10351852Sbpstatic void zfs_objset_close(zfsvfs_t *zfsvfs);
10451852Sbpstatic void zfs_freevfs(vfs_t *vfsp);
10551852Sbp
10651852Sbpstatic struct vfsops zfs_vfsops = {
10751852Sbp	.vfs_mount =		zfs_mount,
10851852Sbp	.vfs_unmount =		zfs_umount,
10951852Sbp	.vfs_root =		zfs_root,
11051852Sbp	.vfs_statfs =		zfs_statfs,
11151852Sbp	.vfs_vget =		zfs_vget,
11251852Sbp	.vfs_sync =		zfs_sync,
11351852Sbp	.vfs_checkexp =		zfs_checkexp,
11451852Sbp	.vfs_fhtovp =		zfs_fhtovp,
11552814Sarchie};
11652814Sarchie
11751852SbpVFS_SET(zfs_vfsops, zfs, VFCF_JAIL | VFCF_DELEGADMIN);
11851852Sbp
11951852Sbp/*
12051852Sbp * We need to keep a count of active fs's.
121111119Simp * This is necessary to prevent our module
12251852Sbp * from being unloaded after a umount -f
12351852Sbp */
12452814Sarchiestatic uint32_t	zfs_active_fs_count = 0;
12552814Sarchie
12651852Sbp/*ARGSUSED*/
12751852Sbpstatic int
12851852Sbpzfs_sync(vfs_t *vfsp, int waitfor)
12951852Sbp{
13051852Sbp
13151852Sbp	/*
13251852Sbp	 * Data integrity is job one.  We don't want a compromised kernel
13351852Sbp	 * writing to the storage pool, so we never sync during panic.
13451852Sbp	 */
13551852Sbp	if (panicstr)
13651852Sbp		return (0);
13751852Sbp
13851852Sbp	if (vfsp != NULL) {
13951852Sbp		/*
14083366Sjulian		 * Sync a specific filesystem.
14151852Sbp		 */
14251852Sbp		zfsvfs_t *zfsvfs = vfsp->vfs_data;
14351852Sbp		int error;
14451852Sbp
14551852Sbp		error = vfs_stdsync(vfsp, waitfor);
14651852Sbp		if (error != 0)
14751852Sbp			return (error);
14851852Sbp
14951852Sbp		ZFS_ENTER(zfsvfs);
15051852Sbp		if (zfsvfs->z_log != NULL)
15151852Sbp			zil_commit(zfsvfs->z_log, UINT64_MAX, 0);
15251852Sbp		else
15351852Sbp			txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
15451852Sbp		ZFS_EXIT(zfsvfs);
15551852Sbp	} else {
15651852Sbp		/*
15751852Sbp		 * Sync all ZFS filesystems.  This is what happens when you
15851852Sbp		 * run sync(1M).  Unlike other filesystems, ZFS honors the
15951852Sbp		 * request by waiting for all pools to commit all dirty data.
16051852Sbp		 */
16151852Sbp		spa_sync_allpools();
16251852Sbp	}
16351852Sbp
16451852Sbp	return (0);
16591406Sjhb}
16651852Sbp
16751852Sbpstatic void
16851852Sbpatime_changed_cb(void *arg, uint64_t newval)
16951852Sbp{
17051852Sbp	zfsvfs_t *zfsvfs = arg;
17151852Sbp
17251852Sbp	if (newval == TRUE) {
17351852Sbp		zfsvfs->z_atime = TRUE;
17451852Sbp		zfsvfs->z_vfs->vfs_flag &= ~MNT_NOATIME;
17583366Sjulian		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME);
17651852Sbp		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0);
17751852Sbp	} else {
178112916Stjr		zfsvfs->z_atime = FALSE;
179112916Stjr		zfsvfs->z_vfs->vfs_flag |= MNT_NOATIME;
18051852Sbp		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME);
18151852Sbp		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0);
18251852Sbp	}
18351852Sbp}
18451852Sbp
18551852Sbpstatic void
18651852Sbpxattr_changed_cb(void *arg, uint64_t newval)
18751852Sbp{
18851852Sbp	zfsvfs_t *zfsvfs = arg;
18951852Sbp
19051852Sbp	if (newval == TRUE) {
19151852Sbp		/* XXX locking on vfs_flag? */
19251852Sbp#ifdef TODO
19351852Sbp		zfsvfs->z_vfs->vfs_flag |= VFS_XATTR;
19451852Sbp#endif
19551852Sbp		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR);
19651852Sbp		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_XATTR, NULL, 0);
19751852Sbp	} else {
19851852Sbp		/* XXX locking on vfs_flag? */
19951852Sbp#ifdef TODO
20051852Sbp		zfsvfs->z_vfs->vfs_flag &= ~VFS_XATTR;
20151852Sbp#endif
20251852Sbp		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_XATTR);
20351852Sbp		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR, NULL, 0);
20451852Sbp	}
20551852Sbp}
20651852Sbp
20751852Sbpstatic void
20851852Sbpblksz_changed_cb(void *arg, uint64_t newval)
20951852Sbp{
21051852Sbp	zfsvfs_t *zfsvfs = arg;
211132023Salfred
21251852Sbp	if (newval < SPA_MINBLOCKSIZE ||
21351852Sbp	    newval > SPA_MAXBLOCKSIZE || !ISP2(newval))
21451852Sbp		newval = SPA_MAXBLOCKSIZE;
21551852Sbp
21651852Sbp	zfsvfs->z_max_blksz = newval;
21783366Sjulian	zfsvfs->z_vfs->mnt_stat.f_iosize = newval;
218103936Sjeff}
21951852Sbp
22051852Sbpstatic void
22151852Sbpreadonly_changed_cb(void *arg, uint64_t newval)
22251852Sbp{
22351852Sbp	zfsvfs_t *zfsvfs = arg;
22451852Sbp
22551852Sbp	if (newval) {
22651852Sbp		/* XXX locking on vfs_flag? */
22751852Sbp		zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY;
22851852Sbp		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW);
22951852Sbp		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0);
23083366Sjulian	} else {
23151852Sbp		/* XXX locking on vfs_flag? */
23251852Sbp		zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
23351852Sbp		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO);
23451852Sbp		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0);
23551852Sbp	}
23651852Sbp}
23751852Sbp
23851852Sbpstatic void
23951852Sbpsetuid_changed_cb(void *arg, uint64_t newval)
24076688Siedowse{
241132023Salfred	zfsvfs_t *zfsvfs = arg;
24276688Siedowse
24351852Sbp	if (newval == FALSE) {
24451852Sbp		zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID;
24551852Sbp		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID);
24691406Sjhb		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0);
24774062Sbp	} else {
24883366Sjulian		zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID;
24951852Sbp		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID);
25051852Sbp		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0);
25151852Sbp	}
25252814Sarchie}
25351852Sbp
25451852Sbpstatic void
25551852Sbpexec_changed_cb(void *arg, uint64_t newval)
25651852Sbp{
25751852Sbp	zfsvfs_t *zfsvfs = arg;
25851852Sbp
25951852Sbp	if (newval == FALSE) {
260132023Salfred		zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC;
26151852Sbp		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC);
26251852Sbp		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0);
26351852Sbp	} else {
26451852Sbp		zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC;
26551852Sbp		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC);
26691406Sjhb		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0);
26751852Sbp	}
26851852Sbp}
26951852Sbp
27051852Sbp/*
27151852Sbp * The nbmand mount option can be changed at mount time.
27251852Sbp * We can't allow it to be toggled on live file systems or incorrect
27351852Sbp * behavior may be seen from cifs clients
27483366Sjulian *
27566540Sbp * This property isn't registered via dsl_prop_register(), but this callback
27651852Sbp * will be called when a file system is first mounted
27751852Sbp */
27851852Sbpstatic void
27983366Sjuliannbmand_changed_cb(void *arg, uint64_t newval)
28051852Sbp{
28151852Sbp	zfsvfs_t *zfsvfs = arg;
28251852Sbp	if (newval == FALSE) {
28383366Sjulian		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND);
28451852Sbp		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND, NULL, 0);
28551852Sbp	} else {
28651852Sbp		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND);
28751852Sbp		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND, NULL, 0);
28851852Sbp	}
28951852Sbp}
29051852Sbp
29151852Sbpstatic void
29251852Sbpsnapdir_changed_cb(void *arg, uint64_t newval)
29351852Sbp{
29451852Sbp	zfsvfs_t *zfsvfs = arg;
29551852Sbp
29651852Sbp	zfsvfs->z_show_ctldir = newval;
29751852Sbp}
29851852Sbp
29951852Sbpstatic void
30051852Sbpvscan_changed_cb(void *arg, uint64_t newval)
30151852Sbp{
30251852Sbp	zfsvfs_t *zfsvfs = arg;
30351852Sbp
30451852Sbp	zfsvfs->z_vscan = newval;
30551852Sbp}
30651852Sbp
30751852Sbpstatic void
30851852Sbpacl_mode_changed_cb(void *arg, uint64_t newval)
30983366Sjulian{
31051852Sbp	zfsvfs_t *zfsvfs = arg;
31151852Sbp
31251852Sbp	zfsvfs->z_acl_mode = newval;
31351852Sbp}
31451852Sbp
31551852Sbpstatic void
31651852Sbpacl_inherit_changed_cb(void *arg, uint64_t newval)
31783366Sjulian{
31851852Sbp	zfsvfs_t *zfsvfs = arg;
31951852Sbp
32051852Sbp	zfsvfs->z_acl_inherit = newval;
32151852Sbp}
32251852Sbp
32351852Sbpstatic int
32451852Sbpzfs_register_callbacks(vfs_t *vfsp)
32583366Sjulian{
32651852Sbp	struct dsl_dataset *ds = NULL;
32751852Sbp	objset_t *os = NULL;
32851852Sbp	zfsvfs_t *zfsvfs = NULL;
32951852Sbp	uint64_t nbmand;
33074064Sbp	int readonly, do_readonly = FALSE;
33151852Sbp	int setuid, do_setuid = FALSE;
33251852Sbp	int exec, do_exec = FALSE;
33351852Sbp	int xattr, do_xattr = FALSE;
33451852Sbp	int atime, do_atime = FALSE;
33551852Sbp	int error = 0;
336101308Sjeff
33751852Sbp	ASSERT(vfsp);
33851852Sbp	zfsvfs = vfsp->vfs_data;
33951852Sbp	ASSERT(zfsvfs);
34051852Sbp	os = zfsvfs->z_os;
34183366Sjulian
34251852Sbp	/*
34351852Sbp	 * This function can be called for a snapshot when we update snapshot's
34451852Sbp	 * mount point, which isn't really supported.
34551852Sbp	 */
34651852Sbp	if (dmu_objset_is_snapshot(os))
34751852Sbp		return (EOPNOTSUPP);
34851852Sbp
34951852Sbp	/*
35051852Sbp	 * The act of registering our callbacks will destroy any mount
35151852Sbp	 * options we may have.  In order to enable temporary overrides
35251852Sbp	 * of mount options, we stash away the current values and
35351852Sbp	 * restore them after we register the callbacks.
35451852Sbp	 */
35551852Sbp	if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) {
35683366Sjulian		readonly = B_TRUE;
35751852Sbp		do_readonly = B_TRUE;
35851852Sbp	} else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) {
35983366Sjulian		readonly = B_FALSE;
36051852Sbp		do_readonly = B_TRUE;
36151852Sbp	}
36251852Sbp	if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) {
36351852Sbp		setuid = B_FALSE;
36451852Sbp		do_setuid = B_TRUE;
36551852Sbp	} else {
36651852Sbp		if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) {
36751852Sbp			setuid = B_FALSE;
36851852Sbp			do_setuid = B_TRUE;
36951852Sbp		} else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) {
37083366Sjulian			setuid = B_TRUE;
37151852Sbp			do_setuid = B_TRUE;
37251852Sbp		}
37351852Sbp	}
37451852Sbp	if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) {
37583366Sjulian		exec = B_FALSE;
37651852Sbp		do_exec = B_TRUE;
37751852Sbp	} else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) {
37851852Sbp		exec = B_TRUE;
37951852Sbp		do_exec = B_TRUE;
38051852Sbp	}
38151852Sbp	if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) {
38251852Sbp		xattr = B_FALSE;
38351852Sbp		do_xattr = B_TRUE;
38451852Sbp	} else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) {
38551852Sbp		xattr = B_TRUE;
38651852Sbp		do_xattr = B_TRUE;
38751852Sbp	}
38851852Sbp	if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) {
38951852Sbp		atime = B_FALSE;
39051852Sbp		do_atime = B_TRUE;
39151852Sbp	} else if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) {
39251852Sbp		atime = B_TRUE;
39351852Sbp		do_atime = B_TRUE;
39451852Sbp	}
39551852Sbp
39651852Sbp	/*
39751852Sbp	 * nbmand is a special property.  It can only be changed at
39851852Sbp	 * mount time.
39951852Sbp	 *
40051852Sbp	 * This is weird, but it is documented to only be changeable
40151852Sbp	 * at mount time.
40251852Sbp	 */
40351852Sbp	if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) {
40451852Sbp		nbmand = B_FALSE;
40583366Sjulian	} else if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) {
40651852Sbp		nbmand = B_TRUE;
40766539Sbp	} else {
40883366Sjulian		char osname[MAXNAMELEN];
40951852Sbp
41051852Sbp		dmu_objset_name(os, osname);
41151852Sbp		if (error = dsl_prop_get_integer(osname, "nbmand", &nbmand,
41251852Sbp		    NULL)) {
41351852Sbp			return (error);
41451852Sbp		}
41551852Sbp	}
41683366Sjulian
41791406Sjhb	/*
41851852Sbp	 * Register property callbacks.
41951852Sbp	 *
42096755Strhodes	 * It would probably be fine to just check for i/o error from
42151852Sbp	 * the first prop_register(), but I guess I like to go
42251852Sbp	 * overboard...
42351852Sbp	 */
42496755Strhodes	ds = dmu_objset_ds(os);
42551852Sbp	error = dsl_prop_register(ds, "atime", atime_changed_cb, zfsvfs);
42651852Sbp	error = error ? error : dsl_prop_register(ds,
42751852Sbp	    "xattr", xattr_changed_cb, zfsvfs);
42851852Sbp	error = error ? error : dsl_prop_register(ds,
42951852Sbp	    "recordsize", blksz_changed_cb, zfsvfs);
43096755Strhodes	error = error ? error : dsl_prop_register(ds,
43151852Sbp	    "readonly", readonly_changed_cb, zfsvfs);
43251852Sbp	error = error ? error : dsl_prop_register(ds,
43351852Sbp	    "setuid", setuid_changed_cb, zfsvfs);
43451852Sbp	error = error ? error : dsl_prop_register(ds,
43551852Sbp	    "exec", exec_changed_cb, zfsvfs);
43696755Strhodes	error = error ? error : dsl_prop_register(ds,
43751852Sbp	    "snapdir", snapdir_changed_cb, zfsvfs);
43851852Sbp	error = error ? error : dsl_prop_register(ds,
43951852Sbp	    "aclmode", acl_mode_changed_cb, zfsvfs);
44051852Sbp	error = error ? error : dsl_prop_register(ds,
44151852Sbp	    "aclinherit", acl_inherit_changed_cb, zfsvfs);
44251852Sbp	error = error ? error : dsl_prop_register(ds,
44351852Sbp	    "vscan", vscan_changed_cb, zfsvfs);
44451852Sbp	if (error)
445		goto unregister;
446
447	/*
448	 * Invoke our callbacks to restore temporary mount options.
449	 */
450	if (do_readonly)
451		readonly_changed_cb(zfsvfs, readonly);
452	if (do_setuid)
453		setuid_changed_cb(zfsvfs, setuid);
454	if (do_exec)
455		exec_changed_cb(zfsvfs, exec);
456	if (do_xattr)
457		xattr_changed_cb(zfsvfs, xattr);
458	if (do_atime)
459		atime_changed_cb(zfsvfs, atime);
460
461	nbmand_changed_cb(zfsvfs, nbmand);
462
463	return (0);
464
465unregister:
466	/*
467	 * We may attempt to unregister some callbacks that are not
468	 * registered, but this is OK; it will simply return ENOMSG,
469	 * which we will ignore.
470	 */
471	(void) dsl_prop_unregister(ds, "atime", atime_changed_cb, zfsvfs);
472	(void) dsl_prop_unregister(ds, "xattr", xattr_changed_cb, zfsvfs);
473	(void) dsl_prop_unregister(ds, "recordsize", blksz_changed_cb, zfsvfs);
474	(void) dsl_prop_unregister(ds, "readonly", readonly_changed_cb, zfsvfs);
475	(void) dsl_prop_unregister(ds, "setuid", setuid_changed_cb, zfsvfs);
476	(void) dsl_prop_unregister(ds, "exec", exec_changed_cb, zfsvfs);
477	(void) dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb, zfsvfs);
478	(void) dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb, zfsvfs);
479	(void) dsl_prop_unregister(ds, "aclinherit", acl_inherit_changed_cb,
480	    zfsvfs);
481	(void) dsl_prop_unregister(ds, "vscan", vscan_changed_cb, zfsvfs);
482	return (error);
483
484}
485
486static int
487zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
488{
489	int error;
490
491	error = zfs_register_callbacks(zfsvfs->z_vfs);
492	if (error)
493		return (error);
494
495	/*
496	 * Set the objset user_ptr to track its zfsvfs.
497	 */
498	mutex_enter(&zfsvfs->z_os->os->os_user_ptr_lock);
499	dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
500	mutex_exit(&zfsvfs->z_os->os->os_user_ptr_lock);
501
502	/*
503	 * If we are not mounting (ie: online recv), then we don't
504	 * have to worry about replaying the log as we blocked all
505	 * operations out since we closed the ZIL.
506	 */
507	if (mounting) {
508		boolean_t readonly;
509
510		/*
511		 * During replay we remove the read only flag to
512		 * allow replays to succeed.
513		 */
514		readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY;
515		zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
516
517		/*
518		 * Parse and replay the intent log.
519		 */
520		zil_replay(zfsvfs->z_os, zfsvfs, &zfsvfs->z_assign,
521		    zfs_replay_vector, zfs_unlinked_drain);
522
523		zfs_unlinked_drain(zfsvfs);
524		zfsvfs->z_vfs->vfs_flag |= readonly; /* restore readonly bit */
525	}
526
527	if (!zil_disable)
528		zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data);
529
530	return (0);
531}
532
533static void
534zfs_freezfsvfs(zfsvfs_t *zfsvfs)
535{
536	mutex_destroy(&zfsvfs->z_znodes_lock);
537	mutex_destroy(&zfsvfs->z_online_recv_lock);
538	list_destroy(&zfsvfs->z_all_znodes);
539	rrw_destroy(&zfsvfs->z_teardown_lock);
540	rw_destroy(&zfsvfs->z_teardown_inactive_lock);
541	rw_destroy(&zfsvfs->z_fuid_lock);
542	kmem_free(zfsvfs, sizeof (zfsvfs_t));
543}
544
545static int
546zfs_domount(vfs_t *vfsp, char *osname)
547{
548	uint64_t recordsize, readonly;
549	int error = 0;
550	int mode;
551	zfsvfs_t *zfsvfs;
552	znode_t *zp = NULL;
553
554	ASSERT(vfsp);
555	ASSERT(osname);
556
557	/*
558	 * Initialize the zfs-specific filesystem structure.
559	 * Should probably make this a kmem cache, shuffle fields,
560	 * and just bzero up to z_hold_mtx[].
561	 */
562	zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
563	zfsvfs->z_vfs = vfsp;
564	zfsvfs->z_parent = zfsvfs;
565	zfsvfs->z_assign = TXG_NOWAIT;
566	zfsvfs->z_max_blksz = SPA_MAXBLOCKSIZE;
567	zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE;
568
569	mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
570	mutex_init(&zfsvfs->z_online_recv_lock, NULL, MUTEX_DEFAULT, NULL);
571	list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
572	    offsetof(znode_t, z_link_node));
573	rrw_init(&zfsvfs->z_teardown_lock);
574	rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL);
575	rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL);
576
577	if (error = dsl_prop_get_integer(osname, "recordsize", &recordsize,
578	    NULL))
579		goto out;
580	zfsvfs->z_vfs->vfs_bsize = SPA_MINBLOCKSIZE;
581	zfsvfs->z_vfs->mnt_stat.f_iosize = recordsize;
582
583	vfsp->vfs_data = zfsvfs;
584	vfsp->mnt_flag |= MNT_LOCAL;
585	vfsp->mnt_kern_flag |= MNTK_MPSAFE;
586	vfsp->mnt_kern_flag |= MNTK_LOOKUP_SHARED;
587	vfsp->mnt_kern_flag |= MNTK_SHARED_WRITES;
588
589	if (error = dsl_prop_get_integer(osname, "readonly", &readonly, NULL))
590		goto out;
591
592	mode = DS_MODE_OWNER;
593	if (readonly)
594		mode |= DS_MODE_READONLY;
595
596	error = dmu_objset_open(osname, DMU_OST_ZFS, mode, &zfsvfs->z_os);
597	if (error == EROFS) {
598		mode = DS_MODE_OWNER | DS_MODE_READONLY;
599		error = dmu_objset_open(osname, DMU_OST_ZFS, mode,
600		    &zfsvfs->z_os);
601	}
602
603	if (error)
604		goto out;
605
606	if (error = zfs_init_fs(zfsvfs, &zp))
607		goto out;
608
609	/*
610	 * Set features for file system.
611	 */
612	zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
613	if (zfsvfs->z_use_fuids) {
614		vfs_set_feature(vfsp, VFSFT_XVATTR);
615		vfs_set_feature(vfsp, VFSFT_SYSATTR_VIEWS);
616		vfs_set_feature(vfsp, VFSFT_ACEMASKONACCESS);
617		vfs_set_feature(vfsp, VFSFT_ACLONCREATE);
618	}
619	if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) {
620		vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);
621		vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);
622		vfs_set_feature(vfsp, VFSFT_NOCASESENSITIVE);
623	} else if (zfsvfs->z_case == ZFS_CASE_MIXED) {
624		vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);
625		vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);
626	}
627
628	if (dmu_objset_is_snapshot(zfsvfs->z_os)) {
629		uint64_t pval;
630
631		ASSERT(mode & DS_MODE_READONLY);
632		atime_changed_cb(zfsvfs, B_FALSE);
633		readonly_changed_cb(zfsvfs, B_TRUE);
634		if (error = dsl_prop_get_integer(osname, "xattr", &pval, NULL))
635			goto out;
636		xattr_changed_cb(zfsvfs, pval);
637		zfsvfs->z_issnap = B_TRUE;
638	} else {
639		error = zfsvfs_setup(zfsvfs, B_TRUE);
640	}
641
642	vfs_mountedfrom(vfsp, osname);
643
644	if (!zfsvfs->z_issnap)
645		zfsctl_create(zfsvfs);
646out:
647	if (error) {
648		if (zfsvfs->z_os)
649			dmu_objset_close(zfsvfs->z_os);
650		zfs_freezfsvfs(zfsvfs);
651	} else {
652		atomic_add_32(&zfs_active_fs_count, 1);
653	}
654
655	return (error);
656}
657
658void
659zfs_unregister_callbacks(zfsvfs_t *zfsvfs)
660{
661	objset_t *os = zfsvfs->z_os;
662	struct dsl_dataset *ds;
663
664	/*
665	 * Unregister properties.
666	 */
667	if (!dmu_objset_is_snapshot(os)) {
668		ds = dmu_objset_ds(os);
669		VERIFY(dsl_prop_unregister(ds, "atime", atime_changed_cb,
670		    zfsvfs) == 0);
671
672		VERIFY(dsl_prop_unregister(ds, "xattr", xattr_changed_cb,
673		    zfsvfs) == 0);
674
675		VERIFY(dsl_prop_unregister(ds, "recordsize", blksz_changed_cb,
676		    zfsvfs) == 0);
677
678		VERIFY(dsl_prop_unregister(ds, "readonly", readonly_changed_cb,
679		    zfsvfs) == 0);
680
681		VERIFY(dsl_prop_unregister(ds, "setuid", setuid_changed_cb,
682		    zfsvfs) == 0);
683
684		VERIFY(dsl_prop_unregister(ds, "exec", exec_changed_cb,
685		    zfsvfs) == 0);
686
687		VERIFY(dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb,
688		    zfsvfs) == 0);
689
690		VERIFY(dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb,
691		    zfsvfs) == 0);
692
693		VERIFY(dsl_prop_unregister(ds, "aclinherit",
694		    acl_inherit_changed_cb, zfsvfs) == 0);
695
696		VERIFY(dsl_prop_unregister(ds, "vscan",
697		    vscan_changed_cb, zfsvfs) == 0);
698	}
699}
700
701/*ARGSUSED*/
702static int
703zfs_mount(vfs_t *vfsp)
704{
705	kthread_t	*td = curthread;
706	vnode_t		*mvp = vfsp->mnt_vnodecovered;
707	cred_t		*cr = td->td_ucred;
708	char		*osname;
709	int		error = 0;
710	int		canwrite;
711
712	if (vfs_getopt(vfsp->mnt_optnew, "from", (void **)&osname, NULL))
713		return (EINVAL);
714
715	/*
716	 * If full-owner-access is enabled and delegated administration is
717	 * turned on, we must set nosuid.
718	 */
719	if (zfs_super_owner &&
720	    dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != ECANCELED) {
721		secpolicy_fs_mount_clearopts(cr, vfsp);
722	}
723
724	/*
725	 * Check for mount privilege?
726	 *
727	 * If we don't have privilege then see if
728	 * we have local permission to allow it
729	 */
730	error = secpolicy_fs_mount(cr, mvp, vfsp);
731	if (error) {
732		error = dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr);
733		if (error != 0)
734			goto out;
735
736		if (!(vfsp->vfs_flag & MS_REMOUNT)) {
737			vattr_t		vattr;
738
739			/*
740			 * Make sure user is the owner of the mount point
741			 * or has sufficient privileges.
742			 */
743
744			vattr.va_mask = AT_UID;
745
746			vn_lock(mvp, LK_SHARED | LK_RETRY);
747			if (error = VOP_GETATTR(mvp, &vattr, cr)) {
748				VOP_UNLOCK(mvp, 0);
749				goto out;
750			}
751
752#if 0 /* CHECK THIS! Is probably needed for zfs_suser. */
753			if (secpolicy_vnode_owner(mvp, cr, vattr.va_uid) != 0 &&
754			    VOP_ACCESS(mvp, VWRITE, cr, td) != 0) {
755				error = EPERM;
756				goto out;
757			}
758#else
759			if (error = secpolicy_vnode_owner(mvp, cr, vattr.va_uid)) {
760				VOP_UNLOCK(mvp, 0);
761				goto out;
762			}
763
764			if (error = VOP_ACCESS(mvp, VWRITE, cr, td)) {
765				VOP_UNLOCK(mvp, 0);
766				goto out;
767			}
768			VOP_UNLOCK(mvp, 0);
769#endif
770		}
771
772		secpolicy_fs_mount_clearopts(cr, vfsp);
773	}
774
775	/*
776	 * Refuse to mount a filesystem if we are in a local zone and the
777	 * dataset is not visible.
778	 */
779	if (!INGLOBALZONE(curthread) &&
780	    (!zone_dataset_visible(osname, &canwrite) || !canwrite)) {
781		error = EPERM;
782		goto out;
783	}
784
785	/*
786	 * When doing a remount, we simply refresh our temporary properties
787	 * according to those options set in the current VFS options.
788	 */
789	if (vfsp->vfs_flag & MS_REMOUNT) {
790		/* refresh mount options */
791		zfs_unregister_callbacks(vfsp->vfs_data);
792		error = zfs_register_callbacks(vfsp);
793		goto out;
794	}
795
796	DROP_GIANT();
797	error = zfs_domount(vfsp, osname);
798	PICKUP_GIANT();
799out:
800	return (error);
801}
802
803static int
804zfs_statfs(vfs_t *vfsp, struct statfs *statp)
805{
806	zfsvfs_t *zfsvfs = vfsp->vfs_data;
807	uint64_t refdbytes, availbytes, usedobjs, availobjs;
808
809	statp->f_version = STATFS_VERSION;
810
811	ZFS_ENTER(zfsvfs);
812
813	dmu_objset_space(zfsvfs->z_os,
814	    &refdbytes, &availbytes, &usedobjs, &availobjs);
815
816	/*
817	 * The underlying storage pool actually uses multiple block sizes.
818	 * We report the fragsize as the smallest block size we support,
819	 * and we report our blocksize as the filesystem's maximum blocksize.
820	 */
821	statp->f_bsize = SPA_MINBLOCKSIZE;
822	statp->f_iosize = zfsvfs->z_vfs->mnt_stat.f_iosize;
823
824	/*
825	 * The following report "total" blocks of various kinds in the
826	 * file system, but reported in terms of f_frsize - the
827	 * "fragment" size.
828	 */
829
830	statp->f_blocks = (refdbytes + availbytes) >> SPA_MINBLOCKSHIFT;
831	statp->f_bfree = availbytes / statp->f_bsize;
832	statp->f_bavail = statp->f_bfree; /* no root reservation */
833
834	/*
835	 * statvfs() should really be called statufs(), because it assumes
836	 * static metadata.  ZFS doesn't preallocate files, so the best
837	 * we can do is report the max that could possibly fit in f_files,
838	 * and that minus the number actually used in f_ffree.
839	 * For f_ffree, report the smaller of the number of object available
840	 * and the number of blocks (each object will take at least a block).
841	 */
842	statp->f_ffree = MIN(availobjs, statp->f_bfree);
843	statp->f_files = statp->f_ffree + usedobjs;
844
845	/*
846	 * We're a zfs filesystem.
847	 */
848	(void) strlcpy(statp->f_fstypename, "zfs", sizeof(statp->f_fstypename));
849
850	strlcpy(statp->f_mntfromname, vfsp->mnt_stat.f_mntfromname,
851	    sizeof(statp->f_mntfromname));
852	strlcpy(statp->f_mntonname, vfsp->mnt_stat.f_mntonname,
853	    sizeof(statp->f_mntonname));
854
855	statp->f_namemax = ZFS_MAXNAMELEN;
856
857	ZFS_EXIT(zfsvfs);
858	return (0);
859}
860
861static int
862zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp)
863{
864	zfsvfs_t *zfsvfs = vfsp->vfs_data;
865	znode_t *rootzp;
866	int error;
867
868	ZFS_ENTER_NOERROR(zfsvfs);
869
870	error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp);
871
872	ZFS_EXIT(zfsvfs);
873
874	if (error == 0) {
875		*vpp = ZTOV(rootzp);
876		error = vn_lock(*vpp, flags);
877		(*vpp)->v_vflag |= VV_ROOT;
878	}
879
880	return (error);
881}
882
883/*
884 * Teardown the zfsvfs::z_os.
885 *
886 * Note, if 'unmounting' if FALSE, we return with the 'z_teardown_lock'
887 * and 'z_teardown_inactive_lock' held.
888 */
889static int
890zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting)
891{
892	znode_t	*zp;
893
894	rrw_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
895
896	if (!unmounting) {
897		/*
898		 * We purge the parent filesystem's vfsp as the parent
899		 * filesystem and all of its snapshots have their vnode's
900		 * v_vfsp set to the parent's filesystem's vfsp.  Note,
901		 * 'z_parent' is self referential for non-snapshots.
902		 */
903		(void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0);
904#ifdef FREEBSD_NAMECACHE
905		cache_purgevfs(zfsvfs->z_parent->z_vfs);
906#endif
907	}
908
909	/*
910	 * Close the zil. NB: Can't close the zil while zfs_inactive
911	 * threads are blocked as zil_close can call zfs_inactive.
912	 */
913	if (zfsvfs->z_log) {
914		zil_close(zfsvfs->z_log);
915		zfsvfs->z_log = NULL;
916	}
917
918	rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_WRITER);
919
920	/*
921	 * If we are not unmounting (ie: online recv) and someone already
922	 * unmounted this file system while we were doing the switcheroo,
923	 * or a reopen of z_os failed then just bail out now.
924	 */
925	if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) {
926		rw_exit(&zfsvfs->z_teardown_inactive_lock);
927		rrw_exit(&zfsvfs->z_teardown_lock, FTAG);
928		return (EIO);
929	}
930
931	/*
932	 * At this point there are no vops active, and any new vops will
933	 * fail with EIO since we have z_teardown_lock for writer (only
934	 * relavent for forced unmount).
935	 *
936	 * Release all holds on dbufs.
937	 */
938	mutex_enter(&zfsvfs->z_znodes_lock);
939	for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL;
940	    zp = list_next(&zfsvfs->z_all_znodes, zp))
941		if (zp->z_dbuf) {
942			ASSERT(ZTOV(zp)->v_count >= 0);
943			zfs_znode_dmu_fini(zp);
944		}
945	mutex_exit(&zfsvfs->z_znodes_lock);
946
947	/*
948	 * If we are unmounting, set the unmounted flag and let new vops
949	 * unblock.  zfs_inactive will have the unmounted behavior, and all
950	 * other vops will fail with EIO.
951	 */
952	if (unmounting) {
953		zfsvfs->z_unmounted = B_TRUE;
954		rrw_exit(&zfsvfs->z_teardown_lock, FTAG);
955		rw_exit(&zfsvfs->z_teardown_inactive_lock);
956
957#ifdef __FreeBSD__
958		/*
959		 * Some znodes might not be fully reclaimed, wait for them.
960		 */
961		mutex_enter(&zfsvfs->z_znodes_lock);
962		while (list_head(&zfsvfs->z_all_znodes) != NULL) {
963			msleep(zfsvfs, &zfsvfs->z_znodes_lock, 0,
964			    "zteardown", 0);
965		}
966		mutex_exit(&zfsvfs->z_znodes_lock);
967#endif
968	}
969
970	/*
971	 * z_os will be NULL if there was an error in attempting to reopen
972	 * zfsvfs, so just return as the properties had already been
973	 * unregistered and cached data had been evicted before.
974	 */
975	if (zfsvfs->z_os == NULL)
976		return (0);
977
978	/*
979	 * Unregister properties.
980	 */
981	zfs_unregister_callbacks(zfsvfs);
982
983	/*
984	 * Evict cached data
985	 */
986	if (dmu_objset_evict_dbufs(zfsvfs->z_os)) {
987		txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
988		(void) dmu_objset_evict_dbufs(zfsvfs->z_os);
989	}
990
991	return (0);
992}
993
994/*ARGSUSED*/
995static int
996zfs_umount(vfs_t *vfsp, int fflag)
997{
998	zfsvfs_t *zfsvfs = vfsp->vfs_data;
999	objset_t *os;
1000	cred_t *cr = curthread->td_ucred;
1001	int ret;
1002
1003	ret = secpolicy_fs_unmount(cr, vfsp);
1004	if (ret) {
1005		ret = dsl_deleg_access((char *)refstr_value(vfsp->vfs_resource),
1006		    ZFS_DELEG_PERM_MOUNT, cr);
1007		if (ret)
1008			return (ret);
1009	}
1010	/*
1011	 * We purge the parent filesystem's vfsp as the parent filesystem
1012	 * and all of its snapshots have their vnode's v_vfsp set to the
1013	 * parent's filesystem's vfsp.  Note, 'z_parent' is self
1014	 * referential for non-snapshots.
1015	 */
1016	(void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0);
1017
1018	/*
1019	 * Unmount any snapshots mounted under .zfs before unmounting the
1020	 * dataset itself.
1021	 */
1022	if (zfsvfs->z_ctldir != NULL) {
1023		if ((ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0)
1024			return (ret);
1025		ret = vflush(vfsp, 0, 0, curthread);
1026		ASSERT(ret == EBUSY);
1027		if (!(fflag & MS_FORCE)) {
1028			if (zfsvfs->z_ctldir->v_count > 1)
1029				return (EBUSY);
1030			ASSERT(zfsvfs->z_ctldir->v_count == 1);
1031		}
1032		zfsctl_destroy(zfsvfs);
1033		ASSERT(zfsvfs->z_ctldir == NULL);
1034	}
1035
1036	if (fflag & MS_FORCE) {
1037		/*
1038		 * Mark file system as unmounted before calling
1039		 * vflush(FORCECLOSE). This way we ensure no future vnops
1040		 * will be called and risk operating on DOOMED vnodes.
1041		 */
1042		rrw_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
1043		zfsvfs->z_unmounted = B_TRUE;
1044		rrw_exit(&zfsvfs->z_teardown_lock, FTAG);
1045	}
1046
1047	/*
1048	 * Flush all the files.
1049	 */
1050	ret = vflush(vfsp, 1, (fflag & MS_FORCE) ? FORCECLOSE : 0, curthread);
1051	if (ret != 0) {
1052		if (!zfsvfs->z_issnap) {
1053			zfsctl_create(zfsvfs);
1054			ASSERT(zfsvfs->z_ctldir != NULL);
1055		}
1056		return (ret);
1057	}
1058
1059	if (!(fflag & MS_FORCE)) {
1060		/*
1061		 * Check the number of active vnodes in the file system.
1062		 * Our count is maintained in the vfs structure, but the
1063		 * number is off by 1 to indicate a hold on the vfs
1064		 * structure itself.
1065		 *
1066		 * The '.zfs' directory maintains a reference of its
1067		 * own, and any active references underneath are
1068		 * reflected in the vnode count.
1069		 */
1070		if (zfsvfs->z_ctldir == NULL) {
1071			if (vfsp->vfs_count > 1)
1072				return (EBUSY);
1073		} else {
1074			if (vfsp->vfs_count > 2 ||
1075			    zfsvfs->z_ctldir->v_count > 1)
1076				return (EBUSY);
1077		}
1078	} else {
1079		MNT_ILOCK(vfsp);
1080		vfsp->mnt_kern_flag |= MNTK_UNMOUNTF;
1081		MNT_IUNLOCK(vfsp);
1082	}
1083
1084	VERIFY(zfsvfs_teardown(zfsvfs, B_TRUE) == 0);
1085	os = zfsvfs->z_os;
1086
1087	/*
1088	 * z_os will be NULL if there was an error in
1089	 * attempting to reopen zfsvfs.
1090	 */
1091	if (os != NULL) {
1092		/*
1093		 * Unset the objset user_ptr.
1094		 */
1095		mutex_enter(&os->os->os_user_ptr_lock);
1096		dmu_objset_set_user(os, NULL);
1097		mutex_exit(&os->os->os_user_ptr_lock);
1098
1099		/*
1100		 * Finally release the objset
1101		 */
1102		dmu_objset_close(os);
1103	}
1104
1105	/*
1106	 * We can now safely destroy the '.zfs' directory node.
1107	 */
1108	if (zfsvfs->z_ctldir != NULL)
1109		zfsctl_destroy(zfsvfs);
1110	if (zfsvfs->z_issnap) {
1111		vnode_t *svp = vfsp->mnt_vnodecovered;
1112
1113		if (svp->v_count >= 2)
1114			VN_RELE(svp);
1115	}
1116	zfs_freevfs(vfsp);
1117
1118	return (0);
1119}
1120
1121static int
1122zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp)
1123{
1124	zfsvfs_t	*zfsvfs = vfsp->vfs_data;
1125	znode_t		*zp;
1126	int 		err;
1127
1128	/*
1129	 * XXXPJD: zfs_zget() can't operate on virtual entires like .zfs/ or
1130	 * .zfs/snapshot/ directories, so for now just return EOPNOTSUPP.
1131	 * This will make NFS to fall back to using READDIR instead of
1132	 * READDIRPLUS.
1133	 * Also snapshots are stored in AVL tree, but based on their names,
1134	 * not inode numbers, so it will be very inefficient to iterate
1135	 * over all snapshots to find the right one.
1136	 * Note that OpenSolaris READDIRPLUS implementation does LOOKUP on
1137	 * d_name, and not VGET on d_fileno as we do.
1138	 */
1139	if (ino == ZFSCTL_INO_ROOT || ino == ZFSCTL_INO_SNAPDIR)
1140		return (EOPNOTSUPP);
1141
1142	ZFS_ENTER(zfsvfs);
1143	err = zfs_zget(zfsvfs, ino, &zp);
1144	if (err == 0 && zp->z_unlinked) {
1145		VN_RELE(ZTOV(zp));
1146		err = EINVAL;
1147	}
1148	ZFS_EXIT(zfsvfs);
1149	if (err != 0)
1150		*vpp = NULL;
1151	else {
1152		*vpp = ZTOV(zp);
1153		vn_lock(*vpp, flags);
1154	}
1155	return (err);
1156}
1157
1158static int
1159zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, int *extflagsp,
1160    struct ucred **credanonp, int *numsecflavors, int **secflavors)
1161{
1162	zfsvfs_t *zfsvfs = vfsp->vfs_data;
1163
1164	/*
1165	 * If this is regular file system vfsp is the same as
1166	 * zfsvfs->z_parent->z_vfs, but if it is snapshot,
1167	 * zfsvfs->z_parent->z_vfs represents parent file system
1168	 * which we have to use here, because only this file system
1169	 * has mnt_export configured.
1170	 */
1171	vfsp = zfsvfs->z_parent->z_vfs;
1172
1173	return (vfs_stdcheckexp(zfsvfs->z_parent->z_vfs, nam, extflagsp,
1174	    credanonp, numsecflavors, secflavors));
1175}
1176
1177CTASSERT(SHORT_FID_LEN <= sizeof(struct fid));
1178CTASSERT(LONG_FID_LEN <= sizeof(struct fid));
1179
1180static int
1181zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, vnode_t **vpp)
1182{
1183	zfsvfs_t	*zfsvfs = vfsp->vfs_data;
1184	znode_t		*zp;
1185	uint64_t	object = 0;
1186	uint64_t	fid_gen = 0;
1187	uint64_t	gen_mask;
1188	uint64_t	zp_gen;
1189	int		i, err;
1190
1191	*vpp = NULL;
1192
1193	ZFS_ENTER(zfsvfs);
1194
1195	/*
1196	 * On FreeBSD we can get snapshot's mount point or its parent file
1197	 * system mount point depending if snapshot is already mounted or not.
1198	 */
1199	if (zfsvfs->z_parent == zfsvfs && fidp->fid_len == LONG_FID_LEN) {
1200		zfid_long_t	*zlfid = (zfid_long_t *)fidp;
1201		uint64_t	objsetid = 0;
1202		uint64_t	setgen = 0;
1203
1204		for (i = 0; i < sizeof (zlfid->zf_setid); i++)
1205			objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i);
1206
1207		for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
1208			setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i);
1209
1210		ZFS_EXIT(zfsvfs);
1211
1212		err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs);
1213		if (err)
1214			return (EINVAL);
1215		ZFS_ENTER(zfsvfs);
1216	}
1217
1218	if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) {
1219		zfid_short_t	*zfid = (zfid_short_t *)fidp;
1220
1221		for (i = 0; i < sizeof (zfid->zf_object); i++)
1222			object |= ((uint64_t)zfid->zf_object[i]) << (8 * i);
1223
1224		for (i = 0; i < sizeof (zfid->zf_gen); i++)
1225			fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i);
1226	} else {
1227		ZFS_EXIT(zfsvfs);
1228		return (EINVAL);
1229	}
1230
1231	/* A zero fid_gen means we are in the .zfs control directories */
1232	if (fid_gen == 0 &&
1233	    (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) {
1234		*vpp = zfsvfs->z_ctldir;
1235		ASSERT(*vpp != NULL);
1236		if (object == ZFSCTL_INO_SNAPDIR) {
1237			VERIFY(zfsctl_root_lookup(*vpp, "snapshot", vpp, NULL,
1238			    0, NULL, NULL, NULL, NULL, NULL) == 0);
1239		} else {
1240			VN_HOLD(*vpp);
1241		}
1242		ZFS_EXIT(zfsvfs);
1243		vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
1244		return (0);
1245	}
1246
1247	gen_mask = -1ULL >> (64 - 8 * i);
1248
1249	dprintf("getting %llu [%u mask %llx]\n", object, fid_gen, gen_mask);
1250	if (err = zfs_zget(zfsvfs, object, &zp)) {
1251		ZFS_EXIT(zfsvfs);
1252		return (err);
1253	}
1254	zp_gen = zp->z_phys->zp_gen & gen_mask;
1255	if (zp_gen == 0)
1256		zp_gen = 1;
1257	if (zp->z_unlinked || zp_gen != fid_gen) {
1258		dprintf("znode gen (%u) != fid gen (%u)\n", zp_gen, fid_gen);
1259		VN_RELE(ZTOV(zp));
1260		ZFS_EXIT(zfsvfs);
1261		return (EINVAL);
1262	}
1263
1264	ZFS_EXIT(zfsvfs);
1265
1266	*vpp = ZTOV(zp);
1267	vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
1268	vnode_create_vobject(*vpp, zp->z_phys->zp_size, curthread);
1269	return (0);
1270}
1271
1272/*
1273 * Block out VOPs and close zfsvfs_t::z_os
1274 *
1275 * Note, if successful, then we return with the 'z_teardown_lock' and
1276 * 'z_teardown_inactive_lock' write held.
1277 */
1278int
1279zfs_suspend_fs(zfsvfs_t *zfsvfs, char *name, int *mode)
1280{
1281	int error;
1282
1283	if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0)
1284		return (error);
1285
1286	*mode = zfsvfs->z_os->os_mode;
1287	dmu_objset_name(zfsvfs->z_os, name);
1288	dmu_objset_close(zfsvfs->z_os);
1289
1290	return (0);
1291}
1292
1293/*
1294 * Reopen zfsvfs_t::z_os and release VOPs.
1295 */
1296int
1297zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname, int mode)
1298{
1299	int err;
1300
1301	ASSERT(RRW_WRITE_HELD(&zfsvfs->z_teardown_lock));
1302	ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock));
1303
1304	err = dmu_objset_open(osname, DMU_OST_ZFS, mode, &zfsvfs->z_os);
1305	if (err) {
1306		zfsvfs->z_os = NULL;
1307	} else {
1308		znode_t *zp;
1309
1310		VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0);
1311
1312		/*
1313		 * Attempt to re-establish all the active znodes with
1314		 * their dbufs.  If a zfs_rezget() fails, then we'll let
1315		 * any potential callers discover that via ZFS_ENTER_VERIFY_VP
1316		 * when they try to use their znode.
1317		 */
1318		mutex_enter(&zfsvfs->z_znodes_lock);
1319		for (zp = list_head(&zfsvfs->z_all_znodes); zp;
1320		    zp = list_next(&zfsvfs->z_all_znodes, zp)) {
1321			(void) zfs_rezget(zp);
1322		}
1323		mutex_exit(&zfsvfs->z_znodes_lock);
1324
1325	}
1326
1327	/* release the VOPs */
1328	rw_exit(&zfsvfs->z_teardown_inactive_lock);
1329	rrw_exit(&zfsvfs->z_teardown_lock, FTAG);
1330
1331	if (err) {
1332		/*
1333		 * Since we couldn't reopen zfsvfs::z_os, force
1334		 * unmount this file system.
1335		 */
1336		if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0)
1337			(void) dounmount(zfsvfs->z_vfs, MS_FORCE, curthread);
1338	}
1339	return (err);
1340}
1341
1342static void
1343zfs_freevfs(vfs_t *vfsp)
1344{
1345	zfsvfs_t *zfsvfs = vfsp->vfs_data;
1346	int i;
1347
1348	for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
1349		mutex_destroy(&zfsvfs->z_hold_mtx[i]);
1350
1351	zfs_fuid_destroy(zfsvfs);
1352	zfs_freezfsvfs(zfsvfs);
1353
1354	atomic_add_32(&zfs_active_fs_count, -1);
1355}
1356
1357#ifdef __i386__
1358static int desiredvnodes_backup;
1359#endif
1360
1361static void
1362zfs_vnodes_adjust(void)
1363{
1364#ifdef __i386__
1365	int newdesiredvnodes;
1366
1367	desiredvnodes_backup = desiredvnodes;
1368
1369	/*
1370	 * We calculate newdesiredvnodes the same way it is done in
1371	 * vntblinit(). If it is equal to desiredvnodes, it means that
1372	 * it wasn't tuned by the administrator and we can tune it down.
1373	 */
1374	newdesiredvnodes = min(maxproc + cnt.v_page_count / 4, 2 *
1375	    vm_kmem_size / (5 * (sizeof(struct vm_object) +
1376	    sizeof(struct vnode))));
1377	if (newdesiredvnodes == desiredvnodes)
1378		desiredvnodes = (3 * newdesiredvnodes) / 4;
1379#endif
1380}
1381
1382static void
1383zfs_vnodes_adjust_back(void)
1384{
1385
1386#ifdef __i386__
1387	desiredvnodes = desiredvnodes_backup;
1388#endif
1389}
1390
1391void
1392zfs_init(void)
1393{
1394
1395	printf("ZFS filesystem version " ZPL_VERSION_STRING "\n");
1396
1397	/*
1398	 * Initialize znode cache, vnode ops, etc...
1399	 */
1400	zfs_znode_init();
1401
1402	/*
1403	 * Initialize .zfs directory structures
1404	 */
1405	zfsctl_init();
1406
1407	/*
1408	 * Reduce number of vnode. Originally number of vnodes is calculated
1409	 * with UFS inode in mind. We reduce it here, because it's too big for
1410	 * ZFS/i386.
1411	 */
1412	zfs_vnodes_adjust();
1413}
1414
1415void
1416zfs_fini(void)
1417{
1418	zfsctl_fini();
1419	zfs_znode_fini();
1420	zfs_vnodes_adjust_back();
1421}
1422
1423int
1424zfs_busy(void)
1425{
1426	return (zfs_active_fs_count != 0);
1427}
1428
1429int
1430zfs_set_version(const char *name, uint64_t newvers)
1431{
1432	int error;
1433	objset_t *os;
1434	dmu_tx_t *tx;
1435	uint64_t curvers;
1436
1437	/*
1438	 * XXX for now, require that the filesystem be unmounted.  Would
1439	 * be nice to find the zfsvfs_t and just update that if
1440	 * possible.
1441	 */
1442
1443	if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION)
1444		return (EINVAL);
1445
1446	error = dmu_objset_open(name, DMU_OST_ZFS, DS_MODE_OWNER, &os);
1447	if (error)
1448		return (error);
1449
1450	error = zap_lookup(os, MASTER_NODE_OBJ, ZPL_VERSION_STR,
1451	    8, 1, &curvers);
1452	if (error)
1453		goto out;
1454	if (newvers < curvers) {
1455		error = EINVAL;
1456		goto out;
1457	}
1458
1459	tx = dmu_tx_create(os);
1460	dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, 0, ZPL_VERSION_STR);
1461	error = dmu_tx_assign(tx, TXG_WAIT);
1462	if (error) {
1463		dmu_tx_abort(tx);
1464		goto out;
1465	}
1466	error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR, 8, 1,
1467	    &newvers, tx);
1468
1469	spa_history_internal_log(LOG_DS_UPGRADE,
1470	    dmu_objset_spa(os), tx, CRED(),
1471	    "oldver=%llu newver=%llu dataset = %llu", curvers, newvers,
1472	    dmu_objset_id(os));
1473	dmu_tx_commit(tx);
1474
1475out:
1476	dmu_objset_close(os);
1477	return (error);
1478}
1479/*
1480 * Read a property stored within the master node.
1481 */
1482int
1483zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value)
1484{
1485	const char *pname;
1486	int error = ENOENT;
1487
1488	/*
1489	 * Look up the file system's value for the property.  For the
1490	 * version property, we look up a slightly different string.
1491	 */
1492	if (prop == ZFS_PROP_VERSION)
1493		pname = ZPL_VERSION_STR;
1494	else
1495		pname = zfs_prop_to_name(prop);
1496
1497	if (os != NULL)
1498		error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value);
1499
1500	if (error == ENOENT) {
1501		/* No value set, use the default value */
1502		switch (prop) {
1503		case ZFS_PROP_VERSION:
1504			*value = ZPL_VERSION;
1505			break;
1506		case ZFS_PROP_NORMALIZE:
1507		case ZFS_PROP_UTF8ONLY:
1508			*value = 0;
1509			break;
1510		case ZFS_PROP_CASE:
1511			*value = ZFS_CASE_SENSITIVE;
1512			break;
1513		default:
1514			return (error);
1515		}
1516		error = 0;
1517	}
1518	return (error);
1519}
1520