zfs_vfsops.c revision 209230
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26#include <sys/types.h>
27#include <sys/param.h>
28#include <sys/systm.h>
29#include <sys/kernel.h>
30#include <sys/sysmacros.h>
31#include <sys/kmem.h>
32#include <sys/acl.h>
33#include <sys/vnode.h>
34#include <sys/vfs.h>
35#include <sys/mntent.h>
36#include <sys/mount.h>
37#include <sys/cmn_err.h>
38#include <sys/zfs_znode.h>
39#include <sys/zfs_dir.h>
40#include <sys/zil.h>
41#include <sys/fs/zfs.h>
42#include <sys/dmu.h>
43#include <sys/dsl_prop.h>
44#include <sys/dsl_dataset.h>
45#include <sys/dsl_deleg.h>
46#include <sys/spa.h>
47#include <sys/zap.h>
48#include <sys/varargs.h>
49#include <sys/policy.h>
50#include <sys/atomic.h>
51#include <sys/zfs_ioctl.h>
52#include <sys/zfs_ctldir.h>
53#include <sys/zfs_fuid.h>
54#include <sys/sunddi.h>
55#include <sys/dnlc.h>
56#include <sys/dmu_objset.h>
57#include <sys/spa_boot.h>
58#include <sys/vdev_impl.h>	/* VDEV_BOOT_VERSION */
59
60struct mtx zfs_debug_mtx;
61MTX_SYSINIT(zfs_debug_mtx, &zfs_debug_mtx, "zfs_debug", MTX_DEF);
62
63SYSCTL_NODE(_vfs, OID_AUTO, zfs, CTLFLAG_RW, 0, "ZFS file system");
64
65int zfs_super_owner = 0;
66SYSCTL_INT(_vfs_zfs, OID_AUTO, super_owner, CTLFLAG_RW, &zfs_super_owner, 0,
67    "File system owner can perform privileged operation on his file systems");
68
69int zfs_debug_level = 0;
70TUNABLE_INT("vfs.zfs.debug", &zfs_debug_level);
71SYSCTL_INT(_vfs_zfs, OID_AUTO, debug, CTLFLAG_RW, &zfs_debug_level, 0,
72    "Debug level");
73
74SYSCTL_NODE(_vfs_zfs, OID_AUTO, version, CTLFLAG_RD, 0, "ZFS versions");
75static int zfs_version_acl = ZFS_ACL_VERSION;
76SYSCTL_INT(_vfs_zfs_version, OID_AUTO, acl, CTLFLAG_RD, &zfs_version_acl, 0,
77    "ZFS_ACL_VERSION");
78static int zfs_version_dmu_backup_header = DMU_BACKUP_HEADER_VERSION;
79SYSCTL_INT(_vfs_zfs_version, OID_AUTO, dmu_backup_header, CTLFLAG_RD,
80    &zfs_version_dmu_backup_header, 0, "DMU_BACKUP_HEADER_VERSION");
81static int zfs_version_dmu_backup_stream = DMU_BACKUP_STREAM_VERSION;
82SYSCTL_INT(_vfs_zfs_version, OID_AUTO, dmu_backup_stream, CTLFLAG_RD,
83    &zfs_version_dmu_backup_stream, 0, "DMU_BACKUP_STREAM_VERSION");
84static int zfs_version_spa = SPA_VERSION;
85SYSCTL_INT(_vfs_zfs_version, OID_AUTO, spa, CTLFLAG_RD, &zfs_version_spa, 0,
86    "SPA_VERSION");
87static int zfs_version_vdev_boot = VDEV_BOOT_VERSION;
88SYSCTL_INT(_vfs_zfs_version, OID_AUTO, vdev_boot, CTLFLAG_RD,
89    &zfs_version_vdev_boot, 0, "VDEV_BOOT_VERSION");
90static int zfs_version_zpl = ZPL_VERSION;
91SYSCTL_INT(_vfs_zfs_version, OID_AUTO, zpl, CTLFLAG_RD, &zfs_version_zpl, 0,
92    "ZPL_VERSION");
93
94static int zfs_mount(vfs_t *vfsp);
95static int zfs_umount(vfs_t *vfsp, int fflag);
96static int zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp);
97static int zfs_statfs(vfs_t *vfsp, struct statfs *statp);
98static int zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp);
99static int zfs_sync(vfs_t *vfsp, int waitfor);
100static int zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, int *extflagsp,
101    struct ucred **credanonp, int *numsecflavors, int **secflavors);
102static int zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, vnode_t **vpp);
103static void zfs_objset_close(zfsvfs_t *zfsvfs);
104static void zfs_freevfs(vfs_t *vfsp);
105
106static struct vfsops zfs_vfsops = {
107	.vfs_mount =		zfs_mount,
108	.vfs_unmount =		zfs_umount,
109	.vfs_root =		zfs_root,
110	.vfs_statfs =		zfs_statfs,
111	.vfs_vget =		zfs_vget,
112	.vfs_sync =		zfs_sync,
113	.vfs_checkexp =		zfs_checkexp,
114	.vfs_fhtovp =		zfs_fhtovp,
115};
116
117VFS_SET(zfs_vfsops, zfs, VFCF_JAIL | VFCF_DELEGADMIN);
118
119/*
120 * We need to keep a count of active fs's.
121 * This is necessary to prevent our module
122 * from being unloaded after a umount -f
123 */
124static uint32_t	zfs_active_fs_count = 0;
125
126/*ARGSUSED*/
127static int
128zfs_sync(vfs_t *vfsp, int waitfor)
129{
130
131	/*
132	 * Data integrity is job one.  We don't want a compromised kernel
133	 * writing to the storage pool, so we never sync during panic.
134	 */
135	if (panicstr)
136		return (0);
137
138	if (vfsp != NULL) {
139		/*
140		 * Sync a specific filesystem.
141		 */
142		zfsvfs_t *zfsvfs = vfsp->vfs_data;
143		int error;
144
145		error = vfs_stdsync(vfsp, waitfor);
146		if (error != 0)
147			return (error);
148
149		ZFS_ENTER(zfsvfs);
150		if (zfsvfs->z_log != NULL)
151			zil_commit(zfsvfs->z_log, UINT64_MAX, 0);
152		else
153			txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
154		ZFS_EXIT(zfsvfs);
155	} else {
156		/*
157		 * Sync all ZFS filesystems.  This is what happens when you
158		 * run sync(1M).  Unlike other filesystems, ZFS honors the
159		 * request by waiting for all pools to commit all dirty data.
160		 */
161		spa_sync_allpools();
162	}
163
164	return (0);
165}
166
167static void
168atime_changed_cb(void *arg, uint64_t newval)
169{
170	zfsvfs_t *zfsvfs = arg;
171
172	if (newval == TRUE) {
173		zfsvfs->z_atime = TRUE;
174		zfsvfs->z_vfs->vfs_flag &= ~MNT_NOATIME;
175		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME);
176		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0);
177	} else {
178		zfsvfs->z_atime = FALSE;
179		zfsvfs->z_vfs->vfs_flag |= MNT_NOATIME;
180		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME);
181		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0);
182	}
183}
184
185static void
186xattr_changed_cb(void *arg, uint64_t newval)
187{
188	zfsvfs_t *zfsvfs = arg;
189
190	if (newval == TRUE) {
191		/* XXX locking on vfs_flag? */
192#ifdef TODO
193		zfsvfs->z_vfs->vfs_flag |= VFS_XATTR;
194#endif
195		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR);
196		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_XATTR, NULL, 0);
197	} else {
198		/* XXX locking on vfs_flag? */
199#ifdef TODO
200		zfsvfs->z_vfs->vfs_flag &= ~VFS_XATTR;
201#endif
202		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_XATTR);
203		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR, NULL, 0);
204	}
205}
206
207static void
208blksz_changed_cb(void *arg, uint64_t newval)
209{
210	zfsvfs_t *zfsvfs = arg;
211
212	if (newval < SPA_MINBLOCKSIZE ||
213	    newval > SPA_MAXBLOCKSIZE || !ISP2(newval))
214		newval = SPA_MAXBLOCKSIZE;
215
216	zfsvfs->z_max_blksz = newval;
217	zfsvfs->z_vfs->mnt_stat.f_iosize = newval;
218}
219
220static void
221readonly_changed_cb(void *arg, uint64_t newval)
222{
223	zfsvfs_t *zfsvfs = arg;
224
225	if (newval) {
226		/* XXX locking on vfs_flag? */
227		zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY;
228		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW);
229		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0);
230	} else {
231		/* XXX locking on vfs_flag? */
232		zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
233		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO);
234		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0);
235	}
236}
237
238static void
239setuid_changed_cb(void *arg, uint64_t newval)
240{
241	zfsvfs_t *zfsvfs = arg;
242
243	if (newval == FALSE) {
244		zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID;
245		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID);
246		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0);
247	} else {
248		zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID;
249		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID);
250		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0);
251	}
252}
253
254static void
255exec_changed_cb(void *arg, uint64_t newval)
256{
257	zfsvfs_t *zfsvfs = arg;
258
259	if (newval == FALSE) {
260		zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC;
261		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC);
262		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0);
263	} else {
264		zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC;
265		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC);
266		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0);
267	}
268}
269
270/*
271 * The nbmand mount option can be changed at mount time.
272 * We can't allow it to be toggled on live file systems or incorrect
273 * behavior may be seen from cifs clients
274 *
275 * This property isn't registered via dsl_prop_register(), but this callback
276 * will be called when a file system is first mounted
277 */
278static void
279nbmand_changed_cb(void *arg, uint64_t newval)
280{
281	zfsvfs_t *zfsvfs = arg;
282	if (newval == FALSE) {
283		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND);
284		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND, NULL, 0);
285	} else {
286		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND);
287		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND, NULL, 0);
288	}
289}
290
291static void
292snapdir_changed_cb(void *arg, uint64_t newval)
293{
294	zfsvfs_t *zfsvfs = arg;
295
296	zfsvfs->z_show_ctldir = newval;
297}
298
299static void
300vscan_changed_cb(void *arg, uint64_t newval)
301{
302	zfsvfs_t *zfsvfs = arg;
303
304	zfsvfs->z_vscan = newval;
305}
306
307static void
308acl_mode_changed_cb(void *arg, uint64_t newval)
309{
310	zfsvfs_t *zfsvfs = arg;
311
312	zfsvfs->z_acl_mode = newval;
313}
314
315static void
316acl_inherit_changed_cb(void *arg, uint64_t newval)
317{
318	zfsvfs_t *zfsvfs = arg;
319
320	zfsvfs->z_acl_inherit = newval;
321}
322
323static int
324zfs_register_callbacks(vfs_t *vfsp)
325{
326	struct dsl_dataset *ds = NULL;
327	objset_t *os = NULL;
328	zfsvfs_t *zfsvfs = NULL;
329	uint64_t nbmand;
330	int readonly, do_readonly = FALSE;
331	int setuid, do_setuid = FALSE;
332	int exec, do_exec = FALSE;
333	int xattr, do_xattr = FALSE;
334	int atime, do_atime = FALSE;
335	int error = 0;
336
337	ASSERT(vfsp);
338	zfsvfs = vfsp->vfs_data;
339	ASSERT(zfsvfs);
340	os = zfsvfs->z_os;
341
342	/*
343	 * This function can be called for a snapshot when we update snapshot's
344	 * mount point, which isn't really supported.
345	 */
346	if (dmu_objset_is_snapshot(os))
347		return (EOPNOTSUPP);
348
349	/*
350	 * The act of registering our callbacks will destroy any mount
351	 * options we may have.  In order to enable temporary overrides
352	 * of mount options, we stash away the current values and
353	 * restore them after we register the callbacks.
354	 */
355	if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) {
356		readonly = B_TRUE;
357		do_readonly = B_TRUE;
358	} else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) {
359		readonly = B_FALSE;
360		do_readonly = B_TRUE;
361	}
362	if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) {
363		setuid = B_FALSE;
364		do_setuid = B_TRUE;
365	} else {
366		if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) {
367			setuid = B_FALSE;
368			do_setuid = B_TRUE;
369		} else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) {
370			setuid = B_TRUE;
371			do_setuid = B_TRUE;
372		}
373	}
374	if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) {
375		exec = B_FALSE;
376		do_exec = B_TRUE;
377	} else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) {
378		exec = B_TRUE;
379		do_exec = B_TRUE;
380	}
381	if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) {
382		xattr = B_FALSE;
383		do_xattr = B_TRUE;
384	} else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) {
385		xattr = B_TRUE;
386		do_xattr = B_TRUE;
387	}
388	if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) {
389		atime = B_FALSE;
390		do_atime = B_TRUE;
391	} else if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) {
392		atime = B_TRUE;
393		do_atime = B_TRUE;
394	}
395
396	/*
397	 * nbmand is a special property.  It can only be changed at
398	 * mount time.
399	 *
400	 * This is weird, but it is documented to only be changeable
401	 * at mount time.
402	 */
403	if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) {
404		nbmand = B_FALSE;
405	} else if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) {
406		nbmand = B_TRUE;
407	} else {
408		char osname[MAXNAMELEN];
409
410		dmu_objset_name(os, osname);
411		if (error = dsl_prop_get_integer(osname, "nbmand", &nbmand,
412		    NULL)) {
413			return (error);
414		}
415	}
416
417	/*
418	 * Register property callbacks.
419	 *
420	 * It would probably be fine to just check for i/o error from
421	 * the first prop_register(), but I guess I like to go
422	 * overboard...
423	 */
424	ds = dmu_objset_ds(os);
425	error = dsl_prop_register(ds, "atime", atime_changed_cb, zfsvfs);
426	error = error ? error : dsl_prop_register(ds,
427	    "xattr", xattr_changed_cb, zfsvfs);
428	error = error ? error : dsl_prop_register(ds,
429	    "recordsize", blksz_changed_cb, zfsvfs);
430	error = error ? error : dsl_prop_register(ds,
431	    "readonly", readonly_changed_cb, zfsvfs);
432	error = error ? error : dsl_prop_register(ds,
433	    "setuid", setuid_changed_cb, zfsvfs);
434	error = error ? error : dsl_prop_register(ds,
435	    "exec", exec_changed_cb, zfsvfs);
436	error = error ? error : dsl_prop_register(ds,
437	    "snapdir", snapdir_changed_cb, zfsvfs);
438	error = error ? error : dsl_prop_register(ds,
439	    "aclmode", acl_mode_changed_cb, zfsvfs);
440	error = error ? error : dsl_prop_register(ds,
441	    "aclinherit", acl_inherit_changed_cb, zfsvfs);
442	error = error ? error : dsl_prop_register(ds,
443	    "vscan", vscan_changed_cb, zfsvfs);
444	if (error)
445		goto unregister;
446
447	/*
448	 * Invoke our callbacks to restore temporary mount options.
449	 */
450	if (do_readonly)
451		readonly_changed_cb(zfsvfs, readonly);
452	if (do_setuid)
453		setuid_changed_cb(zfsvfs, setuid);
454	if (do_exec)
455		exec_changed_cb(zfsvfs, exec);
456	if (do_xattr)
457		xattr_changed_cb(zfsvfs, xattr);
458	if (do_atime)
459		atime_changed_cb(zfsvfs, atime);
460
461	nbmand_changed_cb(zfsvfs, nbmand);
462
463	return (0);
464
465unregister:
466	/*
467	 * We may attempt to unregister some callbacks that are not
468	 * registered, but this is OK; it will simply return ENOMSG,
469	 * which we will ignore.
470	 */
471	(void) dsl_prop_unregister(ds, "atime", atime_changed_cb, zfsvfs);
472	(void) dsl_prop_unregister(ds, "xattr", xattr_changed_cb, zfsvfs);
473	(void) dsl_prop_unregister(ds, "recordsize", blksz_changed_cb, zfsvfs);
474	(void) dsl_prop_unregister(ds, "readonly", readonly_changed_cb, zfsvfs);
475	(void) dsl_prop_unregister(ds, "setuid", setuid_changed_cb, zfsvfs);
476	(void) dsl_prop_unregister(ds, "exec", exec_changed_cb, zfsvfs);
477	(void) dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb, zfsvfs);
478	(void) dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb, zfsvfs);
479	(void) dsl_prop_unregister(ds, "aclinherit", acl_inherit_changed_cb,
480	    zfsvfs);
481	(void) dsl_prop_unregister(ds, "vscan", vscan_changed_cb, zfsvfs);
482	return (error);
483
484}
485
486static int
487zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
488{
489	int error;
490
491	error = zfs_register_callbacks(zfsvfs->z_vfs);
492	if (error)
493		return (error);
494
495	/*
496	 * Set the objset user_ptr to track its zfsvfs.
497	 */
498	mutex_enter(&zfsvfs->z_os->os->os_user_ptr_lock);
499	dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
500	mutex_exit(&zfsvfs->z_os->os->os_user_ptr_lock);
501
502	zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data);
503	if (zil_disable) {
504		zil_destroy(zfsvfs->z_log, B_FALSE);
505		zfsvfs->z_log = NULL;
506	}
507
508	/*
509	 * If we are not mounting (ie: online recv), then we don't
510	 * have to worry about replaying the log as we blocked all
511	 * operations out since we closed the ZIL.
512	 */
513	if (mounting) {
514		boolean_t readonly;
515
516		/*
517		 * During replay we remove the read only flag to
518		 * allow replays to succeed.
519		 */
520		readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY;
521		if (readonly != 0)
522			zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
523		else
524			zfs_unlinked_drain(zfsvfs);
525
526		if (zfsvfs->z_log) {
527			/*
528			 * Parse and replay the intent log.
529			 *
530			 * Because of ziltest, this must be done after
531			 * zfs_unlinked_drain().  (Further note: ziltest
532			 * doesn't use readonly mounts, where
533			 * zfs_unlinked_drain() isn't called.)  This is because
534			 * ziltest causes spa_sync() to think it's committed,
535			 * but actually it is not, so the intent log contains
536			 * many txg's worth of changes.
537			 *
538			 * In particular, if object N is in the unlinked set in
539			 * the last txg to actually sync, then it could be
540			 * actually freed in a later txg and then reallocated
541			 * in a yet later txg.  This would write a "create
542			 * object N" record to the intent log.  Normally, this
543			 * would be fine because the spa_sync() would have
544			 * written out the fact that object N is free, before
545			 * we could write the "create object N" intent log
546			 * record.
547			 *
548			 * But when we are in ziltest mode, we advance the "open
549			 * txg" without actually spa_sync()-ing the changes to
550			 * disk.  So we would see that object N is still
551			 * allocated and in the unlinked set, and there is an
552			 * intent log record saying to allocate it.
553			 */
554			zil_replay(zfsvfs->z_os, zfsvfs, &zfsvfs->z_assign,
555			    zfs_replay_vector, zfs_unlinked_drain);
556		}
557		zfsvfs->z_vfs->vfs_flag |= readonly; /* restore readonly bit */
558	}
559
560	return (0);
561}
562
563static void
564zfs_freezfsvfs(zfsvfs_t *zfsvfs)
565{
566	mutex_destroy(&zfsvfs->z_znodes_lock);
567	mutex_destroy(&zfsvfs->z_online_recv_lock);
568	list_destroy(&zfsvfs->z_all_znodes);
569	rrw_destroy(&zfsvfs->z_teardown_lock);
570	rw_destroy(&zfsvfs->z_teardown_inactive_lock);
571	rw_destroy(&zfsvfs->z_fuid_lock);
572	kmem_free(zfsvfs, sizeof (zfsvfs_t));
573}
574
575static int
576zfs_domount(vfs_t *vfsp, char *osname)
577{
578	uint64_t recordsize, readonly;
579	int error = 0;
580	int mode;
581	zfsvfs_t *zfsvfs;
582	znode_t *zp = NULL;
583
584	ASSERT(vfsp);
585	ASSERT(osname);
586
587	/*
588	 * Initialize the zfs-specific filesystem structure.
589	 * Should probably make this a kmem cache, shuffle fields,
590	 * and just bzero up to z_hold_mtx[].
591	 */
592	zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
593	zfsvfs->z_vfs = vfsp;
594	zfsvfs->z_parent = zfsvfs;
595	zfsvfs->z_assign = TXG_NOWAIT;
596	zfsvfs->z_max_blksz = SPA_MAXBLOCKSIZE;
597	zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE;
598
599	mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
600	mutex_init(&zfsvfs->z_online_recv_lock, NULL, MUTEX_DEFAULT, NULL);
601	list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
602	    offsetof(znode_t, z_link_node));
603	rrw_init(&zfsvfs->z_teardown_lock);
604	rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL);
605	rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL);
606
607	if (error = dsl_prop_get_integer(osname, "recordsize", &recordsize,
608	    NULL))
609		goto out;
610	zfsvfs->z_vfs->vfs_bsize = SPA_MINBLOCKSIZE;
611	zfsvfs->z_vfs->mnt_stat.f_iosize = recordsize;
612
613	vfsp->vfs_data = zfsvfs;
614	vfsp->mnt_flag |= MNT_LOCAL;
615	vfsp->mnt_kern_flag |= MNTK_MPSAFE;
616	vfsp->mnt_kern_flag |= MNTK_LOOKUP_SHARED;
617	vfsp->mnt_kern_flag |= MNTK_SHARED_WRITES;
618
619	if (error = dsl_prop_get_integer(osname, "readonly", &readonly, NULL))
620		goto out;
621
622	mode = DS_MODE_OWNER;
623	if (readonly)
624		mode |= DS_MODE_READONLY;
625
626	error = dmu_objset_open(osname, DMU_OST_ZFS, mode, &zfsvfs->z_os);
627	if (error == EROFS) {
628		mode = DS_MODE_OWNER | DS_MODE_READONLY;
629		error = dmu_objset_open(osname, DMU_OST_ZFS, mode,
630		    &zfsvfs->z_os);
631	}
632
633	if (error)
634		goto out;
635
636	if (error = zfs_init_fs(zfsvfs, &zp))
637		goto out;
638
639	/*
640	 * Set features for file system.
641	 */
642	zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
643	if (zfsvfs->z_use_fuids) {
644		vfs_set_feature(vfsp, VFSFT_XVATTR);
645		vfs_set_feature(vfsp, VFSFT_SYSATTR_VIEWS);
646		vfs_set_feature(vfsp, VFSFT_ACEMASKONACCESS);
647		vfs_set_feature(vfsp, VFSFT_ACLONCREATE);
648	}
649	if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) {
650		vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);
651		vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);
652		vfs_set_feature(vfsp, VFSFT_NOCASESENSITIVE);
653	} else if (zfsvfs->z_case == ZFS_CASE_MIXED) {
654		vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);
655		vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);
656	}
657
658	if (dmu_objset_is_snapshot(zfsvfs->z_os)) {
659		uint64_t pval;
660
661		ASSERT(mode & DS_MODE_READONLY);
662		atime_changed_cb(zfsvfs, B_FALSE);
663		readonly_changed_cb(zfsvfs, B_TRUE);
664		if (error = dsl_prop_get_integer(osname, "xattr", &pval, NULL))
665			goto out;
666		xattr_changed_cb(zfsvfs, pval);
667		zfsvfs->z_issnap = B_TRUE;
668	} else {
669		error = zfsvfs_setup(zfsvfs, B_TRUE);
670	}
671
672	vfs_mountedfrom(vfsp, osname);
673
674	if (!zfsvfs->z_issnap)
675		zfsctl_create(zfsvfs);
676out:
677	if (error) {
678		if (zfsvfs->z_os)
679			dmu_objset_close(zfsvfs->z_os);
680		zfs_freezfsvfs(zfsvfs);
681	} else {
682		atomic_add_32(&zfs_active_fs_count, 1);
683	}
684
685	return (error);
686}
687
688void
689zfs_unregister_callbacks(zfsvfs_t *zfsvfs)
690{
691	objset_t *os = zfsvfs->z_os;
692	struct dsl_dataset *ds;
693
694	/*
695	 * Unregister properties.
696	 */
697	if (!dmu_objset_is_snapshot(os)) {
698		ds = dmu_objset_ds(os);
699		VERIFY(dsl_prop_unregister(ds, "atime", atime_changed_cb,
700		    zfsvfs) == 0);
701
702		VERIFY(dsl_prop_unregister(ds, "xattr", xattr_changed_cb,
703		    zfsvfs) == 0);
704
705		VERIFY(dsl_prop_unregister(ds, "recordsize", blksz_changed_cb,
706		    zfsvfs) == 0);
707
708		VERIFY(dsl_prop_unregister(ds, "readonly", readonly_changed_cb,
709		    zfsvfs) == 0);
710
711		VERIFY(dsl_prop_unregister(ds, "setuid", setuid_changed_cb,
712		    zfsvfs) == 0);
713
714		VERIFY(dsl_prop_unregister(ds, "exec", exec_changed_cb,
715		    zfsvfs) == 0);
716
717		VERIFY(dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb,
718		    zfsvfs) == 0);
719
720		VERIFY(dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb,
721		    zfsvfs) == 0);
722
723		VERIFY(dsl_prop_unregister(ds, "aclinherit",
724		    acl_inherit_changed_cb, zfsvfs) == 0);
725
726		VERIFY(dsl_prop_unregister(ds, "vscan",
727		    vscan_changed_cb, zfsvfs) == 0);
728	}
729}
730
731/*ARGSUSED*/
732static int
733zfs_mount(vfs_t *vfsp)
734{
735	kthread_t	*td = curthread;
736	vnode_t		*mvp = vfsp->mnt_vnodecovered;
737	cred_t		*cr = td->td_ucred;
738	char		*osname;
739	int		error = 0;
740	int		canwrite;
741
742	if (vfs_getopt(vfsp->mnt_optnew, "from", (void **)&osname, NULL))
743		return (EINVAL);
744
745	/*
746	 * If full-owner-access is enabled and delegated administration is
747	 * turned on, we must set nosuid.
748	 */
749	if (zfs_super_owner &&
750	    dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != ECANCELED) {
751		secpolicy_fs_mount_clearopts(cr, vfsp);
752	}
753
754	/*
755	 * Check for mount privilege?
756	 *
757	 * If we don't have privilege then see if
758	 * we have local permission to allow it
759	 */
760	error = secpolicy_fs_mount(cr, mvp, vfsp);
761	if (error) {
762		error = dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr);
763		if (error != 0)
764			goto out;
765
766		if (!(vfsp->vfs_flag & MS_REMOUNT)) {
767			vattr_t		vattr;
768
769			/*
770			 * Make sure user is the owner of the mount point
771			 * or has sufficient privileges.
772			 */
773
774			vattr.va_mask = AT_UID;
775
776			vn_lock(mvp, LK_SHARED | LK_RETRY);
777			if (error = VOP_GETATTR(mvp, &vattr, cr)) {
778				VOP_UNLOCK(mvp, 0);
779				goto out;
780			}
781
782#if 0 /* CHECK THIS! Is probably needed for zfs_suser. */
783			if (secpolicy_vnode_owner(mvp, cr, vattr.va_uid) != 0 &&
784			    VOP_ACCESS(mvp, VWRITE, cr, td) != 0) {
785				error = EPERM;
786				goto out;
787			}
788#else
789			if (error = secpolicy_vnode_owner(mvp, cr, vattr.va_uid)) {
790				VOP_UNLOCK(mvp, 0);
791				goto out;
792			}
793
794			if (error = VOP_ACCESS(mvp, VWRITE, cr, td)) {
795				VOP_UNLOCK(mvp, 0);
796				goto out;
797			}
798			VOP_UNLOCK(mvp, 0);
799#endif
800		}
801
802		secpolicy_fs_mount_clearopts(cr, vfsp);
803	}
804
805	/*
806	 * Refuse to mount a filesystem if we are in a local zone and the
807	 * dataset is not visible.
808	 */
809	if (!INGLOBALZONE(curthread) &&
810	    (!zone_dataset_visible(osname, &canwrite) || !canwrite)) {
811		error = EPERM;
812		goto out;
813	}
814
815	/*
816	 * When doing a remount, we simply refresh our temporary properties
817	 * according to those options set in the current VFS options.
818	 */
819	if (vfsp->vfs_flag & MS_REMOUNT) {
820		/* refresh mount options */
821		zfs_unregister_callbacks(vfsp->vfs_data);
822		error = zfs_register_callbacks(vfsp);
823		goto out;
824	}
825
826	DROP_GIANT();
827	error = zfs_domount(vfsp, osname);
828	PICKUP_GIANT();
829out:
830	return (error);
831}
832
833static int
834zfs_statfs(vfs_t *vfsp, struct statfs *statp)
835{
836	zfsvfs_t *zfsvfs = vfsp->vfs_data;
837	uint64_t refdbytes, availbytes, usedobjs, availobjs;
838
839	statp->f_version = STATFS_VERSION;
840
841	ZFS_ENTER(zfsvfs);
842
843	dmu_objset_space(zfsvfs->z_os,
844	    &refdbytes, &availbytes, &usedobjs, &availobjs);
845
846	/*
847	 * The underlying storage pool actually uses multiple block sizes.
848	 * We report the fragsize as the smallest block size we support,
849	 * and we report our blocksize as the filesystem's maximum blocksize.
850	 */
851	statp->f_bsize = SPA_MINBLOCKSIZE;
852	statp->f_iosize = zfsvfs->z_vfs->mnt_stat.f_iosize;
853
854	/*
855	 * The following report "total" blocks of various kinds in the
856	 * file system, but reported in terms of f_frsize - the
857	 * "fragment" size.
858	 */
859
860	statp->f_blocks = (refdbytes + availbytes) >> SPA_MINBLOCKSHIFT;
861	statp->f_bfree = availbytes / statp->f_bsize;
862	statp->f_bavail = statp->f_bfree; /* no root reservation */
863
864	/*
865	 * statvfs() should really be called statufs(), because it assumes
866	 * static metadata.  ZFS doesn't preallocate files, so the best
867	 * we can do is report the max that could possibly fit in f_files,
868	 * and that minus the number actually used in f_ffree.
869	 * For f_ffree, report the smaller of the number of object available
870	 * and the number of blocks (each object will take at least a block).
871	 */
872	statp->f_ffree = MIN(availobjs, statp->f_bfree);
873	statp->f_files = statp->f_ffree + usedobjs;
874
875	/*
876	 * We're a zfs filesystem.
877	 */
878	(void) strlcpy(statp->f_fstypename, "zfs", sizeof(statp->f_fstypename));
879
880	strlcpy(statp->f_mntfromname, vfsp->mnt_stat.f_mntfromname,
881	    sizeof(statp->f_mntfromname));
882	strlcpy(statp->f_mntonname, vfsp->mnt_stat.f_mntonname,
883	    sizeof(statp->f_mntonname));
884
885	statp->f_namemax = ZFS_MAXNAMELEN;
886
887	ZFS_EXIT(zfsvfs);
888	return (0);
889}
890
891static int
892zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp)
893{
894	zfsvfs_t *zfsvfs = vfsp->vfs_data;
895	znode_t *rootzp;
896	int error;
897
898	ZFS_ENTER_NOERROR(zfsvfs);
899
900	error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp);
901
902	ZFS_EXIT(zfsvfs);
903
904	if (error == 0) {
905		*vpp = ZTOV(rootzp);
906		error = vn_lock(*vpp, flags);
907		(*vpp)->v_vflag |= VV_ROOT;
908	}
909
910	return (error);
911}
912
913/*
914 * Teardown the zfsvfs::z_os.
915 *
916 * Note, if 'unmounting' if FALSE, we return with the 'z_teardown_lock'
917 * and 'z_teardown_inactive_lock' held.
918 */
919static int
920zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting)
921{
922	znode_t	*zp;
923
924	rrw_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
925
926	if (!unmounting) {
927		/*
928		 * We purge the parent filesystem's vfsp as the parent
929		 * filesystem and all of its snapshots have their vnode's
930		 * v_vfsp set to the parent's filesystem's vfsp.  Note,
931		 * 'z_parent' is self referential for non-snapshots.
932		 */
933		(void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0);
934#ifdef FREEBSD_NAMECACHE
935		cache_purgevfs(zfsvfs->z_parent->z_vfs);
936#endif
937	}
938
939	/*
940	 * Close the zil. NB: Can't close the zil while zfs_inactive
941	 * threads are blocked as zil_close can call zfs_inactive.
942	 */
943	if (zfsvfs->z_log) {
944		zil_close(zfsvfs->z_log);
945		zfsvfs->z_log = NULL;
946	}
947
948	rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_WRITER);
949
950	/*
951	 * If we are not unmounting (ie: online recv) and someone already
952	 * unmounted this file system while we were doing the switcheroo,
953	 * or a reopen of z_os failed then just bail out now.
954	 */
955	if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) {
956		rw_exit(&zfsvfs->z_teardown_inactive_lock);
957		rrw_exit(&zfsvfs->z_teardown_lock, FTAG);
958		return (EIO);
959	}
960
961	/*
962	 * At this point there are no vops active, and any new vops will
963	 * fail with EIO since we have z_teardown_lock for writer (only
964	 * relavent for forced unmount).
965	 *
966	 * Release all holds on dbufs.
967	 */
968	mutex_enter(&zfsvfs->z_znodes_lock);
969	for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL;
970	    zp = list_next(&zfsvfs->z_all_znodes, zp))
971		if (zp->z_dbuf) {
972			ASSERT(ZTOV(zp)->v_count >= 0);
973			zfs_znode_dmu_fini(zp);
974		}
975	mutex_exit(&zfsvfs->z_znodes_lock);
976
977	/*
978	 * If we are unmounting, set the unmounted flag and let new vops
979	 * unblock.  zfs_inactive will have the unmounted behavior, and all
980	 * other vops will fail with EIO.
981	 */
982	if (unmounting) {
983		zfsvfs->z_unmounted = B_TRUE;
984		rrw_exit(&zfsvfs->z_teardown_lock, FTAG);
985		rw_exit(&zfsvfs->z_teardown_inactive_lock);
986
987#ifdef __FreeBSD__
988		/*
989		 * Some znodes might not be fully reclaimed, wait for them.
990		 */
991		mutex_enter(&zfsvfs->z_znodes_lock);
992		while (list_head(&zfsvfs->z_all_znodes) != NULL) {
993			msleep(zfsvfs, &zfsvfs->z_znodes_lock, 0,
994			    "zteardown", 0);
995		}
996		mutex_exit(&zfsvfs->z_znodes_lock);
997#endif
998	}
999
1000	/*
1001	 * z_os will be NULL if there was an error in attempting to reopen
1002	 * zfsvfs, so just return as the properties had already been
1003	 * unregistered and cached data had been evicted before.
1004	 */
1005	if (zfsvfs->z_os == NULL)
1006		return (0);
1007
1008	/*
1009	 * Unregister properties.
1010	 */
1011	zfs_unregister_callbacks(zfsvfs);
1012
1013	/*
1014	 * Evict cached data
1015	 */
1016	if (dmu_objset_evict_dbufs(zfsvfs->z_os)) {
1017		txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
1018		(void) dmu_objset_evict_dbufs(zfsvfs->z_os);
1019	}
1020
1021	return (0);
1022}
1023
1024/*ARGSUSED*/
1025static int
1026zfs_umount(vfs_t *vfsp, int fflag)
1027{
1028	zfsvfs_t *zfsvfs = vfsp->vfs_data;
1029	objset_t *os;
1030	cred_t *cr = curthread->td_ucred;
1031	int ret;
1032
1033	ret = secpolicy_fs_unmount(cr, vfsp);
1034	if (ret) {
1035		ret = dsl_deleg_access((char *)refstr_value(vfsp->vfs_resource),
1036		    ZFS_DELEG_PERM_MOUNT, cr);
1037		if (ret)
1038			return (ret);
1039	}
1040	/*
1041	 * We purge the parent filesystem's vfsp as the parent filesystem
1042	 * and all of its snapshots have their vnode's v_vfsp set to the
1043	 * parent's filesystem's vfsp.  Note, 'z_parent' is self
1044	 * referential for non-snapshots.
1045	 */
1046	(void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0);
1047
1048	/*
1049	 * Unmount any snapshots mounted under .zfs before unmounting the
1050	 * dataset itself.
1051	 */
1052	if (zfsvfs->z_ctldir != NULL) {
1053		if ((ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0)
1054			return (ret);
1055		ret = vflush(vfsp, 0, 0, curthread);
1056		ASSERT(ret == EBUSY);
1057		if (!(fflag & MS_FORCE)) {
1058			if (zfsvfs->z_ctldir->v_count > 1)
1059				return (EBUSY);
1060			ASSERT(zfsvfs->z_ctldir->v_count == 1);
1061		}
1062		zfsctl_destroy(zfsvfs);
1063		ASSERT(zfsvfs->z_ctldir == NULL);
1064	}
1065
1066	if (fflag & MS_FORCE) {
1067		/*
1068		 * Mark file system as unmounted before calling
1069		 * vflush(FORCECLOSE). This way we ensure no future vnops
1070		 * will be called and risk operating on DOOMED vnodes.
1071		 */
1072		rrw_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
1073		zfsvfs->z_unmounted = B_TRUE;
1074		rrw_exit(&zfsvfs->z_teardown_lock, FTAG);
1075	}
1076
1077	/*
1078	 * Flush all the files.
1079	 */
1080	ret = vflush(vfsp, 1, (fflag & MS_FORCE) ? FORCECLOSE : 0, curthread);
1081	if (ret != 0) {
1082		if (!zfsvfs->z_issnap) {
1083			zfsctl_create(zfsvfs);
1084			ASSERT(zfsvfs->z_ctldir != NULL);
1085		}
1086		return (ret);
1087	}
1088
1089	if (!(fflag & MS_FORCE)) {
1090		/*
1091		 * Check the number of active vnodes in the file system.
1092		 * Our count is maintained in the vfs structure, but the
1093		 * number is off by 1 to indicate a hold on the vfs
1094		 * structure itself.
1095		 *
1096		 * The '.zfs' directory maintains a reference of its
1097		 * own, and any active references underneath are
1098		 * reflected in the vnode count.
1099		 */
1100		if (zfsvfs->z_ctldir == NULL) {
1101			if (vfsp->vfs_count > 1)
1102				return (EBUSY);
1103		} else {
1104			if (vfsp->vfs_count > 2 ||
1105			    zfsvfs->z_ctldir->v_count > 1)
1106				return (EBUSY);
1107		}
1108	} else {
1109		MNT_ILOCK(vfsp);
1110		vfsp->mnt_kern_flag |= MNTK_UNMOUNTF;
1111		MNT_IUNLOCK(vfsp);
1112	}
1113
1114	VERIFY(zfsvfs_teardown(zfsvfs, B_TRUE) == 0);
1115	os = zfsvfs->z_os;
1116
1117	/*
1118	 * z_os will be NULL if there was an error in
1119	 * attempting to reopen zfsvfs.
1120	 */
1121	if (os != NULL) {
1122		/*
1123		 * Unset the objset user_ptr.
1124		 */
1125		mutex_enter(&os->os->os_user_ptr_lock);
1126		dmu_objset_set_user(os, NULL);
1127		mutex_exit(&os->os->os_user_ptr_lock);
1128
1129		/*
1130		 * Finally release the objset
1131		 */
1132		dmu_objset_close(os);
1133	}
1134
1135	/*
1136	 * We can now safely destroy the '.zfs' directory node.
1137	 */
1138	if (zfsvfs->z_ctldir != NULL)
1139		zfsctl_destroy(zfsvfs);
1140	if (zfsvfs->z_issnap) {
1141		vnode_t *svp = vfsp->mnt_vnodecovered;
1142
1143		if (svp->v_count >= 2)
1144			VN_RELE(svp);
1145	}
1146	zfs_freevfs(vfsp);
1147
1148	return (0);
1149}
1150
1151static int
1152zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp)
1153{
1154	zfsvfs_t	*zfsvfs = vfsp->vfs_data;
1155	znode_t		*zp;
1156	int 		err;
1157
1158	/*
1159	 * XXXPJD: zfs_zget() can't operate on virtual entires like .zfs/ or
1160	 * .zfs/snapshot/ directories, so for now just return EOPNOTSUPP.
1161	 * This will make NFS to fall back to using READDIR instead of
1162	 * READDIRPLUS.
1163	 * Also snapshots are stored in AVL tree, but based on their names,
1164	 * not inode numbers, so it will be very inefficient to iterate
1165	 * over all snapshots to find the right one.
1166	 * Note that OpenSolaris READDIRPLUS implementation does LOOKUP on
1167	 * d_name, and not VGET on d_fileno as we do.
1168	 */
1169	if (ino == ZFSCTL_INO_ROOT || ino == ZFSCTL_INO_SNAPDIR)
1170		return (EOPNOTSUPP);
1171
1172	ZFS_ENTER(zfsvfs);
1173	err = zfs_zget(zfsvfs, ino, &zp);
1174	if (err == 0 && zp->z_unlinked) {
1175		VN_RELE(ZTOV(zp));
1176		err = EINVAL;
1177	}
1178	ZFS_EXIT(zfsvfs);
1179	if (err != 0)
1180		*vpp = NULL;
1181	else {
1182		*vpp = ZTOV(zp);
1183		vn_lock(*vpp, flags);
1184	}
1185	return (err);
1186}
1187
1188static int
1189zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, int *extflagsp,
1190    struct ucred **credanonp, int *numsecflavors, int **secflavors)
1191{
1192	zfsvfs_t *zfsvfs = vfsp->vfs_data;
1193
1194	/*
1195	 * If this is regular file system vfsp is the same as
1196	 * zfsvfs->z_parent->z_vfs, but if it is snapshot,
1197	 * zfsvfs->z_parent->z_vfs represents parent file system
1198	 * which we have to use here, because only this file system
1199	 * has mnt_export configured.
1200	 */
1201	return (vfs_stdcheckexp(zfsvfs->z_parent->z_vfs, nam, extflagsp,
1202	    credanonp, numsecflavors, secflavors));
1203}
1204
1205CTASSERT(SHORT_FID_LEN <= sizeof(struct fid));
1206CTASSERT(LONG_FID_LEN <= sizeof(struct fid));
1207
1208static int
1209zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, vnode_t **vpp)
1210{
1211	zfsvfs_t	*zfsvfs = vfsp->vfs_data;
1212	znode_t		*zp;
1213	uint64_t	object = 0;
1214	uint64_t	fid_gen = 0;
1215	uint64_t	gen_mask;
1216	uint64_t	zp_gen;
1217	int		i, err;
1218
1219	*vpp = NULL;
1220
1221	ZFS_ENTER(zfsvfs);
1222
1223	/*
1224	 * On FreeBSD we can get snapshot's mount point or its parent file
1225	 * system mount point depending if snapshot is already mounted or not.
1226	 */
1227	if (zfsvfs->z_parent == zfsvfs && fidp->fid_len == LONG_FID_LEN) {
1228		zfid_long_t	*zlfid = (zfid_long_t *)fidp;
1229		uint64_t	objsetid = 0;
1230		uint64_t	setgen = 0;
1231
1232		for (i = 0; i < sizeof (zlfid->zf_setid); i++)
1233			objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i);
1234
1235		for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
1236			setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i);
1237
1238		ZFS_EXIT(zfsvfs);
1239
1240		err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs);
1241		if (err)
1242			return (EINVAL);
1243		ZFS_ENTER(zfsvfs);
1244	}
1245
1246	if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) {
1247		zfid_short_t	*zfid = (zfid_short_t *)fidp;
1248
1249		for (i = 0; i < sizeof (zfid->zf_object); i++)
1250			object |= ((uint64_t)zfid->zf_object[i]) << (8 * i);
1251
1252		for (i = 0; i < sizeof (zfid->zf_gen); i++)
1253			fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i);
1254	} else {
1255		ZFS_EXIT(zfsvfs);
1256		return (EINVAL);
1257	}
1258
1259	/* A zero fid_gen means we are in the .zfs control directories */
1260	if (fid_gen == 0 &&
1261	    (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) {
1262		*vpp = zfsvfs->z_ctldir;
1263		ASSERT(*vpp != NULL);
1264		if (object == ZFSCTL_INO_SNAPDIR) {
1265			VERIFY(zfsctl_root_lookup(*vpp, "snapshot", vpp, NULL,
1266			    0, NULL, NULL, NULL, NULL, NULL) == 0);
1267		} else {
1268			VN_HOLD(*vpp);
1269		}
1270		ZFS_EXIT(zfsvfs);
1271		vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
1272		return (0);
1273	}
1274
1275	gen_mask = -1ULL >> (64 - 8 * i);
1276
1277	dprintf("getting %llu [%u mask %llx]\n", object, fid_gen, gen_mask);
1278	if (err = zfs_zget(zfsvfs, object, &zp)) {
1279		ZFS_EXIT(zfsvfs);
1280		return (err);
1281	}
1282	zp_gen = zp->z_phys->zp_gen & gen_mask;
1283	if (zp_gen == 0)
1284		zp_gen = 1;
1285	if (zp->z_unlinked || zp_gen != fid_gen) {
1286		dprintf("znode gen (%u) != fid gen (%u)\n", zp_gen, fid_gen);
1287		VN_RELE(ZTOV(zp));
1288		ZFS_EXIT(zfsvfs);
1289		return (EINVAL);
1290	}
1291
1292	ZFS_EXIT(zfsvfs);
1293
1294	*vpp = ZTOV(zp);
1295	vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
1296	vnode_create_vobject(*vpp, zp->z_phys->zp_size, curthread);
1297	return (0);
1298}
1299
1300/*
1301 * Block out VOPs and close zfsvfs_t::z_os
1302 *
1303 * Note, if successful, then we return with the 'z_teardown_lock' and
1304 * 'z_teardown_inactive_lock' write held.
1305 */
1306int
1307zfs_suspend_fs(zfsvfs_t *zfsvfs, char *name, int *mode)
1308{
1309	int error;
1310
1311	if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0)
1312		return (error);
1313
1314	*mode = zfsvfs->z_os->os_mode;
1315	dmu_objset_name(zfsvfs->z_os, name);
1316	dmu_objset_close(zfsvfs->z_os);
1317
1318	return (0);
1319}
1320
1321/*
1322 * Reopen zfsvfs_t::z_os and release VOPs.
1323 */
1324int
1325zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname, int mode)
1326{
1327	int err;
1328
1329	ASSERT(RRW_WRITE_HELD(&zfsvfs->z_teardown_lock));
1330	ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock));
1331
1332	err = dmu_objset_open(osname, DMU_OST_ZFS, mode, &zfsvfs->z_os);
1333	if (err) {
1334		zfsvfs->z_os = NULL;
1335	} else {
1336		znode_t *zp;
1337
1338		VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0);
1339
1340		/*
1341		 * Attempt to re-establish all the active znodes with
1342		 * their dbufs.  If a zfs_rezget() fails, then we'll let
1343		 * any potential callers discover that via ZFS_ENTER_VERIFY_VP
1344		 * when they try to use their znode.
1345		 */
1346		mutex_enter(&zfsvfs->z_znodes_lock);
1347		for (zp = list_head(&zfsvfs->z_all_znodes); zp;
1348		    zp = list_next(&zfsvfs->z_all_znodes, zp)) {
1349			(void) zfs_rezget(zp);
1350		}
1351		mutex_exit(&zfsvfs->z_znodes_lock);
1352
1353	}
1354
1355	/* release the VOPs */
1356	rw_exit(&zfsvfs->z_teardown_inactive_lock);
1357	rrw_exit(&zfsvfs->z_teardown_lock, FTAG);
1358
1359	if (err) {
1360		/*
1361		 * Since we couldn't reopen zfsvfs::z_os, force
1362		 * unmount this file system.
1363		 */
1364		if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0)
1365			(void) dounmount(zfsvfs->z_vfs, MS_FORCE, curthread);
1366	}
1367	return (err);
1368}
1369
1370static void
1371zfs_freevfs(vfs_t *vfsp)
1372{
1373	zfsvfs_t *zfsvfs = vfsp->vfs_data;
1374	int i;
1375
1376	for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
1377		mutex_destroy(&zfsvfs->z_hold_mtx[i]);
1378
1379	zfs_fuid_destroy(zfsvfs);
1380	zfs_freezfsvfs(zfsvfs);
1381
1382	atomic_add_32(&zfs_active_fs_count, -1);
1383}
1384
1385#ifdef __i386__
1386static int desiredvnodes_backup;
1387#endif
1388
1389static void
1390zfs_vnodes_adjust(void)
1391{
1392#ifdef __i386__
1393	int newdesiredvnodes;
1394
1395	desiredvnodes_backup = desiredvnodes;
1396
1397	/*
1398	 * We calculate newdesiredvnodes the same way it is done in
1399	 * vntblinit(). If it is equal to desiredvnodes, it means that
1400	 * it wasn't tuned by the administrator and we can tune it down.
1401	 */
1402	newdesiredvnodes = min(maxproc + cnt.v_page_count / 4, 2 *
1403	    vm_kmem_size / (5 * (sizeof(struct vm_object) +
1404	    sizeof(struct vnode))));
1405	if (newdesiredvnodes == desiredvnodes)
1406		desiredvnodes = (3 * newdesiredvnodes) / 4;
1407#endif
1408}
1409
1410static void
1411zfs_vnodes_adjust_back(void)
1412{
1413
1414#ifdef __i386__
1415	desiredvnodes = desiredvnodes_backup;
1416#endif
1417}
1418
1419void
1420zfs_init(void)
1421{
1422
1423	printf("ZFS filesystem version " ZPL_VERSION_STRING "\n");
1424
1425	/*
1426	 * Initialize znode cache, vnode ops, etc...
1427	 */
1428	zfs_znode_init();
1429
1430	/*
1431	 * Initialize .zfs directory structures
1432	 */
1433	zfsctl_init();
1434
1435	/*
1436	 * Reduce number of vnode. Originally number of vnodes is calculated
1437	 * with UFS inode in mind. We reduce it here, because it's too big for
1438	 * ZFS/i386.
1439	 */
1440	zfs_vnodes_adjust();
1441}
1442
1443void
1444zfs_fini(void)
1445{
1446	zfsctl_fini();
1447	zfs_znode_fini();
1448	zfs_vnodes_adjust_back();
1449}
1450
1451int
1452zfs_busy(void)
1453{
1454	return (zfs_active_fs_count != 0);
1455}
1456
1457int
1458zfs_set_version(const char *name, uint64_t newvers)
1459{
1460	int error;
1461	objset_t *os;
1462	dmu_tx_t *tx;
1463	uint64_t curvers;
1464
1465	/*
1466	 * XXX for now, require that the filesystem be unmounted.  Would
1467	 * be nice to find the zfsvfs_t and just update that if
1468	 * possible.
1469	 */
1470
1471	if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION)
1472		return (EINVAL);
1473
1474	error = dmu_objset_open(name, DMU_OST_ZFS, DS_MODE_OWNER, &os);
1475	if (error)
1476		return (error);
1477
1478	error = zap_lookup(os, MASTER_NODE_OBJ, ZPL_VERSION_STR,
1479	    8, 1, &curvers);
1480	if (error)
1481		goto out;
1482	if (newvers < curvers) {
1483		error = EINVAL;
1484		goto out;
1485	}
1486
1487	tx = dmu_tx_create(os);
1488	dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, 0, ZPL_VERSION_STR);
1489	error = dmu_tx_assign(tx, TXG_WAIT);
1490	if (error) {
1491		dmu_tx_abort(tx);
1492		goto out;
1493	}
1494	error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR, 8, 1,
1495	    &newvers, tx);
1496
1497	spa_history_internal_log(LOG_DS_UPGRADE,
1498	    dmu_objset_spa(os), tx, CRED(),
1499	    "oldver=%llu newver=%llu dataset = %llu", curvers, newvers,
1500	    dmu_objset_id(os));
1501	dmu_tx_commit(tx);
1502
1503out:
1504	dmu_objset_close(os);
1505	return (error);
1506}
1507/*
1508 * Read a property stored within the master node.
1509 */
1510int
1511zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value)
1512{
1513	const char *pname;
1514	int error = ENOENT;
1515
1516	/*
1517	 * Look up the file system's value for the property.  For the
1518	 * version property, we look up a slightly different string.
1519	 */
1520	if (prop == ZFS_PROP_VERSION)
1521		pname = ZPL_VERSION_STR;
1522	else
1523		pname = zfs_prop_to_name(prop);
1524
1525	if (os != NULL)
1526		error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value);
1527
1528	if (error == ENOENT) {
1529		/* No value set, use the default value */
1530		switch (prop) {
1531		case ZFS_PROP_VERSION:
1532			*value = ZPL_VERSION;
1533			break;
1534		case ZFS_PROP_NORMALIZE:
1535		case ZFS_PROP_UTF8ONLY:
1536			*value = 0;
1537			break;
1538		case ZFS_PROP_CASE:
1539			*value = ZFS_CASE_SENSITIVE;
1540			break;
1541		default:
1542			return (error);
1543		}
1544		error = 0;
1545	}
1546	return (error);
1547}
1548