zfs_vfsops.c revision 196944
1158115Sume/*
2158115Sume * CDDL HEADER START
3158115Sume *
4158115Sume * The contents of this file are subject to the terms of the
5158115Sume * Common Development and Distribution License (the "License").
6158115Sume * You may not use this file except in compliance with the License.
7158115Sume *
8158115Sume * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9158115Sume * or http://www.opensolaris.org/os/licensing.
10158115Sume * See the License for the specific language governing permissions
11158115Sume * and limitations under the License.
12158115Sume *
13158115Sume * When distributing Covered Code, include this CDDL HEADER in each
14158115Sume * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15158115Sume * If applicable, add the following below this CDDL HEADER, with the
16158115Sume * fields enclosed by brackets "[]" replaced with your own identifying
17158115Sume * information: Portions Copyright [yyyy] [name of copyright owner]
18158115Sume *
19158115Sume * CDDL HEADER END
20158115Sume */
21158115Sume/*
22158115Sume * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23158115Sume * Use is subject to license terms.
24158115Sume */
25158115Sume
26158115Sume#include <sys/types.h>
27158115Sume#include <sys/param.h>
28158115Sume#include <sys/systm.h>
29158115Sume#include <sys/kernel.h>
30158115Sume#include <sys/sysmacros.h>
31194112Sdes#include <sys/kmem.h>
32194112Sdes#include <sys/acl.h>
33#include <sys/vnode.h>
34#include <sys/vfs.h>
35#include <sys/mntent.h>
36#include <sys/mount.h>
37#include <sys/cmn_err.h>
38#include <sys/zfs_znode.h>
39#include <sys/zfs_dir.h>
40#include <sys/zil.h>
41#include <sys/fs/zfs.h>
42#include <sys/dmu.h>
43#include <sys/dsl_prop.h>
44#include <sys/dsl_dataset.h>
45#include <sys/dsl_deleg.h>
46#include <sys/spa.h>
47#include <sys/zap.h>
48#include <sys/varargs.h>
49#include <sys/policy.h>
50#include <sys/atomic.h>
51#include <sys/zfs_ioctl.h>
52#include <sys/zfs_ctldir.h>
53#include <sys/zfs_fuid.h>
54#include <sys/sunddi.h>
55#include <sys/dnlc.h>
56#include <sys/dmu_objset.h>
57#include <sys/spa_boot.h>
58#include <sys/vdev_impl.h>	/* VDEV_BOOT_VERSION */
59
60struct mtx zfs_debug_mtx;
61MTX_SYSINIT(zfs_debug_mtx, &zfs_debug_mtx, "zfs_debug", MTX_DEF);
62
63SYSCTL_NODE(_vfs, OID_AUTO, zfs, CTLFLAG_RW, 0, "ZFS file system");
64
65int zfs_super_owner = 0;
66SYSCTL_INT(_vfs_zfs, OID_AUTO, super_owner, CTLFLAG_RW, &zfs_super_owner, 0,
67    "File system owner can perform privileged operation on his file systems");
68
69int zfs_debug_level = 0;
70TUNABLE_INT("vfs.zfs.debug", &zfs_debug_level);
71SYSCTL_INT(_vfs_zfs, OID_AUTO, debug, CTLFLAG_RW, &zfs_debug_level, 0,
72    "Debug level");
73
74SYSCTL_NODE(_vfs_zfs, OID_AUTO, version, CTLFLAG_RD, 0, "ZFS versions");
75static int zfs_version_acl = ZFS_ACL_VERSION;
76SYSCTL_INT(_vfs_zfs_version, OID_AUTO, acl, CTLFLAG_RD, &zfs_version_acl, 0,
77    "ZFS_ACL_VERSION");
78static int zfs_version_dmu_backup_header = DMU_BACKUP_HEADER_VERSION;
79SYSCTL_INT(_vfs_zfs_version, OID_AUTO, dmu_backup_header, CTLFLAG_RD,
80    &zfs_version_dmu_backup_header, 0, "DMU_BACKUP_HEADER_VERSION");
81static int zfs_version_dmu_backup_stream = DMU_BACKUP_STREAM_VERSION;
82SYSCTL_INT(_vfs_zfs_version, OID_AUTO, dmu_backup_stream, CTLFLAG_RD,
83    &zfs_version_dmu_backup_stream, 0, "DMU_BACKUP_STREAM_VERSION");
84static int zfs_version_spa = SPA_VERSION;
85SYSCTL_INT(_vfs_zfs_version, OID_AUTO, spa, CTLFLAG_RD, &zfs_version_spa, 0,
86    "SPA_VERSION");
87static int zfs_version_vdev_boot = VDEV_BOOT_VERSION;
88SYSCTL_INT(_vfs_zfs_version, OID_AUTO, vdev_boot, CTLFLAG_RD,
89    &zfs_version_vdev_boot, 0, "VDEV_BOOT_VERSION");
90static int zfs_version_zpl = ZPL_VERSION;
91SYSCTL_INT(_vfs_zfs_version, OID_AUTO, zpl, CTLFLAG_RD, &zfs_version_zpl, 0,
92    "ZPL_VERSION");
93
94static int zfs_mount(vfs_t *vfsp);
95static int zfs_umount(vfs_t *vfsp, int fflag);
96static int zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp);
97static int zfs_statfs(vfs_t *vfsp, struct statfs *statp);
98static int zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp);
99static int zfs_sync(vfs_t *vfsp, int waitfor);
100static int zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, vnode_t **vpp);
101static void zfs_objset_close(zfsvfs_t *zfsvfs);
102static void zfs_freevfs(vfs_t *vfsp);
103
104static struct vfsops zfs_vfsops = {
105	.vfs_mount =		zfs_mount,
106	.vfs_unmount =		zfs_umount,
107	.vfs_root =		zfs_root,
108	.vfs_statfs =		zfs_statfs,
109	.vfs_vget =		zfs_vget,
110	.vfs_sync =		zfs_sync,
111	.vfs_fhtovp =		zfs_fhtovp,
112};
113
114VFS_SET(zfs_vfsops, zfs, VFCF_JAIL | VFCF_DELEGADMIN);
115
116/*
117 * We need to keep a count of active fs's.
118 * This is necessary to prevent our module
119 * from being unloaded after a umount -f
120 */
121static uint32_t	zfs_active_fs_count = 0;
122
123/*ARGSUSED*/
124static int
125zfs_sync(vfs_t *vfsp, int waitfor)
126{
127
128	/*
129	 * Data integrity is job one.  We don't want a compromised kernel
130	 * writing to the storage pool, so we never sync during panic.
131	 */
132	if (panicstr)
133		return (0);
134
135	if (vfsp != NULL) {
136		/*
137		 * Sync a specific filesystem.
138		 */
139		zfsvfs_t *zfsvfs = vfsp->vfs_data;
140		int error;
141
142		error = vfs_stdsync(vfsp, waitfor);
143		if (error != 0)
144			return (error);
145
146		ZFS_ENTER(zfsvfs);
147		if (zfsvfs->z_log != NULL)
148			zil_commit(zfsvfs->z_log, UINT64_MAX, 0);
149		else
150			txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
151		ZFS_EXIT(zfsvfs);
152	} else {
153		/*
154		 * Sync all ZFS filesystems.  This is what happens when you
155		 * run sync(1M).  Unlike other filesystems, ZFS honors the
156		 * request by waiting for all pools to commit all dirty data.
157		 */
158		spa_sync_allpools();
159	}
160
161	return (0);
162}
163
164static void
165atime_changed_cb(void *arg, uint64_t newval)
166{
167	zfsvfs_t *zfsvfs = arg;
168
169	if (newval == TRUE) {
170		zfsvfs->z_atime = TRUE;
171		zfsvfs->z_vfs->vfs_flag &= ~MNT_NOATIME;
172		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME);
173		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0);
174	} else {
175		zfsvfs->z_atime = FALSE;
176		zfsvfs->z_vfs->vfs_flag |= MNT_NOATIME;
177		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME);
178		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0);
179	}
180}
181
182static void
183xattr_changed_cb(void *arg, uint64_t newval)
184{
185	zfsvfs_t *zfsvfs = arg;
186
187	if (newval == TRUE) {
188		/* XXX locking on vfs_flag? */
189#ifdef TODO
190		zfsvfs->z_vfs->vfs_flag |= VFS_XATTR;
191#endif
192		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR);
193		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_XATTR, NULL, 0);
194	} else {
195		/* XXX locking on vfs_flag? */
196#ifdef TODO
197		zfsvfs->z_vfs->vfs_flag &= ~VFS_XATTR;
198#endif
199		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_XATTR);
200		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR, NULL, 0);
201	}
202}
203
204static void
205blksz_changed_cb(void *arg, uint64_t newval)
206{
207	zfsvfs_t *zfsvfs = arg;
208
209	if (newval < SPA_MINBLOCKSIZE ||
210	    newval > SPA_MAXBLOCKSIZE || !ISP2(newval))
211		newval = SPA_MAXBLOCKSIZE;
212
213	zfsvfs->z_max_blksz = newval;
214	zfsvfs->z_vfs->vfs_bsize = newval;
215}
216
217static void
218readonly_changed_cb(void *arg, uint64_t newval)
219{
220	zfsvfs_t *zfsvfs = arg;
221
222	if (newval) {
223		/* XXX locking on vfs_flag? */
224		zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY;
225		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW);
226		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0);
227	} else {
228		/* XXX locking on vfs_flag? */
229		zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
230		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO);
231		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0);
232	}
233}
234
235static void
236setuid_changed_cb(void *arg, uint64_t newval)
237{
238	zfsvfs_t *zfsvfs = arg;
239
240	if (newval == FALSE) {
241		zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID;
242		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID);
243		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0);
244	} else {
245		zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID;
246		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID);
247		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0);
248	}
249}
250
251static void
252exec_changed_cb(void *arg, uint64_t newval)
253{
254	zfsvfs_t *zfsvfs = arg;
255
256	if (newval == FALSE) {
257		zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC;
258		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC);
259		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0);
260	} else {
261		zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC;
262		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC);
263		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0);
264	}
265}
266
267/*
268 * The nbmand mount option can be changed at mount time.
269 * We can't allow it to be toggled on live file systems or incorrect
270 * behavior may be seen from cifs clients
271 *
272 * This property isn't registered via dsl_prop_register(), but this callback
273 * will be called when a file system is first mounted
274 */
275static void
276nbmand_changed_cb(void *arg, uint64_t newval)
277{
278	zfsvfs_t *zfsvfs = arg;
279	if (newval == FALSE) {
280		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND);
281		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND, NULL, 0);
282	} else {
283		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND);
284		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND, NULL, 0);
285	}
286}
287
288static void
289snapdir_changed_cb(void *arg, uint64_t newval)
290{
291	zfsvfs_t *zfsvfs = arg;
292
293	zfsvfs->z_show_ctldir = newval;
294}
295
296static void
297vscan_changed_cb(void *arg, uint64_t newval)
298{
299	zfsvfs_t *zfsvfs = arg;
300
301	zfsvfs->z_vscan = newval;
302}
303
304static void
305acl_mode_changed_cb(void *arg, uint64_t newval)
306{
307	zfsvfs_t *zfsvfs = arg;
308
309	zfsvfs->z_acl_mode = newval;
310}
311
312static void
313acl_inherit_changed_cb(void *arg, uint64_t newval)
314{
315	zfsvfs_t *zfsvfs = arg;
316
317	zfsvfs->z_acl_inherit = newval;
318}
319
320static int
321zfs_register_callbacks(vfs_t *vfsp)
322{
323	struct dsl_dataset *ds = NULL;
324	objset_t *os = NULL;
325	zfsvfs_t *zfsvfs = NULL;
326	uint64_t nbmand;
327	int readonly, do_readonly = FALSE;
328	int setuid, do_setuid = FALSE;
329	int exec, do_exec = FALSE;
330	int xattr, do_xattr = FALSE;
331	int atime, do_atime = FALSE;
332	int error = 0;
333
334	ASSERT(vfsp);
335	zfsvfs = vfsp->vfs_data;
336	ASSERT(zfsvfs);
337	os = zfsvfs->z_os;
338
339	/*
340	 * The act of registering our callbacks will destroy any mount
341	 * options we may have.  In order to enable temporary overrides
342	 * of mount options, we stash away the current values and
343	 * restore them after we register the callbacks.
344	 */
345	if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) {
346		readonly = B_TRUE;
347		do_readonly = B_TRUE;
348	} else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) {
349		readonly = B_FALSE;
350		do_readonly = B_TRUE;
351	}
352	if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) {
353		setuid = B_FALSE;
354		do_setuid = B_TRUE;
355	} else {
356		if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) {
357			setuid = B_FALSE;
358			do_setuid = B_TRUE;
359		} else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) {
360			setuid = B_TRUE;
361			do_setuid = B_TRUE;
362		}
363	}
364	if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) {
365		exec = B_FALSE;
366		do_exec = B_TRUE;
367	} else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) {
368		exec = B_TRUE;
369		do_exec = B_TRUE;
370	}
371	if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) {
372		xattr = B_FALSE;
373		do_xattr = B_TRUE;
374	} else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) {
375		xattr = B_TRUE;
376		do_xattr = B_TRUE;
377	}
378	if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) {
379		atime = B_FALSE;
380		do_atime = B_TRUE;
381	} else if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) {
382		atime = B_TRUE;
383		do_atime = B_TRUE;
384	}
385
386	/*
387	 * nbmand is a special property.  It can only be changed at
388	 * mount time.
389	 *
390	 * This is weird, but it is documented to only be changeable
391	 * at mount time.
392	 */
393	if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) {
394		nbmand = B_FALSE;
395	} else if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) {
396		nbmand = B_TRUE;
397	} else {
398		char osname[MAXNAMELEN];
399
400		dmu_objset_name(os, osname);
401		if (error = dsl_prop_get_integer(osname, "nbmand", &nbmand,
402		    NULL)) {
403			return (error);
404		}
405	}
406
407	/*
408	 * Register property callbacks.
409	 *
410	 * It would probably be fine to just check for i/o error from
411	 * the first prop_register(), but I guess I like to go
412	 * overboard...
413	 */
414	ds = dmu_objset_ds(os);
415	error = dsl_prop_register(ds, "atime", atime_changed_cb, zfsvfs);
416	error = error ? error : dsl_prop_register(ds,
417	    "xattr", xattr_changed_cb, zfsvfs);
418	error = error ? error : dsl_prop_register(ds,
419	    "recordsize", blksz_changed_cb, zfsvfs);
420	error = error ? error : dsl_prop_register(ds,
421	    "readonly", readonly_changed_cb, zfsvfs);
422	error = error ? error : dsl_prop_register(ds,
423	    "setuid", setuid_changed_cb, zfsvfs);
424	error = error ? error : dsl_prop_register(ds,
425	    "exec", exec_changed_cb, zfsvfs);
426	error = error ? error : dsl_prop_register(ds,
427	    "snapdir", snapdir_changed_cb, zfsvfs);
428	error = error ? error : dsl_prop_register(ds,
429	    "aclmode", acl_mode_changed_cb, zfsvfs);
430	error = error ? error : dsl_prop_register(ds,
431	    "aclinherit", acl_inherit_changed_cb, zfsvfs);
432	error = error ? error : dsl_prop_register(ds,
433	    "vscan", vscan_changed_cb, zfsvfs);
434	if (error)
435		goto unregister;
436
437	/*
438	 * Invoke our callbacks to restore temporary mount options.
439	 */
440	if (do_readonly)
441		readonly_changed_cb(zfsvfs, readonly);
442	if (do_setuid)
443		setuid_changed_cb(zfsvfs, setuid);
444	if (do_exec)
445		exec_changed_cb(zfsvfs, exec);
446	if (do_xattr)
447		xattr_changed_cb(zfsvfs, xattr);
448	if (do_atime)
449		atime_changed_cb(zfsvfs, atime);
450
451	nbmand_changed_cb(zfsvfs, nbmand);
452
453	return (0);
454
455unregister:
456	/*
457	 * We may attempt to unregister some callbacks that are not
458	 * registered, but this is OK; it will simply return ENOMSG,
459	 * which we will ignore.
460	 */
461	(void) dsl_prop_unregister(ds, "atime", atime_changed_cb, zfsvfs);
462	(void) dsl_prop_unregister(ds, "xattr", xattr_changed_cb, zfsvfs);
463	(void) dsl_prop_unregister(ds, "recordsize", blksz_changed_cb, zfsvfs);
464	(void) dsl_prop_unregister(ds, "readonly", readonly_changed_cb, zfsvfs);
465	(void) dsl_prop_unregister(ds, "setuid", setuid_changed_cb, zfsvfs);
466	(void) dsl_prop_unregister(ds, "exec", exec_changed_cb, zfsvfs);
467	(void) dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb, zfsvfs);
468	(void) dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb, zfsvfs);
469	(void) dsl_prop_unregister(ds, "aclinherit", acl_inherit_changed_cb,
470	    zfsvfs);
471	(void) dsl_prop_unregister(ds, "vscan", vscan_changed_cb, zfsvfs);
472	return (error);
473
474}
475
476static int
477zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
478{
479	int error;
480
481	error = zfs_register_callbacks(zfsvfs->z_vfs);
482	if (error)
483		return (error);
484
485	/*
486	 * Set the objset user_ptr to track its zfsvfs.
487	 */
488	mutex_enter(&zfsvfs->z_os->os->os_user_ptr_lock);
489	dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
490	mutex_exit(&zfsvfs->z_os->os->os_user_ptr_lock);
491
492	/*
493	 * If we are not mounting (ie: online recv), then we don't
494	 * have to worry about replaying the log as we blocked all
495	 * operations out since we closed the ZIL.
496	 */
497	if (mounting) {
498		boolean_t readonly;
499
500		/*
501		 * During replay we remove the read only flag to
502		 * allow replays to succeed.
503		 */
504		readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY;
505		zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
506
507		/*
508		 * Parse and replay the intent log.
509		 */
510		zil_replay(zfsvfs->z_os, zfsvfs, &zfsvfs->z_assign,
511		    zfs_replay_vector, zfs_unlinked_drain);
512
513		zfs_unlinked_drain(zfsvfs);
514		zfsvfs->z_vfs->vfs_flag |= readonly; /* restore readonly bit */
515	}
516
517	if (!zil_disable)
518		zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data);
519
520	return (0);
521}
522
523static void
524zfs_freezfsvfs(zfsvfs_t *zfsvfs)
525{
526	mutex_destroy(&zfsvfs->z_znodes_lock);
527	mutex_destroy(&zfsvfs->z_online_recv_lock);
528	list_destroy(&zfsvfs->z_all_znodes);
529	rrw_destroy(&zfsvfs->z_teardown_lock);
530	rw_destroy(&zfsvfs->z_teardown_inactive_lock);
531	rw_destroy(&zfsvfs->z_fuid_lock);
532	kmem_free(zfsvfs, sizeof (zfsvfs_t));
533}
534
535static int
536zfs_domount(vfs_t *vfsp, char *osname)
537{
538	uint64_t recordsize, readonly;
539	int error = 0;
540	int mode;
541	zfsvfs_t *zfsvfs;
542	znode_t *zp = NULL;
543
544	ASSERT(vfsp);
545	ASSERT(osname);
546
547	/*
548	 * Initialize the zfs-specific filesystem structure.
549	 * Should probably make this a kmem cache, shuffle fields,
550	 * and just bzero up to z_hold_mtx[].
551	 */
552	zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
553	zfsvfs->z_vfs = vfsp;
554	zfsvfs->z_parent = zfsvfs;
555	zfsvfs->z_assign = TXG_NOWAIT;
556	zfsvfs->z_max_blksz = SPA_MAXBLOCKSIZE;
557	zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE;
558
559	mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
560	mutex_init(&zfsvfs->z_online_recv_lock, NULL, MUTEX_DEFAULT, NULL);
561	list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
562	    offsetof(znode_t, z_link_node));
563	rrw_init(&zfsvfs->z_teardown_lock);
564	rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL);
565	rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL);
566
567	if (error = dsl_prop_get_integer(osname, "recordsize", &recordsize,
568	    NULL))
569		goto out;
570	zfsvfs->z_vfs->vfs_bsize = recordsize;
571
572	vfsp->vfs_data = zfsvfs;
573	vfsp->mnt_flag |= MNT_LOCAL;
574	vfsp->mnt_kern_flag |= MNTK_MPSAFE;
575	vfsp->mnt_kern_flag |= MNTK_LOOKUP_SHARED;
576	vfsp->mnt_kern_flag |= MNTK_SHARED_WRITES;
577
578	if (error = dsl_prop_get_integer(osname, "readonly", &readonly, NULL))
579		goto out;
580
581	mode = DS_MODE_OWNER;
582	if (readonly)
583		mode |= DS_MODE_READONLY;
584
585	error = dmu_objset_open(osname, DMU_OST_ZFS, mode, &zfsvfs->z_os);
586	if (error == EROFS) {
587		mode = DS_MODE_OWNER | DS_MODE_READONLY;
588		error = dmu_objset_open(osname, DMU_OST_ZFS, mode,
589		    &zfsvfs->z_os);
590	}
591
592	if (error)
593		goto out;
594
595	if (error = zfs_init_fs(zfsvfs, &zp))
596		goto out;
597
598	/*
599	 * Set features for file system.
600	 */
601	zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
602	if (zfsvfs->z_use_fuids) {
603		vfs_set_feature(vfsp, VFSFT_XVATTR);
604		vfs_set_feature(vfsp, VFSFT_SYSATTR_VIEWS);
605		vfs_set_feature(vfsp, VFSFT_ACEMASKONACCESS);
606		vfs_set_feature(vfsp, VFSFT_ACLONCREATE);
607	}
608	if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) {
609		vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);
610		vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);
611		vfs_set_feature(vfsp, VFSFT_NOCASESENSITIVE);
612	} else if (zfsvfs->z_case == ZFS_CASE_MIXED) {
613		vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);
614		vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);
615	}
616
617	if (dmu_objset_is_snapshot(zfsvfs->z_os)) {
618		uint64_t pval;
619
620		ASSERT(mode & DS_MODE_READONLY);
621		atime_changed_cb(zfsvfs, B_FALSE);
622		readonly_changed_cb(zfsvfs, B_TRUE);
623		if (error = dsl_prop_get_integer(osname, "xattr", &pval, NULL))
624			goto out;
625		xattr_changed_cb(zfsvfs, pval);
626		zfsvfs->z_issnap = B_TRUE;
627	} else {
628		error = zfsvfs_setup(zfsvfs, B_TRUE);
629	}
630
631	vfs_mountedfrom(vfsp, osname);
632
633	if (!zfsvfs->z_issnap)
634		zfsctl_create(zfsvfs);
635out:
636	if (error) {
637		if (zfsvfs->z_os)
638			dmu_objset_close(zfsvfs->z_os);
639		zfs_freezfsvfs(zfsvfs);
640	} else {
641		atomic_add_32(&zfs_active_fs_count, 1);
642	}
643
644	return (error);
645}
646
647void
648zfs_unregister_callbacks(zfsvfs_t *zfsvfs)
649{
650	objset_t *os = zfsvfs->z_os;
651	struct dsl_dataset *ds;
652
653	/*
654	 * Unregister properties.
655	 */
656	if (!dmu_objset_is_snapshot(os)) {
657		ds = dmu_objset_ds(os);
658		VERIFY(dsl_prop_unregister(ds, "atime", atime_changed_cb,
659		    zfsvfs) == 0);
660
661		VERIFY(dsl_prop_unregister(ds, "xattr", xattr_changed_cb,
662		    zfsvfs) == 0);
663
664		VERIFY(dsl_prop_unregister(ds, "recordsize", blksz_changed_cb,
665		    zfsvfs) == 0);
666
667		VERIFY(dsl_prop_unregister(ds, "readonly", readonly_changed_cb,
668		    zfsvfs) == 0);
669
670		VERIFY(dsl_prop_unregister(ds, "setuid", setuid_changed_cb,
671		    zfsvfs) == 0);
672
673		VERIFY(dsl_prop_unregister(ds, "exec", exec_changed_cb,
674		    zfsvfs) == 0);
675
676		VERIFY(dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb,
677		    zfsvfs) == 0);
678
679		VERIFY(dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb,
680		    zfsvfs) == 0);
681
682		VERIFY(dsl_prop_unregister(ds, "aclinherit",
683		    acl_inherit_changed_cb, zfsvfs) == 0);
684
685		VERIFY(dsl_prop_unregister(ds, "vscan",
686		    vscan_changed_cb, zfsvfs) == 0);
687	}
688}
689
690/*ARGSUSED*/
691static int
692zfs_mount(vfs_t *vfsp)
693{
694	kthread_t	*td = curthread;
695	vnode_t		*mvp = vfsp->mnt_vnodecovered;
696	cred_t		*cr = td->td_ucred;
697	char		*osname;
698	int		error = 0;
699	int		canwrite;
700
701	if (vfs_getopt(vfsp->mnt_optnew, "from", (void **)&osname, NULL))
702		return (EINVAL);
703
704	/*
705	 * If full-owner-access is enabled and delegated administration is
706	 * turned on, we must set nosuid.
707	 */
708	if (zfs_super_owner &&
709	    dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != ECANCELED) {
710		secpolicy_fs_mount_clearopts(cr, vfsp);
711	}
712
713	/*
714	 * Check for mount privilege?
715	 *
716	 * If we don't have privilege then see if
717	 * we have local permission to allow it
718	 */
719	error = secpolicy_fs_mount(cr, mvp, vfsp);
720	if (error) {
721		error = dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr);
722		if (error != 0)
723			goto out;
724
725		if (!(vfsp->vfs_flag & MS_REMOUNT)) {
726			vattr_t		vattr;
727
728			/*
729			 * Make sure user is the owner of the mount point
730			 * or has sufficient privileges.
731			 */
732
733			vattr.va_mask = AT_UID;
734
735			vn_lock(mvp, LK_SHARED | LK_RETRY);
736			if (error = VOP_GETATTR(mvp, &vattr, cr)) {
737				VOP_UNLOCK(mvp, 0);
738				goto out;
739			}
740
741#if 0 /* CHECK THIS! Is probably needed for zfs_suser. */
742			if (secpolicy_vnode_owner(mvp, cr, vattr.va_uid) != 0 &&
743			    VOP_ACCESS(mvp, VWRITE, cr, td) != 0) {
744				error = EPERM;
745				goto out;
746			}
747#else
748			if (error = secpolicy_vnode_owner(mvp, cr, vattr.va_uid)) {
749				VOP_UNLOCK(mvp, 0);
750				goto out;
751			}
752
753			if (error = VOP_ACCESS(mvp, VWRITE, cr, td)) {
754				VOP_UNLOCK(mvp, 0);
755				goto out;
756			}
757			VOP_UNLOCK(mvp, 0);
758#endif
759		}
760
761		secpolicy_fs_mount_clearopts(cr, vfsp);
762	}
763
764	/*
765	 * Refuse to mount a filesystem if we are in a local zone and the
766	 * dataset is not visible.
767	 */
768	if (!INGLOBALZONE(curthread) &&
769	    (!zone_dataset_visible(osname, &canwrite) || !canwrite)) {
770		error = EPERM;
771		goto out;
772	}
773
774	/*
775	 * When doing a remount, we simply refresh our temporary properties
776	 * according to those options set in the current VFS options.
777	 */
778	if (vfsp->vfs_flag & MS_REMOUNT) {
779		/* refresh mount options */
780		zfs_unregister_callbacks(vfsp->vfs_data);
781		error = zfs_register_callbacks(vfsp);
782		goto out;
783	}
784
785	DROP_GIANT();
786	error = zfs_domount(vfsp, osname);
787	PICKUP_GIANT();
788out:
789	return (error);
790}
791
792static int
793zfs_statfs(vfs_t *vfsp, struct statfs *statp)
794{
795	zfsvfs_t *zfsvfs = vfsp->vfs_data;
796	uint64_t refdbytes, availbytes, usedobjs, availobjs;
797
798	statp->f_version = STATFS_VERSION;
799
800	ZFS_ENTER(zfsvfs);
801
802	dmu_objset_space(zfsvfs->z_os,
803	    &refdbytes, &availbytes, &usedobjs, &availobjs);
804
805	/*
806	 * The underlying storage pool actually uses multiple block sizes.
807	 * We report the fragsize as the smallest block size we support,
808	 * and we report our blocksize as the filesystem's maximum blocksize.
809	 */
810	statp->f_bsize = zfsvfs->z_vfs->vfs_bsize;
811	statp->f_iosize = zfsvfs->z_vfs->vfs_bsize;
812
813	/*
814	 * The following report "total" blocks of various kinds in the
815	 * file system, but reported in terms of f_frsize - the
816	 * "fragment" size.
817	 */
818
819	statp->f_blocks = (refdbytes + availbytes) / statp->f_bsize;
820	statp->f_bfree = availbytes / statp->f_bsize;
821	statp->f_bavail = statp->f_bfree; /* no root reservation */
822
823	/*
824	 * statvfs() should really be called statufs(), because it assumes
825	 * static metadata.  ZFS doesn't preallocate files, so the best
826	 * we can do is report the max that could possibly fit in f_files,
827	 * and that minus the number actually used in f_ffree.
828	 * For f_ffree, report the smaller of the number of object available
829	 * and the number of blocks (each object will take at least a block).
830	 */
831	statp->f_ffree = MIN(availobjs, statp->f_bfree);
832	statp->f_files = statp->f_ffree + usedobjs;
833
834	/*
835	 * We're a zfs filesystem.
836	 */
837	(void) strlcpy(statp->f_fstypename, "zfs", sizeof(statp->f_fstypename));
838
839	strlcpy(statp->f_mntfromname, vfsp->mnt_stat.f_mntfromname,
840	    sizeof(statp->f_mntfromname));
841	strlcpy(statp->f_mntonname, vfsp->mnt_stat.f_mntonname,
842	    sizeof(statp->f_mntonname));
843
844	statp->f_namemax = ZFS_MAXNAMELEN;
845
846	ZFS_EXIT(zfsvfs);
847	return (0);
848}
849
850static int
851zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp)
852{
853	zfsvfs_t *zfsvfs = vfsp->vfs_data;
854	znode_t *rootzp;
855	int error;
856
857	ZFS_ENTER(zfsvfs);
858
859	error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp);
860	if (error == 0) {
861		*vpp = ZTOV(rootzp);
862		error = vn_lock(*vpp, flags);
863		(*vpp)->v_vflag |= VV_ROOT;
864	}
865
866	ZFS_EXIT(zfsvfs);
867	return (error);
868}
869
870/*
871 * Teardown the zfsvfs::z_os.
872 *
873 * Note, if 'unmounting' if FALSE, we return with the 'z_teardown_lock'
874 * and 'z_teardown_inactive_lock' held.
875 */
876static int
877zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting)
878{
879	znode_t	*zp;
880
881	rrw_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
882
883	if (!unmounting) {
884		/*
885		 * We purge the parent filesystem's vfsp as the parent
886		 * filesystem and all of its snapshots have their vnode's
887		 * v_vfsp set to the parent's filesystem's vfsp.  Note,
888		 * 'z_parent' is self referential for non-snapshots.
889		 */
890		(void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0);
891	}
892
893	/*
894	 * Close the zil. NB: Can't close the zil while zfs_inactive
895	 * threads are blocked as zil_close can call zfs_inactive.
896	 */
897	if (zfsvfs->z_log) {
898		zil_close(zfsvfs->z_log);
899		zfsvfs->z_log = NULL;
900	}
901
902	rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_WRITER);
903
904	/*
905	 * If we are not unmounting (ie: online recv) and someone already
906	 * unmounted this file system while we were doing the switcheroo,
907	 * or a reopen of z_os failed then just bail out now.
908	 */
909	if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) {
910		rw_exit(&zfsvfs->z_teardown_inactive_lock);
911		rrw_exit(&zfsvfs->z_teardown_lock, FTAG);
912		return (EIO);
913	}
914
915	/*
916	 * At this point there are no vops active, and any new vops will
917	 * fail with EIO since we have z_teardown_lock for writer (only
918	 * relavent for forced unmount).
919	 *
920	 * Release all holds on dbufs.
921	 */
922	mutex_enter(&zfsvfs->z_znodes_lock);
923	for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL;
924	    zp = list_next(&zfsvfs->z_all_znodes, zp))
925		if (zp->z_dbuf) {
926			ASSERT(ZTOV(zp)->v_count >= 0);
927			zfs_znode_dmu_fini(zp);
928		}
929	mutex_exit(&zfsvfs->z_znodes_lock);
930
931	/*
932	 * If we are unmounting, set the unmounted flag and let new vops
933	 * unblock.  zfs_inactive will have the unmounted behavior, and all
934	 * other vops will fail with EIO.
935	 */
936	if (unmounting) {
937		zfsvfs->z_unmounted = B_TRUE;
938		rrw_exit(&zfsvfs->z_teardown_lock, FTAG);
939		rw_exit(&zfsvfs->z_teardown_inactive_lock);
940	}
941
942	/*
943	 * z_os will be NULL if there was an error in attempting to reopen
944	 * zfsvfs, so just return as the properties had already been
945	 * unregistered and cached data had been evicted before.
946	 */
947	if (zfsvfs->z_os == NULL)
948		return (0);
949
950	/*
951	 * Unregister properties.
952	 */
953	zfs_unregister_callbacks(zfsvfs);
954
955	/*
956	 * Evict cached data
957	 */
958	if (dmu_objset_evict_dbufs(zfsvfs->z_os)) {
959		txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
960		(void) dmu_objset_evict_dbufs(zfsvfs->z_os);
961	}
962
963	return (0);
964}
965
966/*ARGSUSED*/
967static int
968zfs_umount(vfs_t *vfsp, int fflag)
969{
970	zfsvfs_t *zfsvfs = vfsp->vfs_data;
971	objset_t *os;
972	cred_t *cr = curthread->td_ucred;
973	int ret;
974
975	if (fflag & MS_FORCE) {
976		/* TODO: Force unmount is not well implemented yet, so deny it. */
977		ZFS_LOG(0, "Force unmount is experimental - report any problems.");
978	}
979
980	ret = secpolicy_fs_unmount(cr, vfsp);
981	if (ret) {
982		ret = dsl_deleg_access((char *)refstr_value(vfsp->vfs_resource),
983		    ZFS_DELEG_PERM_MOUNT, cr);
984		if (ret)
985			return (ret);
986	}
987	/*
988	 * We purge the parent filesystem's vfsp as the parent filesystem
989	 * and all of its snapshots have their vnode's v_vfsp set to the
990	 * parent's filesystem's vfsp.  Note, 'z_parent' is self
991	 * referential for non-snapshots.
992	 */
993	(void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0);
994
995	/*
996	 * Unmount any snapshots mounted under .zfs before unmounting the
997	 * dataset itself.
998	 */
999	if (zfsvfs->z_ctldir != NULL) {
1000		if ((ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0)
1001			return (ret);
1002		ret = vflush(vfsp, 0, 0, curthread);
1003		ASSERT(ret == EBUSY);
1004		if (!(fflag & MS_FORCE)) {
1005			if (zfsvfs->z_ctldir->v_count > 1)
1006				return (EBUSY);
1007			ASSERT(zfsvfs->z_ctldir->v_count == 1);
1008		}
1009		zfsctl_destroy(zfsvfs);
1010		ASSERT(zfsvfs->z_ctldir == NULL);
1011	}
1012
1013	/*
1014	 * Flush all the files.
1015	 */
1016	ret = vflush(vfsp, 1, (fflag & MS_FORCE) ? FORCECLOSE : 0, curthread);
1017	if (ret != 0) {
1018		if (!zfsvfs->z_issnap) {
1019			zfsctl_create(zfsvfs);
1020			ASSERT(zfsvfs->z_ctldir != NULL);
1021		}
1022		return (ret);
1023	}
1024
1025	if (!(fflag & MS_FORCE)) {
1026		/*
1027		 * Check the number of active vnodes in the file system.
1028		 * Our count is maintained in the vfs structure, but the
1029		 * number is off by 1 to indicate a hold on the vfs
1030		 * structure itself.
1031		 *
1032		 * The '.zfs' directory maintains a reference of its
1033		 * own, and any active references underneath are
1034		 * reflected in the vnode count.
1035		 */
1036		if (zfsvfs->z_ctldir == NULL) {
1037			if (vfsp->vfs_count > 1)
1038				return (EBUSY);
1039		} else {
1040			if (vfsp->vfs_count > 2 ||
1041			    zfsvfs->z_ctldir->v_count > 1)
1042				return (EBUSY);
1043		}
1044	} else {
1045		MNT_ILOCK(vfsp);
1046		vfsp->mnt_kern_flag |= MNTK_UNMOUNTF;
1047		MNT_IUNLOCK(vfsp);
1048	}
1049
1050	VERIFY(zfsvfs_teardown(zfsvfs, B_TRUE) == 0);
1051	os = zfsvfs->z_os;
1052
1053	/*
1054	 * z_os will be NULL if there was an error in
1055	 * attempting to reopen zfsvfs.
1056	 */
1057	if (os != NULL) {
1058		/*
1059		 * Unset the objset user_ptr.
1060		 */
1061		mutex_enter(&os->os->os_user_ptr_lock);
1062		dmu_objset_set_user(os, NULL);
1063		mutex_exit(&os->os->os_user_ptr_lock);
1064
1065		/*
1066		 * Finally release the objset
1067		 */
1068		dmu_objset_close(os);
1069	}
1070
1071	/*
1072	 * We can now safely destroy the '.zfs' directory node.
1073	 */
1074	if (zfsvfs->z_ctldir != NULL)
1075		zfsctl_destroy(zfsvfs);
1076	if (zfsvfs->z_issnap) {
1077		vnode_t *svp = vfsp->mnt_vnodecovered;
1078
1079		ASSERT(svp->v_count == 2 || svp->v_count == 1);
1080		if (svp->v_count == 2)
1081			VN_RELE(svp);
1082	}
1083	zfs_freevfs(vfsp);
1084
1085	return (0);
1086}
1087
1088static int
1089zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp)
1090{
1091	zfsvfs_t	*zfsvfs = vfsp->vfs_data;
1092	znode_t		*zp;
1093	int 		err;
1094
1095	ZFS_ENTER(zfsvfs);
1096	err = zfs_zget(zfsvfs, ino, &zp);
1097	if (err == 0 && zp->z_unlinked) {
1098		VN_RELE(ZTOV(zp));
1099		err = EINVAL;
1100	}
1101	if (err != 0)
1102		*vpp = NULL;
1103	else {
1104		*vpp = ZTOV(zp);
1105		vn_lock(*vpp, flags);
1106	}
1107	ZFS_EXIT(zfsvfs);
1108	return (err);
1109}
1110
1111static int
1112zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, vnode_t **vpp)
1113{
1114	zfsvfs_t	*zfsvfs = vfsp->vfs_data;
1115	znode_t		*zp;
1116	uint64_t	object = 0;
1117	uint64_t	fid_gen = 0;
1118	uint64_t	gen_mask;
1119	uint64_t	zp_gen;
1120	int		i, err;
1121
1122	*vpp = NULL;
1123
1124	ZFS_ENTER(zfsvfs);
1125
1126	if (fidp->fid_len == LONG_FID_LEN) {
1127		zfid_long_t	*zlfid = (zfid_long_t *)fidp;
1128		uint64_t	objsetid = 0;
1129		uint64_t	setgen = 0;
1130
1131		for (i = 0; i < sizeof (zlfid->zf_setid); i++)
1132			objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i);
1133
1134		for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
1135			setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i);
1136
1137		ZFS_EXIT(zfsvfs);
1138
1139		err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs);
1140		if (err)
1141			return (EINVAL);
1142		ZFS_ENTER(zfsvfs);
1143	}
1144
1145	if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) {
1146		zfid_short_t	*zfid = (zfid_short_t *)fidp;
1147
1148		for (i = 0; i < sizeof (zfid->zf_object); i++)
1149			object |= ((uint64_t)zfid->zf_object[i]) << (8 * i);
1150
1151		for (i = 0; i < sizeof (zfid->zf_gen); i++)
1152			fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i);
1153	} else {
1154		ZFS_EXIT(zfsvfs);
1155		return (EINVAL);
1156	}
1157
1158	/* A zero fid_gen means we are in the .zfs control directories */
1159	if (fid_gen == 0 &&
1160	    (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) {
1161		*vpp = zfsvfs->z_ctldir;
1162		ASSERT(*vpp != NULL);
1163		if (object == ZFSCTL_INO_SNAPDIR) {
1164			VERIFY(zfsctl_root_lookup(*vpp, "snapshot", vpp, NULL,
1165			    0, NULL, NULL, NULL, NULL, NULL) == 0);
1166		} else {
1167			VN_HOLD(*vpp);
1168		}
1169		ZFS_EXIT(zfsvfs);
1170		/* XXX: LK_RETRY? */
1171		vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
1172		return (0);
1173	}
1174
1175	gen_mask = -1ULL >> (64 - 8 * i);
1176
1177	dprintf("getting %llu [%u mask %llx]\n", object, fid_gen, gen_mask);
1178	if (err = zfs_zget(zfsvfs, object, &zp)) {
1179		ZFS_EXIT(zfsvfs);
1180		return (err);
1181	}
1182	zp_gen = zp->z_phys->zp_gen & gen_mask;
1183	if (zp_gen == 0)
1184		zp_gen = 1;
1185	if (zp->z_unlinked || zp_gen != fid_gen) {
1186		dprintf("znode gen (%u) != fid gen (%u)\n", zp_gen, fid_gen);
1187		VN_RELE(ZTOV(zp));
1188		ZFS_EXIT(zfsvfs);
1189		return (EINVAL);
1190	}
1191
1192	*vpp = ZTOV(zp);
1193	/* XXX: LK_RETRY? */
1194	vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
1195	vnode_create_vobject(*vpp, zp->z_phys->zp_size, curthread);
1196	ZFS_EXIT(zfsvfs);
1197	return (0);
1198}
1199
1200/*
1201 * Block out VOPs and close zfsvfs_t::z_os
1202 *
1203 * Note, if successful, then we return with the 'z_teardown_lock' and
1204 * 'z_teardown_inactive_lock' write held.
1205 */
1206int
1207zfs_suspend_fs(zfsvfs_t *zfsvfs, char *name, int *mode)
1208{
1209	int error;
1210
1211	if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0)
1212		return (error);
1213
1214	*mode = zfsvfs->z_os->os_mode;
1215	dmu_objset_name(zfsvfs->z_os, name);
1216	dmu_objset_close(zfsvfs->z_os);
1217
1218	return (0);
1219}
1220
1221/*
1222 * Reopen zfsvfs_t::z_os and release VOPs.
1223 */
1224int
1225zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname, int mode)
1226{
1227	int err;
1228
1229	ASSERT(RRW_WRITE_HELD(&zfsvfs->z_teardown_lock));
1230	ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock));
1231
1232	err = dmu_objset_open(osname, DMU_OST_ZFS, mode, &zfsvfs->z_os);
1233	if (err) {
1234		zfsvfs->z_os = NULL;
1235	} else {
1236		znode_t *zp;
1237
1238		VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0);
1239
1240		/*
1241		 * Attempt to re-establish all the active znodes with
1242		 * their dbufs.  If a zfs_rezget() fails, then we'll let
1243		 * any potential callers discover that via ZFS_ENTER_VERIFY_VP
1244		 * when they try to use their znode.
1245		 */
1246		mutex_enter(&zfsvfs->z_znodes_lock);
1247		for (zp = list_head(&zfsvfs->z_all_znodes); zp;
1248		    zp = list_next(&zfsvfs->z_all_znodes, zp)) {
1249			(void) zfs_rezget(zp);
1250		}
1251		mutex_exit(&zfsvfs->z_znodes_lock);
1252
1253	}
1254
1255	/* release the VOPs */
1256	rw_exit(&zfsvfs->z_teardown_inactive_lock);
1257	rrw_exit(&zfsvfs->z_teardown_lock, FTAG);
1258
1259	if (err) {
1260		/*
1261		 * Since we couldn't reopen zfsvfs::z_os, force
1262		 * unmount this file system.
1263		 */
1264		if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0)
1265			(void) dounmount(zfsvfs->z_vfs, MS_FORCE, curthread);
1266	}
1267	return (err);
1268}
1269
1270static void
1271zfs_freevfs(vfs_t *vfsp)
1272{
1273	zfsvfs_t *zfsvfs = vfsp->vfs_data;
1274	int i;
1275
1276	for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
1277		mutex_destroy(&zfsvfs->z_hold_mtx[i]);
1278
1279	zfs_fuid_destroy(zfsvfs);
1280	zfs_freezfsvfs(zfsvfs);
1281
1282	atomic_add_32(&zfs_active_fs_count, -1);
1283}
1284
1285#ifdef __i386__
1286static int desiredvnodes_backup;
1287#endif
1288
1289static void
1290zfs_vnodes_adjust(void)
1291{
1292#ifdef __i386__
1293	int newdesiredvnodes;
1294
1295	desiredvnodes_backup = desiredvnodes;
1296
1297	/*
1298	 * We calculate newdesiredvnodes the same way it is done in
1299	 * vntblinit(). If it is equal to desiredvnodes, it means that
1300	 * it wasn't tuned by the administrator and we can tune it down.
1301	 */
1302	newdesiredvnodes = min(maxproc + cnt.v_page_count / 4, 2 *
1303	    vm_kmem_size / (5 * (sizeof(struct vm_object) +
1304	    sizeof(struct vnode))));
1305	if (newdesiredvnodes == desiredvnodes)
1306		desiredvnodes = (3 * newdesiredvnodes) / 4;
1307#endif
1308}
1309
1310static void
1311zfs_vnodes_adjust_back(void)
1312{
1313
1314#ifdef __i386__
1315	desiredvnodes = desiredvnodes_backup;
1316#endif
1317}
1318
1319void
1320zfs_init(void)
1321{
1322
1323	printf("ZFS filesystem version " SPA_VERSION_STRING "\n");
1324
1325	/*
1326	 * Initialize znode cache, vnode ops, etc...
1327	 */
1328	zfs_znode_init();
1329
1330	/*
1331	 * Initialize .zfs directory structures
1332	 */
1333	zfsctl_init();
1334
1335	/*
1336	 * Reduce number of vnode. Originally number of vnodes is calculated
1337	 * with UFS inode in mind. We reduce it here, because it's too big for
1338	 * ZFS/i386.
1339	 */
1340	zfs_vnodes_adjust();
1341}
1342
1343void
1344zfs_fini(void)
1345{
1346	zfsctl_fini();
1347	zfs_znode_fini();
1348	zfs_vnodes_adjust_back();
1349}
1350
1351int
1352zfs_busy(void)
1353{
1354	return (zfs_active_fs_count != 0);
1355}
1356
1357int
1358zfs_set_version(const char *name, uint64_t newvers)
1359{
1360	int error;
1361	objset_t *os;
1362	dmu_tx_t *tx;
1363	uint64_t curvers;
1364
1365	/*
1366	 * XXX for now, require that the filesystem be unmounted.  Would
1367	 * be nice to find the zfsvfs_t and just update that if
1368	 * possible.
1369	 */
1370
1371	if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION)
1372		return (EINVAL);
1373
1374	error = dmu_objset_open(name, DMU_OST_ZFS, DS_MODE_OWNER, &os);
1375	if (error)
1376		return (error);
1377
1378	error = zap_lookup(os, MASTER_NODE_OBJ, ZPL_VERSION_STR,
1379	    8, 1, &curvers);
1380	if (error)
1381		goto out;
1382	if (newvers < curvers) {
1383		error = EINVAL;
1384		goto out;
1385	}
1386
1387	tx = dmu_tx_create(os);
1388	dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, 0, ZPL_VERSION_STR);
1389	error = dmu_tx_assign(tx, TXG_WAIT);
1390	if (error) {
1391		dmu_tx_abort(tx);
1392		goto out;
1393	}
1394	error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR, 8, 1,
1395	    &newvers, tx);
1396
1397	spa_history_internal_log(LOG_DS_UPGRADE,
1398	    dmu_objset_spa(os), tx, CRED(),
1399	    "oldver=%llu newver=%llu dataset = %llu", curvers, newvers,
1400	    dmu_objset_id(os));
1401	dmu_tx_commit(tx);
1402
1403out:
1404	dmu_objset_close(os);
1405	return (error);
1406}
1407/*
1408 * Read a property stored within the master node.
1409 */
1410int
1411zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value)
1412{
1413	const char *pname;
1414	int error = ENOENT;
1415
1416	/*
1417	 * Look up the file system's value for the property.  For the
1418	 * version property, we look up a slightly different string.
1419	 */
1420	if (prop == ZFS_PROP_VERSION)
1421		pname = ZPL_VERSION_STR;
1422	else
1423		pname = zfs_prop_to_name(prop);
1424
1425	if (os != NULL)
1426		error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value);
1427
1428	if (error == ENOENT) {
1429		/* No value set, use the default value */
1430		switch (prop) {
1431		case ZFS_PROP_VERSION:
1432			*value = ZPL_VERSION;
1433			break;
1434		case ZFS_PROP_NORMALIZE:
1435		case ZFS_PROP_UTF8ONLY:
1436			*value = 0;
1437			break;
1438		case ZFS_PROP_CASE:
1439			*value = ZFS_CASE_SENSITIVE;
1440			break;
1441		default:
1442			return (error);
1443		}
1444		error = 0;
1445	}
1446	return (error);
1447}
1448