zfs_vfsops.c revision 7656:2621e50fdf4a
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26#include <sys/types.h>
27#include <sys/param.h>
28#include <sys/systm.h>
29#include <sys/sysmacros.h>
30#include <sys/kmem.h>
31#include <sys/pathname.h>
32#include <sys/vnode.h>
33#include <sys/vfs.h>
34#include <sys/vfs_opreg.h>
35#include <sys/mntent.h>
36#include <sys/mount.h>
37#include <sys/cmn_err.h>
38#include "fs/fs_subr.h"
39#include <sys/zfs_znode.h>
40#include <sys/zfs_dir.h>
41#include <sys/zil.h>
42#include <sys/fs/zfs.h>
43#include <sys/dmu.h>
44#include <sys/dsl_prop.h>
45#include <sys/dsl_dataset.h>
46#include <sys/dsl_deleg.h>
47#include <sys/spa.h>
48#include <sys/zap.h>
49#include <sys/varargs.h>
50#include <sys/policy.h>
51#include <sys/atomic.h>
52#include <sys/mkdev.h>
53#include <sys/modctl.h>
54#include <sys/refstr.h>
55#include <sys/zfs_ioctl.h>
56#include <sys/zfs_ctldir.h>
57#include <sys/zfs_fuid.h>
58#include <sys/bootconf.h>
59#include <sys/sunddi.h>
60#include <sys/dnlc.h>
61#include <sys/dmu_objset.h>
62#include <sys/spa_boot.h>
63
64int zfsfstype;
65vfsops_t *zfs_vfsops = NULL;
66static major_t zfs_major;
67static minor_t zfs_minor;
68static kmutex_t	zfs_dev_mtx;
69
70static int zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr);
71static int zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr);
72static int zfs_mountroot(vfs_t *vfsp, enum whymountroot);
73static int zfs_root(vfs_t *vfsp, vnode_t **vpp);
74static int zfs_statvfs(vfs_t *vfsp, struct statvfs64 *statp);
75static int zfs_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp);
76static void zfs_freevfs(vfs_t *vfsp);
77
78static const fs_operation_def_t zfs_vfsops_template[] = {
79	VFSNAME_MOUNT,		{ .vfs_mount = zfs_mount },
80	VFSNAME_MOUNTROOT,	{ .vfs_mountroot = zfs_mountroot },
81	VFSNAME_UNMOUNT,	{ .vfs_unmount = zfs_umount },
82	VFSNAME_ROOT,		{ .vfs_root = zfs_root },
83	VFSNAME_STATVFS,	{ .vfs_statvfs = zfs_statvfs },
84	VFSNAME_SYNC,		{ .vfs_sync = zfs_sync },
85	VFSNAME_VGET,		{ .vfs_vget = zfs_vget },
86	VFSNAME_FREEVFS,	{ .vfs_freevfs = zfs_freevfs },
87	NULL,			NULL
88};
89
90static const fs_operation_def_t zfs_vfsops_eio_template[] = {
91	VFSNAME_FREEVFS,	{ .vfs_freevfs =  zfs_freevfs },
92	NULL,			NULL
93};
94
95/*
96 * We need to keep a count of active fs's.
97 * This is necessary to prevent our module
98 * from being unloaded after a umount -f
99 */
100static uint32_t	zfs_active_fs_count = 0;
101
102static char *noatime_cancel[] = { MNTOPT_ATIME, NULL };
103static char *atime_cancel[] = { MNTOPT_NOATIME, NULL };
104static char *noxattr_cancel[] = { MNTOPT_XATTR, NULL };
105static char *xattr_cancel[] = { MNTOPT_NOXATTR, NULL };
106
107/*
108 * MO_DEFAULT is not used since the default value is determined
109 * by the equivalent property.
110 */
111static mntopt_t mntopts[] = {
112	{ MNTOPT_NOXATTR, noxattr_cancel, NULL, 0, NULL },
113	{ MNTOPT_XATTR, xattr_cancel, NULL, 0, NULL },
114	{ MNTOPT_NOATIME, noatime_cancel, NULL, 0, NULL },
115	{ MNTOPT_ATIME, atime_cancel, NULL, 0, NULL }
116};
117
118static mntopts_t zfs_mntopts = {
119	sizeof (mntopts) / sizeof (mntopt_t),
120	mntopts
121};
122
123/*ARGSUSED*/
124int
125zfs_sync(vfs_t *vfsp, short flag, cred_t *cr)
126{
127	/*
128	 * Data integrity is job one.  We don't want a compromised kernel
129	 * writing to the storage pool, so we never sync during panic.
130	 */
131	if (panicstr)
132		return (0);
133
134	/*
135	 * SYNC_ATTR is used by fsflush() to force old filesystems like UFS
136	 * to sync metadata, which they would otherwise cache indefinitely.
137	 * Semantically, the only requirement is that the sync be initiated.
138	 * The DMU syncs out txgs frequently, so there's nothing to do.
139	 */
140	if (flag & SYNC_ATTR)
141		return (0);
142
143	if (vfsp != NULL) {
144		/*
145		 * Sync a specific filesystem.
146		 */
147		zfsvfs_t *zfsvfs = vfsp->vfs_data;
148
149		ZFS_ENTER(zfsvfs);
150		if (zfsvfs->z_log != NULL)
151			zil_commit(zfsvfs->z_log, UINT64_MAX, 0);
152		else
153			txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
154		ZFS_EXIT(zfsvfs);
155	} else {
156		/*
157		 * Sync all ZFS filesystems.  This is what happens when you
158		 * run sync(1M).  Unlike other filesystems, ZFS honors the
159		 * request by waiting for all pools to commit all dirty data.
160		 */
161		spa_sync_allpools();
162	}
163
164	return (0);
165}
166
167static int
168zfs_create_unique_device(dev_t *dev)
169{
170	major_t new_major;
171
172	do {
173		ASSERT3U(zfs_minor, <=, MAXMIN32);
174		minor_t start = zfs_minor;
175		do {
176			mutex_enter(&zfs_dev_mtx);
177			if (zfs_minor >= MAXMIN32) {
178				/*
179				 * If we're still using the real major
180				 * keep out of /dev/zfs and /dev/zvol minor
181				 * number space.  If we're using a getudev()'ed
182				 * major number, we can use all of its minors.
183				 */
184				if (zfs_major == ddi_name_to_major(ZFS_DRIVER))
185					zfs_minor = ZFS_MIN_MINOR;
186				else
187					zfs_minor = 0;
188			} else {
189				zfs_minor++;
190			}
191			*dev = makedevice(zfs_major, zfs_minor);
192			mutex_exit(&zfs_dev_mtx);
193		} while (vfs_devismounted(*dev) && zfs_minor != start);
194		if (zfs_minor == start) {
195			/*
196			 * We are using all ~262,000 minor numbers for the
197			 * current major number.  Create a new major number.
198			 */
199			if ((new_major = getudev()) == (major_t)-1) {
200				cmn_err(CE_WARN,
201				    "zfs_mount: Can't get unique major "
202				    "device number.");
203				return (-1);
204			}
205			mutex_enter(&zfs_dev_mtx);
206			zfs_major = new_major;
207			zfs_minor = 0;
208
209			mutex_exit(&zfs_dev_mtx);
210		} else {
211			break;
212		}
213		/* CONSTANTCONDITION */
214	} while (1);
215
216	return (0);
217}
218
219static void
220atime_changed_cb(void *arg, uint64_t newval)
221{
222	zfsvfs_t *zfsvfs = arg;
223
224	if (newval == TRUE) {
225		zfsvfs->z_atime = TRUE;
226		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME);
227		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0);
228	} else {
229		zfsvfs->z_atime = FALSE;
230		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME);
231		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0);
232	}
233}
234
235static void
236xattr_changed_cb(void *arg, uint64_t newval)
237{
238	zfsvfs_t *zfsvfs = arg;
239
240	if (newval == TRUE) {
241		/* XXX locking on vfs_flag? */
242		zfsvfs->z_vfs->vfs_flag |= VFS_XATTR;
243		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR);
244		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_XATTR, NULL, 0);
245	} else {
246		/* XXX locking on vfs_flag? */
247		zfsvfs->z_vfs->vfs_flag &= ~VFS_XATTR;
248		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_XATTR);
249		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR, NULL, 0);
250	}
251}
252
253static void
254blksz_changed_cb(void *arg, uint64_t newval)
255{
256	zfsvfs_t *zfsvfs = arg;
257
258	if (newval < SPA_MINBLOCKSIZE ||
259	    newval > SPA_MAXBLOCKSIZE || !ISP2(newval))
260		newval = SPA_MAXBLOCKSIZE;
261
262	zfsvfs->z_max_blksz = newval;
263	zfsvfs->z_vfs->vfs_bsize = newval;
264}
265
266static void
267readonly_changed_cb(void *arg, uint64_t newval)
268{
269	zfsvfs_t *zfsvfs = arg;
270
271	if (newval) {
272		/* XXX locking on vfs_flag? */
273		zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY;
274		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW);
275		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0);
276	} else {
277		/* XXX locking on vfs_flag? */
278		zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
279		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO);
280		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0);
281	}
282}
283
284static void
285devices_changed_cb(void *arg, uint64_t newval)
286{
287	zfsvfs_t *zfsvfs = arg;
288
289	if (newval == FALSE) {
290		zfsvfs->z_vfs->vfs_flag |= VFS_NODEVICES;
291		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_DEVICES);
292		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NODEVICES, NULL, 0);
293	} else {
294		zfsvfs->z_vfs->vfs_flag &= ~VFS_NODEVICES;
295		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NODEVICES);
296		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_DEVICES, NULL, 0);
297	}
298}
299
300static void
301setuid_changed_cb(void *arg, uint64_t newval)
302{
303	zfsvfs_t *zfsvfs = arg;
304
305	if (newval == FALSE) {
306		zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID;
307		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID);
308		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0);
309	} else {
310		zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID;
311		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID);
312		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0);
313	}
314}
315
316static void
317exec_changed_cb(void *arg, uint64_t newval)
318{
319	zfsvfs_t *zfsvfs = arg;
320
321	if (newval == FALSE) {
322		zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC;
323		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC);
324		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0);
325	} else {
326		zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC;
327		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC);
328		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0);
329	}
330}
331
332/*
333 * The nbmand mount option can be changed at mount time.
334 * We can't allow it to be toggled on live file systems or incorrect
335 * behavior may be seen from cifs clients
336 *
337 * This property isn't registered via dsl_prop_register(), but this callback
338 * will be called when a file system is first mounted
339 */
340static void
341nbmand_changed_cb(void *arg, uint64_t newval)
342{
343	zfsvfs_t *zfsvfs = arg;
344	if (newval == FALSE) {
345		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND);
346		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND, NULL, 0);
347	} else {
348		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND);
349		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND, NULL, 0);
350	}
351}
352
353static void
354snapdir_changed_cb(void *arg, uint64_t newval)
355{
356	zfsvfs_t *zfsvfs = arg;
357
358	zfsvfs->z_show_ctldir = newval;
359}
360
361static void
362vscan_changed_cb(void *arg, uint64_t newval)
363{
364	zfsvfs_t *zfsvfs = arg;
365
366	zfsvfs->z_vscan = newval;
367}
368
369static void
370acl_mode_changed_cb(void *arg, uint64_t newval)
371{
372	zfsvfs_t *zfsvfs = arg;
373
374	zfsvfs->z_acl_mode = newval;
375}
376
377static void
378acl_inherit_changed_cb(void *arg, uint64_t newval)
379{
380	zfsvfs_t *zfsvfs = arg;
381
382	zfsvfs->z_acl_inherit = newval;
383}
384
385static int
386zfs_register_callbacks(vfs_t *vfsp)
387{
388	struct dsl_dataset *ds = NULL;
389	objset_t *os = NULL;
390	zfsvfs_t *zfsvfs = NULL;
391	uint64_t nbmand;
392	int readonly, do_readonly = B_FALSE;
393	int setuid, do_setuid = B_FALSE;
394	int exec, do_exec = B_FALSE;
395	int devices, do_devices = B_FALSE;
396	int xattr, do_xattr = B_FALSE;
397	int atime, do_atime = B_FALSE;
398	int error = 0;
399
400	ASSERT(vfsp);
401	zfsvfs = vfsp->vfs_data;
402	ASSERT(zfsvfs);
403	os = zfsvfs->z_os;
404
405	/*
406	 * The act of registering our callbacks will destroy any mount
407	 * options we may have.  In order to enable temporary overrides
408	 * of mount options, we stash away the current values and
409	 * restore them after we register the callbacks.
410	 */
411	if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) {
412		readonly = B_TRUE;
413		do_readonly = B_TRUE;
414	} else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) {
415		readonly = B_FALSE;
416		do_readonly = B_TRUE;
417	}
418	if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) {
419		devices = B_FALSE;
420		setuid = B_FALSE;
421		do_devices = B_TRUE;
422		do_setuid = B_TRUE;
423	} else {
424		if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL)) {
425			devices = B_FALSE;
426			do_devices = B_TRUE;
427		} else if (vfs_optionisset(vfsp, MNTOPT_DEVICES, NULL)) {
428			devices = B_TRUE;
429			do_devices = B_TRUE;
430		}
431
432		if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) {
433			setuid = B_FALSE;
434			do_setuid = B_TRUE;
435		} else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) {
436			setuid = B_TRUE;
437			do_setuid = B_TRUE;
438		}
439	}
440	if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) {
441		exec = B_FALSE;
442		do_exec = B_TRUE;
443	} else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) {
444		exec = B_TRUE;
445		do_exec = B_TRUE;
446	}
447	if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) {
448		xattr = B_FALSE;
449		do_xattr = B_TRUE;
450	} else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) {
451		xattr = B_TRUE;
452		do_xattr = B_TRUE;
453	}
454	if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) {
455		atime = B_FALSE;
456		do_atime = B_TRUE;
457	} else if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) {
458		atime = B_TRUE;
459		do_atime = B_TRUE;
460	}
461
462	/*
463	 * nbmand is a special property.  It can only be changed at
464	 * mount time.
465	 *
466	 * This is weird, but it is documented to only be changeable
467	 * at mount time.
468	 */
469	if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) {
470		nbmand = B_FALSE;
471	} else if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) {
472		nbmand = B_TRUE;
473	} else {
474		char osname[MAXNAMELEN];
475
476		dmu_objset_name(os, osname);
477		if (error = dsl_prop_get_integer(osname, "nbmand", &nbmand,
478		    NULL)) {
479			return (error);
480		}
481	}
482
483	/*
484	 * Register property callbacks.
485	 *
486	 * It would probably be fine to just check for i/o error from
487	 * the first prop_register(), but I guess I like to go
488	 * overboard...
489	 */
490	ds = dmu_objset_ds(os);
491	error = dsl_prop_register(ds, "atime", atime_changed_cb, zfsvfs);
492	error = error ? error : dsl_prop_register(ds,
493	    "xattr", xattr_changed_cb, zfsvfs);
494	error = error ? error : dsl_prop_register(ds,
495	    "recordsize", blksz_changed_cb, zfsvfs);
496	error = error ? error : dsl_prop_register(ds,
497	    "readonly", readonly_changed_cb, zfsvfs);
498	error = error ? error : dsl_prop_register(ds,
499	    "devices", devices_changed_cb, zfsvfs);
500	error = error ? error : dsl_prop_register(ds,
501	    "setuid", setuid_changed_cb, zfsvfs);
502	error = error ? error : dsl_prop_register(ds,
503	    "exec", exec_changed_cb, zfsvfs);
504	error = error ? error : dsl_prop_register(ds,
505	    "snapdir", snapdir_changed_cb, zfsvfs);
506	error = error ? error : dsl_prop_register(ds,
507	    "aclmode", acl_mode_changed_cb, zfsvfs);
508	error = error ? error : dsl_prop_register(ds,
509	    "aclinherit", acl_inherit_changed_cb, zfsvfs);
510	error = error ? error : dsl_prop_register(ds,
511	    "vscan", vscan_changed_cb, zfsvfs);
512	if (error)
513		goto unregister;
514
515	/*
516	 * Invoke our callbacks to restore temporary mount options.
517	 */
518	if (do_readonly)
519		readonly_changed_cb(zfsvfs, readonly);
520	if (do_setuid)
521		setuid_changed_cb(zfsvfs, setuid);
522	if (do_exec)
523		exec_changed_cb(zfsvfs, exec);
524	if (do_devices)
525		devices_changed_cb(zfsvfs, devices);
526	if (do_xattr)
527		xattr_changed_cb(zfsvfs, xattr);
528	if (do_atime)
529		atime_changed_cb(zfsvfs, atime);
530
531	nbmand_changed_cb(zfsvfs, nbmand);
532
533	return (0);
534
535unregister:
536	/*
537	 * We may attempt to unregister some callbacks that are not
538	 * registered, but this is OK; it will simply return ENOMSG,
539	 * which we will ignore.
540	 */
541	(void) dsl_prop_unregister(ds, "atime", atime_changed_cb, zfsvfs);
542	(void) dsl_prop_unregister(ds, "xattr", xattr_changed_cb, zfsvfs);
543	(void) dsl_prop_unregister(ds, "recordsize", blksz_changed_cb, zfsvfs);
544	(void) dsl_prop_unregister(ds, "readonly", readonly_changed_cb, zfsvfs);
545	(void) dsl_prop_unregister(ds, "devices", devices_changed_cb, zfsvfs);
546	(void) dsl_prop_unregister(ds, "setuid", setuid_changed_cb, zfsvfs);
547	(void) dsl_prop_unregister(ds, "exec", exec_changed_cb, zfsvfs);
548	(void) dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb, zfsvfs);
549	(void) dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb, zfsvfs);
550	(void) dsl_prop_unregister(ds, "aclinherit", acl_inherit_changed_cb,
551	    zfsvfs);
552	(void) dsl_prop_unregister(ds, "vscan", vscan_changed_cb, zfsvfs);
553	return (error);
554
555}
556
557static int
558zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
559{
560	int error;
561
562	error = zfs_register_callbacks(zfsvfs->z_vfs);
563	if (error)
564		return (error);
565
566	/*
567	 * Set the objset user_ptr to track its zfsvfs.
568	 */
569	mutex_enter(&zfsvfs->z_os->os->os_user_ptr_lock);
570	dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
571	mutex_exit(&zfsvfs->z_os->os->os_user_ptr_lock);
572
573	/*
574	 * If we are not mounting (ie: online recv), then we don't
575	 * have to worry about replaying the log as we blocked all
576	 * operations out since we closed the ZIL.
577	 */
578	if (mounting) {
579		boolean_t readonly;
580
581		/*
582		 * During replay we remove the read only flag to
583		 * allow replays to succeed.
584		 */
585		readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY;
586		zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
587
588		/*
589		 * Parse and replay the intent log.
590		 */
591		zil_replay(zfsvfs->z_os, zfsvfs, &zfsvfs->z_assign,
592		    zfs_replay_vector, zfs_unlinked_drain);
593
594		zfs_unlinked_drain(zfsvfs);
595		zfsvfs->z_vfs->vfs_flag |= readonly; /* restore readonly bit */
596	}
597
598	if (!zil_disable)
599		zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data);
600
601	return (0);
602}
603
604static void
605zfs_freezfsvfs(zfsvfs_t *zfsvfs)
606{
607	mutex_destroy(&zfsvfs->z_znodes_lock);
608	mutex_destroy(&zfsvfs->z_online_recv_lock);
609	list_destroy(&zfsvfs->z_all_znodes);
610	rrw_destroy(&zfsvfs->z_teardown_lock);
611	rw_destroy(&zfsvfs->z_teardown_inactive_lock);
612	rw_destroy(&zfsvfs->z_fuid_lock);
613	kmem_free(zfsvfs, sizeof (zfsvfs_t));
614}
615
616static int
617zfs_domount(vfs_t *vfsp, char *osname)
618{
619	dev_t mount_dev;
620	uint64_t recordsize, readonly;
621	int error = 0;
622	int mode;
623	zfsvfs_t *zfsvfs;
624	znode_t *zp = NULL;
625
626	ASSERT(vfsp);
627	ASSERT(osname);
628
629	/*
630	 * Initialize the zfs-specific filesystem structure.
631	 * Should probably make this a kmem cache, shuffle fields,
632	 * and just bzero up to z_hold_mtx[].
633	 */
634	zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
635	zfsvfs->z_vfs = vfsp;
636	zfsvfs->z_parent = zfsvfs;
637	zfsvfs->z_assign = TXG_NOWAIT;
638	zfsvfs->z_max_blksz = SPA_MAXBLOCKSIZE;
639	zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE;
640
641	mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
642	mutex_init(&zfsvfs->z_online_recv_lock, NULL, MUTEX_DEFAULT, NULL);
643	list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
644	    offsetof(znode_t, z_link_node));
645	rrw_init(&zfsvfs->z_teardown_lock);
646	rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL);
647	rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL);
648
649	/* Initialize the generic filesystem structure. */
650	vfsp->vfs_bcount = 0;
651	vfsp->vfs_data = NULL;
652
653	if (zfs_create_unique_device(&mount_dev) == -1) {
654		error = ENODEV;
655		goto out;
656	}
657	ASSERT(vfs_devismounted(mount_dev) == 0);
658
659	if (error = dsl_prop_get_integer(osname, "recordsize", &recordsize,
660	    NULL))
661		goto out;
662
663	vfsp->vfs_dev = mount_dev;
664	vfsp->vfs_fstype = zfsfstype;
665	vfsp->vfs_bsize = recordsize;
666	vfsp->vfs_flag |= VFS_NOTRUNC;
667	vfsp->vfs_data = zfsvfs;
668
669	if (error = dsl_prop_get_integer(osname, "readonly", &readonly, NULL))
670		goto out;
671
672	mode = DS_MODE_OWNER;
673	if (readonly)
674		mode |= DS_MODE_READONLY;
675
676	error = dmu_objset_open(osname, DMU_OST_ZFS, mode, &zfsvfs->z_os);
677	if (error == EROFS) {
678		mode = DS_MODE_OWNER | DS_MODE_READONLY;
679		error = dmu_objset_open(osname, DMU_OST_ZFS, mode,
680		    &zfsvfs->z_os);
681	}
682
683	if (error)
684		goto out;
685
686	if (error = zfs_init_fs(zfsvfs, &zp))
687		goto out;
688
689	/* The call to zfs_init_fs leaves the vnode held, release it here. */
690	VN_RELE(ZTOV(zp));
691
692	/*
693	 * Set features for file system.
694	 */
695	zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
696	if (zfsvfs->z_use_fuids) {
697		vfs_set_feature(vfsp, VFSFT_XVATTR);
698		vfs_set_feature(vfsp, VFSFT_ACEMASKONACCESS);
699		vfs_set_feature(vfsp, VFSFT_ACLONCREATE);
700	}
701	if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) {
702		vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);
703		vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);
704		vfs_set_feature(vfsp, VFSFT_NOCASESENSITIVE);
705	} else if (zfsvfs->z_case == ZFS_CASE_MIXED) {
706		vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);
707		vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);
708	}
709
710	if (dmu_objset_is_snapshot(zfsvfs->z_os)) {
711		uint64_t pval;
712
713		ASSERT(mode & DS_MODE_READONLY);
714		atime_changed_cb(zfsvfs, B_FALSE);
715		readonly_changed_cb(zfsvfs, B_TRUE);
716		if (error = dsl_prop_get_integer(osname, "xattr", &pval, NULL))
717			goto out;
718		xattr_changed_cb(zfsvfs, pval);
719		zfsvfs->z_issnap = B_TRUE;
720	} else {
721		error = zfsvfs_setup(zfsvfs, B_TRUE);
722	}
723
724	if (!zfsvfs->z_issnap)
725		zfsctl_create(zfsvfs);
726out:
727	if (error) {
728		if (zfsvfs->z_os)
729			dmu_objset_close(zfsvfs->z_os);
730		zfs_freezfsvfs(zfsvfs);
731	} else {
732		atomic_add_32(&zfs_active_fs_count, 1);
733	}
734
735	return (error);
736}
737
738void
739zfs_unregister_callbacks(zfsvfs_t *zfsvfs)
740{
741	objset_t *os = zfsvfs->z_os;
742	struct dsl_dataset *ds;
743
744	/*
745	 * Unregister properties.
746	 */
747	if (!dmu_objset_is_snapshot(os)) {
748		ds = dmu_objset_ds(os);
749		VERIFY(dsl_prop_unregister(ds, "atime", atime_changed_cb,
750		    zfsvfs) == 0);
751
752		VERIFY(dsl_prop_unregister(ds, "xattr", xattr_changed_cb,
753		    zfsvfs) == 0);
754
755		VERIFY(dsl_prop_unregister(ds, "recordsize", blksz_changed_cb,
756		    zfsvfs) == 0);
757
758		VERIFY(dsl_prop_unregister(ds, "readonly", readonly_changed_cb,
759		    zfsvfs) == 0);
760
761		VERIFY(dsl_prop_unregister(ds, "devices", devices_changed_cb,
762		    zfsvfs) == 0);
763
764		VERIFY(dsl_prop_unregister(ds, "setuid", setuid_changed_cb,
765		    zfsvfs) == 0);
766
767		VERIFY(dsl_prop_unregister(ds, "exec", exec_changed_cb,
768		    zfsvfs) == 0);
769
770		VERIFY(dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb,
771		    zfsvfs) == 0);
772
773		VERIFY(dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb,
774		    zfsvfs) == 0);
775
776		VERIFY(dsl_prop_unregister(ds, "aclinherit",
777		    acl_inherit_changed_cb, zfsvfs) == 0);
778
779		VERIFY(dsl_prop_unregister(ds, "vscan",
780		    vscan_changed_cb, zfsvfs) == 0);
781	}
782}
783
784/*
785 * Convert a decimal digit string to a uint64_t integer.
786 */
787static int
788str_to_uint64(char *str, uint64_t *objnum)
789{
790	uint64_t num = 0;
791
792	while (*str) {
793		if (*str < '0' || *str > '9')
794			return (EINVAL);
795
796		num = num*10 + *str++ - '0';
797	}
798
799	*objnum = num;
800	return (0);
801}
802
803/*
804 * The boot path passed from the boot loader is in the form of
805 * "rootpool-name/root-filesystem-object-number'. Convert this
806 * string to a dataset name: "rootpool-name/root-filesystem-name".
807 */
808static int
809zfs_parse_bootfs(char *bpath, char *outpath)
810{
811	char *slashp;
812	uint64_t objnum;
813	int error;
814
815	if (*bpath == 0 || *bpath == '/')
816		return (EINVAL);
817
818	(void) strcpy(outpath, bpath);
819
820	slashp = strchr(bpath, '/');
821
822	/* if no '/', just return the pool name */
823	if (slashp == NULL) {
824		return (0);
825	}
826
827	/* if not a number, just return the root dataset name */
828	if (str_to_uint64(slashp+1, &objnum)) {
829		return (0);
830	}
831
832	*slashp = '\0';
833	error = dsl_dsobj_to_dsname(bpath, objnum, outpath);
834	*slashp = '/';
835
836	return (error);
837}
838
839static int
840zfs_mountroot(vfs_t *vfsp, enum whymountroot why)
841{
842	int error = 0;
843	static int zfsrootdone = 0;
844	zfsvfs_t *zfsvfs = NULL;
845	znode_t *zp = NULL;
846	vnode_t *vp = NULL;
847	char *zfs_bootfs;
848	char *zfs_devid;
849
850	ASSERT(vfsp);
851
852	/*
853	 * The filesystem that we mount as root is defined in the
854	 * boot property "zfs-bootfs" with a format of
855	 * "poolname/root-dataset-objnum".
856	 */
857	if (why == ROOT_INIT) {
858		if (zfsrootdone++)
859			return (EBUSY);
860		/*
861		 * the process of doing a spa_load will require the
862		 * clock to be set before we could (for example) do
863		 * something better by looking at the timestamp on
864		 * an uberblock, so just set it to -1.
865		 */
866		clkset(-1);
867
868		if ((zfs_bootfs = spa_get_bootprop("zfs-bootfs")) == NULL) {
869			cmn_err(CE_NOTE, "spa_get_bootfs: can not get "
870			    "bootfs name");
871			return (EINVAL);
872		}
873		zfs_devid = spa_get_bootprop("diskdevid");
874		error = spa_import_rootpool(rootfs.bo_name, zfs_devid);
875		if (zfs_devid)
876			spa_free_bootprop(zfs_devid);
877		if (error) {
878			spa_free_bootprop(zfs_bootfs);
879			cmn_err(CE_NOTE, "spa_import_rootpool: error %d",
880			    error);
881			return (error);
882		}
883		if (error = zfs_parse_bootfs(zfs_bootfs, rootfs.bo_name)) {
884			spa_free_bootprop(zfs_bootfs);
885			cmn_err(CE_NOTE, "zfs_parse_bootfs: error %d",
886			    error);
887			return (error);
888		}
889
890		spa_free_bootprop(zfs_bootfs);
891
892		if (error = vfs_lock(vfsp))
893			return (error);
894
895		if (error = zfs_domount(vfsp, rootfs.bo_name)) {
896			cmn_err(CE_NOTE, "zfs_domount: error %d", error);
897			goto out;
898		}
899
900		zfsvfs = (zfsvfs_t *)vfsp->vfs_data;
901		ASSERT(zfsvfs);
902		if (error = zfs_zget(zfsvfs, zfsvfs->z_root, &zp)) {
903			cmn_err(CE_NOTE, "zfs_zget: error %d", error);
904			goto out;
905		}
906
907		vp = ZTOV(zp);
908		mutex_enter(&vp->v_lock);
909		vp->v_flag |= VROOT;
910		mutex_exit(&vp->v_lock);
911		rootvp = vp;
912
913		/*
914		 * Leave rootvp held.  The root file system is never unmounted.
915		 */
916
917		vfs_add((struct vnode *)0, vfsp,
918		    (vfsp->vfs_flag & VFS_RDONLY) ? MS_RDONLY : 0);
919out:
920		vfs_unlock(vfsp);
921		return (error);
922	} else if (why == ROOT_REMOUNT) {
923		readonly_changed_cb(vfsp->vfs_data, B_FALSE);
924		vfsp->vfs_flag |= VFS_REMOUNT;
925
926		/* refresh mount options */
927		zfs_unregister_callbacks(vfsp->vfs_data);
928		return (zfs_register_callbacks(vfsp));
929
930	} else if (why == ROOT_UNMOUNT) {
931		zfs_unregister_callbacks((zfsvfs_t *)vfsp->vfs_data);
932		(void) zfs_sync(vfsp, 0, 0);
933		return (0);
934	}
935
936	/*
937	 * if "why" is equal to anything else other than ROOT_INIT,
938	 * ROOT_REMOUNT, or ROOT_UNMOUNT, we do not support it.
939	 */
940	return (ENOTSUP);
941}
942
943/*ARGSUSED*/
944static int
945zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
946{
947	char		*osname;
948	pathname_t	spn;
949	int		error = 0;
950	uio_seg_t	fromspace = (uap->flags & MS_SYSSPACE) ?
951	    UIO_SYSSPACE : UIO_USERSPACE;
952	int		canwrite;
953
954	if (mvp->v_type != VDIR)
955		return (ENOTDIR);
956
957	mutex_enter(&mvp->v_lock);
958	if ((uap->flags & MS_REMOUNT) == 0 &&
959	    (uap->flags & MS_OVERLAY) == 0 &&
960	    (mvp->v_count != 1 || (mvp->v_flag & VROOT))) {
961		mutex_exit(&mvp->v_lock);
962		return (EBUSY);
963	}
964	mutex_exit(&mvp->v_lock);
965
966	/*
967	 * ZFS does not support passing unparsed data in via MS_DATA.
968	 * Users should use the MS_OPTIONSTR interface; this means
969	 * that all option parsing is already done and the options struct
970	 * can be interrogated.
971	 */
972	if ((uap->flags & MS_DATA) && uap->datalen > 0)
973		return (EINVAL);
974
975	/*
976	 * Get the objset name (the "special" mount argument).
977	 */
978	if (error = pn_get(uap->spec, fromspace, &spn))
979		return (error);
980
981	osname = spn.pn_path;
982
983	/*
984	 * Check for mount privilege?
985	 *
986	 * If we don't have privilege then see if
987	 * we have local permission to allow it
988	 */
989	error = secpolicy_fs_mount(cr, mvp, vfsp);
990	if (error) {
991		error = dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr);
992		if (error == 0) {
993			vattr_t		vattr;
994
995			/*
996			 * Make sure user is the owner of the mount point
997			 * or has sufficient privileges.
998			 */
999
1000			vattr.va_mask = AT_UID;
1001
1002			if (error = VOP_GETATTR(mvp, &vattr, 0, cr, NULL)) {
1003				goto out;
1004			}
1005
1006			if (secpolicy_vnode_owner(cr, vattr.va_uid) != 0 &&
1007			    VOP_ACCESS(mvp, VWRITE, 0, cr, NULL) != 0) {
1008				error = EPERM;
1009				goto out;
1010			}
1011
1012			secpolicy_fs_mount_clearopts(cr, vfsp);
1013		} else {
1014			goto out;
1015		}
1016	}
1017
1018	/*
1019	 * Refuse to mount a filesystem if we are in a local zone and the
1020	 * dataset is not visible.
1021	 */
1022	if (!INGLOBALZONE(curproc) &&
1023	    (!zone_dataset_visible(osname, &canwrite) || !canwrite)) {
1024		error = EPERM;
1025		goto out;
1026	}
1027
1028	/*
1029	 * When doing a remount, we simply refresh our temporary properties
1030	 * according to those options set in the current VFS options.
1031	 */
1032	if (uap->flags & MS_REMOUNT) {
1033		/* refresh mount options */
1034		zfs_unregister_callbacks(vfsp->vfs_data);
1035		error = zfs_register_callbacks(vfsp);
1036		goto out;
1037	}
1038
1039	error = zfs_domount(vfsp, osname);
1040
1041out:
1042	pn_free(&spn);
1043	return (error);
1044}
1045
1046static int
1047zfs_statvfs(vfs_t *vfsp, struct statvfs64 *statp)
1048{
1049	zfsvfs_t *zfsvfs = vfsp->vfs_data;
1050	dev32_t d32;
1051	uint64_t refdbytes, availbytes, usedobjs, availobjs;
1052
1053	ZFS_ENTER(zfsvfs);
1054
1055	dmu_objset_space(zfsvfs->z_os,
1056	    &refdbytes, &availbytes, &usedobjs, &availobjs);
1057
1058	/*
1059	 * The underlying storage pool actually uses multiple block sizes.
1060	 * We report the fragsize as the smallest block size we support,
1061	 * and we report our blocksize as the filesystem's maximum blocksize.
1062	 */
1063	statp->f_frsize = 1UL << SPA_MINBLOCKSHIFT;
1064	statp->f_bsize = zfsvfs->z_max_blksz;
1065
1066	/*
1067	 * The following report "total" blocks of various kinds in the
1068	 * file system, but reported in terms of f_frsize - the
1069	 * "fragment" size.
1070	 */
1071
1072	statp->f_blocks = (refdbytes + availbytes) >> SPA_MINBLOCKSHIFT;
1073	statp->f_bfree = availbytes >> SPA_MINBLOCKSHIFT;
1074	statp->f_bavail = statp->f_bfree; /* no root reservation */
1075
1076	/*
1077	 * statvfs() should really be called statufs(), because it assumes
1078	 * static metadata.  ZFS doesn't preallocate files, so the best
1079	 * we can do is report the max that could possibly fit in f_files,
1080	 * and that minus the number actually used in f_ffree.
1081	 * For f_ffree, report the smaller of the number of object available
1082	 * and the number of blocks (each object will take at least a block).
1083	 */
1084	statp->f_ffree = MIN(availobjs, statp->f_bfree);
1085	statp->f_favail = statp->f_ffree;	/* no "root reservation" */
1086	statp->f_files = statp->f_ffree + usedobjs;
1087
1088	(void) cmpldev(&d32, vfsp->vfs_dev);
1089	statp->f_fsid = d32;
1090
1091	/*
1092	 * We're a zfs filesystem.
1093	 */
1094	(void) strcpy(statp->f_basetype, vfssw[vfsp->vfs_fstype].vsw_name);
1095
1096	statp->f_flag = vf_to_stf(vfsp->vfs_flag);
1097
1098	statp->f_namemax = ZFS_MAXNAMELEN;
1099
1100	/*
1101	 * We have all of 32 characters to stuff a string here.
1102	 * Is there anything useful we could/should provide?
1103	 */
1104	bzero(statp->f_fstr, sizeof (statp->f_fstr));
1105
1106	ZFS_EXIT(zfsvfs);
1107	return (0);
1108}
1109
1110static int
1111zfs_root(vfs_t *vfsp, vnode_t **vpp)
1112{
1113	zfsvfs_t *zfsvfs = vfsp->vfs_data;
1114	znode_t *rootzp;
1115	int error;
1116
1117	ZFS_ENTER(zfsvfs);
1118
1119	error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp);
1120	if (error == 0)
1121		*vpp = ZTOV(rootzp);
1122
1123	ZFS_EXIT(zfsvfs);
1124	return (error);
1125}
1126
1127/*
1128 * Teardown the zfsvfs::z_os.
1129 *
1130 * Note, if 'unmounting' if FALSE, we return with the 'z_teardown_lock'
1131 * and 'z_teardown_inactive_lock' held.
1132 */
1133static int
1134zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting)
1135{
1136	znode_t	*zp;
1137
1138	rrw_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
1139
1140	if (!unmounting) {
1141		/*
1142		 * We purge the parent filesystem's vfsp as the parent
1143		 * filesystem and all of its snapshots have their vnode's
1144		 * v_vfsp set to the parent's filesystem's vfsp.  Note,
1145		 * 'z_parent' is self referential for non-snapshots.
1146		 */
1147		(void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0);
1148	}
1149
1150	/*
1151	 * Close the zil. NB: Can't close the zil while zfs_inactive
1152	 * threads are blocked as zil_close can call zfs_inactive.
1153	 */
1154	if (zfsvfs->z_log) {
1155		zil_close(zfsvfs->z_log);
1156		zfsvfs->z_log = NULL;
1157	}
1158
1159	rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_WRITER);
1160
1161	/*
1162	 * If we are not unmounting (ie: online recv) and someone already
1163	 * unmounted this file system while we were doing the switcheroo,
1164	 * or a reopen of z_os failed then just bail out now.
1165	 */
1166	if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) {
1167		rw_exit(&zfsvfs->z_teardown_inactive_lock);
1168		rrw_exit(&zfsvfs->z_teardown_lock, FTAG);
1169		return (EIO);
1170	}
1171
1172	/*
1173	 * At this point there are no vops active, and any new vops will
1174	 * fail with EIO since we have z_teardown_lock for writer (only
1175	 * relavent for forced unmount).
1176	 *
1177	 * Release all holds on dbufs.
1178	 */
1179	mutex_enter(&zfsvfs->z_znodes_lock);
1180	for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL;
1181	    zp = list_next(&zfsvfs->z_all_znodes, zp))
1182		if (zp->z_dbuf) {
1183			ASSERT(ZTOV(zp)->v_count > 0);
1184			zfs_znode_dmu_fini(zp);
1185		}
1186	mutex_exit(&zfsvfs->z_znodes_lock);
1187
1188	/*
1189	 * If we are unmounting, set the unmounted flag and let new vops
1190	 * unblock.  zfs_inactive will have the unmounted behavior, and all
1191	 * other vops will fail with EIO.
1192	 */
1193	if (unmounting) {
1194		zfsvfs->z_unmounted = B_TRUE;
1195		rrw_exit(&zfsvfs->z_teardown_lock, FTAG);
1196		rw_exit(&zfsvfs->z_teardown_inactive_lock);
1197	}
1198
1199	/*
1200	 * z_os will be NULL if there was an error in attempting to reopen
1201	 * zfsvfs, so just return as the properties had already been
1202	 * unregistered and cached data had been evicted before.
1203	 */
1204	if (zfsvfs->z_os == NULL)
1205		return (0);
1206
1207	/*
1208	 * Unregister properties.
1209	 */
1210	zfs_unregister_callbacks(zfsvfs);
1211
1212	/*
1213	 * Evict cached data
1214	 */
1215	if (dmu_objset_evict_dbufs(zfsvfs->z_os)) {
1216		txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
1217		(void) dmu_objset_evict_dbufs(zfsvfs->z_os);
1218	}
1219
1220	return (0);
1221}
1222
1223/*ARGSUSED*/
1224static int
1225zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr)
1226{
1227	zfsvfs_t *zfsvfs = vfsp->vfs_data;
1228	objset_t *os;
1229	int ret;
1230
1231	ret = secpolicy_fs_unmount(cr, vfsp);
1232	if (ret) {
1233		ret = dsl_deleg_access((char *)refstr_value(vfsp->vfs_resource),
1234		    ZFS_DELEG_PERM_MOUNT, cr);
1235		if (ret)
1236			return (ret);
1237	}
1238
1239	/*
1240	 * We purge the parent filesystem's vfsp as the parent filesystem
1241	 * and all of its snapshots have their vnode's v_vfsp set to the
1242	 * parent's filesystem's vfsp.  Note, 'z_parent' is self
1243	 * referential for non-snapshots.
1244	 */
1245	(void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0);
1246
1247	/*
1248	 * Unmount any snapshots mounted under .zfs before unmounting the
1249	 * dataset itself.
1250	 */
1251	if (zfsvfs->z_ctldir != NULL &&
1252	    (ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0) {
1253		return (ret);
1254	}
1255
1256	if (!(fflag & MS_FORCE)) {
1257		/*
1258		 * Check the number of active vnodes in the file system.
1259		 * Our count is maintained in the vfs structure, but the
1260		 * number is off by 1 to indicate a hold on the vfs
1261		 * structure itself.
1262		 *
1263		 * The '.zfs' directory maintains a reference of its
1264		 * own, and any active references underneath are
1265		 * reflected in the vnode count.
1266		 */
1267		if (zfsvfs->z_ctldir == NULL) {
1268			if (vfsp->vfs_count > 1)
1269				return (EBUSY);
1270		} else {
1271			if (vfsp->vfs_count > 2 ||
1272			    zfsvfs->z_ctldir->v_count > 1)
1273				return (EBUSY);
1274		}
1275	}
1276
1277	vfsp->vfs_flag |= VFS_UNMOUNTED;
1278
1279	VERIFY(zfsvfs_teardown(zfsvfs, B_TRUE) == 0);
1280	os = zfsvfs->z_os;
1281
1282	/*
1283	 * z_os will be NULL if there was an error in
1284	 * attempting to reopen zfsvfs.
1285	 */
1286	if (os != NULL) {
1287		/*
1288		 * Unset the objset user_ptr.
1289		 */
1290		mutex_enter(&os->os->os_user_ptr_lock);
1291		dmu_objset_set_user(os, NULL);
1292		mutex_exit(&os->os->os_user_ptr_lock);
1293
1294		/*
1295		 * Finally release the objset
1296		 */
1297		dmu_objset_close(os);
1298	}
1299
1300	/*
1301	 * We can now safely destroy the '.zfs' directory node.
1302	 */
1303	if (zfsvfs->z_ctldir != NULL)
1304		zfsctl_destroy(zfsvfs);
1305
1306	return (0);
1307}
1308
1309static int
1310zfs_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp)
1311{
1312	zfsvfs_t	*zfsvfs = vfsp->vfs_data;
1313	znode_t		*zp;
1314	uint64_t	object = 0;
1315	uint64_t	fid_gen = 0;
1316	uint64_t	gen_mask;
1317	uint64_t	zp_gen;
1318	int 		i, err;
1319
1320	*vpp = NULL;
1321
1322	ZFS_ENTER(zfsvfs);
1323
1324	if (fidp->fid_len == LONG_FID_LEN) {
1325		zfid_long_t	*zlfid = (zfid_long_t *)fidp;
1326		uint64_t	objsetid = 0;
1327		uint64_t	setgen = 0;
1328
1329		for (i = 0; i < sizeof (zlfid->zf_setid); i++)
1330			objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i);
1331
1332		for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
1333			setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i);
1334
1335		ZFS_EXIT(zfsvfs);
1336
1337		err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs);
1338		if (err)
1339			return (EINVAL);
1340		ZFS_ENTER(zfsvfs);
1341	}
1342
1343	if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) {
1344		zfid_short_t	*zfid = (zfid_short_t *)fidp;
1345
1346		for (i = 0; i < sizeof (zfid->zf_object); i++)
1347			object |= ((uint64_t)zfid->zf_object[i]) << (8 * i);
1348
1349		for (i = 0; i < sizeof (zfid->zf_gen); i++)
1350			fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i);
1351	} else {
1352		ZFS_EXIT(zfsvfs);
1353		return (EINVAL);
1354	}
1355
1356	/* A zero fid_gen means we are in the .zfs control directories */
1357	if (fid_gen == 0 &&
1358	    (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) {
1359		*vpp = zfsvfs->z_ctldir;
1360		ASSERT(*vpp != NULL);
1361		if (object == ZFSCTL_INO_SNAPDIR) {
1362			VERIFY(zfsctl_root_lookup(*vpp, "snapshot", vpp, NULL,
1363			    0, NULL, NULL, NULL, NULL, NULL) == 0);
1364		} else {
1365			VN_HOLD(*vpp);
1366		}
1367		ZFS_EXIT(zfsvfs);
1368		return (0);
1369	}
1370
1371	gen_mask = -1ULL >> (64 - 8 * i);
1372
1373	dprintf("getting %llu [%u mask %llx]\n", object, fid_gen, gen_mask);
1374	if (err = zfs_zget(zfsvfs, object, &zp)) {
1375		ZFS_EXIT(zfsvfs);
1376		return (err);
1377	}
1378	zp_gen = zp->z_phys->zp_gen & gen_mask;
1379	if (zp_gen == 0)
1380		zp_gen = 1;
1381	if (zp->z_unlinked || zp_gen != fid_gen) {
1382		dprintf("znode gen (%u) != fid gen (%u)\n", zp_gen, fid_gen);
1383		VN_RELE(ZTOV(zp));
1384		ZFS_EXIT(zfsvfs);
1385		return (EINVAL);
1386	}
1387
1388	*vpp = ZTOV(zp);
1389	ZFS_EXIT(zfsvfs);
1390	return (0);
1391}
1392
1393/*
1394 * Block out VOPs and close zfsvfs_t::z_os
1395 *
1396 * Note, if successful, then we return with the 'z_teardown_lock' and
1397 * 'z_teardown_inactive_lock' write held.
1398 */
1399int
1400zfs_suspend_fs(zfsvfs_t *zfsvfs, char *name, int *mode)
1401{
1402	int error;
1403
1404	if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0)
1405		return (error);
1406
1407	*mode = zfsvfs->z_os->os_mode;
1408	dmu_objset_name(zfsvfs->z_os, name);
1409	dmu_objset_close(zfsvfs->z_os);
1410
1411	return (0);
1412}
1413
1414/*
1415 * Reopen zfsvfs_t::z_os and release VOPs.
1416 */
1417int
1418zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname, int mode)
1419{
1420	int err;
1421
1422	ASSERT(RRW_WRITE_HELD(&zfsvfs->z_teardown_lock));
1423	ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock));
1424
1425	err = dmu_objset_open(osname, DMU_OST_ZFS, mode, &zfsvfs->z_os);
1426	if (err) {
1427		zfsvfs->z_os = NULL;
1428	} else {
1429		znode_t *zp;
1430
1431		VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0);
1432
1433		/*
1434		 * Attempt to re-establish all the active znodes with
1435		 * their dbufs.  If a zfs_rezget() fails, then we'll let
1436		 * any potential callers discover that via ZFS_ENTER_VERIFY_VP
1437		 * when they try to use their znode.
1438		 */
1439		mutex_enter(&zfsvfs->z_znodes_lock);
1440		for (zp = list_head(&zfsvfs->z_all_znodes); zp;
1441		    zp = list_next(&zfsvfs->z_all_znodes, zp)) {
1442			(void) zfs_rezget(zp);
1443		}
1444		mutex_exit(&zfsvfs->z_znodes_lock);
1445
1446	}
1447
1448	/* release the VOPs */
1449	rw_exit(&zfsvfs->z_teardown_inactive_lock);
1450	rrw_exit(&zfsvfs->z_teardown_lock, FTAG);
1451
1452	if (err) {
1453		/*
1454		 * Since we couldn't reopen zfsvfs::z_os, force
1455		 * unmount this file system.
1456		 */
1457		if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0)
1458			(void) dounmount(zfsvfs->z_vfs, MS_FORCE, CRED());
1459	}
1460	return (err);
1461}
1462
1463static void
1464zfs_freevfs(vfs_t *vfsp)
1465{
1466	zfsvfs_t *zfsvfs = vfsp->vfs_data;
1467	int i;
1468
1469	for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
1470		mutex_destroy(&zfsvfs->z_hold_mtx[i]);
1471
1472	zfs_fuid_destroy(zfsvfs);
1473	zfs_freezfsvfs(zfsvfs);
1474
1475	atomic_add_32(&zfs_active_fs_count, -1);
1476}
1477
1478/*
1479 * VFS_INIT() initialization.  Note that there is no VFS_FINI(),
1480 * so we can't safely do any non-idempotent initialization here.
1481 * Leave that to zfs_init() and zfs_fini(), which are called
1482 * from the module's _init() and _fini() entry points.
1483 */
1484/*ARGSUSED*/
1485static int
1486zfs_vfsinit(int fstype, char *name)
1487{
1488	int error;
1489
1490	zfsfstype = fstype;
1491
1492	/*
1493	 * Setup vfsops and vnodeops tables.
1494	 */
1495	error = vfs_setfsops(fstype, zfs_vfsops_template, &zfs_vfsops);
1496	if (error != 0) {
1497		cmn_err(CE_WARN, "zfs: bad vfs ops template");
1498	}
1499
1500	error = zfs_create_op_tables();
1501	if (error) {
1502		zfs_remove_op_tables();
1503		cmn_err(CE_WARN, "zfs: bad vnode ops template");
1504		(void) vfs_freevfsops_by_type(zfsfstype);
1505		return (error);
1506	}
1507
1508	mutex_init(&zfs_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
1509
1510	/*
1511	 * Unique major number for all zfs mounts.
1512	 * If we run out of 32-bit minors, we'll getudev() another major.
1513	 */
1514	zfs_major = ddi_name_to_major(ZFS_DRIVER);
1515	zfs_minor = ZFS_MIN_MINOR;
1516
1517	return (0);
1518}
1519
1520void
1521zfs_init(void)
1522{
1523	/*
1524	 * Initialize .zfs directory structures
1525	 */
1526	zfsctl_init();
1527
1528	/*
1529	 * Initialize znode cache, vnode ops, etc...
1530	 */
1531	zfs_znode_init();
1532}
1533
1534void
1535zfs_fini(void)
1536{
1537	zfsctl_fini();
1538	zfs_znode_fini();
1539}
1540
1541int
1542zfs_busy(void)
1543{
1544	return (zfs_active_fs_count != 0);
1545}
1546
1547int
1548zfs_set_version(const char *name, uint64_t newvers)
1549{
1550	int error;
1551	objset_t *os;
1552	dmu_tx_t *tx;
1553	uint64_t curvers;
1554
1555	/*
1556	 * XXX for now, require that the filesystem be unmounted.  Would
1557	 * be nice to find the zfsvfs_t and just update that if
1558	 * possible.
1559	 */
1560
1561	if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION)
1562		return (EINVAL);
1563
1564	error = dmu_objset_open(name, DMU_OST_ZFS, DS_MODE_OWNER, &os);
1565	if (error)
1566		return (error);
1567
1568	error = zap_lookup(os, MASTER_NODE_OBJ, ZPL_VERSION_STR,
1569	    8, 1, &curvers);
1570	if (error)
1571		goto out;
1572	if (newvers < curvers) {
1573		error = EINVAL;
1574		goto out;
1575	}
1576
1577	tx = dmu_tx_create(os);
1578	dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, 0, ZPL_VERSION_STR);
1579	error = dmu_tx_assign(tx, TXG_WAIT);
1580	if (error) {
1581		dmu_tx_abort(tx);
1582		goto out;
1583	}
1584	error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR, 8, 1,
1585	    &newvers, tx);
1586
1587	spa_history_internal_log(LOG_DS_UPGRADE,
1588	    dmu_objset_spa(os), tx, CRED(),
1589	    "oldver=%llu newver=%llu dataset = %llu", curvers, newvers,
1590	    dmu_objset_id(os));
1591	dmu_tx_commit(tx);
1592
1593out:
1594	dmu_objset_close(os);
1595	return (error);
1596}
1597
1598/*
1599 * Read a property stored within the master node.
1600 */
1601int
1602zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value)
1603{
1604	const char *pname;
1605	int error = ENOENT;
1606
1607	/*
1608	 * Look up the file system's value for the property.  For the
1609	 * version property, we look up a slightly different string.
1610	 */
1611	if (prop == ZFS_PROP_VERSION)
1612		pname = ZPL_VERSION_STR;
1613	else
1614		pname = zfs_prop_to_name(prop);
1615
1616	if (os != NULL)
1617		error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value);
1618
1619	if (error == ENOENT) {
1620		/* No value set, use the default value */
1621		switch (prop) {
1622		case ZFS_PROP_VERSION:
1623			*value = ZPL_VERSION;
1624			break;
1625		case ZFS_PROP_NORMALIZE:
1626		case ZFS_PROP_UTF8ONLY:
1627			*value = 0;
1628			break;
1629		case ZFS_PROP_CASE:
1630			*value = ZFS_CASE_SENSITIVE;
1631			break;
1632		default:
1633			return (error);
1634		}
1635		error = 0;
1636	}
1637	return (error);
1638}
1639
1640static vfsdef_t vfw = {
1641	VFSDEF_VERSION,
1642	MNTTYPE_ZFS,
1643	zfs_vfsinit,
1644	VSW_HASPROTO|VSW_CANRWRO|VSW_CANREMOUNT|VSW_VOLATILEDEV|VSW_STATS|
1645	    VSW_XID,
1646	&zfs_mntopts
1647};
1648
1649struct modlfs zfs_modlfs = {
1650	&mod_fsops, "ZFS filesystem version " SPA_VERSION_STRING, &vfw
1651};
1652