vfs.c revision 10793:34709091de6d
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26/*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
27/*	  All Rights Reserved  	*/
28
29/*
30 * University Copyright- Copyright (c) 1982, 1986, 1988
31 * The Regents of the University of California
32 * All Rights Reserved
33 *
34 * University Acknowledgment- Portions of this document are derived from
35 * software developed by the University of California, Berkeley, and its
36 * contributors.
37 */
38
39#include <sys/types.h>
40#include <sys/t_lock.h>
41#include <sys/param.h>
42#include <sys/errno.h>
43#include <sys/user.h>
44#include <sys/fstyp.h>
45#include <sys/kmem.h>
46#include <sys/systm.h>
47#include <sys/proc.h>
48#include <sys/mount.h>
49#include <sys/vfs.h>
50#include <sys/vfs_opreg.h>
51#include <sys/fem.h>
52#include <sys/mntent.h>
53#include <sys/stat.h>
54#include <sys/statvfs.h>
55#include <sys/statfs.h>
56#include <sys/cred.h>
57#include <sys/vnode.h>
58#include <sys/rwstlock.h>
59#include <sys/dnlc.h>
60#include <sys/file.h>
61#include <sys/time.h>
62#include <sys/atomic.h>
63#include <sys/cmn_err.h>
64#include <sys/buf.h>
65#include <sys/swap.h>
66#include <sys/debug.h>
67#include <sys/vnode.h>
68#include <sys/modctl.h>
69#include <sys/ddi.h>
70#include <sys/pathname.h>
71#include <sys/bootconf.h>
72#include <sys/dumphdr.h>
73#include <sys/dc_ki.h>
74#include <sys/poll.h>
75#include <sys/sunddi.h>
76#include <sys/sysmacros.h>
77#include <sys/zone.h>
78#include <sys/policy.h>
79#include <sys/ctfs.h>
80#include <sys/objfs.h>
81#include <sys/console.h>
82#include <sys/reboot.h>
83#include <sys/attr.h>
84#include <sys/spa.h>
85#include <sys/lofi.h>
86#include <sys/bootprops.h>
87
88#include <vm/page.h>
89
90#include <fs/fs_subr.h>
91/* Private interfaces to create vopstats-related data structures */
92extern void		initialize_vopstats(vopstats_t *);
93extern vopstats_t	*get_fstype_vopstats(struct vfs *, struct vfssw *);
94extern vsk_anchor_t	*get_vskstat_anchor(struct vfs *);
95
96static void vfs_clearmntopt_nolock(mntopts_t *, const char *, int);
97static void vfs_setmntopt_nolock(mntopts_t *, const char *,
98    const char *, int, int);
99static int  vfs_optionisset_nolock(const mntopts_t *, const char *, char **);
100static void vfs_freemnttab(struct vfs *);
101static void vfs_freeopt(mntopt_t *);
102static void vfs_swapopttbl_nolock(mntopts_t *, mntopts_t *);
103static void vfs_swapopttbl(mntopts_t *, mntopts_t *);
104static void vfs_copyopttbl_extend(const mntopts_t *, mntopts_t *, int);
105static void vfs_createopttbl_extend(mntopts_t *, const char *,
106    const mntopts_t *);
107static char **vfs_copycancelopt_extend(char **const, int);
108static void vfs_freecancelopt(char **);
109static void getrootfs(char **, char **);
110static int getmacpath(dev_info_t *, void *);
111static void vfs_mnttabvp_setup(void);
112
113struct ipmnt {
114	struct ipmnt	*mip_next;
115	dev_t		mip_dev;
116	struct vfs	*mip_vfsp;
117};
118
119static kmutex_t		vfs_miplist_mutex;
120static struct ipmnt	*vfs_miplist = NULL;
121static struct ipmnt	*vfs_miplist_end = NULL;
122
123static kmem_cache_t *vfs_cache;	/* Pointer to VFS kmem cache */
124
125/*
126 * VFS global data.
127 */
128vnode_t *rootdir;		/* pointer to root inode vnode. */
129vnode_t *devicesdir;		/* pointer to inode of devices root */
130vnode_t	*devdir;		/* pointer to inode of dev root */
131
132char *server_rootpath;		/* root path for diskless clients */
133char *server_hostname;		/* hostname of diskless server */
134
135static struct vfs root;
136static struct vfs devices;
137static struct vfs dev;
138struct vfs *rootvfs = &root;	/* pointer to root vfs; head of VFS list. */
139rvfs_t *rvfs_list;		/* array of vfs ptrs for vfs hash list */
140int vfshsz = 512;		/* # of heads/locks in vfs hash arrays */
141				/* must be power of 2!	*/
142timespec_t vfs_mnttab_ctime;	/* mnttab created time */
143timespec_t vfs_mnttab_mtime;	/* mnttab last modified time */
144char *vfs_dummyfstype = "\0";
145struct pollhead vfs_pollhd;	/* for mnttab pollers */
146struct vnode *vfs_mntdummyvp;	/* to fake mnttab read/write for file events */
147int	mntfstype;		/* will be set once mnt fs is mounted */
148
149/*
150 * Table for generic options recognized in the VFS layer and acted
151 * on at this level before parsing file system specific options.
152 * The nosuid option is stronger than any of the devices and setuid
153 * options, so those are canceled when nosuid is seen.
154 *
155 * All options which are added here need to be added to the
156 * list of standard options in usr/src/cmd/fs.d/fslib.c as well.
157 */
158/*
159 * VFS Mount options table
160 */
161static char *ro_cancel[] = { MNTOPT_RW, NULL };
162static char *rw_cancel[] = { MNTOPT_RO, NULL };
163static char *suid_cancel[] = { MNTOPT_NOSUID, NULL };
164static char *nosuid_cancel[] = { MNTOPT_SUID, MNTOPT_DEVICES, MNTOPT_NODEVICES,
165    MNTOPT_NOSETUID, MNTOPT_SETUID, NULL };
166static char *devices_cancel[] = { MNTOPT_NODEVICES, NULL };
167static char *nodevices_cancel[] = { MNTOPT_DEVICES, NULL };
168static char *setuid_cancel[] = { MNTOPT_NOSETUID, NULL };
169static char *nosetuid_cancel[] = { MNTOPT_SETUID, NULL };
170static char *nbmand_cancel[] = { MNTOPT_NONBMAND, NULL };
171static char *nonbmand_cancel[] = { MNTOPT_NBMAND, NULL };
172static char *exec_cancel[] = { MNTOPT_NOEXEC, NULL };
173static char *noexec_cancel[] = { MNTOPT_EXEC, NULL };
174
175static const mntopt_t mntopts[] = {
176/*
177 *	option name		cancel options		default arg	flags
178 */
179	{ MNTOPT_REMOUNT,	NULL,			NULL,
180		MO_NODISPLAY, (void *)0 },
181	{ MNTOPT_RO,		ro_cancel,		NULL,		0,
182		(void *)0 },
183	{ MNTOPT_RW,		rw_cancel,		NULL,		0,
184		(void *)0 },
185	{ MNTOPT_SUID,		suid_cancel,		NULL,		0,
186		(void *)0 },
187	{ MNTOPT_NOSUID,	nosuid_cancel,		NULL,		0,
188		(void *)0 },
189	{ MNTOPT_DEVICES,	devices_cancel,		NULL,		0,
190		(void *)0 },
191	{ MNTOPT_NODEVICES,	nodevices_cancel,	NULL,		0,
192		(void *)0 },
193	{ MNTOPT_SETUID,	setuid_cancel,		NULL,		0,
194		(void *)0 },
195	{ MNTOPT_NOSETUID,	nosetuid_cancel,	NULL,		0,
196		(void *)0 },
197	{ MNTOPT_NBMAND,	nbmand_cancel,		NULL,		0,
198		(void *)0 },
199	{ MNTOPT_NONBMAND,	nonbmand_cancel,	NULL,		0,
200		(void *)0 },
201	{ MNTOPT_EXEC,		exec_cancel,		NULL,		0,
202		(void *)0 },
203	{ MNTOPT_NOEXEC,	noexec_cancel,		NULL,		0,
204		(void *)0 },
205};
206
207const mntopts_t vfs_mntopts = {
208	sizeof (mntopts) / sizeof (mntopt_t),
209	(mntopt_t *)&mntopts[0]
210};
211
212/*
213 * File system operation dispatch functions.
214 */
215
216int
217fsop_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
218{
219	return (*(vfsp)->vfs_op->vfs_mount)(vfsp, mvp, uap, cr);
220}
221
222int
223fsop_unmount(vfs_t *vfsp, int flag, cred_t *cr)
224{
225	return (*(vfsp)->vfs_op->vfs_unmount)(vfsp, flag, cr);
226}
227
228int
229fsop_root(vfs_t *vfsp, vnode_t **vpp)
230{
231	refstr_t *mntpt;
232	int ret = (*(vfsp)->vfs_op->vfs_root)(vfsp, vpp);
233	/*
234	 * Make sure this root has a path.  With lofs, it is possible to have
235	 * a NULL mountpoint.
236	 */
237	if (ret == 0 && vfsp->vfs_mntpt != NULL && (*vpp)->v_path == NULL) {
238		mntpt = vfs_getmntpoint(vfsp);
239		vn_setpath_str(*vpp, refstr_value(mntpt),
240		    strlen(refstr_value(mntpt)));
241		refstr_rele(mntpt);
242	}
243
244	return (ret);
245}
246
247int
248fsop_statfs(vfs_t *vfsp, statvfs64_t *sp)
249{
250	return (*(vfsp)->vfs_op->vfs_statvfs)(vfsp, sp);
251}
252
253int
254fsop_sync(vfs_t *vfsp, short flag, cred_t *cr)
255{
256	return (*(vfsp)->vfs_op->vfs_sync)(vfsp, flag, cr);
257}
258
259int
260fsop_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp)
261{
262	/*
263	 * In order to handle system attribute fids in a manner
264	 * transparent to the underlying fs, we embed the fid for
265	 * the sysattr parent object in the sysattr fid and tack on
266	 * some extra bytes that only the sysattr layer knows about.
267	 *
268	 * This guarantees that sysattr fids are larger than other fids
269	 * for this vfs. If the vfs supports the sysattr view interface
270	 * (as indicated by VFSFT_SYSATTR_VIEWS), we cannot have a size
271	 * collision with XATTR_FIDSZ.
272	 */
273	if (vfs_has_feature(vfsp, VFSFT_SYSATTR_VIEWS) &&
274	    fidp->fid_len == XATTR_FIDSZ)
275		return (xattr_dir_vget(vfsp, vpp, fidp));
276
277	return (*(vfsp)->vfs_op->vfs_vget)(vfsp, vpp, fidp);
278}
279
280int
281fsop_mountroot(vfs_t *vfsp, enum whymountroot reason)
282{
283	return (*(vfsp)->vfs_op->vfs_mountroot)(vfsp, reason);
284}
285
286void
287fsop_freefs(vfs_t *vfsp)
288{
289	(*(vfsp)->vfs_op->vfs_freevfs)(vfsp);
290}
291
292int
293fsop_vnstate(vfs_t *vfsp, vnode_t *vp, vntrans_t nstate)
294{
295	return ((*(vfsp)->vfs_op->vfs_vnstate)(vfsp, vp, nstate));
296}
297
298int
299fsop_sync_by_kind(int fstype, short flag, cred_t *cr)
300{
301	ASSERT((fstype >= 0) && (fstype < nfstype));
302
303	if (ALLOCATED_VFSSW(&vfssw[fstype]) && VFS_INSTALLED(&vfssw[fstype]))
304		return (*vfssw[fstype].vsw_vfsops.vfs_sync) (NULL, flag, cr);
305	else
306		return (ENOTSUP);
307}
308
309/*
310 * File system initialization.  vfs_setfsops() must be called from a file
311 * system's init routine.
312 */
313
314static int
315fs_copyfsops(const fs_operation_def_t *template, vfsops_t *actual,
316    int *unused_ops)
317{
318	static const fs_operation_trans_def_t vfs_ops_table[] = {
319		VFSNAME_MOUNT, offsetof(vfsops_t, vfs_mount),
320			fs_nosys, fs_nosys,
321
322		VFSNAME_UNMOUNT, offsetof(vfsops_t, vfs_unmount),
323			fs_nosys, fs_nosys,
324
325		VFSNAME_ROOT, offsetof(vfsops_t, vfs_root),
326			fs_nosys, fs_nosys,
327
328		VFSNAME_STATVFS, offsetof(vfsops_t, vfs_statvfs),
329			fs_nosys, fs_nosys,
330
331		VFSNAME_SYNC, offsetof(vfsops_t, vfs_sync),
332			(fs_generic_func_p) fs_sync,
333			(fs_generic_func_p) fs_sync,	/* No errors allowed */
334
335		VFSNAME_VGET, offsetof(vfsops_t, vfs_vget),
336			fs_nosys, fs_nosys,
337
338		VFSNAME_MOUNTROOT, offsetof(vfsops_t, vfs_mountroot),
339			fs_nosys, fs_nosys,
340
341		VFSNAME_FREEVFS, offsetof(vfsops_t, vfs_freevfs),
342			(fs_generic_func_p)fs_freevfs,
343			(fs_generic_func_p)fs_freevfs,	/* Shouldn't fail */
344
345		VFSNAME_VNSTATE, offsetof(vfsops_t, vfs_vnstate),
346			(fs_generic_func_p)fs_nosys,
347			(fs_generic_func_p)fs_nosys,
348
349		NULL, 0, NULL, NULL
350	};
351
352	return (fs_build_vector(actual, unused_ops, vfs_ops_table, template));
353}
354
355void
356zfs_boot_init() {
357
358	if (strcmp(rootfs.bo_fstype, MNTTYPE_ZFS) == 0)
359		spa_boot_init();
360}
361
362int
363vfs_setfsops(int fstype, const fs_operation_def_t *template, vfsops_t **actual)
364{
365	int error;
366	int unused_ops;
367
368	/*
369	 * Verify that fstype refers to a valid fs.  Note that
370	 * 0 is valid since it's used to set "stray" ops.
371	 */
372	if ((fstype < 0) || (fstype >= nfstype))
373		return (EINVAL);
374
375	if (!ALLOCATED_VFSSW(&vfssw[fstype]))
376		return (EINVAL);
377
378	/* Set up the operations vector. */
379
380	error = fs_copyfsops(template, &vfssw[fstype].vsw_vfsops, &unused_ops);
381
382	if (error != 0)
383		return (error);
384
385	vfssw[fstype].vsw_flag |= VSW_INSTALLED;
386
387	if (actual != NULL)
388		*actual = &vfssw[fstype].vsw_vfsops;
389
390#if DEBUG
391	if (unused_ops != 0)
392		cmn_err(CE_WARN, "vfs_setfsops: %s: %d operations supplied "
393		    "but not used", vfssw[fstype].vsw_name, unused_ops);
394#endif
395
396	return (0);
397}
398
399int
400vfs_makefsops(const fs_operation_def_t *template, vfsops_t **actual)
401{
402	int error;
403	int unused_ops;
404
405	*actual = (vfsops_t *)kmem_alloc(sizeof (vfsops_t), KM_SLEEP);
406
407	error = fs_copyfsops(template, *actual, &unused_ops);
408	if (error != 0) {
409		kmem_free(*actual, sizeof (vfsops_t));
410		*actual = NULL;
411		return (error);
412	}
413
414	return (0);
415}
416
417/*
418 * Free a vfsops structure created as a result of vfs_makefsops().
419 * NOTE: For a vfsops structure initialized by vfs_setfsops(), use
420 * vfs_freevfsops_by_type().
421 */
422void
423vfs_freevfsops(vfsops_t *vfsops)
424{
425	kmem_free(vfsops, sizeof (vfsops_t));
426}
427
428/*
429 * Since the vfsops structure is part of the vfssw table and wasn't
430 * really allocated, we're not really freeing anything.  We keep
431 * the name for consistency with vfs_freevfsops().  We do, however,
432 * need to take care of a little bookkeeping.
433 * NOTE: For a vfsops structure created by vfs_setfsops(), use
434 * vfs_freevfsops_by_type().
435 */
436int
437vfs_freevfsops_by_type(int fstype)
438{
439
440	/* Verify that fstype refers to a loaded fs (and not fsid 0). */
441	if ((fstype <= 0) || (fstype >= nfstype))
442		return (EINVAL);
443
444	WLOCK_VFSSW();
445	if ((vfssw[fstype].vsw_flag & VSW_INSTALLED) == 0) {
446		WUNLOCK_VFSSW();
447		return (EINVAL);
448	}
449
450	vfssw[fstype].vsw_flag &= ~VSW_INSTALLED;
451	WUNLOCK_VFSSW();
452
453	return (0);
454}
455
456/* Support routines used to reference vfs_op */
457
458/* Set the operations vector for a vfs */
459void
460vfs_setops(vfs_t *vfsp, vfsops_t *vfsops)
461{
462	vfsops_t	*op;
463
464	ASSERT(vfsp != NULL);
465	ASSERT(vfsops != NULL);
466
467	op = vfsp->vfs_op;
468	membar_consumer();
469	if (vfsp->vfs_femhead == NULL &&
470	    casptr(&vfsp->vfs_op, op, vfsops) == op) {
471		return;
472	}
473	fsem_setvfsops(vfsp, vfsops);
474}
475
476/* Retrieve the operations vector for a vfs */
477vfsops_t *
478vfs_getops(vfs_t *vfsp)
479{
480	vfsops_t	*op;
481
482	ASSERT(vfsp != NULL);
483
484	op = vfsp->vfs_op;
485	membar_consumer();
486	if (vfsp->vfs_femhead == NULL && op == vfsp->vfs_op) {
487		return (op);
488	} else {
489		return (fsem_getvfsops(vfsp));
490	}
491}
492
493/*
494 * Returns non-zero (1) if the vfsops matches that of the vfs.
495 * Returns zero (0) if not.
496 */
497int
498vfs_matchops(vfs_t *vfsp, vfsops_t *vfsops)
499{
500	return (vfs_getops(vfsp) == vfsops);
501}
502
503/*
504 * Returns non-zero (1) if the file system has installed a non-default,
505 * non-error vfs_sync routine.  Returns zero (0) otherwise.
506 */
507int
508vfs_can_sync(vfs_t *vfsp)
509{
510	/* vfs_sync() routine is not the default/error function */
511	return (vfs_getops(vfsp)->vfs_sync != fs_sync);
512}
513
514/*
515 * Initialize a vfs structure.
516 */
517void
518vfs_init(vfs_t *vfsp, vfsops_t *op, void *data)
519{
520	/* Other initialization has been moved to vfs_alloc() */
521	vfsp->vfs_count = 0;
522	vfsp->vfs_next = vfsp;
523	vfsp->vfs_prev = vfsp;
524	vfsp->vfs_zone_next = vfsp;
525	vfsp->vfs_zone_prev = vfsp;
526	vfsp->vfs_lofi_minor = 0;
527	sema_init(&vfsp->vfs_reflock, 1, NULL, SEMA_DEFAULT, NULL);
528	vfsimpl_setup(vfsp);
529	vfsp->vfs_data = (data);
530	vfs_setops((vfsp), (op));
531}
532
533/*
534 * Allocate and initialize the vfs implementation private data
535 * structure, vfs_impl_t.
536 */
537void
538vfsimpl_setup(vfs_t *vfsp)
539{
540	int i;
541
542	if (vfsp->vfs_implp != NULL) {
543		return;
544	}
545
546	vfsp->vfs_implp = kmem_alloc(sizeof (vfs_impl_t), KM_SLEEP);
547	/* Note that these are #define'd in vfs.h */
548	vfsp->vfs_vskap = NULL;
549	vfsp->vfs_fstypevsp = NULL;
550
551	/* Set size of counted array, then zero the array */
552	vfsp->vfs_featureset[0] = VFS_FEATURE_MAXSZ - 1;
553	for (i = 1; i <  VFS_FEATURE_MAXSZ; i++) {
554		vfsp->vfs_featureset[i] = 0;
555	}
556}
557
558/*
559 * Release the vfs_impl_t structure, if it exists. Some unbundled
560 * filesystems may not use the newer version of vfs and thus
561 * would not contain this implementation private data structure.
562 */
563void
564vfsimpl_teardown(vfs_t *vfsp)
565{
566	vfs_impl_t	*vip = vfsp->vfs_implp;
567
568	if (vip == NULL)
569		return;
570
571	kmem_free(vfsp->vfs_implp, sizeof (vfs_impl_t));
572	vfsp->vfs_implp = NULL;
573}
574
575/*
576 * VFS system calls: mount, umount, syssync, statfs, fstatfs, statvfs,
577 * fstatvfs, and sysfs moved to common/syscall.
578 */
579
580/*
581 * Update every mounted file system.  We call the vfs_sync operation of
582 * each file system type, passing it a NULL vfsp to indicate that all
583 * mounted file systems of that type should be updated.
584 */
585void
586vfs_sync(int flag)
587{
588	struct vfssw *vswp;
589	RLOCK_VFSSW();
590	for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) {
591		if (ALLOCATED_VFSSW(vswp) && VFS_INSTALLED(vswp)) {
592			vfs_refvfssw(vswp);
593			RUNLOCK_VFSSW();
594			(void) (*vswp->vsw_vfsops.vfs_sync)(NULL, flag,
595			    CRED());
596			vfs_unrefvfssw(vswp);
597			RLOCK_VFSSW();
598		}
599	}
600	RUNLOCK_VFSSW();
601}
602
603void
604sync(void)
605{
606	vfs_sync(0);
607}
608
609/*
610 * External routines.
611 */
612
613krwlock_t vfssw_lock;	/* lock accesses to vfssw */
614
615/*
616 * Lock for accessing the vfs linked list.  Initialized in vfs_mountroot(),
617 * but otherwise should be accessed only via vfs_list_lock() and
618 * vfs_list_unlock().  Also used to protect the timestamp for mods to the list.
619 */
620static krwlock_t vfslist;
621
622/*
623 * Mount devfs on /devices. This is done right after root is mounted
624 * to provide device access support for the system
625 */
626static void
627vfs_mountdevices(void)
628{
629	struct vfssw *vsw;
630	struct vnode *mvp;
631	struct mounta mounta = {	/* fake mounta for devfs_mount() */
632		NULL,
633		NULL,
634		MS_SYSSPACE,
635		NULL,
636		NULL,
637		0,
638		NULL,
639		0
640	};
641
642	/*
643	 * _init devfs module to fill in the vfssw
644	 */
645	if (modload("fs", "devfs") == -1)
646		panic("Cannot _init devfs module");
647
648	/*
649	 * Hold vfs
650	 */
651	RLOCK_VFSSW();
652	vsw = vfs_getvfsswbyname("devfs");
653	VFS_INIT(&devices, &vsw->vsw_vfsops, NULL);
654	VFS_HOLD(&devices);
655
656	/*
657	 * Locate mount point
658	 */
659	if (lookupname("/devices", UIO_SYSSPACE, FOLLOW, NULLVPP, &mvp))
660		panic("Cannot find /devices");
661
662	/*
663	 * Perform the mount of /devices
664	 */
665	if (VFS_MOUNT(&devices, mvp, &mounta, CRED()))
666		panic("Cannot mount /devices");
667
668	RUNLOCK_VFSSW();
669
670	/*
671	 * Set appropriate members and add to vfs list for mnttab display
672	 */
673	vfs_setresource(&devices, "/devices");
674	vfs_setmntpoint(&devices, "/devices");
675
676	/*
677	 * Hold the root of /devices so it won't go away
678	 */
679	if (VFS_ROOT(&devices, &devicesdir))
680		panic("vfs_mountdevices: not devices root");
681
682	if (vfs_lock(&devices) != 0) {
683		VN_RELE(devicesdir);
684		cmn_err(CE_NOTE, "Cannot acquire vfs_lock of /devices");
685		return;
686	}
687
688	if (vn_vfswlock(mvp) != 0) {
689		vfs_unlock(&devices);
690		VN_RELE(devicesdir);
691		cmn_err(CE_NOTE, "Cannot acquire vfswlock of /devices");
692		return;
693	}
694
695	vfs_add(mvp, &devices, 0);
696	vn_vfsunlock(mvp);
697	vfs_unlock(&devices);
698	VN_RELE(devicesdir);
699}
700
701/*
702 * mount the first instance of /dev  to root and remain mounted
703 */
704static void
705vfs_mountdev1(void)
706{
707	struct vfssw *vsw;
708	struct vnode *mvp;
709	struct mounta mounta = {	/* fake mounta for sdev_mount() */
710		NULL,
711		NULL,
712		MS_SYSSPACE | MS_OVERLAY,
713		NULL,
714		NULL,
715		0,
716		NULL,
717		0
718	};
719
720	/*
721	 * _init dev module to fill in the vfssw
722	 */
723	if (modload("fs", "dev") == -1)
724		cmn_err(CE_PANIC, "Cannot _init dev module\n");
725
726	/*
727	 * Hold vfs
728	 */
729	RLOCK_VFSSW();
730	vsw = vfs_getvfsswbyname("dev");
731	VFS_INIT(&dev, &vsw->vsw_vfsops, NULL);
732	VFS_HOLD(&dev);
733
734	/*
735	 * Locate mount point
736	 */
737	if (lookupname("/dev", UIO_SYSSPACE, FOLLOW, NULLVPP, &mvp))
738		cmn_err(CE_PANIC, "Cannot find /dev\n");
739
740	/*
741	 * Perform the mount of /dev
742	 */
743	if (VFS_MOUNT(&dev, mvp, &mounta, CRED()))
744		cmn_err(CE_PANIC, "Cannot mount /dev 1\n");
745
746	RUNLOCK_VFSSW();
747
748	/*
749	 * Set appropriate members and add to vfs list for mnttab display
750	 */
751	vfs_setresource(&dev, "/dev");
752	vfs_setmntpoint(&dev, "/dev");
753
754	/*
755	 * Hold the root of /dev so it won't go away
756	 */
757	if (VFS_ROOT(&dev, &devdir))
758		cmn_err(CE_PANIC, "vfs_mountdev1: not dev root");
759
760	if (vfs_lock(&dev) != 0) {
761		VN_RELE(devdir);
762		cmn_err(CE_NOTE, "Cannot acquire vfs_lock of /dev");
763		return;
764	}
765
766	if (vn_vfswlock(mvp) != 0) {
767		vfs_unlock(&dev);
768		VN_RELE(devdir);
769		cmn_err(CE_NOTE, "Cannot acquire vfswlock of /dev");
770		return;
771	}
772
773	vfs_add(mvp, &dev, 0);
774	vn_vfsunlock(mvp);
775	vfs_unlock(&dev);
776	VN_RELE(devdir);
777}
778
779/*
780 * Mount required filesystem. This is done right after root is mounted.
781 */
782static void
783vfs_mountfs(char *module, char *spec, char *path)
784{
785	struct vnode *mvp;
786	struct mounta mounta;
787	vfs_t *vfsp;
788
789	mounta.flags = MS_SYSSPACE | MS_DATA;
790	mounta.fstype = module;
791	mounta.spec = spec;
792	mounta.dir = path;
793	if (lookupname(path, UIO_SYSSPACE, FOLLOW, NULLVPP, &mvp)) {
794		cmn_err(CE_WARN, "Cannot find %s", path);
795		return;
796	}
797	if (domount(NULL, &mounta, mvp, CRED(), &vfsp))
798		cmn_err(CE_WARN, "Cannot mount %s", path);
799	else
800		VFS_RELE(vfsp);
801	VN_RELE(mvp);
802}
803
804/*
805 * vfs_mountroot is called by main() to mount the root filesystem.
806 */
807void
808vfs_mountroot(void)
809{
810	struct vnode	*rvp = NULL;
811	char		*path;
812	size_t		plen;
813	struct vfssw	*vswp;
814
815	rw_init(&vfssw_lock, NULL, RW_DEFAULT, NULL);
816	rw_init(&vfslist, NULL, RW_DEFAULT, NULL);
817
818	/*
819	 * Alloc the vfs hash bucket array and locks
820	 */
821	rvfs_list = kmem_zalloc(vfshsz * sizeof (rvfs_t), KM_SLEEP);
822
823	/*
824	 * Call machine-dependent routine "rootconf" to choose a root
825	 * file system type.
826	 */
827	if (rootconf())
828		panic("vfs_mountroot: cannot mount root");
829	/*
830	 * Get vnode for '/'.  Set up rootdir, u.u_rdir and u.u_cdir
831	 * to point to it.  These are used by lookuppn() so that it
832	 * knows where to start from ('/' or '.').
833	 */
834	vfs_setmntpoint(rootvfs, "/");
835	if (VFS_ROOT(rootvfs, &rootdir))
836		panic("vfs_mountroot: no root vnode");
837	PTOU(curproc)->u_cdir = rootdir;
838	VN_HOLD(PTOU(curproc)->u_cdir);
839	PTOU(curproc)->u_rdir = NULL;
840
841	/*
842	 * Setup the global zone's rootvp, now that it exists.
843	 */
844	global_zone->zone_rootvp = rootdir;
845	VN_HOLD(global_zone->zone_rootvp);
846
847	/*
848	 * Notify the module code that it can begin using the
849	 * root filesystem instead of the boot program's services.
850	 */
851	modrootloaded = 1;
852
853	/*
854	 * Special handling for a ZFS root file system.
855	 */
856	zfs_boot_init();
857
858	/*
859	 * Set up mnttab information for root
860	 */
861	vfs_setresource(rootvfs, rootfs.bo_name);
862
863	/*
864	 * Notify cluster software that the root filesystem is available.
865	 */
866	clboot_mountroot();
867
868	/* Now that we're all done with the root FS, set up its vopstats */
869	if ((vswp = vfs_getvfsswbyvfsops(vfs_getops(rootvfs))) != NULL) {
870		/* Set flag for statistics collection */
871		if (vswp->vsw_flag & VSW_STATS) {
872			initialize_vopstats(&rootvfs->vfs_vopstats);
873			rootvfs->vfs_flag |= VFS_STATS;
874			rootvfs->vfs_fstypevsp =
875			    get_fstype_vopstats(rootvfs, vswp);
876			rootvfs->vfs_vskap = get_vskstat_anchor(rootvfs);
877		}
878		vfs_unrefvfssw(vswp);
879	}
880
881	/*
882	 * Mount /devices, /dev instance 1, /system/contract, /etc/mnttab,
883	 * /etc/svc/volatile, /etc/dfs/sharetab, /system/object, and /proc.
884	 */
885	vfs_mountdevices();
886	vfs_mountdev1();
887
888	vfs_mountfs("ctfs", "ctfs", CTFS_ROOT);
889	vfs_mountfs("proc", "/proc", "/proc");
890	vfs_mountfs("mntfs", "/etc/mnttab", "/etc/mnttab");
891	vfs_mountfs("tmpfs", "/etc/svc/volatile", "/etc/svc/volatile");
892	vfs_mountfs("objfs", "objfs", OBJFS_ROOT);
893
894	if (getzoneid() == GLOBAL_ZONEID) {
895		vfs_mountfs("sharefs", "sharefs", "/etc/dfs/sharetab");
896	}
897
898#ifdef __sparc
899	/*
900	 * This bit of magic can go away when we convert sparc to
901	 * the new boot architecture based on ramdisk.
902	 *
903	 * Booting off a mirrored root volume:
904	 * At this point, we have booted and mounted root on a
905	 * single component of the mirror.  Complete the boot
906	 * by configuring SVM and converting the root to the
907	 * dev_t of the mirrored root device.  This dev_t conversion
908	 * only works because the underlying device doesn't change.
909	 */
910	if (root_is_svm) {
911		if (svm_rootconf()) {
912			panic("vfs_mountroot: cannot remount root");
913		}
914
915		/*
916		 * mnttab should reflect the new root device
917		 */
918		vfs_lock_wait(rootvfs);
919		vfs_setresource(rootvfs, rootfs.bo_name);
920		vfs_unlock(rootvfs);
921	}
922#endif /* __sparc */
923
924	/*
925	 * Look up the root device via devfs so that a dv_node is
926	 * created for it. The vnode is never VN_RELE()ed.
927	 * We allocate more than MAXPATHLEN so that the
928	 * buffer passed to i_ddi_prompath_to_devfspath() is
929	 * exactly MAXPATHLEN (the function expects a buffer
930	 * of that length).
931	 */
932	plen = strlen("/devices");
933	path = kmem_alloc(plen + MAXPATHLEN, KM_SLEEP);
934	(void) strcpy(path, "/devices");
935
936	if (i_ddi_prompath_to_devfspath(rootfs.bo_name, path + plen)
937	    != DDI_SUCCESS ||
938	    lookupname(path, UIO_SYSSPACE, FOLLOW, NULLVPP, &rvp)) {
939
940		/* NUL terminate in case "path" has garbage */
941		path[plen + MAXPATHLEN - 1] = '\0';
942#ifdef	DEBUG
943		cmn_err(CE_WARN, "!Cannot lookup root device: %s", path);
944#endif
945	}
946	kmem_free(path, plen + MAXPATHLEN);
947	vfs_mnttabvp_setup();
948}
949
950/*
951 * If remount failed and we're in a zone we need to check for the zone
952 * root path and strip it before the call to vfs_setpath().
953 *
954 * If strpath doesn't begin with the zone_rootpath the original
955 * strpath is returned unchanged.
956 */
957static const char *
958stripzonepath(const char *strpath)
959{
960	char *str1, *str2;
961	int i;
962	zone_t *zonep = curproc->p_zone;
963
964	if (zonep->zone_rootpath == NULL || strpath == NULL) {
965		return (NULL);
966	}
967
968	/*
969	 * we check for the end of the string at one past the
970	 * current position because the zone_rootpath always
971	 * ends with "/" but we don't want to strip that off.
972	 */
973	str1 = zonep->zone_rootpath;
974	str2 = (char *)strpath;
975	ASSERT(str1[0] != '\0');
976	for (i = 0; str1[i + 1] != '\0'; i++) {
977		if (str1[i] != str2[i])
978			return ((char *)strpath);
979	}
980	return (&str2[i]);
981}
982
983/*
984 * Check to see if our "block device" is actually a file.  If so,
985 * automatically add a lofi device, and keep track of this fact.
986 */
987static int
988lofi_add(const char *fsname, struct vfs *vfsp,
989    mntopts_t *mntopts, struct mounta *uap)
990{
991	int fromspace = (uap->flags & MS_SYSSPACE) ?
992	    UIO_SYSSPACE : UIO_USERSPACE;
993	struct lofi_ioctl *li = NULL;
994	struct vnode *vp = NULL;
995	struct pathname	pn = { NULL };
996	ldi_ident_t ldi_id;
997	ldi_handle_t ldi_hdl;
998	vfssw_t *vfssw;
999	int minor;
1000	int err = 0;
1001
1002	if (fsname == NULL ||
1003	    (vfssw = vfs_getvfssw(fsname)) == NULL)
1004		return (0);
1005
1006	if (!(vfssw->vsw_flag & VSW_CANLOFI)) {
1007		vfs_unrefvfssw(vfssw);
1008		return (0);
1009	}
1010
1011	vfs_unrefvfssw(vfssw);
1012	vfssw = NULL;
1013
1014	if (pn_get(uap->spec, fromspace, &pn) != 0)
1015		return (0);
1016
1017	if (lookupname(uap->spec, fromspace, FOLLOW, NULL, &vp) != 0)
1018		goto out;
1019
1020	if (vp->v_type != VREG)
1021		goto out;
1022
1023	/* OK, this is a lofi mount. */
1024
1025	if ((uap->flags & (MS_REMOUNT|MS_GLOBAL)) ||
1026	    vfs_optionisset_nolock(mntopts, MNTOPT_SUID, NULL) ||
1027	    vfs_optionisset_nolock(mntopts, MNTOPT_SETUID, NULL) ||
1028	    vfs_optionisset_nolock(mntopts, MNTOPT_DEVICES, NULL)) {
1029		err = EINVAL;
1030		goto out;
1031	}
1032
1033	ldi_id = ldi_ident_from_anon();
1034	li = kmem_zalloc(sizeof (*li), KM_SLEEP);
1035	(void) strlcpy(li->li_filename, pn.pn_path, MAXPATHLEN);
1036
1037	/*
1038	 * The lofi control node is currently exclusive-open.  We'd like
1039	 * to improve this, but in the meantime, we'll loop waiting for
1040	 * access.
1041	 */
1042	for (;;) {
1043		err = ldi_open_by_name("/dev/lofictl", FREAD | FWRITE | FEXCL,
1044		    kcred, &ldi_hdl, ldi_id);
1045
1046		if (err != EBUSY)
1047			break;
1048
1049		if ((err = delay_sig(hz / 8)) == EINTR)
1050			break;
1051	}
1052
1053	if (err)
1054		goto out2;
1055
1056	err = ldi_ioctl(ldi_hdl, LOFI_MAP_FILE, (intptr_t)li,
1057	    FREAD | FWRITE | FEXCL | FKIOCTL, kcred, &minor);
1058
1059	(void) ldi_close(ldi_hdl, FREAD | FWRITE | FEXCL, kcred);
1060
1061	if (!err)
1062		vfsp->vfs_lofi_minor = minor;
1063
1064out2:
1065	ldi_ident_release(ldi_id);
1066out:
1067	if (li != NULL)
1068		kmem_free(li, sizeof (*li));
1069	if (vp != NULL)
1070		VN_RELE(vp);
1071	pn_free(&pn);
1072	return (err);
1073}
1074
1075static void
1076lofi_remove(struct vfs *vfsp)
1077{
1078	struct lofi_ioctl *li = NULL;
1079	ldi_ident_t ldi_id;
1080	ldi_handle_t ldi_hdl;
1081	int err;
1082
1083	if (vfsp->vfs_lofi_minor == 0)
1084		return;
1085
1086	ldi_id = ldi_ident_from_anon();
1087
1088	li = kmem_zalloc(sizeof (*li), KM_SLEEP);
1089	li->li_minor = vfsp->vfs_lofi_minor;
1090	li->li_cleanup = B_TRUE;
1091
1092	do {
1093		err = ldi_open_by_name("/dev/lofictl", FREAD | FWRITE | FEXCL,
1094		    kcred, &ldi_hdl, ldi_id);
1095	} while (err == EBUSY);
1096
1097	if (err)
1098		goto out;
1099
1100	err = ldi_ioctl(ldi_hdl, LOFI_UNMAP_FILE_MINOR, (intptr_t)li,
1101	    FREAD | FWRITE | FEXCL | FKIOCTL, kcred, NULL);
1102
1103	(void) ldi_close(ldi_hdl, FREAD | FWRITE | FEXCL, kcred);
1104
1105	if (!err)
1106		vfsp->vfs_lofi_minor = 0;
1107
1108out:
1109	ldi_ident_release(ldi_id);
1110	if (li != NULL)
1111		kmem_free(li, sizeof (*li));
1112}
1113
1114/*
1115 * Common mount code.  Called from the system call entry point, from autofs,
1116 * nfsv4 trigger mounts, and from pxfs.
1117 *
1118 * Takes the effective file system type, mount arguments, the mount point
1119 * vnode, flags specifying whether the mount is a remount and whether it
1120 * should be entered into the vfs list, and credentials.  Fills in its vfspp
1121 * parameter with the mounted file system instance's vfs.
1122 *
1123 * Note that the effective file system type is specified as a string.  It may
1124 * be null, in which case it's determined from the mount arguments, and may
1125 * differ from the type specified in the mount arguments; this is a hook to
1126 * allow interposition when instantiating file system instances.
1127 *
1128 * The caller is responsible for releasing its own hold on the mount point
1129 * vp (this routine does its own hold when necessary).
1130 * Also note that for remounts, the mount point vp should be the vnode for
1131 * the root of the file system rather than the vnode that the file system
1132 * is mounted on top of.
1133 */
1134int
1135domount(char *fsname, struct mounta *uap, vnode_t *vp, struct cred *credp,
1136	struct vfs **vfspp)
1137{
1138	struct vfssw	*vswp;
1139	vfsops_t	*vfsops;
1140	struct vfs	*vfsp;
1141	struct vnode	*bvp;
1142	dev_t		bdev = 0;
1143	mntopts_t	mnt_mntopts;
1144	int		error = 0;
1145	int		copyout_error = 0;
1146	int		ovflags;
1147	char		*opts = uap->optptr;
1148	char		*inargs = opts;
1149	int		optlen = uap->optlen;
1150	int		remount;
1151	int		rdonly;
1152	int		nbmand = 0;
1153	int		delmip = 0;
1154	int		addmip = 0;
1155	int		splice = ((uap->flags & MS_NOSPLICE) == 0);
1156	int		fromspace = (uap->flags & MS_SYSSPACE) ?
1157	    UIO_SYSSPACE : UIO_USERSPACE;
1158	char		*resource = NULL, *mountpt = NULL;
1159	refstr_t	*oldresource, *oldmntpt;
1160	struct pathname	pn, rpn;
1161	vsk_anchor_t	*vskap;
1162	char fstname[FSTYPSZ];
1163
1164	/*
1165	 * The v_flag value for the mount point vp is permanently set
1166	 * to VVFSLOCK so that no one bypasses the vn_vfs*locks routine
1167	 * for mount point locking.
1168	 */
1169	mutex_enter(&vp->v_lock);
1170	vp->v_flag |= VVFSLOCK;
1171	mutex_exit(&vp->v_lock);
1172
1173	mnt_mntopts.mo_count = 0;
1174	/*
1175	 * Find the ops vector to use to invoke the file system-specific mount
1176	 * method.  If the fsname argument is non-NULL, use it directly.
1177	 * Otherwise, dig the file system type information out of the mount
1178	 * arguments.
1179	 *
1180	 * A side effect is to hold the vfssw entry.
1181	 *
1182	 * Mount arguments can be specified in several ways, which are
1183	 * distinguished by flag bit settings.  The preferred way is to set
1184	 * MS_OPTIONSTR, indicating an 8 argument mount with the file system
1185	 * type supplied as a character string and the last two arguments
1186	 * being a pointer to a character buffer and the size of the buffer.
1187	 * On entry, the buffer holds a null terminated list of options; on
1188	 * return, the string is the list of options the file system
1189	 * recognized. If MS_DATA is set arguments five and six point to a
1190	 * block of binary data which the file system interprets.
1191	 * A further wrinkle is that some callers don't set MS_FSS and MS_DATA
1192	 * consistently with these conventions.  To handle them, we check to
1193	 * see whether the pointer to the file system name has a numeric value
1194	 * less than 256.  If so, we treat it as an index.
1195	 */
1196	if (fsname != NULL) {
1197		if ((vswp = vfs_getvfssw(fsname)) == NULL) {
1198			return (EINVAL);
1199		}
1200	} else if (uap->flags & (MS_OPTIONSTR | MS_DATA | MS_FSS)) {
1201		size_t n;
1202		uint_t fstype;
1203
1204		fsname = fstname;
1205
1206		if ((fstype = (uintptr_t)uap->fstype) < 256) {
1207			RLOCK_VFSSW();
1208			if (fstype == 0 || fstype >= nfstype ||
1209			    !ALLOCATED_VFSSW(&vfssw[fstype])) {
1210				RUNLOCK_VFSSW();
1211				return (EINVAL);
1212			}
1213			(void) strcpy(fsname, vfssw[fstype].vsw_name);
1214			RUNLOCK_VFSSW();
1215			if ((vswp = vfs_getvfssw(fsname)) == NULL)
1216				return (EINVAL);
1217		} else {
1218			/*
1219			 * Handle either kernel or user address space.
1220			 */
1221			if (uap->flags & MS_SYSSPACE) {
1222				error = copystr(uap->fstype, fsname,
1223				    FSTYPSZ, &n);
1224			} else {
1225				error = copyinstr(uap->fstype, fsname,
1226				    FSTYPSZ, &n);
1227			}
1228			if (error) {
1229				if (error == ENAMETOOLONG)
1230					return (EINVAL);
1231				return (error);
1232			}
1233			if ((vswp = vfs_getvfssw(fsname)) == NULL)
1234				return (EINVAL);
1235		}
1236	} else {
1237		if ((vswp = vfs_getvfsswbyvfsops(vfs_getops(rootvfs))) == NULL)
1238			return (EINVAL);
1239	}
1240	if (!VFS_INSTALLED(vswp))
1241		return (EINVAL);
1242	vfsops = &vswp->vsw_vfsops;
1243
1244	vfs_copyopttbl(&vswp->vsw_optproto, &mnt_mntopts);
1245	/*
1246	 * Fetch mount options and parse them for generic vfs options
1247	 */
1248	if (uap->flags & MS_OPTIONSTR) {
1249		/*
1250		 * Limit the buffer size
1251		 */
1252		if (optlen < 0 || optlen > MAX_MNTOPT_STR) {
1253			error = EINVAL;
1254			goto errout;
1255		}
1256		if ((uap->flags & MS_SYSSPACE) == 0) {
1257			inargs = kmem_alloc(MAX_MNTOPT_STR, KM_SLEEP);
1258			inargs[0] = '\0';
1259			if (optlen) {
1260				error = copyinstr(opts, inargs, (size_t)optlen,
1261				    NULL);
1262				if (error) {
1263					goto errout;
1264				}
1265			}
1266		}
1267		vfs_parsemntopts(&mnt_mntopts, inargs, 0);
1268	}
1269	/*
1270	 * Flag bits override the options string.
1271	 */
1272	if (uap->flags & MS_REMOUNT)
1273		vfs_setmntopt_nolock(&mnt_mntopts, MNTOPT_REMOUNT, NULL, 0, 0);
1274	if (uap->flags & MS_RDONLY)
1275		vfs_setmntopt_nolock(&mnt_mntopts, MNTOPT_RO, NULL, 0, 0);
1276	if (uap->flags & MS_NOSUID)
1277		vfs_setmntopt_nolock(&mnt_mntopts, MNTOPT_NOSUID, NULL, 0, 0);
1278
1279	/*
1280	 * Check if this is a remount; must be set in the option string and
1281	 * the file system must support a remount option.
1282	 */
1283	if (remount = vfs_optionisset_nolock(&mnt_mntopts,
1284	    MNTOPT_REMOUNT, NULL)) {
1285		if (!(vswp->vsw_flag & VSW_CANREMOUNT)) {
1286			error = ENOTSUP;
1287			goto errout;
1288		}
1289		uap->flags |= MS_REMOUNT;
1290	}
1291
1292	/*
1293	 * uap->flags and vfs_optionisset() should agree.
1294	 */
1295	if (rdonly = vfs_optionisset_nolock(&mnt_mntopts, MNTOPT_RO, NULL)) {
1296		uap->flags |= MS_RDONLY;
1297	}
1298	if (vfs_optionisset_nolock(&mnt_mntopts, MNTOPT_NOSUID, NULL)) {
1299		uap->flags |= MS_NOSUID;
1300	}
1301	nbmand = vfs_optionisset_nolock(&mnt_mntopts, MNTOPT_NBMAND, NULL);
1302	ASSERT(splice || !remount);
1303	/*
1304	 * If we are splicing the fs into the namespace,
1305	 * perform mount point checks.
1306	 *
1307	 * We want to resolve the path for the mount point to eliminate
1308	 * '.' and ".." and symlinks in mount points; we can't do the
1309	 * same for the resource string, since it would turn
1310	 * "/dev/dsk/c0t0d0s0" into "/devices/pci@...".  We need to do
1311	 * this before grabbing vn_vfswlock(), because otherwise we
1312	 * would deadlock with lookuppn().
1313	 */
1314	if (splice) {
1315		ASSERT(vp->v_count > 0);
1316
1317		/*
1318		 * Pick up mount point and device from appropriate space.
1319		 */
1320		if (pn_get(uap->spec, fromspace, &pn) == 0) {
1321			resource = kmem_alloc(pn.pn_pathlen + 1,
1322			    KM_SLEEP);
1323			(void) strcpy(resource, pn.pn_path);
1324			pn_free(&pn);
1325		}
1326		/*
1327		 * Do a lookupname prior to taking the
1328		 * writelock. Mark this as completed if
1329		 * successful for later cleanup and addition to
1330		 * the mount in progress table.
1331		 */
1332		if ((uap->flags & MS_GLOBAL) == 0 &&
1333		    lookupname(uap->spec, fromspace,
1334		    FOLLOW, NULL, &bvp) == 0) {
1335			addmip = 1;
1336		}
1337
1338		if ((error = pn_get(uap->dir, fromspace, &pn)) == 0) {
1339			pathname_t *pnp;
1340
1341			if (*pn.pn_path != '/') {
1342				error = EINVAL;
1343				pn_free(&pn);
1344				goto errout;
1345			}
1346			pn_alloc(&rpn);
1347			/*
1348			 * Kludge to prevent autofs from deadlocking with
1349			 * itself when it calls domount().
1350			 *
1351			 * If autofs is calling, it is because it is doing
1352			 * (autofs) mounts in the process of an NFS mount.  A
1353			 * lookuppn() here would cause us to block waiting for
1354			 * said NFS mount to complete, which can't since this
1355			 * is the thread that was supposed to doing it.
1356			 */
1357			if (fromspace == UIO_USERSPACE) {
1358				if ((error = lookuppn(&pn, &rpn, FOLLOW, NULL,
1359				    NULL)) == 0) {
1360					pnp = &rpn;
1361				} else {
1362					/*
1363					 * The file disappeared or otherwise
1364					 * became inaccessible since we opened
1365					 * it; might as well fail the mount
1366					 * since the mount point is no longer
1367					 * accessible.
1368					 */
1369					pn_free(&rpn);
1370					pn_free(&pn);
1371					goto errout;
1372				}
1373			} else {
1374				pnp = &pn;
1375			}
1376			mountpt = kmem_alloc(pnp->pn_pathlen + 1, KM_SLEEP);
1377			(void) strcpy(mountpt, pnp->pn_path);
1378
1379			/*
1380			 * If the addition of the zone's rootpath
1381			 * would push us over a total path length
1382			 * of MAXPATHLEN, we fail the mount with
1383			 * ENAMETOOLONG, which is what we would have
1384			 * gotten if we were trying to perform the same
1385			 * mount in the global zone.
1386			 *
1387			 * strlen() doesn't count the trailing
1388			 * '\0', but zone_rootpathlen counts both a
1389			 * trailing '/' and the terminating '\0'.
1390			 */
1391			if ((curproc->p_zone->zone_rootpathlen - 1 +
1392			    strlen(mountpt)) > MAXPATHLEN ||
1393			    (resource != NULL &&
1394			    (curproc->p_zone->zone_rootpathlen - 1 +
1395			    strlen(resource)) > MAXPATHLEN)) {
1396				error = ENAMETOOLONG;
1397			}
1398
1399			pn_free(&rpn);
1400			pn_free(&pn);
1401		}
1402
1403		if (error)
1404			goto errout;
1405
1406		/*
1407		 * Prevent path name resolution from proceeding past
1408		 * the mount point.
1409		 */
1410		if (vn_vfswlock(vp) != 0) {
1411			error = EBUSY;
1412			goto errout;
1413		}
1414
1415		/*
1416		 * Verify that it's legitimate to establish a mount on
1417		 * the prospective mount point.
1418		 */
1419		if (vn_mountedvfs(vp) != NULL) {
1420			/*
1421			 * The mount point lock was obtained after some
1422			 * other thread raced through and established a mount.
1423			 */
1424			vn_vfsunlock(vp);
1425			error = EBUSY;
1426			goto errout;
1427		}
1428		if (vp->v_flag & VNOMOUNT) {
1429			vn_vfsunlock(vp);
1430			error = EINVAL;
1431			goto errout;
1432		}
1433	}
1434	if ((uap->flags & (MS_DATA | MS_OPTIONSTR)) == 0) {
1435		uap->dataptr = NULL;
1436		uap->datalen = 0;
1437	}
1438
1439	/*
1440	 * If this is a remount, we don't want to create a new VFS.
1441	 * Instead, we pass the existing one with a remount flag.
1442	 */
1443	if (remount) {
1444		/*
1445		 * Confirm that the mount point is the root vnode of the
1446		 * file system that is being remounted.
1447		 * This can happen if the user specifies a different
1448		 * mount point directory pathname in the (re)mount command.
1449		 *
1450		 * Code below can only be reached if splice is true, so it's
1451		 * safe to do vn_vfsunlock() here.
1452		 */
1453		if ((vp->v_flag & VROOT) == 0) {
1454			vn_vfsunlock(vp);
1455			error = ENOENT;
1456			goto errout;
1457		}
1458		/*
1459		 * Disallow making file systems read-only unless file system
1460		 * explicitly allows it in its vfssw.  Ignore other flags.
1461		 */
1462		if (rdonly && vn_is_readonly(vp) == 0 &&
1463		    (vswp->vsw_flag & VSW_CANRWRO) == 0) {
1464			vn_vfsunlock(vp);
1465			error = EINVAL;
1466			goto errout;
1467		}
1468		/*
1469		 * Disallow changing the NBMAND disposition of the file
1470		 * system on remounts.
1471		 */
1472		if ((nbmand && ((vp->v_vfsp->vfs_flag & VFS_NBMAND) == 0)) ||
1473		    (!nbmand && (vp->v_vfsp->vfs_flag & VFS_NBMAND))) {
1474			vn_vfsunlock(vp);
1475			error = EINVAL;
1476			goto errout;
1477		}
1478		vfsp = vp->v_vfsp;
1479		ovflags = vfsp->vfs_flag;
1480		vfsp->vfs_flag |= VFS_REMOUNT;
1481		vfsp->vfs_flag &= ~VFS_RDONLY;
1482	} else {
1483		vfsp = vfs_alloc(KM_SLEEP);
1484		VFS_INIT(vfsp, vfsops, NULL);
1485	}
1486
1487	VFS_HOLD(vfsp);
1488
1489	if ((error = lofi_add(fsname, vfsp, &mnt_mntopts, uap)) != 0) {
1490		if (!remount) {
1491			if (splice)
1492				vn_vfsunlock(vp);
1493			vfs_free(vfsp);
1494		} else {
1495			vn_vfsunlock(vp);
1496			VFS_RELE(vfsp);
1497		}
1498		goto errout;
1499	}
1500
1501	/*
1502	 * PRIV_SYS_MOUNT doesn't mean you can become root.
1503	 */
1504	if (vfsp->vfs_lofi_minor != 0) {
1505		uap->flags |= MS_NOSUID;
1506		vfs_setmntopt_nolock(&mnt_mntopts, MNTOPT_NOSUID, NULL, 0, 0);
1507	}
1508
1509	/*
1510	 * The vfs_reflock is not used anymore the code below explicitly
1511	 * holds it preventing others accesing it directly.
1512	 */
1513	if ((sema_tryp(&vfsp->vfs_reflock) == 0) &&
1514	    !(vfsp->vfs_flag & VFS_REMOUNT))
1515		cmn_err(CE_WARN,
1516		    "mount type %s couldn't get vfs_reflock", vswp->vsw_name);
1517
1518	/*
1519	 * Lock the vfs. If this is a remount we want to avoid spurious umount
1520	 * failures that happen as a side-effect of fsflush() and other mount
1521	 * and unmount operations that might be going on simultaneously and
1522	 * may have locked the vfs currently. To not return EBUSY immediately
1523	 * here we use vfs_lock_wait() instead vfs_lock() for the remount case.
1524	 */
1525	if (!remount) {
1526		if (error = vfs_lock(vfsp)) {
1527			vfsp->vfs_flag = ovflags;
1528
1529			lofi_remove(vfsp);
1530
1531			if (splice)
1532				vn_vfsunlock(vp);
1533			vfs_free(vfsp);
1534			goto errout;
1535		}
1536	} else {
1537		vfs_lock_wait(vfsp);
1538	}
1539
1540	/*
1541	 * Add device to mount in progress table, global mounts require special
1542	 * handling. It is possible that we have already done the lookupname
1543	 * on a spliced, non-global fs. If so, we don't want to do it again
1544	 * since we cannot do a lookupname after taking the
1545	 * wlock above. This case is for a non-spliced, non-global filesystem.
1546	 */
1547	if (!addmip) {
1548		if ((uap->flags & MS_GLOBAL) == 0 &&
1549		    lookupname(uap->spec, fromspace, FOLLOW, NULL, &bvp) == 0) {
1550			addmip = 1;
1551		}
1552	}
1553
1554	if (addmip) {
1555		vnode_t *lvp = NULL;
1556
1557		error = vfs_get_lofi(vfsp, &lvp);
1558		if (error > 0) {
1559			lofi_remove(vfsp);
1560
1561			if (splice)
1562				vn_vfsunlock(vp);
1563			vfs_unlock(vfsp);
1564
1565			if (remount) {
1566				VFS_RELE(vfsp);
1567			} else {
1568				vfs_free(vfsp);
1569			}
1570
1571			goto errout;
1572		} else if (error == -1) {
1573			bdev = bvp->v_rdev;
1574			VN_RELE(bvp);
1575		} else {
1576			bdev = lvp->v_rdev;
1577			VN_RELE(lvp);
1578			VN_RELE(bvp);
1579		}
1580
1581		vfs_addmip(bdev, vfsp);
1582		addmip = 0;
1583		delmip = 1;
1584	}
1585	/*
1586	 * Invalidate cached entry for the mount point.
1587	 */
1588	if (splice)
1589		dnlc_purge_vp(vp);
1590
1591	/*
1592	 * If have an option string but the filesystem doesn't supply a
1593	 * prototype options table, create a table with the global
1594	 * options and sufficient room to accept all the options in the
1595	 * string.  Then parse the passed in option string
1596	 * accepting all the options in the string.  This gives us an
1597	 * option table with all the proper cancel properties for the
1598	 * global options.
1599	 *
1600	 * Filesystems that supply a prototype options table are handled
1601	 * earlier in this function.
1602	 */
1603	if (uap->flags & MS_OPTIONSTR) {
1604		if (!(vswp->vsw_flag & VSW_HASPROTO)) {
1605			mntopts_t tmp_mntopts;
1606
1607			tmp_mntopts.mo_count = 0;
1608			vfs_createopttbl_extend(&tmp_mntopts, inargs,
1609			    &mnt_mntopts);
1610			vfs_parsemntopts(&tmp_mntopts, inargs, 1);
1611			vfs_swapopttbl_nolock(&mnt_mntopts, &tmp_mntopts);
1612			vfs_freeopttbl(&tmp_mntopts);
1613		}
1614	}
1615
1616	/*
1617	 * Serialize with zone creations.
1618	 */
1619	mount_in_progress();
1620	/*
1621	 * Instantiate (or reinstantiate) the file system.  If appropriate,
1622	 * splice it into the file system name space.
1623	 *
1624	 * We want VFS_MOUNT() to be able to override the vfs_resource
1625	 * string if necessary (ie, mntfs), and also for a remount to
1626	 * change the same (necessary when remounting '/' during boot).
1627	 * So we set up vfs_mntpt and vfs_resource to what we think they
1628	 * should be, then hand off control to VFS_MOUNT() which can
1629	 * override this.
1630	 *
1631	 * For safety's sake, when changing vfs_resource or vfs_mntpt of
1632	 * a vfs which is on the vfs list (i.e. during a remount), we must
1633	 * never set those fields to NULL. Several bits of code make
1634	 * assumptions that the fields are always valid.
1635	 */
1636	vfs_swapopttbl(&mnt_mntopts, &vfsp->vfs_mntopts);
1637	if (remount) {
1638		if ((oldresource = vfsp->vfs_resource) != NULL)
1639			refstr_hold(oldresource);
1640		if ((oldmntpt = vfsp->vfs_mntpt) != NULL)
1641			refstr_hold(oldmntpt);
1642	}
1643	vfs_setresource(vfsp, resource);
1644	vfs_setmntpoint(vfsp, mountpt);
1645
1646	/*
1647	 * going to mount on this vnode, so notify.
1648	 */
1649	vnevent_mountedover(vp, NULL);
1650	error = VFS_MOUNT(vfsp, vp, uap, credp);
1651
1652	if (uap->flags & MS_RDONLY)
1653		vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0);
1654	if (uap->flags & MS_NOSUID)
1655		vfs_setmntopt(vfsp, MNTOPT_NOSUID, NULL, 0);
1656	if (uap->flags & MS_GLOBAL)
1657		vfs_setmntopt(vfsp, MNTOPT_GLOBAL, NULL, 0);
1658
1659	if (error) {
1660		lofi_remove(vfsp);
1661
1662		if (remount) {
1663			/* put back pre-remount options */
1664			vfs_swapopttbl(&mnt_mntopts, &vfsp->vfs_mntopts);
1665			vfs_setmntpoint(vfsp, (stripzonepath(
1666			    refstr_value(oldmntpt))));
1667			if (oldmntpt)
1668				refstr_rele(oldmntpt);
1669			vfs_setresource(vfsp, (stripzonepath(
1670			    refstr_value(oldresource))));
1671			if (oldresource)
1672				refstr_rele(oldresource);
1673			vfsp->vfs_flag = ovflags;
1674			vfs_unlock(vfsp);
1675			VFS_RELE(vfsp);
1676		} else {
1677			vfs_unlock(vfsp);
1678			vfs_freemnttab(vfsp);
1679			vfs_free(vfsp);
1680		}
1681	} else {
1682		/*
1683		 * Set the mount time to now
1684		 */
1685		vfsp->vfs_mtime = ddi_get_time();
1686		if (remount) {
1687			vfsp->vfs_flag &= ~VFS_REMOUNT;
1688			if (oldresource)
1689				refstr_rele(oldresource);
1690			if (oldmntpt)
1691				refstr_rele(oldmntpt);
1692		} else if (splice) {
1693			/*
1694			 * Link vfsp into the name space at the mount
1695			 * point. Vfs_add() is responsible for
1696			 * holding the mount point which will be
1697			 * released when vfs_remove() is called.
1698			 */
1699			vfs_add(vp, vfsp, uap->flags);
1700		} else {
1701			/*
1702			 * Hold the reference to file system which is
1703			 * not linked into the name space.
1704			 */
1705			vfsp->vfs_zone = NULL;
1706			VFS_HOLD(vfsp);
1707			vfsp->vfs_vnodecovered = NULL;
1708		}
1709		/*
1710		 * Set flags for global options encountered
1711		 */
1712		if (vfs_optionisset(vfsp, MNTOPT_RO, NULL))
1713			vfsp->vfs_flag |= VFS_RDONLY;
1714		else
1715			vfsp->vfs_flag &= ~VFS_RDONLY;
1716		if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) {
1717			vfsp->vfs_flag |= (VFS_NOSETUID|VFS_NODEVICES);
1718		} else {
1719			if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL))
1720				vfsp->vfs_flag |= VFS_NODEVICES;
1721			else
1722				vfsp->vfs_flag &= ~VFS_NODEVICES;
1723			if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL))
1724				vfsp->vfs_flag |= VFS_NOSETUID;
1725			else
1726				vfsp->vfs_flag &= ~VFS_NOSETUID;
1727		}
1728		if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL))
1729			vfsp->vfs_flag |= VFS_NBMAND;
1730		else
1731			vfsp->vfs_flag &= ~VFS_NBMAND;
1732
1733		if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL))
1734			vfsp->vfs_flag |= VFS_XATTR;
1735		else
1736			vfsp->vfs_flag &= ~VFS_XATTR;
1737
1738		if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL))
1739			vfsp->vfs_flag |= VFS_NOEXEC;
1740		else
1741			vfsp->vfs_flag &= ~VFS_NOEXEC;
1742
1743		/*
1744		 * Now construct the output option string of options
1745		 * we recognized.
1746		 */
1747		if (uap->flags & MS_OPTIONSTR) {
1748			vfs_list_read_lock();
1749			copyout_error = vfs_buildoptionstr(
1750			    &vfsp->vfs_mntopts, inargs, optlen);
1751			vfs_list_unlock();
1752			if (copyout_error == 0 &&
1753			    (uap->flags & MS_SYSSPACE) == 0) {
1754				copyout_error = copyoutstr(inargs, opts,
1755				    optlen, NULL);
1756			}
1757		}
1758
1759		/*
1760		 * If this isn't a remount, set up the vopstats before
1761		 * anyone can touch this. We only allow spliced file
1762		 * systems (file systems which are in the namespace) to
1763		 * have the VFS_STATS flag set.
1764		 * NOTE: PxFS mounts the underlying file system with
1765		 * MS_NOSPLICE set and copies those vfs_flags to its private
1766		 * vfs structure. As a result, PxFS should never have
1767		 * the VFS_STATS flag or else we might access the vfs
1768		 * statistics-related fields prior to them being
1769		 * properly initialized.
1770		 */
1771		if (!remount && (vswp->vsw_flag & VSW_STATS) && splice) {
1772			initialize_vopstats(&vfsp->vfs_vopstats);
1773			/*
1774			 * We need to set vfs_vskap to NULL because there's
1775			 * a chance it won't be set below.  This is checked
1776			 * in teardown_vopstats() so we can't have garbage.
1777			 */
1778			vfsp->vfs_vskap = NULL;
1779			vfsp->vfs_flag |= VFS_STATS;
1780			vfsp->vfs_fstypevsp = get_fstype_vopstats(vfsp, vswp);
1781		}
1782
1783		if (vswp->vsw_flag & VSW_XID)
1784			vfsp->vfs_flag |= VFS_XID;
1785
1786		vfs_unlock(vfsp);
1787	}
1788	mount_completed();
1789	if (splice)
1790		vn_vfsunlock(vp);
1791
1792	if ((error == 0) && (copyout_error == 0)) {
1793		if (!remount) {
1794			/*
1795			 * Don't call get_vskstat_anchor() while holding
1796			 * locks since it allocates memory and calls
1797			 * VFS_STATVFS().  For NFS, the latter can generate
1798			 * an over-the-wire call.
1799			 */
1800			vskap = get_vskstat_anchor(vfsp);
1801			/* Only take the lock if we have something to do */
1802			if (vskap != NULL) {
1803				vfs_lock_wait(vfsp);
1804				if (vfsp->vfs_flag & VFS_STATS) {
1805					vfsp->vfs_vskap = vskap;
1806				}
1807				vfs_unlock(vfsp);
1808			}
1809		}
1810		/* Return vfsp to caller. */
1811		*vfspp = vfsp;
1812	}
1813errout:
1814	vfs_freeopttbl(&mnt_mntopts);
1815	if (resource != NULL)
1816		kmem_free(resource, strlen(resource) + 1);
1817	if (mountpt != NULL)
1818		kmem_free(mountpt, strlen(mountpt) + 1);
1819	/*
1820	 * It is possible we errored prior to adding to mount in progress
1821	 * table. Must free vnode we acquired with successful lookupname.
1822	 */
1823	if (addmip)
1824		VN_RELE(bvp);
1825	if (delmip)
1826		vfs_delmip(vfsp);
1827	ASSERT(vswp != NULL);
1828	vfs_unrefvfssw(vswp);
1829	if (inargs != opts)
1830		kmem_free(inargs, MAX_MNTOPT_STR);
1831	if (copyout_error) {
1832		lofi_remove(vfsp);
1833		VFS_RELE(vfsp);
1834		error = copyout_error;
1835	}
1836	return (error);
1837}
1838
1839static void
1840vfs_setpath(struct vfs *vfsp, refstr_t **refp, const char *newpath)
1841{
1842	size_t len;
1843	refstr_t *ref;
1844	zone_t *zone = curproc->p_zone;
1845	char *sp;
1846	int have_list_lock = 0;
1847
1848	ASSERT(!VFS_ON_LIST(vfsp) || vfs_lock_held(vfsp));
1849
1850	/*
1851	 * New path must be less than MAXPATHLEN because mntfs
1852	 * will only display up to MAXPATHLEN bytes. This is currently
1853	 * safe, because domount() uses pn_get(), and other callers
1854	 * similarly cap the size to fewer than MAXPATHLEN bytes.
1855	 */
1856
1857	ASSERT(strlen(newpath) < MAXPATHLEN);
1858
1859	/* mntfs requires consistency while vfs list lock is held */
1860
1861	if (VFS_ON_LIST(vfsp)) {
1862		have_list_lock = 1;
1863		vfs_list_lock();
1864	}
1865
1866	if (*refp != NULL)
1867		refstr_rele(*refp);
1868
1869	/* Do we need to modify the path? */
1870
1871	if (zone == global_zone || *newpath != '/') {
1872		ref = refstr_alloc(newpath);
1873		goto out;
1874	}
1875
1876	/*
1877	 * Truncate the trailing '/' in the zoneroot, and merge
1878	 * in the zone's rootpath with the "newpath" (resource
1879	 * or mountpoint) passed in.
1880	 *
1881	 * The size of the required buffer is thus the size of
1882	 * the buffer required for the passed-in newpath
1883	 * (strlen(newpath) + 1), plus the size of the buffer
1884	 * required to hold zone_rootpath (zone_rootpathlen)
1885	 * minus one for one of the now-superfluous NUL
1886	 * terminations, minus one for the trailing '/'.
1887	 *
1888	 * That gives us:
1889	 *
1890	 * (strlen(newpath) + 1) + zone_rootpathlen - 1 - 1
1891	 *
1892	 * Which is what we have below.
1893	 */
1894
1895	len = strlen(newpath) + zone->zone_rootpathlen - 1;
1896	sp = kmem_alloc(len, KM_SLEEP);
1897
1898	/*
1899	 * Copy everything including the trailing slash, which
1900	 * we then overwrite with the NUL character.
1901	 */
1902
1903	(void) strcpy(sp, zone->zone_rootpath);
1904	sp[zone->zone_rootpathlen - 2] = '\0';
1905	(void) strcat(sp, newpath);
1906
1907	ref = refstr_alloc(sp);
1908	kmem_free(sp, len);
1909out:
1910	*refp = ref;
1911
1912	if (have_list_lock) {
1913		vfs_mnttab_modtimeupd();
1914		vfs_list_unlock();
1915	}
1916}
1917
1918/*
1919 * Record a mounted resource name in a vfs structure.
1920 * If vfsp is already mounted, caller must hold the vfs lock.
1921 */
1922void
1923vfs_setresource(struct vfs *vfsp, const char *resource)
1924{
1925	if (resource == NULL || resource[0] == '\0')
1926		resource = VFS_NORESOURCE;
1927	vfs_setpath(vfsp, &vfsp->vfs_resource, resource);
1928}
1929
1930/*
1931 * Record a mount point name in a vfs structure.
1932 * If vfsp is already mounted, caller must hold the vfs lock.
1933 */
1934void
1935vfs_setmntpoint(struct vfs *vfsp, const char *mntpt)
1936{
1937	if (mntpt == NULL || mntpt[0] == '\0')
1938		mntpt = VFS_NOMNTPT;
1939	vfs_setpath(vfsp, &vfsp->vfs_mntpt, mntpt);
1940}
1941
1942/* Returns the vfs_resource. Caller must call refstr_rele() when finished. */
1943
1944refstr_t *
1945vfs_getresource(const struct vfs *vfsp)
1946{
1947	refstr_t *resource;
1948
1949	vfs_list_read_lock();
1950	resource = vfsp->vfs_resource;
1951	refstr_hold(resource);
1952	vfs_list_unlock();
1953
1954	return (resource);
1955}
1956
1957/* Returns the vfs_mntpt. Caller must call refstr_rele() when finished. */
1958
1959refstr_t *
1960vfs_getmntpoint(const struct vfs *vfsp)
1961{
1962	refstr_t *mntpt;
1963
1964	vfs_list_read_lock();
1965	mntpt = vfsp->vfs_mntpt;
1966	refstr_hold(mntpt);
1967	vfs_list_unlock();
1968
1969	return (mntpt);
1970}
1971
1972/*
1973 * Create an empty options table with enough empty slots to hold all
1974 * The options in the options string passed as an argument.
1975 * Potentially prepend another options table.
1976 *
1977 * Note: caller is responsible for locking the vfs list, if needed,
1978 *       to protect mops.
1979 */
1980static void
1981vfs_createopttbl_extend(mntopts_t *mops, const char *opts,
1982    const mntopts_t *mtmpl)
1983{
1984	const char *s = opts;
1985	uint_t count;
1986
1987	if (opts == NULL || *opts == '\0') {
1988		count = 0;
1989	} else {
1990		count = 1;
1991
1992		/*
1993		 * Count number of options in the string
1994		 */
1995		for (s = strchr(s, ','); s != NULL; s = strchr(s, ',')) {
1996			count++;
1997			s++;
1998		}
1999	}
2000	vfs_copyopttbl_extend(mtmpl, mops, count);
2001}
2002
2003/*
2004 * Create an empty options table with enough empty slots to hold all
2005 * The options in the options string passed as an argument.
2006 *
2007 * This function is *not* for general use by filesystems.
2008 *
2009 * Note: caller is responsible for locking the vfs list, if needed,
2010 *       to protect mops.
2011 */
2012void
2013vfs_createopttbl(mntopts_t *mops, const char *opts)
2014{
2015	vfs_createopttbl_extend(mops, opts, NULL);
2016}
2017
2018
2019/*
2020 * Swap two mount options tables
2021 */
2022static void
2023vfs_swapopttbl_nolock(mntopts_t *optbl1, mntopts_t *optbl2)
2024{
2025	uint_t tmpcnt;
2026	mntopt_t *tmplist;
2027
2028	tmpcnt = optbl2->mo_count;
2029	tmplist = optbl2->mo_list;
2030	optbl2->mo_count = optbl1->mo_count;
2031	optbl2->mo_list = optbl1->mo_list;
2032	optbl1->mo_count = tmpcnt;
2033	optbl1->mo_list = tmplist;
2034}
2035
2036static void
2037vfs_swapopttbl(mntopts_t *optbl1, mntopts_t *optbl2)
2038{
2039	vfs_list_lock();
2040	vfs_swapopttbl_nolock(optbl1, optbl2);
2041	vfs_mnttab_modtimeupd();
2042	vfs_list_unlock();
2043}
2044
2045static char **
2046vfs_copycancelopt_extend(char **const moc, int extend)
2047{
2048	int i = 0;
2049	int j;
2050	char **result;
2051
2052	if (moc != NULL) {
2053		for (; moc[i] != NULL; i++)
2054			/* count number of options to cancel */;
2055	}
2056
2057	if (i + extend == 0)
2058		return (NULL);
2059
2060	result = kmem_alloc((i + extend + 1) * sizeof (char *), KM_SLEEP);
2061
2062	for (j = 0; j < i; j++) {
2063		result[j] = kmem_alloc(strlen(moc[j]) + 1, KM_SLEEP);
2064		(void) strcpy(result[j], moc[j]);
2065	}
2066	for (; j <= i + extend; j++)
2067		result[j] = NULL;
2068
2069	return (result);
2070}
2071
2072static void
2073vfs_copyopt(const mntopt_t *s, mntopt_t *d)
2074{
2075	char *sp, *dp;
2076
2077	d->mo_flags = s->mo_flags;
2078	d->mo_data = s->mo_data;
2079	sp = s->mo_name;
2080	if (sp != NULL) {
2081		dp = kmem_alloc(strlen(sp) + 1, KM_SLEEP);
2082		(void) strcpy(dp, sp);
2083		d->mo_name = dp;
2084	} else {
2085		d->mo_name = NULL; /* should never happen */
2086	}
2087
2088	d->mo_cancel = vfs_copycancelopt_extend(s->mo_cancel, 0);
2089
2090	sp = s->mo_arg;
2091	if (sp != NULL) {
2092		dp = kmem_alloc(strlen(sp) + 1, KM_SLEEP);
2093		(void) strcpy(dp, sp);
2094		d->mo_arg = dp;
2095	} else {
2096		d->mo_arg = NULL;
2097	}
2098}
2099
2100/*
2101 * Copy a mount options table, possibly allocating some spare
2102 * slots at the end.  It is permissible to copy_extend the NULL table.
2103 */
2104static void
2105vfs_copyopttbl_extend(const mntopts_t *smo, mntopts_t *dmo, int extra)
2106{
2107	uint_t i, count;
2108	mntopt_t *motbl;
2109
2110	/*
2111	 * Clear out any existing stuff in the options table being initialized
2112	 */
2113	vfs_freeopttbl(dmo);
2114	count = (smo == NULL) ? 0 : smo->mo_count;
2115	if ((count + extra) == 0)	/* nothing to do */
2116		return;
2117	dmo->mo_count = count + extra;
2118	motbl = kmem_zalloc((count + extra) * sizeof (mntopt_t), KM_SLEEP);
2119	dmo->mo_list = motbl;
2120	for (i = 0; i < count; i++) {
2121		vfs_copyopt(&smo->mo_list[i], &motbl[i]);
2122	}
2123	for (i = count; i < count + extra; i++) {
2124		motbl[i].mo_flags = MO_EMPTY;
2125	}
2126}
2127
2128/*
2129 * Copy a mount options table.
2130 *
2131 * This function is *not* for general use by filesystems.
2132 *
2133 * Note: caller is responsible for locking the vfs list, if needed,
2134 *       to protect smo and dmo.
2135 */
2136void
2137vfs_copyopttbl(const mntopts_t *smo, mntopts_t *dmo)
2138{
2139	vfs_copyopttbl_extend(smo, dmo, 0);
2140}
2141
2142static char **
2143vfs_mergecancelopts(const mntopt_t *mop1, const mntopt_t *mop2)
2144{
2145	int c1 = 0;
2146	int c2 = 0;
2147	char **result;
2148	char **sp1, **sp2, **dp;
2149
2150	/*
2151	 * First we count both lists of cancel options.
2152	 * If either is NULL or has no elements, we return a copy of
2153	 * the other.
2154	 */
2155	if (mop1->mo_cancel != NULL) {
2156		for (; mop1->mo_cancel[c1] != NULL; c1++)
2157			/* count cancel options in mop1 */;
2158	}
2159
2160	if (c1 == 0)
2161		return (vfs_copycancelopt_extend(mop2->mo_cancel, 0));
2162
2163	if (mop2->mo_cancel != NULL) {
2164		for (; mop2->mo_cancel[c2] != NULL; c2++)
2165			/* count cancel options in mop2 */;
2166	}
2167
2168	result = vfs_copycancelopt_extend(mop1->mo_cancel, c2);
2169
2170	if (c2 == 0)
2171		return (result);
2172
2173	/*
2174	 * When we get here, we've got two sets of cancel options;
2175	 * we need to merge the two sets.  We know that the result
2176	 * array has "c1+c2+1" entries and in the end we might shrink
2177	 * it.
2178	 * Result now has a copy of the c1 entries from mop1; we'll
2179	 * now lookup all the entries of mop2 in mop1 and copy it if
2180	 * it is unique.
2181	 * This operation is O(n^2) but it's only called once per
2182	 * filesystem per duplicate option.  This is a situation
2183	 * which doesn't arise with the filesystems in ON and
2184	 * n is generally 1.
2185	 */
2186
2187	dp = &result[c1];
2188	for (sp2 = mop2->mo_cancel; *sp2 != NULL; sp2++) {
2189		for (sp1 = mop1->mo_cancel; *sp1 != NULL; sp1++) {
2190			if (strcmp(*sp1, *sp2) == 0)
2191				break;
2192		}
2193		if (*sp1 == NULL) {
2194			/*
2195			 * Option *sp2 not found in mop1, so copy it.
2196			 * The calls to vfs_copycancelopt_extend()
2197			 * guarantee that there's enough room.
2198			 */
2199			*dp = kmem_alloc(strlen(*sp2) + 1, KM_SLEEP);
2200			(void) strcpy(*dp++, *sp2);
2201		}
2202	}
2203	if (dp != &result[c1+c2]) {
2204		size_t bytes = (dp - result + 1) * sizeof (char *);
2205		char **nres = kmem_alloc(bytes, KM_SLEEP);
2206
2207		bcopy(result, nres, bytes);
2208		kmem_free(result, (c1 + c2 + 1) * sizeof (char *));
2209		result = nres;
2210	}
2211	return (result);
2212}
2213
2214/*
2215 * Merge two mount option tables (outer and inner) into one.  This is very
2216 * similar to "merging" global variables and automatic variables in C.
2217 *
2218 * This isn't (and doesn't have to be) fast.
2219 *
2220 * This function is *not* for general use by filesystems.
2221 *
2222 * Note: caller is responsible for locking the vfs list, if needed,
2223 *       to protect omo, imo & dmo.
2224 */
2225void
2226vfs_mergeopttbl(const mntopts_t *omo, const mntopts_t *imo, mntopts_t *dmo)
2227{
2228	uint_t i, count;
2229	mntopt_t *mop, *motbl;
2230	uint_t freeidx;
2231
2232	/*
2233	 * First determine how much space we need to allocate.
2234	 */
2235	count = omo->mo_count;
2236	for (i = 0; i < imo->mo_count; i++) {
2237		if (imo->mo_list[i].mo_flags & MO_EMPTY)
2238			continue;
2239		if (vfs_hasopt(omo, imo->mo_list[i].mo_name) == NULL)
2240			count++;
2241	}
2242	ASSERT(count >= omo->mo_count &&
2243	    count <= omo->mo_count + imo->mo_count);
2244	motbl = kmem_alloc(count * sizeof (mntopt_t), KM_SLEEP);
2245	for (i = 0; i < omo->mo_count; i++)
2246		vfs_copyopt(&omo->mo_list[i], &motbl[i]);
2247	freeidx = omo->mo_count;
2248	for (i = 0; i < imo->mo_count; i++) {
2249		if (imo->mo_list[i].mo_flags & MO_EMPTY)
2250			continue;
2251		if ((mop = vfs_hasopt(omo, imo->mo_list[i].mo_name)) != NULL) {
2252			char **newcanp;
2253			uint_t index = mop - omo->mo_list;
2254
2255			newcanp = vfs_mergecancelopts(mop, &motbl[index]);
2256
2257			vfs_freeopt(&motbl[index]);
2258			vfs_copyopt(&imo->mo_list[i], &motbl[index]);
2259
2260			vfs_freecancelopt(motbl[index].mo_cancel);
2261			motbl[index].mo_cancel = newcanp;
2262		} else {
2263			/*
2264			 * If it's a new option, just copy it over to the first
2265			 * free location.
2266			 */
2267			vfs_copyopt(&imo->mo_list[i], &motbl[freeidx++]);
2268		}
2269	}
2270	dmo->mo_count = count;
2271	dmo->mo_list = motbl;
2272}
2273
2274/*
2275 * Functions to set and clear mount options in a mount options table.
2276 */
2277
2278/*
2279 * Clear a mount option, if it exists.
2280 *
2281 * The update_mnttab arg indicates whether mops is part of a vfs that is on
2282 * the vfs list.
2283 */
2284static void
2285vfs_clearmntopt_nolock(mntopts_t *mops, const char *opt, int update_mnttab)
2286{
2287	struct mntopt *mop;
2288	uint_t i, count;
2289
2290	ASSERT(!update_mnttab || RW_WRITE_HELD(&vfslist));
2291
2292	count = mops->mo_count;
2293	for (i = 0; i < count; i++) {
2294		mop = &mops->mo_list[i];
2295
2296		if (mop->mo_flags & MO_EMPTY)
2297			continue;
2298		if (strcmp(opt, mop->mo_name))
2299			continue;
2300		mop->mo_flags &= ~MO_SET;
2301		if (mop->mo_arg != NULL) {
2302			kmem_free(mop->mo_arg, strlen(mop->mo_arg) + 1);
2303		}
2304		mop->mo_arg = NULL;
2305		if (update_mnttab)
2306			vfs_mnttab_modtimeupd();
2307		break;
2308	}
2309}
2310
2311void
2312vfs_clearmntopt(struct vfs *vfsp, const char *opt)
2313{
2314	int gotlock = 0;
2315
2316	if (VFS_ON_LIST(vfsp)) {
2317		gotlock = 1;
2318		vfs_list_lock();
2319	}
2320	vfs_clearmntopt_nolock(&vfsp->vfs_mntopts, opt, gotlock);
2321	if (gotlock)
2322		vfs_list_unlock();
2323}
2324
2325
2326/*
2327 * Set a mount option on.  If it's not found in the table, it's silently
2328 * ignored.  If the option has MO_IGNORE set, it is still set unless the
2329 * VFS_NOFORCEOPT bit is set in the flags.  Also, VFS_DISPLAY/VFS_NODISPLAY flag
2330 * bits can be used to toggle the MO_NODISPLAY bit for the option.
2331 * If the VFS_CREATEOPT flag bit is set then the first option slot with
2332 * MO_EMPTY set is created as the option passed in.
2333 *
2334 * The update_mnttab arg indicates whether mops is part of a vfs that is on
2335 * the vfs list.
2336 */
2337static void
2338vfs_setmntopt_nolock(mntopts_t *mops, const char *opt,
2339    const char *arg, int flags, int update_mnttab)
2340{
2341	mntopt_t *mop;
2342	uint_t i, count;
2343	char *sp;
2344
2345	ASSERT(!update_mnttab || RW_WRITE_HELD(&vfslist));
2346
2347	if (flags & VFS_CREATEOPT) {
2348		if (vfs_hasopt(mops, opt) != NULL) {
2349			flags &= ~VFS_CREATEOPT;
2350		}
2351	}
2352	count = mops->mo_count;
2353	for (i = 0; i < count; i++) {
2354		mop = &mops->mo_list[i];
2355
2356		if (mop->mo_flags & MO_EMPTY) {
2357			if ((flags & VFS_CREATEOPT) == 0)
2358				continue;
2359			sp = kmem_alloc(strlen(opt) + 1, KM_SLEEP);
2360			(void) strcpy(sp, opt);
2361			mop->mo_name = sp;
2362			if (arg != NULL)
2363				mop->mo_flags = MO_HASVALUE;
2364			else
2365				mop->mo_flags = 0;
2366		} else if (strcmp(opt, mop->mo_name)) {
2367			continue;
2368		}
2369		if ((mop->mo_flags & MO_IGNORE) && (flags & VFS_NOFORCEOPT))
2370			break;
2371		if (arg != NULL && (mop->mo_flags & MO_HASVALUE) != 0) {
2372			sp = kmem_alloc(strlen(arg) + 1, KM_SLEEP);
2373			(void) strcpy(sp, arg);
2374		} else {
2375			sp = NULL;
2376		}
2377		if (mop->mo_arg != NULL)
2378			kmem_free(mop->mo_arg, strlen(mop->mo_arg) + 1);
2379		mop->mo_arg = sp;
2380		if (flags & VFS_DISPLAY)
2381			mop->mo_flags &= ~MO_NODISPLAY;
2382		if (flags & VFS_NODISPLAY)
2383			mop->mo_flags |= MO_NODISPLAY;
2384		mop->mo_flags |= MO_SET;
2385		if (mop->mo_cancel != NULL) {
2386			char **cp;
2387
2388			for (cp = mop->mo_cancel; *cp != NULL; cp++)
2389				vfs_clearmntopt_nolock(mops, *cp, 0);
2390		}
2391		if (update_mnttab)
2392			vfs_mnttab_modtimeupd();
2393		break;
2394	}
2395}
2396
2397void
2398vfs_setmntopt(struct vfs *vfsp, const char *opt, const char *arg, int flags)
2399{
2400	int gotlock = 0;
2401
2402	if (VFS_ON_LIST(vfsp)) {
2403		gotlock = 1;
2404		vfs_list_lock();
2405	}
2406	vfs_setmntopt_nolock(&vfsp->vfs_mntopts, opt, arg, flags, gotlock);
2407	if (gotlock)
2408		vfs_list_unlock();
2409}
2410
2411
2412/*
2413 * Add a "tag" option to a mounted file system's options list.
2414 *
2415 * Note: caller is responsible for locking the vfs list, if needed,
2416 *       to protect mops.
2417 */
2418static mntopt_t *
2419vfs_addtag(mntopts_t *mops, const char *tag)
2420{
2421	uint_t count;
2422	mntopt_t *mop, *motbl;
2423
2424	count = mops->mo_count + 1;
2425	motbl = kmem_zalloc(count * sizeof (mntopt_t), KM_SLEEP);
2426	if (mops->mo_count) {
2427		size_t len = (count - 1) * sizeof (mntopt_t);
2428
2429		bcopy(mops->mo_list, motbl, len);
2430		kmem_free(mops->mo_list, len);
2431	}
2432	mops->mo_count = count;
2433	mops->mo_list = motbl;
2434	mop = &motbl[count - 1];
2435	mop->mo_flags = MO_TAG;
2436	mop->mo_name = kmem_alloc(strlen(tag) + 1, KM_SLEEP);
2437	(void) strcpy(mop->mo_name, tag);
2438	return (mop);
2439}
2440
2441/*
2442 * Allow users to set arbitrary "tags" in a vfs's mount options.
2443 * Broader use within the kernel is discouraged.
2444 */
2445int
2446vfs_settag(uint_t major, uint_t minor, const char *mntpt, const char *tag,
2447    cred_t *cr)
2448{
2449	vfs_t *vfsp;
2450	mntopts_t *mops;
2451	mntopt_t *mop;
2452	int found = 0;
2453	dev_t dev = makedevice(major, minor);
2454	int err = 0;
2455	char *buf = kmem_alloc(MAX_MNTOPT_STR, KM_SLEEP);
2456
2457	/*
2458	 * Find the desired mounted file system
2459	 */
2460	vfs_list_lock();
2461	vfsp = rootvfs;
2462	do {
2463		if (vfsp->vfs_dev == dev &&
2464		    strcmp(mntpt, refstr_value(vfsp->vfs_mntpt)) == 0) {
2465			found = 1;
2466			break;
2467		}
2468		vfsp = vfsp->vfs_next;
2469	} while (vfsp != rootvfs);
2470
2471	if (!found) {
2472		err = EINVAL;
2473		goto out;
2474	}
2475	err = secpolicy_fs_config(cr, vfsp);
2476	if (err != 0)
2477		goto out;
2478
2479	mops = &vfsp->vfs_mntopts;
2480	/*
2481	 * Add tag if it doesn't already exist
2482	 */
2483	if ((mop = vfs_hasopt(mops, tag)) == NULL) {
2484		int len;
2485
2486		(void) vfs_buildoptionstr(mops, buf, MAX_MNTOPT_STR);
2487		len = strlen(buf);
2488		if (len + strlen(tag) + 2 > MAX_MNTOPT_STR) {
2489			err = ENAMETOOLONG;
2490			goto out;
2491		}
2492		mop = vfs_addtag(mops, tag);
2493	}
2494	if ((mop->mo_flags & MO_TAG) == 0) {
2495		err = EINVAL;
2496		goto out;
2497	}
2498	vfs_setmntopt_nolock(mops, tag, NULL, 0, 1);
2499out:
2500	vfs_list_unlock();
2501	kmem_free(buf, MAX_MNTOPT_STR);
2502	return (err);
2503}
2504
2505/*
2506 * Allow users to remove arbitrary "tags" in a vfs's mount options.
2507 * Broader use within the kernel is discouraged.
2508 */
2509int
2510vfs_clrtag(uint_t major, uint_t minor, const char *mntpt, const char *tag,
2511    cred_t *cr)
2512{
2513	vfs_t *vfsp;
2514	mntopt_t *mop;
2515	int found = 0;
2516	dev_t dev = makedevice(major, minor);
2517	int err = 0;
2518
2519	/*
2520	 * Find the desired mounted file system
2521	 */
2522	vfs_list_lock();
2523	vfsp = rootvfs;
2524	do {
2525		if (vfsp->vfs_dev == dev &&
2526		    strcmp(mntpt, refstr_value(vfsp->vfs_mntpt)) == 0) {
2527			found = 1;
2528			break;
2529		}
2530		vfsp = vfsp->vfs_next;
2531	} while (vfsp != rootvfs);
2532
2533	if (!found) {
2534		err = EINVAL;
2535		goto out;
2536	}
2537	err = secpolicy_fs_config(cr, vfsp);
2538	if (err != 0)
2539		goto out;
2540
2541	if ((mop = vfs_hasopt(&vfsp->vfs_mntopts, tag)) == NULL) {
2542		err = EINVAL;
2543		goto out;
2544	}
2545	if ((mop->mo_flags & MO_TAG) == 0) {
2546		err = EINVAL;
2547		goto out;
2548	}
2549	vfs_clearmntopt_nolock(&vfsp->vfs_mntopts, tag, 1);
2550out:
2551	vfs_list_unlock();
2552	return (err);
2553}
2554
2555/*
2556 * Function to parse an option string and fill in a mount options table.
2557 * Unknown options are silently ignored.  The input option string is modified
2558 * by replacing separators with nulls.  If the create flag is set, options
2559 * not found in the table are just added on the fly.  The table must have
2560 * an option slot marked MO_EMPTY to add an option on the fly.
2561 *
2562 * This function is *not* for general use by filesystems.
2563 *
2564 * Note: caller is responsible for locking the vfs list, if needed,
2565 *       to protect mops..
2566 */
2567void
2568vfs_parsemntopts(mntopts_t *mops, char *osp, int create)
2569{
2570	char *s = osp, *p, *nextop, *valp, *cp, *ep;
2571	int setflg = VFS_NOFORCEOPT;
2572
2573	if (osp == NULL)
2574		return;
2575	while (*s != '\0') {
2576		p = strchr(s, ',');	/* find next option */
2577		if (p == NULL) {
2578			cp = NULL;
2579			p = s + strlen(s);
2580		} else {
2581			cp = p;		/* save location of comma */
2582			*p++ = '\0';	/* mark end and point to next option */
2583		}
2584		nextop = p;
2585		p = strchr(s, '=');	/* look for value */
2586		if (p == NULL) {
2587			valp = NULL;	/* no value supplied */
2588		} else {
2589			ep = p;		/* save location of equals */
2590			*p++ = '\0';	/* end option and point to value */
2591			valp = p;
2592		}
2593		/*
2594		 * set option into options table
2595		 */
2596		if (create)
2597			setflg |= VFS_CREATEOPT;
2598		vfs_setmntopt_nolock(mops, s, valp, setflg, 0);
2599		if (cp != NULL)
2600			*cp = ',';	/* restore the comma */
2601		if (valp != NULL)
2602			*ep = '=';	/* restore the equals */
2603		s = nextop;
2604	}
2605}
2606
2607/*
2608 * Function to inquire if an option exists in a mount options table.
2609 * Returns a pointer to the option if it exists, else NULL.
2610 *
2611 * This function is *not* for general use by filesystems.
2612 *
2613 * Note: caller is responsible for locking the vfs list, if needed,
2614 *       to protect mops.
2615 */
2616struct mntopt *
2617vfs_hasopt(const mntopts_t *mops, const char *opt)
2618{
2619	struct mntopt *mop;
2620	uint_t i, count;
2621
2622	count = mops->mo_count;
2623	for (i = 0; i < count; i++) {
2624		mop = &mops->mo_list[i];
2625
2626		if (mop->mo_flags & MO_EMPTY)
2627			continue;
2628		if (strcmp(opt, mop->mo_name) == 0)
2629			return (mop);
2630	}
2631	return (NULL);
2632}
2633
2634/*
2635 * Function to inquire if an option is set in a mount options table.
2636 * Returns non-zero if set and fills in the arg pointer with a pointer to
2637 * the argument string or NULL if there is no argument string.
2638 */
2639static int
2640vfs_optionisset_nolock(const mntopts_t *mops, const char *opt, char **argp)
2641{
2642	struct mntopt *mop;
2643	uint_t i, count;
2644
2645	count = mops->mo_count;
2646	for (i = 0; i < count; i++) {
2647		mop = &mops->mo_list[i];
2648
2649		if (mop->mo_flags & MO_EMPTY)
2650			continue;
2651		if (strcmp(opt, mop->mo_name))
2652			continue;
2653		if ((mop->mo_flags & MO_SET) == 0)
2654			return (0);
2655		if (argp != NULL && (mop->mo_flags & MO_HASVALUE) != 0)
2656			*argp = mop->mo_arg;
2657		return (1);
2658	}
2659	return (0);
2660}
2661
2662
2663int
2664vfs_optionisset(const struct vfs *vfsp, const char *opt, char **argp)
2665{
2666	int ret;
2667
2668	vfs_list_read_lock();
2669	ret = vfs_optionisset_nolock(&vfsp->vfs_mntopts, opt, argp);
2670	vfs_list_unlock();
2671	return (ret);
2672}
2673
2674
2675/*
2676 * Construct a comma separated string of the options set in the given
2677 * mount table, return the string in the given buffer.  Return non-zero if
2678 * the buffer would overflow.
2679 *
2680 * This function is *not* for general use by filesystems.
2681 *
2682 * Note: caller is responsible for locking the vfs list, if needed,
2683 *       to protect mp.
2684 */
2685int
2686vfs_buildoptionstr(const mntopts_t *mp, char *buf, int len)
2687{
2688	char *cp;
2689	uint_t i;
2690
2691	buf[0] = '\0';
2692	cp = buf;
2693	for (i = 0; i < mp->mo_count; i++) {
2694		struct mntopt *mop;
2695
2696		mop = &mp->mo_list[i];
2697		if (mop->mo_flags & MO_SET) {
2698			int optlen, comma = 0;
2699
2700			if (buf[0] != '\0')
2701				comma = 1;
2702			optlen = strlen(mop->mo_name);
2703			if (strlen(buf) + comma + optlen + 1 > len)
2704				goto err;
2705			if (comma)
2706				*cp++ = ',';
2707			(void) strcpy(cp, mop->mo_name);
2708			cp += optlen;
2709			/*
2710			 * Append option value if there is one
2711			 */
2712			if (mop->mo_arg != NULL) {
2713				int arglen;
2714
2715				arglen = strlen(mop->mo_arg);
2716				if (strlen(buf) + arglen + 2 > len)
2717					goto err;
2718				*cp++ = '=';
2719				(void) strcpy(cp, mop->mo_arg);
2720				cp += arglen;
2721			}
2722		}
2723	}
2724	return (0);
2725err:
2726	return (EOVERFLOW);
2727}
2728
2729static void
2730vfs_freecancelopt(char **moc)
2731{
2732	if (moc != NULL) {
2733		int ccnt = 0;
2734		char **cp;
2735
2736		for (cp = moc; *cp != NULL; cp++) {
2737			kmem_free(*cp, strlen(*cp) + 1);
2738			ccnt++;
2739		}
2740		kmem_free(moc, (ccnt + 1) * sizeof (char *));
2741	}
2742}
2743
2744static void
2745vfs_freeopt(mntopt_t *mop)
2746{
2747	if (mop->mo_name != NULL)
2748		kmem_free(mop->mo_name, strlen(mop->mo_name) + 1);
2749
2750	vfs_freecancelopt(mop->mo_cancel);
2751
2752	if (mop->mo_arg != NULL)
2753		kmem_free(mop->mo_arg, strlen(mop->mo_arg) + 1);
2754}
2755
2756/*
2757 * Free a mount options table
2758 *
2759 * This function is *not* for general use by filesystems.
2760 *
2761 * Note: caller is responsible for locking the vfs list, if needed,
2762 *       to protect mp.
2763 */
2764void
2765vfs_freeopttbl(mntopts_t *mp)
2766{
2767	uint_t i, count;
2768
2769	count = mp->mo_count;
2770	for (i = 0; i < count; i++) {
2771		vfs_freeopt(&mp->mo_list[i]);
2772	}
2773	if (count) {
2774		kmem_free(mp->mo_list, sizeof (mntopt_t) * count);
2775		mp->mo_count = 0;
2776		mp->mo_list = NULL;
2777	}
2778}
2779
2780
2781/* ARGSUSED */
2782static int
2783vfs_mntdummyread(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cred,
2784	caller_context_t *ct)
2785{
2786	return (0);
2787}
2788
2789/* ARGSUSED */
2790static int
2791vfs_mntdummywrite(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cred,
2792	caller_context_t *ct)
2793{
2794	return (0);
2795}
2796
2797/*
2798 * The dummy vnode is currently used only by file events notification
2799 * module which is just interested in the timestamps.
2800 */
2801/* ARGSUSED */
2802static int
2803vfs_mntdummygetattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
2804    caller_context_t *ct)
2805{
2806	bzero(vap, sizeof (vattr_t));
2807	vap->va_type = VREG;
2808	vap->va_nlink = 1;
2809	vap->va_ctime = vfs_mnttab_ctime;
2810	/*
2811	 * it is ok to just copy mtime as the time will be monotonically
2812	 * increasing.
2813	 */
2814	vap->va_mtime = vfs_mnttab_mtime;
2815	vap->va_atime = vap->va_mtime;
2816	return (0);
2817}
2818
2819static void
2820vfs_mnttabvp_setup(void)
2821{
2822	vnode_t *tvp;
2823	vnodeops_t *vfs_mntdummyvnops;
2824	const fs_operation_def_t mnt_dummyvnodeops_template[] = {
2825		VOPNAME_READ, 		{ .vop_read = vfs_mntdummyread },
2826		VOPNAME_WRITE, 		{ .vop_write = vfs_mntdummywrite },
2827		VOPNAME_GETATTR,	{ .vop_getattr = vfs_mntdummygetattr },
2828		VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
2829		NULL,			NULL
2830	};
2831
2832	if (vn_make_ops("mnttab", mnt_dummyvnodeops_template,
2833	    &vfs_mntdummyvnops) != 0) {
2834		cmn_err(CE_WARN, "vfs_mnttabvp_setup: vn_make_ops failed");
2835		/* Shouldn't happen, but not bad enough to panic */
2836		return;
2837	}
2838
2839	/*
2840	 * A global dummy vnode is allocated to represent mntfs files.
2841	 * The mntfs file (/etc/mnttab) can be monitored for file events
2842	 * and receive an event when mnttab changes. Dummy VOP calls
2843	 * will be made on this vnode. The file events notification module
2844	 * intercepts this vnode and delivers relevant events.
2845	 */
2846	tvp = vn_alloc(KM_SLEEP);
2847	tvp->v_flag = VNOMOUNT|VNOMAP|VNOSWAP|VNOCACHE;
2848	vn_setops(tvp, vfs_mntdummyvnops);
2849	tvp->v_type = VREG;
2850	/*
2851	 * The mnt dummy ops do not reference v_data.
2852	 * No other module intercepting this vnode should either.
2853	 * Just set it to point to itself.
2854	 */
2855	tvp->v_data = (caddr_t)tvp;
2856	tvp->v_vfsp = rootvfs;
2857	vfs_mntdummyvp = tvp;
2858}
2859
2860/*
2861 * performs fake read/write ops
2862 */
2863static void
2864vfs_mnttab_rwop(int rw)
2865{
2866	struct uio	uio;
2867	struct iovec	iov;
2868	char	buf[1];
2869
2870	if (vfs_mntdummyvp == NULL)
2871		return;
2872
2873	bzero(&uio, sizeof (uio));
2874	bzero(&iov, sizeof (iov));
2875	iov.iov_base = buf;
2876	iov.iov_len = 0;
2877	uio.uio_iov = &iov;
2878	uio.uio_iovcnt = 1;
2879	uio.uio_loffset = 0;
2880	uio.uio_segflg = UIO_SYSSPACE;
2881	uio.uio_resid = 0;
2882	if (rw) {
2883		(void) VOP_WRITE(vfs_mntdummyvp, &uio, 0, kcred, NULL);
2884	} else {
2885		(void) VOP_READ(vfs_mntdummyvp, &uio, 0, kcred, NULL);
2886	}
2887}
2888
2889/*
2890 * Generate a write operation.
2891 */
2892void
2893vfs_mnttab_writeop(void)
2894{
2895	vfs_mnttab_rwop(1);
2896}
2897
2898/*
2899 * Generate a read operation.
2900 */
2901void
2902vfs_mnttab_readop(void)
2903{
2904	vfs_mnttab_rwop(0);
2905}
2906
2907/*
2908 * Free any mnttab information recorded in the vfs struct.
2909 * The vfs must not be on the vfs list.
2910 */
2911static void
2912vfs_freemnttab(struct vfs *vfsp)
2913{
2914	ASSERT(!VFS_ON_LIST(vfsp));
2915
2916	/*
2917	 * Free device and mount point information
2918	 */
2919	if (vfsp->vfs_mntpt != NULL) {
2920		refstr_rele(vfsp->vfs_mntpt);
2921		vfsp->vfs_mntpt = NULL;
2922	}
2923	if (vfsp->vfs_resource != NULL) {
2924		refstr_rele(vfsp->vfs_resource);
2925		vfsp->vfs_resource = NULL;
2926	}
2927	/*
2928	 * Now free mount options information
2929	 */
2930	vfs_freeopttbl(&vfsp->vfs_mntopts);
2931}
2932
2933/*
2934 * Return the last mnttab modification time
2935 */
2936void
2937vfs_mnttab_modtime(timespec_t *ts)
2938{
2939	ASSERT(RW_LOCK_HELD(&vfslist));
2940	*ts = vfs_mnttab_mtime;
2941}
2942
2943/*
2944 * See if mnttab is changed
2945 */
2946void
2947vfs_mnttab_poll(timespec_t *old, struct pollhead **phpp)
2948{
2949	int changed;
2950
2951	*phpp = (struct pollhead *)NULL;
2952
2953	/*
2954	 * Note: don't grab vfs list lock before accessing vfs_mnttab_mtime.
2955	 * Can lead to deadlock against vfs_mnttab_modtimeupd(). It is safe
2956	 * to not grab the vfs list lock because tv_sec is monotonically
2957	 * increasing.
2958	 */
2959
2960	changed = (old->tv_nsec != vfs_mnttab_mtime.tv_nsec) ||
2961	    (old->tv_sec != vfs_mnttab_mtime.tv_sec);
2962	if (!changed) {
2963		*phpp = &vfs_pollhd;
2964	}
2965}
2966
2967/*
2968 * Update the mnttab modification time and wake up any waiters for
2969 * mnttab changes
2970 */
2971void
2972vfs_mnttab_modtimeupd()
2973{
2974	hrtime_t oldhrt, newhrt;
2975
2976	ASSERT(RW_WRITE_HELD(&vfslist));
2977	oldhrt = ts2hrt(&vfs_mnttab_mtime);
2978	gethrestime(&vfs_mnttab_mtime);
2979	newhrt = ts2hrt(&vfs_mnttab_mtime);
2980	if (oldhrt == (hrtime_t)0)
2981		vfs_mnttab_ctime = vfs_mnttab_mtime;
2982	/*
2983	 * Attempt to provide unique mtime (like uniqtime but not).
2984	 */
2985	if (newhrt == oldhrt) {
2986		newhrt++;
2987		hrt2ts(newhrt, &vfs_mnttab_mtime);
2988	}
2989	pollwakeup(&vfs_pollhd, (short)POLLRDBAND);
2990	vfs_mnttab_writeop();
2991}
2992
2993int
2994dounmount(struct vfs *vfsp, int flag, cred_t *cr)
2995{
2996	vnode_t *coveredvp;
2997	int error;
2998	extern void teardown_vopstats(vfs_t *);
2999
3000	/*
3001	 * Get covered vnode. This will be NULL if the vfs is not linked
3002	 * into the file system name space (i.e., domount() with MNT_NOSPICE).
3003	 */
3004	coveredvp = vfsp->vfs_vnodecovered;
3005	ASSERT(coveredvp == NULL || vn_vfswlock_held(coveredvp));
3006
3007	/*
3008	 * Purge all dnlc entries for this vfs.
3009	 */
3010	(void) dnlc_purge_vfsp(vfsp, 0);
3011
3012	/* For forcible umount, skip VFS_SYNC() since it may hang */
3013	if ((flag & MS_FORCE) == 0)
3014		(void) VFS_SYNC(vfsp, 0, cr);
3015
3016	/*
3017	 * Lock the vfs to maintain fs status quo during unmount.  This
3018	 * has to be done after the sync because ufs_update tries to acquire
3019	 * the vfs_reflock.
3020	 */
3021	vfs_lock_wait(vfsp);
3022
3023	if (error = VFS_UNMOUNT(vfsp, flag, cr)) {
3024		vfs_unlock(vfsp);
3025		if (coveredvp != NULL)
3026			vn_vfsunlock(coveredvp);
3027	} else if (coveredvp != NULL) {
3028		teardown_vopstats(vfsp);
3029		/*
3030		 * vfs_remove() will do a VN_RELE(vfsp->vfs_vnodecovered)
3031		 * when it frees vfsp so we do a VN_HOLD() so we can
3032		 * continue to use coveredvp afterwards.
3033		 */
3034		VN_HOLD(coveredvp);
3035		vfs_remove(vfsp);
3036		vn_vfsunlock(coveredvp);
3037		VN_RELE(coveredvp);
3038	} else {
3039		teardown_vopstats(vfsp);
3040		/*
3041		 * Release the reference to vfs that is not linked
3042		 * into the name space.
3043		 */
3044		vfs_unlock(vfsp);
3045		VFS_RELE(vfsp);
3046	}
3047	return (error);
3048}
3049
3050
3051/*
3052 * Vfs_unmountall() is called by uadmin() to unmount all
3053 * mounted file systems (except the root file system) during shutdown.
3054 * It follows the existing locking protocol when traversing the vfs list
3055 * to sync and unmount vfses. Even though there should be no
3056 * other thread running while the system is shutting down, it is prudent
3057 * to still follow the locking protocol.
3058 */
3059void
3060vfs_unmountall(void)
3061{
3062	struct vfs *vfsp;
3063	struct vfs *prev_vfsp = NULL;
3064	int error;
3065
3066	/*
3067	 * Toss all dnlc entries now so that the per-vfs sync
3068	 * and unmount operations don't have to slog through
3069	 * a bunch of uninteresting vnodes over and over again.
3070	 */
3071	dnlc_purge();
3072
3073	vfs_list_lock();
3074	for (vfsp = rootvfs->vfs_prev; vfsp != rootvfs; vfsp = prev_vfsp) {
3075		prev_vfsp = vfsp->vfs_prev;
3076
3077		if (vfs_lock(vfsp) != 0)
3078			continue;
3079		error = vn_vfswlock(vfsp->vfs_vnodecovered);
3080		vfs_unlock(vfsp);
3081		if (error)
3082			continue;
3083
3084		vfs_list_unlock();
3085
3086		(void) VFS_SYNC(vfsp, SYNC_CLOSE, CRED());
3087		(void) dounmount(vfsp, 0, CRED());
3088
3089		/*
3090		 * Since we dropped the vfslist lock above we must
3091		 * verify that next_vfsp still exists, else start over.
3092		 */
3093		vfs_list_lock();
3094		for (vfsp = rootvfs->vfs_prev;
3095		    vfsp != rootvfs; vfsp = vfsp->vfs_prev)
3096			if (vfsp == prev_vfsp)
3097				break;
3098		if (vfsp == rootvfs && prev_vfsp != rootvfs)
3099			prev_vfsp = rootvfs->vfs_prev;
3100	}
3101	vfs_list_unlock();
3102}
3103
3104/*
3105 * Called to add an entry to the end of the vfs mount in progress list
3106 */
3107void
3108vfs_addmip(dev_t dev, struct vfs *vfsp)
3109{
3110	struct ipmnt *mipp;
3111
3112	mipp = (struct ipmnt *)kmem_alloc(sizeof (struct ipmnt), KM_SLEEP);
3113	mipp->mip_next = NULL;
3114	mipp->mip_dev = dev;
3115	mipp->mip_vfsp = vfsp;
3116	mutex_enter(&vfs_miplist_mutex);
3117	if (vfs_miplist_end != NULL)
3118		vfs_miplist_end->mip_next = mipp;
3119	else
3120		vfs_miplist = mipp;
3121	vfs_miplist_end = mipp;
3122	mutex_exit(&vfs_miplist_mutex);
3123}
3124
3125/*
3126 * Called to remove an entry from the mount in progress list
3127 * Either because the mount completed or it failed.
3128 */
3129void
3130vfs_delmip(struct vfs *vfsp)
3131{
3132	struct ipmnt *mipp, *mipprev;
3133
3134	mutex_enter(&vfs_miplist_mutex);
3135	mipprev = NULL;
3136	for (mipp = vfs_miplist;
3137	    mipp && mipp->mip_vfsp != vfsp; mipp = mipp->mip_next) {
3138		mipprev = mipp;
3139	}
3140	if (mipp == NULL)
3141		return; /* shouldn't happen */
3142	if (mipp == vfs_miplist_end)
3143		vfs_miplist_end = mipprev;
3144	if (mipprev == NULL)
3145		vfs_miplist = mipp->mip_next;
3146	else
3147		mipprev->mip_next = mipp->mip_next;
3148	mutex_exit(&vfs_miplist_mutex);
3149	kmem_free(mipp, sizeof (struct ipmnt));
3150}
3151
3152/*
3153 * vfs_add is called by a specific filesystem's mount routine to add
3154 * the new vfs into the vfs list/hash and to cover the mounted-on vnode.
3155 * The vfs should already have been locked by the caller.
3156 *
3157 * coveredvp is NULL if this is the root.
3158 */
3159void
3160vfs_add(vnode_t *coveredvp, struct vfs *vfsp, int mflag)
3161{
3162	int newflag;
3163
3164	ASSERT(vfs_lock_held(vfsp));
3165	VFS_HOLD(vfsp);
3166	newflag = vfsp->vfs_flag;
3167	if (mflag & MS_RDONLY)
3168		newflag |= VFS_RDONLY;
3169	else
3170		newflag &= ~VFS_RDONLY;
3171	if (mflag & MS_NOSUID)
3172		newflag |= (VFS_NOSETUID|VFS_NODEVICES);
3173	else
3174		newflag &= ~(VFS_NOSETUID|VFS_NODEVICES);
3175	if (mflag & MS_NOMNTTAB)
3176		newflag |= VFS_NOMNTTAB;
3177	else
3178		newflag &= ~VFS_NOMNTTAB;
3179
3180	if (coveredvp != NULL) {
3181		ASSERT(vn_vfswlock_held(coveredvp));
3182		coveredvp->v_vfsmountedhere = vfsp;
3183		VN_HOLD(coveredvp);
3184	}
3185	vfsp->vfs_vnodecovered = coveredvp;
3186	vfsp->vfs_flag = newflag;
3187
3188	vfs_list_add(vfsp);
3189}
3190
3191/*
3192 * Remove a vfs from the vfs list, null out the pointer from the
3193 * covered vnode to the vfs (v_vfsmountedhere), and null out the pointer
3194 * from the vfs to the covered vnode (vfs_vnodecovered). Release the
3195 * reference to the vfs and to the covered vnode.
3196 *
3197 * Called from dounmount after it's confirmed with the file system
3198 * that the unmount is legal.
3199 */
3200void
3201vfs_remove(struct vfs *vfsp)
3202{
3203	vnode_t *vp;
3204
3205	ASSERT(vfs_lock_held(vfsp));
3206
3207	/*
3208	 * Can't unmount root.  Should never happen because fs will
3209	 * be busy.
3210	 */
3211	if (vfsp == rootvfs)
3212		panic("vfs_remove: unmounting root");
3213
3214	vfs_list_remove(vfsp);
3215
3216	/*
3217	 * Unhook from the file system name space.
3218	 */
3219	vp = vfsp->vfs_vnodecovered;
3220	ASSERT(vn_vfswlock_held(vp));
3221	vp->v_vfsmountedhere = NULL;
3222	vfsp->vfs_vnodecovered = NULL;
3223	VN_RELE(vp);
3224
3225	/*
3226	 * Release lock and wakeup anybody waiting.
3227	 */
3228	vfs_unlock(vfsp);
3229	VFS_RELE(vfsp);
3230}
3231
3232/*
3233 * Lock a filesystem to prevent access to it while mounting,
3234 * unmounting and syncing.  Return EBUSY immediately if lock
3235 * can't be acquired.
3236 */
3237int
3238vfs_lock(vfs_t *vfsp)
3239{
3240	vn_vfslocks_entry_t *vpvfsentry;
3241
3242	vpvfsentry = vn_vfslocks_getlock(vfsp);
3243	if (rwst_tryenter(&vpvfsentry->ve_lock, RW_WRITER))
3244		return (0);
3245
3246	vn_vfslocks_rele(vpvfsentry);
3247	return (EBUSY);
3248}
3249
3250int
3251vfs_rlock(vfs_t *vfsp)
3252{
3253	vn_vfslocks_entry_t *vpvfsentry;
3254
3255	vpvfsentry = vn_vfslocks_getlock(vfsp);
3256
3257	if (rwst_tryenter(&vpvfsentry->ve_lock, RW_READER))
3258		return (0);
3259
3260	vn_vfslocks_rele(vpvfsentry);
3261	return (EBUSY);
3262}
3263
3264void
3265vfs_lock_wait(vfs_t *vfsp)
3266{
3267	vn_vfslocks_entry_t *vpvfsentry;
3268
3269	vpvfsentry = vn_vfslocks_getlock(vfsp);
3270	rwst_enter(&vpvfsentry->ve_lock, RW_WRITER);
3271}
3272
3273void
3274vfs_rlock_wait(vfs_t *vfsp)
3275{
3276	vn_vfslocks_entry_t *vpvfsentry;
3277
3278	vpvfsentry = vn_vfslocks_getlock(vfsp);
3279	rwst_enter(&vpvfsentry->ve_lock, RW_READER);
3280}
3281
3282/*
3283 * Unlock a locked filesystem.
3284 */
3285void
3286vfs_unlock(vfs_t *vfsp)
3287{
3288	vn_vfslocks_entry_t *vpvfsentry;
3289
3290	/*
3291	 * vfs_unlock will mimic sema_v behaviour to fix 4748018.
3292	 * And these changes should remain for the patch changes as it is.
3293	 */
3294	if (panicstr)
3295		return;
3296
3297	/*
3298	 * ve_refcount needs to be dropped twice here.
3299	 * 1. To release refernce after a call to vfs_locks_getlock()
3300	 * 2. To release the reference from the locking routines like
3301	 *    vfs_rlock_wait/vfs_wlock_wait/vfs_wlock etc,.
3302	 */
3303
3304	vpvfsentry = vn_vfslocks_getlock(vfsp);
3305	vn_vfslocks_rele(vpvfsentry);
3306
3307	rwst_exit(&vpvfsentry->ve_lock);
3308	vn_vfslocks_rele(vpvfsentry);
3309}
3310
3311/*
3312 * Utility routine that allows a filesystem to construct its
3313 * fsid in "the usual way" - by munging some underlying dev_t and
3314 * the filesystem type number into the 64-bit fsid.  Note that
3315 * this implicitly relies on dev_t persistence to make filesystem
3316 * id's persistent.
3317 *
3318 * There's nothing to prevent an individual fs from constructing its
3319 * fsid in a different way, and indeed they should.
3320 *
3321 * Since we want fsids to be 32-bit quantities (so that they can be
3322 * exported identically by either 32-bit or 64-bit APIs, as well as
3323 * the fact that fsid's are "known" to NFS), we compress the device
3324 * number given down to 32-bits, and panic if that isn't possible.
3325 */
3326void
3327vfs_make_fsid(fsid_t *fsi, dev_t dev, int val)
3328{
3329	if (!cmpldev((dev32_t *)&fsi->val[0], dev))
3330		panic("device number too big for fsid!");
3331	fsi->val[1] = val;
3332}
3333
3334int
3335vfs_lock_held(vfs_t *vfsp)
3336{
3337	int held;
3338	vn_vfslocks_entry_t *vpvfsentry;
3339
3340	/*
3341	 * vfs_lock_held will mimic sema_held behaviour
3342	 * if panicstr is set. And these changes should remain
3343	 * for the patch changes as it is.
3344	 */
3345	if (panicstr)
3346		return (1);
3347
3348	vpvfsentry = vn_vfslocks_getlock(vfsp);
3349	held = rwst_lock_held(&vpvfsentry->ve_lock, RW_WRITER);
3350
3351	vn_vfslocks_rele(vpvfsentry);
3352	return (held);
3353}
3354
3355struct _kthread *
3356vfs_lock_owner(vfs_t *vfsp)
3357{
3358	struct _kthread *owner;
3359	vn_vfslocks_entry_t *vpvfsentry;
3360
3361	/*
3362	 * vfs_wlock_held will mimic sema_held behaviour
3363	 * if panicstr is set. And these changes should remain
3364	 * for the patch changes as it is.
3365	 */
3366	if (panicstr)
3367		return (NULL);
3368
3369	vpvfsentry = vn_vfslocks_getlock(vfsp);
3370	owner = rwst_owner(&vpvfsentry->ve_lock);
3371
3372	vn_vfslocks_rele(vpvfsentry);
3373	return (owner);
3374}
3375
3376/*
3377 * vfs list locking.
3378 *
3379 * Rather than manipulate the vfslist lock directly, we abstract into lock
3380 * and unlock routines to allow the locking implementation to be changed for
3381 * clustering.
3382 *
3383 * Whenever the vfs list is modified through its hash links, the overall list
3384 * lock must be obtained before locking the relevant hash bucket.  But to see
3385 * whether a given vfs is on the list, it suffices to obtain the lock for the
3386 * hash bucket without getting the overall list lock.  (See getvfs() below.)
3387 */
3388
3389void
3390vfs_list_lock()
3391{
3392	rw_enter(&vfslist, RW_WRITER);
3393}
3394
3395void
3396vfs_list_read_lock()
3397{
3398	rw_enter(&vfslist, RW_READER);
3399}
3400
3401void
3402vfs_list_unlock()
3403{
3404	rw_exit(&vfslist);
3405}
3406
3407/*
3408 * Low level worker routines for adding entries to and removing entries from
3409 * the vfs list.
3410 */
3411
3412static void
3413vfs_hash_add(struct vfs *vfsp, int insert_at_head)
3414{
3415	int vhno;
3416	struct vfs **hp;
3417	dev_t dev;
3418
3419	ASSERT(RW_WRITE_HELD(&vfslist));
3420
3421	dev = expldev(vfsp->vfs_fsid.val[0]);
3422	vhno = VFSHASH(getmajor(dev), getminor(dev));
3423
3424	mutex_enter(&rvfs_list[vhno].rvfs_lock);
3425
3426	/*
3427	 * Link into the hash table, inserting it at the end, so that LOFS
3428	 * with the same fsid as UFS (or other) file systems will not hide the
3429	 * UFS.
3430	 */
3431	if (insert_at_head) {
3432		vfsp->vfs_hash = rvfs_list[vhno].rvfs_head;
3433		rvfs_list[vhno].rvfs_head = vfsp;
3434	} else {
3435		for (hp = &rvfs_list[vhno].rvfs_head; *hp != NULL;
3436		    hp = &(*hp)->vfs_hash)
3437			continue;
3438		/*
3439		 * hp now contains the address of the pointer to update
3440		 * to effect the insertion.
3441		 */
3442		vfsp->vfs_hash = NULL;
3443		*hp = vfsp;
3444	}
3445
3446	rvfs_list[vhno].rvfs_len++;
3447	mutex_exit(&rvfs_list[vhno].rvfs_lock);
3448}
3449
3450
3451static void
3452vfs_hash_remove(struct vfs *vfsp)
3453{
3454	int vhno;
3455	struct vfs *tvfsp;
3456	dev_t dev;
3457
3458	ASSERT(RW_WRITE_HELD(&vfslist));
3459
3460	dev = expldev(vfsp->vfs_fsid.val[0]);
3461	vhno = VFSHASH(getmajor(dev), getminor(dev));
3462
3463	mutex_enter(&rvfs_list[vhno].rvfs_lock);
3464
3465	/*
3466	 * Remove from hash.
3467	 */
3468	if (rvfs_list[vhno].rvfs_head == vfsp) {
3469		rvfs_list[vhno].rvfs_head = vfsp->vfs_hash;
3470		rvfs_list[vhno].rvfs_len--;
3471		goto foundit;
3472	}
3473	for (tvfsp = rvfs_list[vhno].rvfs_head; tvfsp != NULL;
3474	    tvfsp = tvfsp->vfs_hash) {
3475		if (tvfsp->vfs_hash == vfsp) {
3476			tvfsp->vfs_hash = vfsp->vfs_hash;
3477			rvfs_list[vhno].rvfs_len--;
3478			goto foundit;
3479		}
3480	}
3481	cmn_err(CE_WARN, "vfs_list_remove: vfs not found in hash");
3482
3483foundit:
3484
3485	mutex_exit(&rvfs_list[vhno].rvfs_lock);
3486}
3487
3488
3489void
3490vfs_list_add(struct vfs *vfsp)
3491{
3492	zone_t *zone;
3493
3494	/*
3495	 * The zone that owns the mount is the one that performed the mount.
3496	 * Note that this isn't necessarily the same as the zone mounted into.
3497	 * The corresponding zone_rele() will be done when the vfs_t is
3498	 * being free'd.
3499	 */
3500	vfsp->vfs_zone = curproc->p_zone;
3501	zone_hold(vfsp->vfs_zone);
3502
3503	/*
3504	 * Find the zone mounted into, and put this mount on its vfs list.
3505	 */
3506	zone = zone_find_by_path(refstr_value(vfsp->vfs_mntpt));
3507	ASSERT(zone != NULL);
3508	/*
3509	 * Special casing for the root vfs.  This structure is allocated
3510	 * statically and hooked onto rootvfs at link time.  During the
3511	 * vfs_mountroot call at system startup time, the root file system's
3512	 * VFS_MOUNTROOT routine will call vfs_add with this root vfs struct
3513	 * as argument.  The code below must detect and handle this special
3514	 * case.  The only apparent justification for this special casing is
3515	 * to ensure that the root file system appears at the head of the
3516	 * list.
3517	 *
3518	 * XXX:	I'm assuming that it's ok to do normal list locking when
3519	 *	adding the entry for the root file system (this used to be
3520	 *	done with no locks held).
3521	 */
3522	vfs_list_lock();
3523	/*
3524	 * Link into the vfs list proper.
3525	 */
3526	if (vfsp == &root) {
3527		/*
3528		 * Assert: This vfs is already on the list as its first entry.
3529		 * Thus, there's nothing to do.
3530		 */
3531		ASSERT(rootvfs == vfsp);
3532		/*
3533		 * Add it to the head of the global zone's vfslist.
3534		 */
3535		ASSERT(zone == global_zone);
3536		ASSERT(zone->zone_vfslist == NULL);
3537		zone->zone_vfslist = vfsp;
3538	} else {
3539		/*
3540		 * Link to end of list using vfs_prev (as rootvfs is now a
3541		 * doubly linked circular list) so list is in mount order for
3542		 * mnttab use.
3543		 */
3544		rootvfs->vfs_prev->vfs_next = vfsp;
3545		vfsp->vfs_prev = rootvfs->vfs_prev;
3546		rootvfs->vfs_prev = vfsp;
3547		vfsp->vfs_next = rootvfs;
3548
3549		/*
3550		 * Do it again for the zone-private list (which may be NULL).
3551		 */
3552		if (zone->zone_vfslist == NULL) {
3553			ASSERT(zone != global_zone);
3554			zone->zone_vfslist = vfsp;
3555		} else {
3556			zone->zone_vfslist->vfs_zone_prev->vfs_zone_next = vfsp;
3557			vfsp->vfs_zone_prev = zone->zone_vfslist->vfs_zone_prev;
3558			zone->zone_vfslist->vfs_zone_prev = vfsp;
3559			vfsp->vfs_zone_next = zone->zone_vfslist;
3560		}
3561	}
3562
3563	/*
3564	 * Link into the hash table, inserting it at the end, so that LOFS
3565	 * with the same fsid as UFS (or other) file systems will not hide
3566	 * the UFS.
3567	 */
3568	vfs_hash_add(vfsp, 0);
3569
3570	/*
3571	 * update the mnttab modification time
3572	 */
3573	vfs_mnttab_modtimeupd();
3574	vfs_list_unlock();
3575	zone_rele(zone);
3576}
3577
3578void
3579vfs_list_remove(struct vfs *vfsp)
3580{
3581	zone_t *zone;
3582
3583	zone = zone_find_by_path(refstr_value(vfsp->vfs_mntpt));
3584	ASSERT(zone != NULL);
3585	/*
3586	 * Callers are responsible for preventing attempts to unmount the
3587	 * root.
3588	 */
3589	ASSERT(vfsp != rootvfs);
3590
3591	vfs_list_lock();
3592
3593	/*
3594	 * Remove from hash.
3595	 */
3596	vfs_hash_remove(vfsp);
3597
3598	/*
3599	 * Remove from vfs list.
3600	 */
3601	vfsp->vfs_prev->vfs_next = vfsp->vfs_next;
3602	vfsp->vfs_next->vfs_prev = vfsp->vfs_prev;
3603	vfsp->vfs_next = vfsp->vfs_prev = NULL;
3604
3605	/*
3606	 * Remove from zone-specific vfs list.
3607	 */
3608	if (zone->zone_vfslist == vfsp)
3609		zone->zone_vfslist = vfsp->vfs_zone_next;
3610
3611	if (vfsp->vfs_zone_next == vfsp) {
3612		ASSERT(vfsp->vfs_zone_prev == vfsp);
3613		ASSERT(zone->zone_vfslist == vfsp);
3614		zone->zone_vfslist = NULL;
3615	}
3616
3617	vfsp->vfs_zone_prev->vfs_zone_next = vfsp->vfs_zone_next;
3618	vfsp->vfs_zone_next->vfs_zone_prev = vfsp->vfs_zone_prev;
3619	vfsp->vfs_zone_next = vfsp->vfs_zone_prev = NULL;
3620
3621	/*
3622	 * update the mnttab modification time
3623	 */
3624	vfs_mnttab_modtimeupd();
3625	vfs_list_unlock();
3626	zone_rele(zone);
3627}
3628
3629struct vfs *
3630getvfs(fsid_t *fsid)
3631{
3632	struct vfs *vfsp;
3633	int val0 = fsid->val[0];
3634	int val1 = fsid->val[1];
3635	dev_t dev = expldev(val0);
3636	int vhno = VFSHASH(getmajor(dev), getminor(dev));
3637	kmutex_t *hmp = &rvfs_list[vhno].rvfs_lock;
3638
3639	mutex_enter(hmp);
3640	for (vfsp = rvfs_list[vhno].rvfs_head; vfsp; vfsp = vfsp->vfs_hash) {
3641		if (vfsp->vfs_fsid.val[0] == val0 &&
3642		    vfsp->vfs_fsid.val[1] == val1) {
3643			VFS_HOLD(vfsp);
3644			mutex_exit(hmp);
3645			return (vfsp);
3646		}
3647	}
3648	mutex_exit(hmp);
3649	return (NULL);
3650}
3651
3652/*
3653 * Search the vfs mount in progress list for a specified device/vfs entry.
3654 * Returns 0 if the first entry in the list that the device matches has the
3655 * given vfs pointer as well.  If the device matches but a different vfs
3656 * pointer is encountered in the list before the given vfs pointer then
3657 * a 1 is returned.
3658 */
3659
3660int
3661vfs_devmounting(dev_t dev, struct vfs *vfsp)
3662{
3663	int retval = 0;
3664	struct ipmnt *mipp;
3665
3666	mutex_enter(&vfs_miplist_mutex);
3667	for (mipp = vfs_miplist; mipp != NULL; mipp = mipp->mip_next) {
3668		if (mipp->mip_dev == dev) {
3669			if (mipp->mip_vfsp != vfsp)
3670				retval = 1;
3671			break;
3672		}
3673	}
3674	mutex_exit(&vfs_miplist_mutex);
3675	return (retval);
3676}
3677
3678/*
3679 * Search the vfs list for a specified device.  Returns 1, if entry is found
3680 * or 0 if no suitable entry is found.
3681 */
3682
3683int
3684vfs_devismounted(dev_t dev)
3685{
3686	struct vfs *vfsp;
3687	int found;
3688
3689	vfs_list_read_lock();
3690	vfsp = rootvfs;
3691	found = 0;
3692	do {
3693		if (vfsp->vfs_dev == dev) {
3694			found = 1;
3695			break;
3696		}
3697		vfsp = vfsp->vfs_next;
3698	} while (vfsp != rootvfs);
3699
3700	vfs_list_unlock();
3701	return (found);
3702}
3703
3704/*
3705 * Search the vfs list for a specified device.  Returns a pointer to it
3706 * or NULL if no suitable entry is found. The caller of this routine
3707 * is responsible for releasing the returned vfs pointer.
3708 */
3709struct vfs *
3710vfs_dev2vfsp(dev_t dev)
3711{
3712	struct vfs *vfsp;
3713	int found;
3714
3715	vfs_list_read_lock();
3716	vfsp = rootvfs;
3717	found = 0;
3718	do {
3719		/*
3720		 * The following could be made more efficient by making
3721		 * the entire loop use vfs_zone_next if the call is from
3722		 * a zone.  The only callers, however, ustat(2) and
3723		 * umount2(2), don't seem to justify the added
3724		 * complexity at present.
3725		 */
3726		if (vfsp->vfs_dev == dev &&
3727		    ZONE_PATH_VISIBLE(refstr_value(vfsp->vfs_mntpt),
3728		    curproc->p_zone)) {
3729			VFS_HOLD(vfsp);
3730			found = 1;
3731			break;
3732		}
3733		vfsp = vfsp->vfs_next;
3734	} while (vfsp != rootvfs);
3735	vfs_list_unlock();
3736	return (found ? vfsp: NULL);
3737}
3738
3739/*
3740 * Search the vfs list for a specified mntpoint.  Returns a pointer to it
3741 * or NULL if no suitable entry is found. The caller of this routine
3742 * is responsible for releasing the returned vfs pointer.
3743 *
3744 * Note that if multiple mntpoints match, the last one matching is
3745 * returned in an attempt to return the "top" mount when overlay
3746 * mounts are covering the same mount point.  This is accomplished by starting
3747 * at the end of the list and working our way backwards, stopping at the first
3748 * matching mount.
3749 */
3750struct vfs *
3751vfs_mntpoint2vfsp(const char *mp)
3752{
3753	struct vfs *vfsp;
3754	struct vfs *retvfsp = NULL;
3755	zone_t *zone = curproc->p_zone;
3756	struct vfs *list;
3757
3758	vfs_list_read_lock();
3759	if (getzoneid() == GLOBAL_ZONEID) {
3760		/*
3761		 * The global zone may see filesystems in any zone.
3762		 */
3763		vfsp = rootvfs->vfs_prev;
3764		do {
3765			if (strcmp(refstr_value(vfsp->vfs_mntpt), mp) == 0) {
3766				retvfsp = vfsp;
3767				break;
3768			}
3769			vfsp = vfsp->vfs_prev;
3770		} while (vfsp != rootvfs->vfs_prev);
3771	} else if ((list = zone->zone_vfslist) != NULL) {
3772		const char *mntpt;
3773
3774		vfsp = list->vfs_zone_prev;
3775		do {
3776			mntpt = refstr_value(vfsp->vfs_mntpt);
3777			mntpt = ZONE_PATH_TRANSLATE(mntpt, zone);
3778			if (strcmp(mntpt, mp) == 0) {
3779				retvfsp = vfsp;
3780				break;
3781			}
3782			vfsp = vfsp->vfs_zone_prev;
3783		} while (vfsp != list->vfs_zone_prev);
3784	}
3785	if (retvfsp)
3786		VFS_HOLD(retvfsp);
3787	vfs_list_unlock();
3788	return (retvfsp);
3789}
3790
3791/*
3792 * Search the vfs list for a specified vfsops.
3793 * if vfs entry is found then return 1, else 0.
3794 */
3795int
3796vfs_opsinuse(vfsops_t *ops)
3797{
3798	struct vfs *vfsp;
3799	int found;
3800
3801	vfs_list_read_lock();
3802	vfsp = rootvfs;
3803	found = 0;
3804	do {
3805		if (vfs_getops(vfsp) == ops) {
3806			found = 1;
3807			break;
3808		}
3809		vfsp = vfsp->vfs_next;
3810	} while (vfsp != rootvfs);
3811	vfs_list_unlock();
3812	return (found);
3813}
3814
3815/*
3816 * Allocate an entry in vfssw for a file system type
3817 */
3818struct vfssw *
3819allocate_vfssw(const char *type)
3820{
3821	struct vfssw *vswp;
3822
3823	if (type[0] == '\0' || strlen(type) + 1 > _ST_FSTYPSZ) {
3824		/*
3825		 * The vfssw table uses the empty string to identify an
3826		 * available entry; we cannot add any type which has
3827		 * a leading NUL. The string length is limited to
3828		 * the size of the st_fstype array in struct stat.
3829		 */
3830		return (NULL);
3831	}
3832
3833	ASSERT(VFSSW_WRITE_LOCKED());
3834	for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++)
3835		if (!ALLOCATED_VFSSW(vswp)) {
3836			vswp->vsw_name = kmem_alloc(strlen(type) + 1, KM_SLEEP);
3837			(void) strcpy(vswp->vsw_name, type);
3838			ASSERT(vswp->vsw_count == 0);
3839			vswp->vsw_count = 1;
3840			mutex_init(&vswp->vsw_lock, NULL, MUTEX_DEFAULT, NULL);
3841			return (vswp);
3842		}
3843	return (NULL);
3844}
3845
3846/*
3847 * Impose additional layer of translation between vfstype names
3848 * and module names in the filesystem.
3849 */
3850static const char *
3851vfs_to_modname(const char *vfstype)
3852{
3853	if (strcmp(vfstype, "proc") == 0) {
3854		vfstype = "procfs";
3855	} else if (strcmp(vfstype, "fd") == 0) {
3856		vfstype = "fdfs";
3857	} else if (strncmp(vfstype, "nfs", 3) == 0) {
3858		vfstype = "nfs";
3859	}
3860
3861	return (vfstype);
3862}
3863
3864/*
3865 * Find a vfssw entry given a file system type name.
3866 * Try to autoload the filesystem if it's not found.
3867 * If it's installed, return the vfssw locked to prevent unloading.
3868 */
3869struct vfssw *
3870vfs_getvfssw(const char *type)
3871{
3872	struct vfssw *vswp;
3873	const char *modname;
3874
3875	RLOCK_VFSSW();
3876	vswp = vfs_getvfsswbyname(type);
3877	modname = vfs_to_modname(type);
3878
3879	if (rootdir == NULL) {
3880		/*
3881		 * If we haven't yet loaded the root file system, then our
3882		 * _init won't be called until later. Allocate vfssw entry,
3883		 * because mod_installfs won't be called.
3884		 */
3885		if (vswp == NULL) {
3886			RUNLOCK_VFSSW();
3887			WLOCK_VFSSW();
3888			if ((vswp = vfs_getvfsswbyname(type)) == NULL) {
3889				if ((vswp = allocate_vfssw(type)) == NULL) {
3890					WUNLOCK_VFSSW();
3891					return (NULL);
3892				}
3893			}
3894			WUNLOCK_VFSSW();
3895			RLOCK_VFSSW();
3896		}
3897		if (!VFS_INSTALLED(vswp)) {
3898			RUNLOCK_VFSSW();
3899			(void) modloadonly("fs", modname);
3900		} else
3901			RUNLOCK_VFSSW();
3902		return (vswp);
3903	}
3904
3905	/*
3906	 * Try to load the filesystem.  Before calling modload(), we drop
3907	 * our lock on the VFS switch table, and pick it up after the
3908	 * module is loaded.  However, there is a potential race:  the
3909	 * module could be unloaded after the call to modload() completes
3910	 * but before we pick up the lock and drive on.  Therefore,
3911	 * we keep reloading the module until we've loaded the module
3912	 * _and_ we have the lock on the VFS switch table.
3913	 */
3914	while (vswp == NULL || !VFS_INSTALLED(vswp)) {
3915		RUNLOCK_VFSSW();
3916		if (modload("fs", modname) == -1)
3917			return (NULL);
3918		RLOCK_VFSSW();
3919		if (vswp == NULL)
3920			if ((vswp = vfs_getvfsswbyname(type)) == NULL)
3921				break;
3922	}
3923	RUNLOCK_VFSSW();
3924
3925	return (vswp);
3926}
3927
3928/*
3929 * Find a vfssw entry given a file system type name.
3930 */
3931struct vfssw *
3932vfs_getvfsswbyname(const char *type)
3933{
3934	struct vfssw *vswp;
3935
3936	ASSERT(VFSSW_LOCKED());
3937	if (type == NULL || *type == '\0')
3938		return (NULL);
3939
3940	for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) {
3941		if (strcmp(type, vswp->vsw_name) == 0) {
3942			vfs_refvfssw(vswp);
3943			return (vswp);
3944		}
3945	}
3946
3947	return (NULL);
3948}
3949
3950/*
3951 * Find a vfssw entry given a set of vfsops.
3952 */
3953struct vfssw *
3954vfs_getvfsswbyvfsops(vfsops_t *vfsops)
3955{
3956	struct vfssw *vswp;
3957
3958	RLOCK_VFSSW();
3959	for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) {
3960		if (ALLOCATED_VFSSW(vswp) && &vswp->vsw_vfsops == vfsops) {
3961			vfs_refvfssw(vswp);
3962			RUNLOCK_VFSSW();
3963			return (vswp);
3964		}
3965	}
3966	RUNLOCK_VFSSW();
3967
3968	return (NULL);
3969}
3970
3971/*
3972 * Reference a vfssw entry.
3973 */
3974void
3975vfs_refvfssw(struct vfssw *vswp)
3976{
3977
3978	mutex_enter(&vswp->vsw_lock);
3979	vswp->vsw_count++;
3980	mutex_exit(&vswp->vsw_lock);
3981}
3982
3983/*
3984 * Unreference a vfssw entry.
3985 */
3986void
3987vfs_unrefvfssw(struct vfssw *vswp)
3988{
3989
3990	mutex_enter(&vswp->vsw_lock);
3991	vswp->vsw_count--;
3992	mutex_exit(&vswp->vsw_lock);
3993}
3994
3995int sync_timeout = 30;		/* timeout for syncing a page during panic */
3996int sync_timeleft;		/* portion of sync_timeout remaining */
3997
3998static int sync_retries = 20;	/* number of retries when not making progress */
3999static int sync_triesleft;	/* portion of sync_retries remaining */
4000
4001static pgcnt_t old_pgcnt, new_pgcnt;
4002static int new_bufcnt, old_bufcnt;
4003
4004/*
4005 * Sync all of the mounted filesystems, and then wait for the actual i/o to
4006 * complete.  We wait by counting the number of dirty pages and buffers,
4007 * pushing them out using bio_busy() and page_busy(), and then counting again.
4008 * This routine is used during both the uadmin A_SHUTDOWN code as well as
4009 * the SYNC phase of the panic code (see comments in panic.c).  It should only
4010 * be used after some higher-level mechanism has quiesced the system so that
4011 * new writes are not being initiated while we are waiting for completion.
4012 *
4013 * To ensure finite running time, our algorithm uses two timeout mechanisms:
4014 * sync_timeleft (a timer implemented by the omnipresent deadman() cyclic), and
4015 * sync_triesleft (a progress counter used by the vfs_syncall() loop below).
4016 * Together these ensure that syncing completes if our i/o paths are stuck.
4017 * The counters are declared above so they can be found easily in the debugger.
4018 *
4019 * The sync_timeleft counter is reset by bio_busy() and page_busy() using the
4020 * vfs_syncprogress() subroutine whenever we make progress through the lists of
4021 * pages and buffers.  It is decremented and expired by the deadman() cyclic.
4022 * When vfs_syncall() decides it is done, we disable the deadman() counter by
4023 * setting sync_timeleft to zero.  This timer guards against vfs_syncall()
4024 * deadlocking or hanging inside of a broken filesystem or driver routine.
4025 *
4026 * The sync_triesleft counter is updated by vfs_syncall() itself.  If we make
4027 * sync_retries consecutive calls to bio_busy() and page_busy() without
4028 * decreasing either the number of dirty buffers or dirty pages below the
4029 * lowest count we have seen so far, we give up and return from vfs_syncall().
4030 *
4031 * Each loop iteration ends with a call to delay() one second to allow time for
4032 * i/o completion and to permit the user time to read our progress messages.
4033 */
4034void
4035vfs_syncall(void)
4036{
4037	if (rootdir == NULL && !modrootloaded)
4038		return; /* panic during boot - no filesystems yet */
4039
4040	printf("syncing file systems...");
4041	vfs_syncprogress();
4042	sync();
4043
4044	vfs_syncprogress();
4045	sync_triesleft = sync_retries;
4046
4047	old_bufcnt = new_bufcnt = INT_MAX;
4048	old_pgcnt = new_pgcnt = ULONG_MAX;
4049
4050	while (sync_triesleft > 0) {
4051		old_bufcnt = MIN(old_bufcnt, new_bufcnt);
4052		old_pgcnt = MIN(old_pgcnt, new_pgcnt);
4053
4054		new_bufcnt = bio_busy(B_TRUE);
4055		new_pgcnt = page_busy(B_TRUE);
4056		vfs_syncprogress();
4057
4058		if (new_bufcnt == 0 && new_pgcnt == 0)
4059			break;
4060
4061		if (new_bufcnt < old_bufcnt || new_pgcnt < old_pgcnt)
4062			sync_triesleft = sync_retries;
4063		else
4064			sync_triesleft--;
4065
4066		if (new_bufcnt)
4067			printf(" [%d]", new_bufcnt);
4068		if (new_pgcnt)
4069			printf(" %lu", new_pgcnt);
4070
4071		delay(hz);
4072	}
4073
4074	if (new_bufcnt != 0 || new_pgcnt != 0)
4075		printf(" done (not all i/o completed)\n");
4076	else
4077		printf(" done\n");
4078
4079	sync_timeleft = 0;
4080	delay(hz);
4081}
4082
4083/*
4084 * If we are in the middle of the sync phase of panic, reset sync_timeleft to
4085 * sync_timeout to indicate that we are making progress and the deadman()
4086 * omnipresent cyclic should not yet time us out.  Note that it is safe to
4087 * store to sync_timeleft here since the deadman() is firing at high-level
4088 * on top of us.  If we are racing with the deadman(), either the deadman()
4089 * will decrement the old value and then we will reset it, or we will
4090 * reset it and then the deadman() will immediately decrement it.  In either
4091 * case, correct behavior results.
4092 */
4093void
4094vfs_syncprogress(void)
4095{
4096	if (panicstr)
4097		sync_timeleft = sync_timeout;
4098}
4099
4100/*
4101 * Map VFS flags to statvfs flags.  These shouldn't really be separate
4102 * flags at all.
4103 */
4104uint_t
4105vf_to_stf(uint_t vf)
4106{
4107	uint_t stf = 0;
4108
4109	if (vf & VFS_RDONLY)
4110		stf |= ST_RDONLY;
4111	if (vf & VFS_NOSETUID)
4112		stf |= ST_NOSUID;
4113	if (vf & VFS_NOTRUNC)
4114		stf |= ST_NOTRUNC;
4115
4116	return (stf);
4117}
4118
4119/*
4120 * Entries for (illegal) fstype 0.
4121 */
4122/* ARGSUSED */
4123int
4124vfsstray_sync(struct vfs *vfsp, short arg, struct cred *cr)
4125{
4126	cmn_err(CE_PANIC, "stray vfs operation");
4127	return (0);
4128}
4129
4130/*
4131 * Entries for (illegal) fstype 0.
4132 */
4133int
4134vfsstray(void)
4135{
4136	cmn_err(CE_PANIC, "stray vfs operation");
4137	return (0);
4138}
4139
4140/*
4141 * Support for dealing with forced UFS unmount and its interaction with
4142 * LOFS. Could be used by any filesystem.
4143 * See bug 1203132.
4144 */
4145int
4146vfs_EIO(void)
4147{
4148	return (EIO);
4149}
4150
4151/*
4152 * We've gotta define the op for sync separately, since the compiler gets
4153 * confused if we mix and match ANSI and normal style prototypes when
4154 * a "short" argument is present and spits out a warning.
4155 */
4156/*ARGSUSED*/
4157int
4158vfs_EIO_sync(struct vfs *vfsp, short arg, struct cred *cr)
4159{
4160	return (EIO);
4161}
4162
4163vfs_t EIO_vfs;
4164vfsops_t *EIO_vfsops;
4165
4166/*
4167 * Called from startup() to initialize all loaded vfs's
4168 */
4169void
4170vfsinit(void)
4171{
4172	struct vfssw *vswp;
4173	int error;
4174	extern int vopstats_enabled;
4175	extern void vopstats_startup();
4176
4177	static const fs_operation_def_t EIO_vfsops_template[] = {
4178		VFSNAME_MOUNT,		{ .error = vfs_EIO },
4179		VFSNAME_UNMOUNT,	{ .error = vfs_EIO },
4180		VFSNAME_ROOT,		{ .error = vfs_EIO },
4181		VFSNAME_STATVFS,	{ .error = vfs_EIO },
4182		VFSNAME_SYNC, 		{ .vfs_sync = vfs_EIO_sync },
4183		VFSNAME_VGET,		{ .error = vfs_EIO },
4184		VFSNAME_MOUNTROOT,	{ .error = vfs_EIO },
4185		VFSNAME_FREEVFS,	{ .error = vfs_EIO },
4186		VFSNAME_VNSTATE,	{ .error = vfs_EIO },
4187		NULL, NULL
4188	};
4189
4190	static const fs_operation_def_t stray_vfsops_template[] = {
4191		VFSNAME_MOUNT,		{ .error = vfsstray },
4192		VFSNAME_UNMOUNT,	{ .error = vfsstray },
4193		VFSNAME_ROOT,		{ .error = vfsstray },
4194		VFSNAME_STATVFS,	{ .error = vfsstray },
4195		VFSNAME_SYNC, 		{ .vfs_sync = vfsstray_sync },
4196		VFSNAME_VGET,		{ .error = vfsstray },
4197		VFSNAME_MOUNTROOT,	{ .error = vfsstray },
4198		VFSNAME_FREEVFS,	{ .error = vfsstray },
4199		VFSNAME_VNSTATE,	{ .error = vfsstray },
4200		NULL, NULL
4201	};
4202
4203	/* Create vfs cache */
4204	vfs_cache = kmem_cache_create("vfs_cache", sizeof (struct vfs),
4205	    sizeof (uintptr_t), NULL, NULL, NULL, NULL, NULL, 0);
4206
4207	/* Initialize the vnode cache (file systems may use it during init). */
4208	vn_create_cache();
4209
4210	/* Setup event monitor framework */
4211	fem_init();
4212
4213	/* Initialize the dummy stray file system type. */
4214	error = vfs_setfsops(0, stray_vfsops_template, NULL);
4215
4216	/* Initialize the dummy EIO file system. */
4217	error = vfs_makefsops(EIO_vfsops_template, &EIO_vfsops);
4218	if (error != 0) {
4219		cmn_err(CE_WARN, "vfsinit: bad EIO vfs ops template");
4220		/* Shouldn't happen, but not bad enough to panic */
4221	}
4222
4223	VFS_INIT(&EIO_vfs, EIO_vfsops, (caddr_t)NULL);
4224
4225	/*
4226	 * Default EIO_vfs.vfs_flag to VFS_UNMOUNTED so a lookup
4227	 * on this vfs can immediately notice it's invalid.
4228	 */
4229	EIO_vfs.vfs_flag |= VFS_UNMOUNTED;
4230
4231	/*
4232	 * Call the init routines of non-loadable filesystems only.
4233	 * Filesystems which are loaded as separate modules will be
4234	 * initialized by the module loading code instead.
4235	 */
4236
4237	for (vswp = &vfssw[1]; vswp < &vfssw[nfstype]; vswp++) {
4238		RLOCK_VFSSW();
4239		if (vswp->vsw_init != NULL)
4240			(*vswp->vsw_init)(vswp - vfssw, vswp->vsw_name);
4241		RUNLOCK_VFSSW();
4242	}
4243
4244	vopstats_startup();
4245
4246	if (vopstats_enabled) {
4247		/* EIO_vfs can collect stats, but we don't retrieve them */
4248		initialize_vopstats(&EIO_vfs.vfs_vopstats);
4249		EIO_vfs.vfs_fstypevsp = NULL;
4250		EIO_vfs.vfs_vskap = NULL;
4251		EIO_vfs.vfs_flag |= VFS_STATS;
4252	}
4253
4254	xattr_init();
4255
4256	reparse_point_init();
4257}
4258
4259vfs_t *
4260vfs_alloc(int kmflag)
4261{
4262	vfs_t *vfsp;
4263
4264	vfsp = kmem_cache_alloc(vfs_cache, kmflag);
4265
4266	/*
4267	 * Do the simplest initialization here.
4268	 * Everything else gets done in vfs_init()
4269	 */
4270	bzero(vfsp, sizeof (vfs_t));
4271	return (vfsp);
4272}
4273
4274void
4275vfs_free(vfs_t *vfsp)
4276{
4277	/*
4278	 * One would be tempted to assert that "vfsp->vfs_count == 0".
4279	 * The problem is that this gets called out of domount() with
4280	 * a partially initialized vfs and a vfs_count of 1.  This is
4281	 * also called from vfs_rele() with a vfs_count of 0.  We can't
4282	 * call VFS_RELE() from domount() if VFS_MOUNT() hasn't successfully
4283	 * returned.  This is because VFS_MOUNT() fully initializes the
4284	 * vfs structure and its associated data.  VFS_RELE() will call
4285	 * VFS_FREEVFS() which may panic the system if the data structures
4286	 * aren't fully initialized from a successful VFS_MOUNT()).
4287	 */
4288
4289	/* If FEM was in use, make sure everything gets cleaned up */
4290	if (vfsp->vfs_femhead) {
4291		ASSERT(vfsp->vfs_femhead->femh_list == NULL);
4292		mutex_destroy(&vfsp->vfs_femhead->femh_lock);
4293		kmem_free(vfsp->vfs_femhead, sizeof (*(vfsp->vfs_femhead)));
4294		vfsp->vfs_femhead = NULL;
4295	}
4296
4297	if (vfsp->vfs_implp)
4298		vfsimpl_teardown(vfsp);
4299	sema_destroy(&vfsp->vfs_reflock);
4300	kmem_cache_free(vfs_cache, vfsp);
4301}
4302
4303/*
4304 * Increments the vfs reference count by one atomically.
4305 */
4306void
4307vfs_hold(vfs_t *vfsp)
4308{
4309	atomic_add_32(&vfsp->vfs_count, 1);
4310	ASSERT(vfsp->vfs_count != 0);
4311}
4312
4313/*
4314 * Decrements the vfs reference count by one atomically. When
4315 * vfs reference count becomes zero, it calls the file system
4316 * specific vfs_freevfs() to free up the resources.
4317 */
4318void
4319vfs_rele(vfs_t *vfsp)
4320{
4321	ASSERT(vfsp->vfs_count != 0);
4322	if (atomic_add_32_nv(&vfsp->vfs_count, -1) == 0) {
4323		VFS_FREEVFS(vfsp);
4324		lofi_remove(vfsp);
4325		if (vfsp->vfs_zone)
4326			zone_rele(vfsp->vfs_zone);
4327		vfs_freemnttab(vfsp);
4328		vfs_free(vfsp);
4329	}
4330}
4331
4332/*
4333 * Generic operations vector support.
4334 *
4335 * This is used to build operations vectors for both the vfs and vnode.
4336 * It's normally called only when a file system is loaded.
4337 *
4338 * There are many possible algorithms for this, including the following:
4339 *
4340 *   (1) scan the list of known operations; for each, see if the file system
4341 *       includes an entry for it, and fill it in as appropriate.
4342 *
4343 *   (2) set up defaults for all known operations.  scan the list of ops
4344 *       supplied by the file system; for each which is both supplied and
4345 *       known, fill it in.
4346 *
4347 *   (3) sort the lists of known ops & supplied ops; scan the list, filling
4348 *       in entries as we go.
4349 *
4350 * we choose (1) for simplicity, and because performance isn't critical here.
4351 * note that (2) could be sped up using a precomputed hash table on known ops.
4352 * (3) could be faster than either, but only if the lists were very large or
4353 * supplied in sorted order.
4354 *
4355 */
4356
4357int
4358fs_build_vector(void *vector, int *unused_ops,
4359    const fs_operation_trans_def_t *translation,
4360    const fs_operation_def_t *operations)
4361{
4362	int i, num_trans, num_ops, used;
4363
4364	/*
4365	 * Count the number of translations and the number of supplied
4366	 * operations.
4367	 */
4368
4369	{
4370		const fs_operation_trans_def_t *p;
4371
4372		for (num_trans = 0, p = translation;
4373		    p->name != NULL;
4374		    num_trans++, p++)
4375			;
4376	}
4377
4378	{
4379		const fs_operation_def_t *p;
4380
4381		for (num_ops = 0, p = operations;
4382		    p->name != NULL;
4383		    num_ops++, p++)
4384			;
4385	}
4386
4387	/* Walk through each operation known to our caller.  There will be */
4388	/* one entry in the supplied "translation table" for each. */
4389
4390	used = 0;
4391
4392	for (i = 0; i < num_trans; i++) {
4393		int j, found;
4394		char *curname;
4395		fs_generic_func_p result;
4396		fs_generic_func_p *location;
4397
4398		curname = translation[i].name;
4399
4400		/* Look for a matching operation in the list supplied by the */
4401		/* file system. */
4402
4403		found = 0;
4404
4405		for (j = 0; j < num_ops; j++) {
4406			if (strcmp(operations[j].name, curname) == 0) {
4407				used++;
4408				found = 1;
4409				break;
4410			}
4411		}
4412
4413		/*
4414		 * If the file system is using a "placeholder" for default
4415		 * or error functions, grab the appropriate function out of
4416		 * the translation table.  If the file system didn't supply
4417		 * this operation at all, use the default function.
4418		 */
4419
4420		if (found) {
4421			result = operations[j].func.fs_generic;
4422			if (result == fs_default) {
4423				result = translation[i].defaultFunc;
4424			} else if (result == fs_error) {
4425				result = translation[i].errorFunc;
4426			} else if (result == NULL) {
4427				/* Null values are PROHIBITED */
4428				return (EINVAL);
4429			}
4430		} else {
4431			result = translation[i].defaultFunc;
4432		}
4433
4434		/* Now store the function into the operations vector. */
4435
4436		location = (fs_generic_func_p *)
4437		    (((char *)vector) + translation[i].offset);
4438
4439		*location = result;
4440	}
4441
4442	*unused_ops = num_ops - used;
4443
4444	return (0);
4445}
4446
4447/* Placeholder functions, should never be called. */
4448
4449int
4450fs_error(void)
4451{
4452	cmn_err(CE_PANIC, "fs_error called");
4453	return (0);
4454}
4455
4456int
4457fs_default(void)
4458{
4459	cmn_err(CE_PANIC, "fs_default called");
4460	return (0);
4461}
4462
4463#ifdef __sparc
4464
4465/*
4466 * Part of the implementation of booting off a mirrored root
4467 * involves a change of dev_t for the root device.  To
4468 * accomplish this, first remove the existing hash table
4469 * entry for the root device, convert to the new dev_t,
4470 * then re-insert in the hash table at the head of the list.
4471 */
4472void
4473vfs_root_redev(vfs_t *vfsp, dev_t ndev, int fstype)
4474{
4475	vfs_list_lock();
4476
4477	vfs_hash_remove(vfsp);
4478
4479	vfsp->vfs_dev = ndev;
4480	vfs_make_fsid(&vfsp->vfs_fsid, ndev, fstype);
4481
4482	vfs_hash_add(vfsp, 1);
4483
4484	vfs_list_unlock();
4485}
4486
4487#else /* x86 NEWBOOT */
4488
4489#if defined(__x86)
4490extern int hvmboot_rootconf();
4491#endif /* __x86 */
4492
4493extern ib_boot_prop_t *iscsiboot_prop;
4494extern void iscsi_boot_prop_free();
4495
4496int
4497rootconf()
4498{
4499	int error;
4500	struct vfssw *vsw;
4501	extern void pm_init();
4502	char *fstyp, *fsmod;
4503	int ret = -1;
4504
4505	getrootfs(&fstyp, &fsmod);
4506
4507#if defined(__x86)
4508	/*
4509	 * hvmboot_rootconf() is defined in the hvm_bootstrap misc module,
4510	 * which lives in /platform/i86hvm, and hence is only available when
4511	 * booted in an x86 hvm environment.  If the hvm_bootstrap misc module
4512	 * is not available then the modstub for this function will return 0.
4513	 * If the hvm_bootstrap misc module is available it will be loaded
4514	 * and hvmboot_rootconf() will be invoked.
4515	 */
4516	if (error = hvmboot_rootconf())
4517		return (error);
4518#endif /* __x86 */
4519
4520	if (error = clboot_rootconf())
4521		return (error);
4522
4523	if (modload("fs", fsmod) == -1)
4524		panic("Cannot _init %s module", fsmod);
4525
4526	RLOCK_VFSSW();
4527	vsw = vfs_getvfsswbyname(fstyp);
4528	RUNLOCK_VFSSW();
4529	if (vsw == NULL) {
4530		cmn_err(CE_CONT, "Cannot find %s filesystem\n", fstyp);
4531		return (ENXIO);
4532	}
4533	VFS_INIT(rootvfs, &vsw->vsw_vfsops, 0);
4534	VFS_HOLD(rootvfs);
4535
4536	/* always mount readonly first */
4537	rootvfs->vfs_flag |= VFS_RDONLY;
4538
4539	pm_init();
4540
4541	if (netboot && iscsiboot_prop) {
4542		cmn_err(CE_WARN, "NFS boot and iSCSI boot"
4543		    " shouldn't happen in the same time");
4544		return (EINVAL);
4545	}
4546
4547	if (netboot || iscsiboot_prop)
4548		ret = strplumb();
4549
4550	if ((ret == 0) && iscsiboot_prop) {
4551		ret = modload("drv", "iscsi");
4552		/* -1 indicates fail */
4553		if (ret == -1) {
4554			cmn_err(CE_WARN, "Failed to load iscsi module");
4555			iscsi_boot_prop_free();
4556			return (EINVAL);
4557		} else {
4558			if (!i_ddi_attach_pseudo_node("iscsi")) {
4559				cmn_err(CE_WARN,
4560				    "Failed to attach iscsi driver");
4561				iscsi_boot_prop_free();
4562				return (ENODEV);
4563			}
4564		}
4565	}
4566
4567	error = VFS_MOUNTROOT(rootvfs, ROOT_INIT);
4568	vfs_unrefvfssw(vsw);
4569	rootdev = rootvfs->vfs_dev;
4570
4571	if (error)
4572		cmn_err(CE_CONT, "Cannot mount root on %s fstype %s\n",
4573		    rootfs.bo_name, fstyp);
4574	else
4575		cmn_err(CE_CONT, "?root on %s fstype %s\n",
4576		    rootfs.bo_name, fstyp);
4577	return (error);
4578}
4579
4580/*
4581 * XXX this is called by nfs only and should probably be removed
4582 * If booted with ASKNAME, prompt on the console for a filesystem
4583 * name and return it.
4584 */
4585void
4586getfsname(char *askfor, char *name, size_t namelen)
4587{
4588	if (boothowto & RB_ASKNAME) {
4589		printf("%s name: ", askfor);
4590		console_gets(name, namelen);
4591	}
4592}
4593
4594/*
4595 * Init the root filesystem type (rootfs.bo_fstype) from the "fstype"
4596 * property.
4597 *
4598 * Filesystem types starting with the prefix "nfs" are diskless clients;
4599 * init the root filename name (rootfs.bo_name), too.
4600 *
4601 * If we are booting via NFS we currently have these options:
4602 *	nfs -	dynamically choose NFS V2, V3, or V4 (default)
4603 *	nfs2 -	force NFS V2
4604 *	nfs3 -	force NFS V3
4605 *	nfs4 -	force NFS V4
4606 * Because we need to maintain backward compatibility with the naming
4607 * convention that the NFS V2 filesystem name is "nfs" (see vfs_conf.c)
4608 * we need to map "nfs" => "nfsdyn" and "nfs2" => "nfs".  The dynamic
4609 * nfs module will map the type back to either "nfs", "nfs3", or "nfs4".
4610 * This is only for root filesystems, all other uses such as cachefs
4611 * will expect that "nfs" == NFS V2.
4612 */
4613static void
4614getrootfs(char **fstypp, char **fsmodp)
4615{
4616	extern char *strplumb_get_netdev_path(void);
4617	char *propstr = NULL;
4618
4619	/*
4620	 * Check fstype property; for diskless it should be one of "nfs",
4621	 * "nfs2", "nfs3" or "nfs4".
4622	 */
4623	if (ddi_prop_lookup_string(DDI_DEV_T_ANY, ddi_root_node(),
4624	    DDI_PROP_DONTPASS, "fstype", &propstr)
4625	    == DDI_SUCCESS) {
4626		(void) strncpy(rootfs.bo_fstype, propstr, BO_MAXFSNAME);
4627		ddi_prop_free(propstr);
4628
4629	/*
4630	 * if the boot property 'fstype' is not set, but 'zfs-bootfs' is set,
4631	 * assume the type of this root filesystem is 'zfs'.
4632	 */
4633	} else if (ddi_prop_lookup_string(DDI_DEV_T_ANY, ddi_root_node(),
4634	    DDI_PROP_DONTPASS, "zfs-bootfs", &propstr)
4635	    == DDI_SUCCESS) {
4636		(void) strncpy(rootfs.bo_fstype, "zfs", BO_MAXFSNAME);
4637		ddi_prop_free(propstr);
4638	}
4639
4640	if (strncmp(rootfs.bo_fstype, "nfs", 3) != 0) {
4641		*fstypp = *fsmodp = rootfs.bo_fstype;
4642		return;
4643	}
4644
4645	++netboot;
4646
4647	if (strcmp(rootfs.bo_fstype, "nfs2") == 0)
4648		(void) strcpy(rootfs.bo_fstype, "nfs");
4649	else if (strcmp(rootfs.bo_fstype, "nfs") == 0)
4650		(void) strcpy(rootfs.bo_fstype, "nfsdyn");
4651
4652	/*
4653	 * check if path to network interface is specified in bootpath
4654	 * or by a hypervisor domain configuration file.
4655	 * XXPV - enable strlumb_get_netdev_path()
4656	 */
4657	if (ddi_prop_exists(DDI_DEV_T_ANY, ddi_root_node(), DDI_PROP_DONTPASS,
4658	    "xpv-nfsroot")) {
4659		(void) strcpy(rootfs.bo_name, "/xpvd/xnf@0");
4660	} else if (ddi_prop_lookup_string(DDI_DEV_T_ANY, ddi_root_node(),
4661	    DDI_PROP_DONTPASS, "bootpath", &propstr)
4662	    == DDI_SUCCESS) {
4663		(void) strncpy(rootfs.bo_name, propstr, BO_MAXOBJNAME);
4664		ddi_prop_free(propstr);
4665	} else {
4666		/* attempt to determine netdev_path via boot_mac address */
4667		netdev_path = strplumb_get_netdev_path();
4668		if (netdev_path == NULL)
4669			panic("cannot find boot network interface");
4670		(void) strncpy(rootfs.bo_name, netdev_path, BO_MAXOBJNAME);
4671	}
4672	*fstypp = rootfs.bo_fstype;
4673	*fsmodp = "nfs";
4674}
4675#endif
4676
4677/*
4678 * VFS feature routines
4679 */
4680
4681#define	VFTINDEX(feature)	(((feature) >> 32) & 0xFFFFFFFF)
4682#define	VFTBITS(feature)	((feature) & 0xFFFFFFFFLL)
4683
4684/* Register a feature in the vfs */
4685void
4686vfs_set_feature(vfs_t *vfsp, vfs_feature_t feature)
4687{
4688	/* Note that vfs_featureset[] is found in *vfsp->vfs_implp */
4689	if (vfsp->vfs_implp == NULL)
4690		return;
4691
4692	vfsp->vfs_featureset[VFTINDEX(feature)] |= VFTBITS(feature);
4693}
4694
4695/*
4696 * Query a vfs for a feature.
4697 * Returns 1 if feature is present, 0 if not
4698 */
4699int
4700vfs_has_feature(vfs_t *vfsp, vfs_feature_t feature)
4701{
4702	int	ret = 0;
4703
4704	/* Note that vfs_featureset[] is found in *vfsp->vfs_implp */
4705	if (vfsp->vfs_implp == NULL)
4706		return (ret);
4707
4708	if (vfsp->vfs_featureset[VFTINDEX(feature)] & VFTBITS(feature))
4709		ret = 1;
4710
4711	return (ret);
4712}
4713
4714/*
4715 * Propagate feature set from one vfs to another
4716 */
4717void
4718vfs_propagate_features(vfs_t *from, vfs_t *to)
4719{
4720	int i;
4721
4722	if (to->vfs_implp == NULL || from->vfs_implp == NULL)
4723		return;
4724
4725	for (i = 1; i <= to->vfs_featureset[0]; i++) {
4726		to->vfs_featureset[i] = from->vfs_featureset[i];
4727	}
4728}
4729
4730#define	LOFICTL_PATH "/devices/pseudo/lofi@0:%d"
4731
4732/*
4733 * Return the vnode for the lofi node if there's a lofi mount in place.
4734 * Returns -1 when there's no lofi node, 0 on success, and > 0 on
4735 * failure.
4736 */
4737int
4738vfs_get_lofi(vfs_t *vfsp, vnode_t **vpp)
4739{
4740	char *path = NULL;
4741	int strsize;
4742	int err;
4743
4744	if (vfsp->vfs_lofi_minor == 0) {
4745		*vpp = NULL;
4746		return (-1);
4747	}
4748
4749	strsize = snprintf(NULL, 0, LOFICTL_PATH, vfsp->vfs_lofi_minor);
4750	path = kmem_alloc(strsize + 1, KM_SLEEP);
4751	(void) snprintf(path, strsize + 1, LOFICTL_PATH, vfsp->vfs_lofi_minor);
4752
4753	err = lookupname(path, UIO_SYSSPACE, FOLLOW, NULLVPP, vpp);
4754
4755	if (err)
4756		*vpp = NULL;
4757
4758	kmem_free(path, strsize + 1);
4759	return (err);
4760}
4761