vfs_subr.c revision 28558
110154Sache/*
27767Sache * Copyright (c) 1989, 1993
37767Sache *	The Regents of the University of California.  All rights reserved.
4941Snate * (c) UNIX System Laboratories, Inc.
57767Sache * All or some portions of this file are derived from material licensed
67767Sache * to the University of California by American Telephone and Telegraph
7941Snate * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8941Snate * the permission of UNIX System Laboratories, Inc.
9941Snate *
10941Snate * Redistribution and use in source and binary forms, with or without
11941Snate * modification, are permitted provided that the following conditions
12941Snate * are met:
13941Snate * 1. Redistributions of source code must retain the above copyright
14941Snate *    notice, this list of conditions and the following disclaimer.
15941Snate * 2. Redistributions in binary form must reproduce the above copyright
16941Snate *    notice, this list of conditions and the following disclaimer in the
17941Snate *    documentation and/or other materials provided with the distribution.
18941Snate * 3. All advertising materials mentioning features or use of this software
19941Snate *    must display the following acknowledgement:
2010154Sache *	This product includes software developed by the University of
21941Snate *	California, Berkeley and its contributors.
22941Snate * 4. Neither the name of the University nor the names of its contributors
23941Snate *    may be used to endorse or promote products derived from this software
24941Snate *    without specific prior written permission.
25941Snate *
26941Snate * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27941Snate * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28941Snate * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
2987230Smarkm * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
3054158Scharnier * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
3187230Smarkm * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
3287230Smarkm * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33941Snate * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34941Snate * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35941Snate * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
367767Sache * SUCH DAMAGE.
37941Snate *
38941Snate *	@(#)vfs_subr.c	8.31 (Berkeley) 5/26/95
39941Snate * $Id: vfs_subr.c,v 1.92 1997/08/21 20:33:39 bde Exp $
4022873Sdavidn */
41941Snate
42941Snate/*
4354158Scharnier * External virtual filesystem routines
44941Snate */
45941Snate#include "opt_ddb.h"
46941Snate#include "opt_devfs.h"
47941Snate
48941Snate#include <sys/param.h>
49941Snate#include <sys/systm.h>
50941Snate#include <sys/kernel.h>
51941Snate#include <sys/file.h>
52941Snate#include <sys/proc.h>
53941Snate#include <sys/mount.h>
5422873Sdavidn#include <sys/time.h>
557767Sache#include <sys/vnode.h>
567767Sache#include <sys/stat.h>
5711760Sache#include <sys/namei.h>
5811760Sache#include <sys/ucred.h>
597767Sache#include <sys/buf.h>
60941Snate#include <sys/errno.h>
6123318Sache#include <sys/malloc.h>
6222873Sdavidn#include <sys/domain.h>
6322873Sdavidn#include <sys/mbuf.h>
6423318Sache#include <sys/dirent.h>
6522873Sdavidn
6622873Sdavidn#include <machine/limits.h>
67941Snate
687767Sache#include <vm/vm.h>
69941Snate#include <vm/vm_param.h>
70941Snate#include <vm/vm_object.h>
71941Snate#include <vm/vm_extern.h>
727767Sache#include <vm/vm_pager.h>
737767Sache#include <vm/vnode_pager.h>
74941Snate#include <sys/sysctl.h>
75941Snate
76941Snate#include <miscfs/specfs/specdev.h>
77941Snate
78941Snate#ifdef DDB
7910154Sacheextern void	printlockedvnodes __P((void));
807767Sache#endif
817767Sachestatic void	vclean __P((struct vnode *vp, int flags, struct proc *p));
827767Sachestatic void	vgonel __P((struct vnode *vp, struct proc *p));
837767Sacheunsigned long	numvnodes;
847767SacheSYSCTL_INT(_debug, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, "");
857767Sachestatic void	vputrele __P((struct vnode *vp, int put));
867767Sache
877767Sacheenum vtype iftovt_tab[16] = {
887767Sache	VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
897767Sache	VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
907767Sache};
917767Sacheint vttoif_tab[9] = {
927767Sache	0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
93941Snate	S_IFSOCK, S_IFIFO, S_IFMT,
94941Snate};
95941Snate
9610154Sache/*
9710154Sache * Insq/Remq for the vnode usage lists.
98941Snate */
997767Sache#define	bufinsvn(bp, dp)	LIST_INSERT_HEAD(dp, bp, b_vnbufs)
10087208Smarkm#define	bufremvn(bp) {							\
101941Snate	LIST_REMOVE(bp, b_vnbufs);					\
1027767Sache	(bp)->b_vnbufs.le_next = NOLIST;				\
1037767Sache}
10446081SimpTAILQ_HEAD(freelst, vnode) vnode_free_list;	/* vnode free list */
105941Snatestatic u_long freevnodes = 0;
106941Snate
1077767Sachestruct mntlist mountlist;	/* mounted filesystem list */
108941Snatestruct simplelock mountlist_slock;
109941Snatestatic struct simplelock mntid_slock;
1107767Sachestruct simplelock mntvnode_slock;
111941Snatestruct simplelock vnode_free_list_slock;
1127767Sachestatic struct simplelock spechash_slock;
113941Snatestruct nfs_public nfs_pub;	/* publicly exported FS */
114941Snate
11582722Skrisint desiredvnodes;
116941SnateSYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW, &desiredvnodes, 0, "");
117941Snate
118941Snatestatic void	vfs_free_addrlist __P((struct netexport *nep));
1197767Sachestatic int	vfs_free_netcred __P((struct radix_node *rn, void *w));
1207767Sachestatic int	vfs_hang_addrlist __P((struct mount *mp, struct netexport *nep,
1217767Sache				       struct export_args *argp));
1227767Sache
1237767Sache/*
12487208Smarkm * Initialize the vnode management data structures.
1257767Sache */
126941Snatevoid
127941Snatevntblinit()
12887208Smarkm{
129941Snate
13010154Sache	desiredvnodes = maxproc + vm_object_cache_max;
131941Snate	simple_lock_init(&mntvnode_slock);
1327767Sache	simple_lock_init(&mntid_slock);
1337767Sache	simple_lock_init(&spechash_slock);
1347767Sache	TAILQ_INIT(&vnode_free_list);
1357767Sache	simple_lock_init(&vnode_free_list_slock);
1367767Sache	CIRCLEQ_INIT(&mountlist);
1377767Sache}
138941Snate
13982722Skris/*
140941Snate * Mark a mount point as busy. Used to synchronize access and to delay
141941Snate * unmounting. Interlock is not released on failure.
14287208Smarkm */
143941Snateint
14482722Skrisvfs_busy(mp, flags, interlkp, p)
14582722Skris	struct mount *mp;
14682722Skris	int flags;
14782722Skris	struct simplelock *interlkp;
14882722Skris	struct proc *p;
14982722Skris{
15082722Skris	int lkflags;
151941Snate
152941Snate	if (mp->mnt_flag & MNT_UNMOUNT) {
153941Snate		if (flags & LK_NOWAIT)
154941Snate			return (ENOENT);
1557767Sache		mp->mnt_flag |= MNT_MWAIT;
156941Snate		if (interlkp) {
157941Snate			simple_unlock(interlkp);
158941Snate		}
159941Snate		/*
1607767Sache		 * Since all busy locks are shared except the exclusive
1617767Sache		 * lock granted when unmounting, the only place that a
162941Snate		 * wakeup needs to be done is at the release of the
1637767Sache		 * exclusive lock at the end of dounmount.
16480294Sobrien		 */
16580294Sobrien		tsleep((caddr_t)mp, PVFS, "vfs_busy", 0);
1667767Sache		if (interlkp) {
1677767Sache			simple_lock(interlkp);
1687767Sache		}
169941Snate		return (ENOENT);
17054158Scharnier	}
171941Snate	lkflags = LK_SHARED;
1727767Sache	if (interlkp)
1737767Sache		lkflags |= LK_INTERLOCK;
17410154Sache	if (lockmgr(&mp->mnt_lock, lkflags, interlkp, p))
1757767Sache		panic("vfs_busy: unexpected lock failure");
17654158Scharnier	return (0);
17710154Sache}
1787767Sache
1797767Sache/*
18080294Sobrien * Free a busy filesystem.
18180294Sobrien */
1827767Sachevoid
183941Snatevfs_unbusy(mp, p)
184941Snate	struct mount *mp;
18510154Sache	struct proc *p;
18610154Sache{
18710154Sache
18810154Sache	lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, p);
18910154Sache}
19010154Sache
19110154Sache/*
19210154Sache * Lookup a filesystem type, and if found allocate and initialize
19310154Sache * a mount structure for it.
19410154Sache *
19510154Sache * Devname is usually updated by mount(8) after booting.
19610154Sache */
19710154Sacheint
19810154Sachevfs_rootmountalloc(fstypename, devname, mpp)
19910154Sache	char *fstypename;
20010154Sache	char *devname;
20110154Sache	struct mount **mpp;
20210154Sache{
20310154Sache	struct proc *p = curproc;	/* XXX */
20410154Sache	struct vfsconf *vfsp;
20510154Sache	struct mount *mp;
20610154Sache
20710154Sache	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
20810154Sache		if (!strcmp(vfsp->vfc_name, fstypename))
20910154Sache			break;
210941Snate	if (vfsp == NULL)
2117767Sache		return (ENODEV);
212941Snate	mp = malloc((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK);
2137767Sache	bzero((char *)mp, (u_long)sizeof(struct mount));
2147767Sache	lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, 0);
21510154Sache	(void)vfs_busy(mp, LK_NOWAIT, 0, p);
2167767Sache	LIST_INIT(&mp->mnt_vnodelist);
2177767Sache	mp->mnt_vfc = vfsp;
2187767Sache	mp->mnt_op = vfsp->vfc_vfsops;
2197767Sache	mp->mnt_flag = MNT_RDONLY;
2207767Sache	mp->mnt_vnodecovered = NULLVP;
2217767Sache	vfsp->vfc_refcount++;
2227767Sache	mp->mnt_stat.f_type = vfsp->vfc_typenum;
2237767Sache	mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
2247767Sache	strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN);
2257767Sache	mp->mnt_stat.f_mntonname[0] = '/';
22610154Sache	mp->mnt_stat.f_mntonname[1] = 0;
22711760Sache	(void) copystr(devname, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, 0);
22811760Sache	*mpp = mp;
22911760Sache	return (0);
23011760Sache}
2317767Sache
2327767Sache/*
2337767Sache * Find an appropriate filesystem to use for the root. If a filesystem
2347767Sache * has not been preselected, walk through the list of known filesystems
2357767Sache * trying those that have mountroot routines, and try them until one
2367767Sache * works or we have tried them all.
237941Snate */
2387767Sache#ifdef notdef	/* XXX JH */
239941Snateint
2407767Sachelite2_vfs_mountroot(void)
241941Snate{
2427767Sache	struct vfsconf *vfsp;
2437767Sache	extern int (*lite2_mountroot)(void);
2447767Sache	int error;
2457767Sache
2467767Sache	if (lite2_mountroot != NULL)
247941Snate		return ((*lite2_mountroot)());
2487767Sache	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
249941Snate		if (vfsp->vfc_mountroot == NULL)
2507767Sache			continue;
25154158Scharnier		if ((error = (*vfsp->vfc_mountroot)()) == 0)
252941Snate			return (0);
2537767Sache		printf("%s_mountroot failed: %d\n", vfsp->vfc_name, error);
2547767Sache	}
255941Snate	return (ENODEV);
2567767Sache}
2577767Sache#endif
2587767Sache
259941Snate/*
2607767Sache * Lookup a mount point by filesystem identifier.
2617767Sache */
2627767Sachestruct mount *
2637767Sachevfs_getvfs(fsid)
2647767Sache	fsid_t *fsid;
2657767Sache{
2667767Sache	register struct mount *mp;
267941Snate
26810154Sache	simple_lock(&mountlist_slock);
26954158Scharnier	for (mp = mountlist.cqh_first; mp != (void *)&mountlist;
270941Snate	    mp = mp->mnt_list.cqe_next) {
27110154Sache		if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
27210154Sache		    mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
273941Snate			simple_unlock(&mountlist_slock);
27410154Sache			return (mp);
27510154Sache	    }
27610154Sache	}
277941Snate	simple_unlock(&mountlist_slock);
27810154Sache	return ((struct mount *) 0);
27910154Sache}
28054158Scharnier
28110154Sache/*
2827767Sache * Get a new unique fsid
2837767Sache */
2847767Sachevoid
2857767Sachevfs_getnewfsid(mp)
2867767Sache	struct mount *mp;
2877767Sache{
2887767Sache	static u_short xxxfs_mntid;
28954158Scharnier
290941Snate	fsid_t tfsid;
2917767Sache	int mtype;
29254158Scharnier
293941Snate	simple_lock(&mntid_slock);
2947767Sache	mtype = mp->mnt_vfc->vfc_typenum;
29554158Scharnier	mp->mnt_stat.f_fsid.val[0] = makedev(nblkdev + mtype, 0);
296941Snate	mp->mnt_stat.f_fsid.val[1] = mtype;
2977767Sache	if (xxxfs_mntid == 0)
298941Snate		++xxxfs_mntid;
2998112Sache	tfsid.val[0] = makedev(nblkdev + mtype, xxxfs_mntid);
3008112Sache	tfsid.val[1] = mtype;
3018112Sache	if (mountlist.cqh_first != (void *)&mountlist) {
3028112Sache		while (vfs_getvfs(&tfsid)) {
3038112Sache			tfsid.val[0]++;
3048112Sache			xxxfs_mntid++;
30510154Sache		}
3067767Sache	}
3077767Sache	mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
3087767Sache	simple_unlock(&mntid_slock);
309941Snate}
3107767Sache
3117767Sache/*
3127767Sache * Set vnode attributes to VNOVAL
3137767Sache */
3147767Sachevoid
3157767Sachevattr_null(vap)
316941Snate	register struct vattr *vap;
3177767Sache{
31854158Scharnier
319941Snate	vap->va_type = VNON;
3207767Sache	vap->va_size = VNOVAL;
3217767Sache	vap->va_bytes = VNOVAL;
3227767Sache	vap->va_mode = vap->va_nlink = vap->va_uid = vap->va_gid =
3237767Sache	    vap->va_fsid = vap->va_fileid =
3247767Sache	    vap->va_blocksize = vap->va_rdev =
3257767Sache	    vap->va_atime.tv_sec = vap->va_atime.tv_nsec =
326941Snate	    vap->va_mtime.tv_sec = vap->va_mtime.tv_nsec =
32710154Sache	    vap->va_ctime.tv_sec = vap->va_ctime.tv_nsec =
32822873Sdavidn	    vap->va_flags = vap->va_gen = VNOVAL;
3297767Sache	vap->va_vaflags = 0;
33010154Sache}
3317767Sache
3327767Sache/*
3337767Sache * Routines having to do with the management of the vnode table.
334941Snate */
3357767Sacheextern vop_t **dead_vnodeop_p;
3367767Sache
3377767Sache/*
3387767Sache * Return the next vnode from the free list.
33954158Scharnier */
3407767Sacheint
34122873Sdavidngetnewvnode(tag, mp, vops, vpp)
34222873Sdavidn	enum vtagtype tag;
343941Snate	struct mount *mp;
3447767Sache	vop_t **vops;
3457767Sache	struct vnode **vpp;
3467767Sache{
347941Snate	struct proc *p = curproc;	/* XXX */
3487767Sache	struct vnode *vp;
3497767Sache
35054158Scharnier	/*
3517767Sache	 * We take the least recently used vnode from the freelist
3527767Sache	 * if we can get it and it has no cached pages, and no
3537767Sache	 * namecache entries are relative to it.
3547767Sache	 * Otherwise we allocate a new vnode
3557767Sache	 */
3567767Sache
357941Snate	simple_lock(&vnode_free_list_slock);
3587767Sache
3597767Sache	if (freevnodes >= desiredvnodes) {
3607767Sache		TAILQ_FOREACH(vp, &vnode_free_list, v_freelist) {
3617767Sache			if (!simple_lock_try(&vp->v_interlock))
3627767Sache				continue;
36387208Smarkm			if (vp->v_usecount)
3647767Sache				panic("free vnode isn't");
3657767Sache
3667767Sache			if (vp->v_object && vp->v_object->resident_page_count) {
36710154Sache				/* Don't recycle if it's caching some pages */
3687767Sache				simple_unlock(&vp->v_interlock);
3697767Sache				continue;
3707767Sache			} else if (LIST_FIRST(&vp->v_cache_src)) {
3717767Sache				/* Don't recycle if active in the namecache */
372941Snate				simple_unlock(&vp->v_interlock);
3737767Sache				continue;
3747767Sache			} else {
3757767Sache				break;
3767767Sache			}
3777767Sache		}
3787767Sache	} else {
3797767Sache		vp = NULL;
3807767Sache	}
3817767Sache
38210154Sache	if (vp) {
38310154Sache		TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
38410154Sache		freevnodes--;
38510154Sache		/* see comment on why 0xdeadb is set at end of vgone (below) */
38610154Sache		vp->v_freelist.tqe_prev = (struct vnode **) 0xdeadb;
38710154Sache		simple_unlock(&vnode_free_list_slock);
38810154Sache		vp->v_lease = NULL;
38910154Sache		if (vp->v_type != VBAD)
39010154Sache			vgonel(vp, p);
39110154Sache		else {
39210154Sache			simple_unlock(&vp->v_interlock);
39310154Sache		}
3947767Sache
395941Snate#ifdef DIAGNOSTIC
3967767Sache		{
3977767Sache			int s;
3987767Sache
3997767Sache			if (vp->v_data)
40010154Sache				panic("cleaned vnode isn't");
401941Snate			s = splbio();
40210154Sache			if (vp->v_numoutput)
4037767Sache				panic("Clean vnode has pending I/O's");
4047767Sache			splx(s);
4057767Sache		}
4067767Sache#endif
4077767Sache		vp->v_flag = 0;
4087767Sache		vp->v_lastr = 0;
4097767Sache		vp->v_lastw = 0;
4107767Sache		vp->v_lasta = 0;
4117767Sache		vp->v_cstart = 0;
4127767Sache		vp->v_clen = 0;
4137767Sache		vp->v_socket = 0;
4147767Sache		vp->v_writecount = 0;	/* XXX */
41510154Sache	} else {
4167767Sache		simple_unlock(&vnode_free_list_slock);
4177767Sache		vp = (struct vnode *) malloc((u_long) sizeof *vp,
4187767Sache		    M_VNODE, M_WAITOK);
4197767Sache		bzero((char *) vp, sizeof *vp);
4207767Sache		vp->v_dd = vp;
4217767Sache		LIST_INIT(&vp->v_cache_src);
4227767Sache		TAILQ_INIT(&vp->v_cache_dst);
4237767Sache		numvnodes++;
424941Snate	}
4257767Sache
4267767Sache	vp->v_type = VNON;
427941Snate	cache_purge(vp);
4287767Sache	vp->v_tag = tag;
4297767Sache	vp->v_op = vops;
43054158Scharnier	insmntque(vp, mp);
43110154Sache	*vpp = vp;
4327767Sache	vp->v_usecount = 1;
43354158Scharnier	vp->v_data = 0;
434941Snate	return (0);
4357767Sache}
436941Snate
4377767Sache/*
4387767Sache * Move a vnode from one mount queue to another.
439941Snate */
4407767Sachevoid
44154158Scharnierinsmntque(vp, mp)
442941Snate	register struct vnode *vp;
4437767Sache	register struct mount *mp;
44410154Sache{
445941Snate
446941Snate	simple_lock(&mntvnode_slock);
447941Snate	/*
448941Snate	 * Delete from old mount point vnode list, if on one.
449941Snate	 */
45010154Sache	if (vp->v_mount != NULL)
4517767Sache		LIST_REMOVE(vp, v_mntvnodes);
4527767Sache	/*
4537767Sache	 * Insert into list of vnodes for the new mount point, if available.
4547767Sache	 */
4557767Sache	if ((vp->v_mount = mp) == NULL) {
4567767Sache		simple_unlock(&mntvnode_slock);
4577767Sache		return;
4587767Sache	}
4597767Sache	LIST_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes);
46010154Sache	simple_unlock(&mntvnode_slock);
4617767Sache}
4627767Sache
4637767Sache/*
46440389Smckay * Update outstanding I/O count and do wakeup if requested.
46540389Smckay */
46640389Smckayvoid
46740389Smckayvwakeup(bp)
468941Snate	register struct buf *bp;
4697767Sache{
470941Snate	register struct vnode *vp;
4717767Sache
47254158Scharnier	bp->b_flags &= ~B_WRITEINPROG;
473941Snate	if ((vp = bp->b_vp)) {
4747767Sache		vp->v_numoutput--;
47554158Scharnier		if (vp->v_numoutput < 0)
476941Snate			panic("vwakeup: neg numoutput");
47710154Sache		if ((vp->v_numoutput == 0) && (vp->v_flag & VBWAIT)) {
4787767Sache			vp->v_flag &= ~VBWAIT;
4797767Sache			wakeup((caddr_t) &vp->v_numoutput);
4807767Sache		}
48154158Scharnier	}
48210154Sache}
4837767Sache
4847767Sache/*
4857767Sache * Flush out and invalidate all buffers associated with a vnode.
4867767Sache * Called with the underlying object locked.
4877767Sache */
4887767Sacheint
4897767Sachevinvalbuf(vp, flags, cred, p, slpflag, slptimeo)
490941Snate	register struct vnode *vp;
49110154Sache	int flags;
4927767Sache	struct ucred *cred;
493941Snate	struct proc *p;
4947767Sache	int slpflag, slptimeo;
4957767Sache{
496941Snate	register struct buf *bp;
4977767Sache	struct buf *nbp, *blist;
4987767Sache	int s, error;
49987208Smarkm	vm_object_t object;
5007767Sache
5017767Sache	if (flags & V_SAVE) {
5027767Sache		if ((error = VOP_FSYNC(vp, cred, MNT_WAIT, p)))
5037767Sache			return (error);
5047767Sache		if (vp->v_dirtyblkhd.lh_first != NULL)
505941Snate			panic("vinvalbuf: dirty bufs");
50610154Sache	}
50710154Sache
50810154Sache	s = splbio();
50910154Sache	for (;;) {
51010154Sache		if ((blist = vp->v_cleanblkhd.lh_first) && (flags & V_SAVEMETA))
51110154Sache			while (blist && blist->b_lblkno < 0)
5127767Sache				blist = blist->b_vnbufs.le_next;
5137767Sache		if (!blist && (blist = vp->v_dirtyblkhd.lh_first) &&
514941Snate		    (flags & V_SAVEMETA))
515941Snate			while (blist && blist->b_lblkno < 0)
516941Snate				blist = blist->b_vnbufs.le_next;
51710154Sache		if (!blist)
518941Snate			break;
5197767Sache
5207767Sache		for (bp = blist; bp; bp = nbp) {
5217767Sache			nbp = bp->b_vnbufs.le_next;
5227767Sache			if ((flags & V_SAVEMETA) && bp->b_lblkno < 0)
52310154Sache				continue;
52410154Sache			if (bp->b_flags & B_BUSY) {
52510154Sache				bp->b_flags |= B_WANTED;
52610154Sache				error = tsleep((caddr_t) bp,
52710154Sache				    slpflag | (PRIBIO + 1), "vinvalbuf",
528941Snate				    slptimeo);
5297767Sache				if (error) {
530941Snate					splx(s);
5317767Sache					return (error);
53254158Scharnier				}
5338874Srgrimes				break;
53410154Sache			}
53554158Scharnier			bremfree(bp);
53610154Sache			bp->b_flags |= B_BUSY;
53710154Sache			/*
53810154Sache			 * XXX Since there are no node locks for NFS, I
53910154Sache			 * believe there is a slight chance that a delayed
54010154Sache			 * write will occur while sleeping just above, so
54110154Sache			 * check for it.
54210154Sache			 */
54310154Sache			if ((bp->b_flags & B_DELWRI) && (flags & V_SAVE)) {
54410154Sache				(void) VOP_BWRITE(bp);
54554158Scharnier				break;
54610154Sache			}
54710154Sache			bp->b_flags |= (B_INVAL|B_NOCACHE|B_RELBUF);
54810154Sache			brelse(bp);
5497860Sache		}
55010154Sache	}
55110154Sache
55210154Sache	while (vp->v_numoutput > 0) {
55354158Scharnier		vp->v_flag |= VBWAIT;
55454158Scharnier		tsleep(&vp->v_numoutput, PVM, "vnvlbv", 0);
55510154Sache	}
55610154Sache
55710154Sache	splx(s);
55810154Sache
55910154Sache	/*
56010154Sache	 * Destroy the copy in the VM cache, too.
56110154Sache	 */
56210154Sache	object = vp->v_object;
56310154Sache	if (object != NULL) {
56410154Sache		vm_object_page_remove(object, 0, object->size,
56510154Sache		    (flags & V_SAVE) ? TRUE : FALSE);
56610154Sache	}
56710154Sache	if (!(flags & V_SAVEMETA) &&
56810154Sache	    (vp->v_dirtyblkhd.lh_first || vp->v_cleanblkhd.lh_first))
56910154Sache		panic("vinvalbuf: flush failed");
57010154Sache	return (0);
57110154Sache}
57210154Sache
57310154Sache/*
57410154Sache * Associate a buffer with a vnode.
57510154Sache */
57610154Sachevoid
57710154Sachebgetvp(vp, bp)
57810154Sache	register struct vnode *vp;
57954158Scharnier	register struct buf *bp;
58010154Sache{
58110154Sache	int s;
58210154Sache
58310154Sache	if (bp->b_vp)
58410154Sache		panic("bgetvp: not free");
58510154Sache	VHOLD(vp);
58610154Sache	bp->b_vp = vp;
58710154Sache	if (vp->v_type == VBLK || vp->v_type == VCHR)
58854158Scharnier		bp->b_dev = vp->v_rdev;
58954158Scharnier	else
59010154Sache		bp->b_dev = NODEV;
59110154Sache	/*
5927860Sache	 * Insert onto list for new vnode.
5937767Sache	 */
5947767Sache	s = splbio();
595941Snate	bufinsvn(bp, &vp->v_cleanblkhd);
596941Snate	splx(s);
5977767Sache}
598941Snate
5997767Sache/*
6007767Sache * Disassociate a buffer from a vnode.
6017767Sache */
6027767Sachevoid
603941Snatebrelvp(bp)
6047767Sache	register struct buf *bp;
60587208Smarkm{
6067767Sache	struct vnode *vp;
6077767Sache	int s;
608941Snate
6097767Sache	if (bp->b_vp == (struct vnode *) 0)
610941Snate		panic("brelvp: NULL");
6117767Sache	/*
6127767Sache	 * Delete from old vnode list, if on one.
6137767Sache	 */
6147767Sache	s = splbio();
6157767Sache	if (bp->b_vnbufs.le_next != NOLIST)
6167767Sache		bufremvn(bp);
617941Snate	splx(s);
61882722Skris
61982722Skris	vp = bp->b_vp;
6207767Sache	bp->b_vp = (struct vnode *) 0;
6217767Sache	HOLDRELE(vp);
6227767Sache}
6237767Sache
6247767Sache/*
6257767Sache * Associate a p-buffer with a vnode.
6267767Sache */
6277767Sachevoid
6287767Sachepbgetvp(vp, bp)
6297767Sache	register struct vnode *vp;
6307767Sache	register struct buf *bp;
6317767Sache{
6327767Sache#if defined(DIAGNOSTIC)
6337767Sache	if (bp->b_vp)
634941Snate		panic("pbgetvp: not free");
6357767Sache#endif
6367767Sache	bp->b_vp = vp;
6377767Sache	if (vp->v_type == VBLK || vp->v_type == VCHR)
63824360Simp		bp->b_dev = vp->v_rdev;
6397767Sache	else
6407767Sache		bp->b_dev = NODEV;
6417767Sache}
6427767Sache
643941Snate/*
6447767Sache * Disassociate a p-buffer from a vnode.
6457767Sache */
6467767Sachevoid
647941Snatepbrelvp(bp)
6487767Sache	register struct buf *bp;
6497767Sache{
6507767Sache	struct vnode *vp;
65110154Sache
6527767Sache#if defined(DIAGNOSTIC)
6537767Sache	if (bp->b_vp == (struct vnode *) 0)
6547767Sache		panic("pbrelvp: NULL");
655941Snate#endif
6567767Sache
6577767Sache	bp->b_vp = (struct vnode *) 0;
6587767Sache}
659941Snate
6607767Sache/*
6617767Sache * Reassign a buffer from one vnode to another.
662941Snate * Used to assign file specific control information
6637767Sache * (indirect blocks) to the vnode to which they belong.
6647767Sache */
6657767Sachevoid
666941Snatereassignbuf(bp, newvp)
6677767Sache	register struct buf *bp;
6687767Sache	register struct vnode *newvp;
6697767Sache{
670941Snate	int s;
6717767Sache
6727767Sache	if (newvp == NULL) {
6737767Sache		printf("reassignbuf: NULL");
674941Snate		return;
6757767Sache	}
6767767Sache
6777767Sache	s = splbio();
678941Snate	/*
6797767Sache	 * Delete from old vnode list, if on one.
6807767Sache	 */
6817767Sache	if (bp->b_vnbufs.le_next != NOLIST)
682941Snate		bufremvn(bp);
6837767Sache	/*
6847767Sache	 * If dirty, put on list of dirty buffers; otherwise insert onto list
6857767Sache	 * of clean buffers.
686941Snate	 */
6877767Sache	if (bp->b_flags & B_DELWRI) {
6887767Sache		struct buf *tbp;
6897767Sache
690941Snate		tbp = newvp->v_dirtyblkhd.lh_first;
69110154Sache		if (!tbp || (tbp->b_lblkno > bp->b_lblkno)) {
69210154Sache			bufinsvn(bp, &newvp->v_dirtyblkhd);
69310154Sache		} else {
69410154Sache			while (tbp->b_vnbufs.le_next &&
69510154Sache				(tbp->b_vnbufs.le_next->b_lblkno < bp->b_lblkno)) {
6967767Sache				tbp = tbp->b_vnbufs.le_next;
6977767Sache			}
6987767Sache			LIST_INSERT_AFTER(tbp, bp, b_vnbufs);
6997767Sache		}
7007767Sache	} else {
7017767Sache		bufinsvn(bp, &newvp->v_cleanblkhd);
702941Snate	}
7037767Sache	splx(s);
70482722Skris}
70582722Skris
70682722Skris#ifndef DEVFS_ROOT
707941Snate/*
7087767Sache * Create a vnode for a block device.
7097767Sache * Used for root filesystem, argdev, and swap areas.
7107767Sache * Also used for memory file system special devices.
71154158Scharnier */
7127767Sacheint
7137767Sachebdevvp(dev, vpp)
714941Snate	dev_t dev;
7157767Sache	struct vnode **vpp;
7167767Sache{
7177767Sache	register struct vnode *vp;
7187767Sache	struct vnode *nvp;
7197767Sache	int error;
7207767Sache
7217767Sache	if (dev == NODEV)
7227767Sache		return (0);
7237767Sache	error = getnewvnode(VT_NON, (struct mount *) 0, spec_vnodeop_p, &nvp);
72410154Sache	if (error) {
7257767Sache		*vpp = 0;
7267767Sache		return (error);
72710154Sache	}
72810154Sache	vp = nvp;
72910154Sache	vp->v_type = VBLK;
73010154Sache	if ((nvp = checkalias(vp, dev, (struct mount *) 0))) {
73110154Sache		vput(vp);
7327767Sache		vp = nvp;
7337767Sache	}
7347767Sache	*vpp = vp;
7357767Sache	return (0);
7367767Sache}
7377767Sache#endif /* !DEVFS_ROOT */
738941Snate
7397767Sache/*
7407767Sache * Check to see if the new vnode represents a special device
7417767Sache * for which we already have a vnode (either because of
7427767Sache * bdevvp() or because of a different vnode representing
7437767Sache * the same block device). If such an alias exists, deallocate
7447767Sache * the existing contents and return the aliased vnode. The
7457767Sache * caller is responsible for filling it with its new contents.
7467767Sache */
7477767Sachestruct vnode *
7487767Sachecheckalias(nvp, nvp_rdev, mp)
7497767Sache	register struct vnode *nvp;
7507767Sache	dev_t nvp_rdev;
7517767Sache	struct mount *mp;
75210154Sache{
7537767Sache	struct proc *p = curproc;	/* XXX */
7547767Sache	struct vnode *vp;
7557767Sache	struct vnode **vpp;
7567767Sache
7577767Sache	if (nvp->v_type != VBLK && nvp->v_type != VCHR)
7587767Sache		return (NULLVP);
7597767Sache
7607767Sache	vpp = &speclisth[SPECHASH(nvp_rdev)];
7617767Sacheloop:
7627767Sache	simple_lock(&spechash_slock);
76354158Scharnier	for (vp = *vpp; vp; vp = vp->v_specnext) {
7647767Sache		if (nvp_rdev != vp->v_rdev || nvp->v_type != vp->v_type)
7657767Sache			continue;
7667767Sache		/*
767941Snate		 * Alias, but not in use, so flush it out.
768		 */
769		simple_lock(&vp->v_interlock);
770		if (vp->v_usecount == 0) {
771			simple_unlock(&spechash_slock);
772			vgonel(vp, p);
773			goto loop;
774		}
775		if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, p)) {
776			simple_unlock(&spechash_slock);
777			goto loop;
778		}
779		break;
780	}
781	if (vp == NULL || vp->v_tag != VT_NON) {
782		MALLOC(nvp->v_specinfo, struct specinfo *,
783		    sizeof(struct specinfo), M_VNODE, M_WAITOK);
784		nvp->v_rdev = nvp_rdev;
785		nvp->v_hashchain = vpp;
786		nvp->v_specnext = *vpp;
787		nvp->v_specflags = 0;
788		simple_unlock(&spechash_slock);
789		*vpp = nvp;
790		if (vp != NULLVP) {
791			nvp->v_flag |= VALIASED;
792			vp->v_flag |= VALIASED;
793			vput(vp);
794		}
795		return (NULLVP);
796	}
797	simple_unlock(&spechash_slock);
798	VOP_UNLOCK(vp, 0, p);
799	simple_lock(&vp->v_interlock);
800	vclean(vp, 0, p);
801	vp->v_op = nvp->v_op;
802	vp->v_tag = nvp->v_tag;
803	nvp->v_type = VNON;
804	insmntque(vp, mp);
805	return (vp);
806}
807
808/*
809 * Grab a particular vnode from the free list, increment its
810 * reference count and lock it. The vnode lock bit is set the
811 * vnode is being eliminated in vgone. The process is awakened
812 * when the transition is completed, and an error returned to
813 * indicate that the vnode is no longer usable (possibly having
814 * been changed to a new file system type).
815 */
816int
817vget(vp, flags, p)
818	register struct vnode *vp;
819	int flags;
820	struct proc *p;
821{
822	int error;
823
824	/*
825	 * If the vnode is in the process of being cleaned out for
826	 * another use, we wait for the cleaning to finish and then
827	 * return failure. Cleaning is determined by checking that
828	 * the VXLOCK flag is set.
829	 */
830	if ((flags & LK_INTERLOCK) == 0) {
831		simple_lock(&vp->v_interlock);
832	}
833	if (vp->v_flag & VXLOCK) {
834		vp->v_flag |= VXWANT;
835		simple_unlock(&vp->v_interlock);
836		tsleep((caddr_t)vp, PINOD, "vget", 0);
837		return (ENOENT);
838	}
839	if (vp->v_usecount == 0) {
840		simple_lock(&vnode_free_list_slock);
841		TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
842		simple_unlock(&vnode_free_list_slock);
843		freevnodes--;
844	}
845	vp->v_usecount++;
846	/*
847	 * Create the VM object, if needed
848	 */
849	if ((vp->v_type == VREG) &&
850		((vp->v_object == NULL) ||
851			(vp->v_object->flags & OBJ_VFS_REF) == 0 ||
852			(vp->v_object->flags & OBJ_DEAD))) {
853		/*
854		 * XXX vfs_object_create probably needs the interlock.
855		 */
856		simple_unlock(&vp->v_interlock);
857		vfs_object_create(vp, curproc, curproc->p_ucred, 0);
858		simple_lock(&vp->v_interlock);
859	}
860	if (flags & LK_TYPE_MASK) {
861		if (error = vn_lock(vp, flags | LK_INTERLOCK, p))
862			vrele(vp);
863		return (error);
864	}
865	simple_unlock(&vp->v_interlock);
866	return (0);
867}
868
869/*
870 * Stubs to use when there is no locking to be done on the underlying object.
871 * A minimal shared lock is necessary to ensure that the underlying object
872 * is not revoked while an operation is in progress. So, an active shared
873 * count is maintained in an auxillary vnode lock structure.
874 */
875int
876vop_sharedlock(ap)
877	struct vop_lock_args /* {
878		struct vnode *a_vp;
879		int a_flags;
880		struct proc *a_p;
881	} */ *ap;
882{
883	/*
884	 * This code cannot be used until all the non-locking filesystems
885	 * (notably NFS) are converted to properly lock and release nodes.
886	 * Also, certain vnode operations change the locking state within
887	 * the operation (create, mknod, remove, link, rename, mkdir, rmdir,
888	 * and symlink). Ideally these operations should not change the
889	 * lock state, but should be changed to let the caller of the
890	 * function unlock them. Otherwise all intermediate vnode layers
891	 * (such as union, umapfs, etc) must catch these functions to do
892	 * the necessary locking at their layer. Note that the inactive
893	 * and lookup operations also change their lock state, but this
894	 * cannot be avoided, so these two operations will always need
895	 * to be handled in intermediate layers.
896	 */
897	struct vnode *vp = ap->a_vp;
898	int vnflags, flags = ap->a_flags;
899
900	if (vp->v_vnlock == NULL) {
901		if ((flags & LK_TYPE_MASK) == LK_DRAIN)
902			return (0);
903		MALLOC(vp->v_vnlock, struct lock *, sizeof(struct lock),
904		    M_VNODE, M_WAITOK);
905		lockinit(vp->v_vnlock, PVFS, "vnlock", 0, 0);
906	}
907	switch (flags & LK_TYPE_MASK) {
908	case LK_DRAIN:
909		vnflags = LK_DRAIN;
910		break;
911	case LK_EXCLUSIVE:
912#ifdef DEBUG_VFS_LOCKS
913		/*
914		 * Normally, we use shared locks here, but that confuses
915		 * the locking assertions.
916		 */
917		vnflags = LK_EXCLUSIVE;
918		break;
919#endif
920	case LK_SHARED:
921		vnflags = LK_SHARED;
922		break;
923	case LK_UPGRADE:
924	case LK_EXCLUPGRADE:
925	case LK_DOWNGRADE:
926		return (0);
927	case LK_RELEASE:
928	default:
929		panic("vop_nolock: bad operation %d", flags & LK_TYPE_MASK);
930	}
931	if (flags & LK_INTERLOCK)
932		vnflags |= LK_INTERLOCK;
933	return(lockmgr(vp->v_vnlock, vnflags, &vp->v_interlock, ap->a_p));
934}
935
936/*
937 * Stubs to use when there is no locking to be done on the underlying object.
938 * A minimal shared lock is necessary to ensure that the underlying object
939 * is not revoked while an operation is in progress. So, an active shared
940 * count is maintained in an auxillary vnode lock structure.
941 */
942int
943vop_nolock(ap)
944	struct vop_lock_args /* {
945		struct vnode *a_vp;
946		int a_flags;
947		struct proc *a_p;
948	} */ *ap;
949{
950#ifdef notyet
951	/*
952	 * This code cannot be used until all the non-locking filesystems
953	 * (notably NFS) are converted to properly lock and release nodes.
954	 * Also, certain vnode operations change the locking state within
955	 * the operation (create, mknod, remove, link, rename, mkdir, rmdir,
956	 * and symlink). Ideally these operations should not change the
957	 * lock state, but should be changed to let the caller of the
958	 * function unlock them. Otherwise all intermediate vnode layers
959	 * (such as union, umapfs, etc) must catch these functions to do
960	 * the necessary locking at their layer. Note that the inactive
961	 * and lookup operations also change their lock state, but this
962	 * cannot be avoided, so these two operations will always need
963	 * to be handled in intermediate layers.
964	 */
965	struct vnode *vp = ap->a_vp;
966	int vnflags, flags = ap->a_flags;
967
968	if (vp->v_vnlock == NULL) {
969		if ((flags & LK_TYPE_MASK) == LK_DRAIN)
970			return (0);
971		MALLOC(vp->v_vnlock, struct lock *, sizeof(struct lock),
972		    M_VNODE, M_WAITOK);
973		lockinit(vp->v_vnlock, PVFS, "vnlock", 0, 0);
974	}
975	switch (flags & LK_TYPE_MASK) {
976	case LK_DRAIN:
977		vnflags = LK_DRAIN;
978		break;
979	case LK_EXCLUSIVE:
980	case LK_SHARED:
981		vnflags = LK_SHARED;
982		break;
983	case LK_UPGRADE:
984	case LK_EXCLUPGRADE:
985	case LK_DOWNGRADE:
986		return (0);
987	case LK_RELEASE:
988	default:
989		panic("vop_nolock: bad operation %d", flags & LK_TYPE_MASK);
990	}
991	if (flags & LK_INTERLOCK)
992		vnflags |= LK_INTERLOCK;
993	return(lockmgr(vp->v_vnlock, vnflags, &vp->v_interlock, ap->a_p));
994#else /* for now */
995	/*
996	 * Since we are not using the lock manager, we must clear
997	 * the interlock here.
998	 */
999	if (ap->a_flags & LK_INTERLOCK) {
1000		simple_unlock(&ap->a_vp->v_interlock);
1001	}
1002	return (0);
1003#endif
1004}
1005
1006/*
1007 * Do the inverse of vop_nolock, handling the interlock in a compatible way.
1008 */
1009int
1010vop_nounlock(ap)
1011	struct vop_unlock_args /* {
1012		struct vnode *a_vp;
1013		int a_flags;
1014		struct proc *a_p;
1015	} */ *ap;
1016{
1017	struct vnode *vp = ap->a_vp;
1018
1019	if (vp->v_vnlock == NULL) {
1020		if (ap->a_flags & LK_INTERLOCK)
1021			simple_unlock(&ap->a_vp->v_interlock);
1022		return (0);
1023	}
1024	return (lockmgr(vp->v_vnlock, LK_RELEASE | ap->a_flags,
1025		&ap->a_vp->v_interlock, ap->a_p));
1026}
1027
1028/*
1029 * Return whether or not the node is in use.
1030 */
1031int
1032vop_noislocked(ap)
1033	struct vop_islocked_args /* {
1034		struct vnode *a_vp;
1035	} */ *ap;
1036{
1037	struct vnode *vp = ap->a_vp;
1038
1039	if (vp->v_vnlock == NULL)
1040		return (0);
1041	return (lockstatus(vp->v_vnlock));
1042}
1043
1044/* #ifdef DIAGNOSTIC */
1045/*
1046 * Vnode reference, just increment the count
1047 */
1048void
1049vref(vp)
1050	struct vnode *vp;
1051{
1052	simple_lock(&vp->v_interlock);
1053	if (vp->v_usecount <= 0)
1054		panic("vref used where vget required");
1055
1056	vp->v_usecount++;
1057
1058	if ((vp->v_type == VREG) &&
1059		((vp->v_object == NULL) ||
1060			((vp->v_object->flags & OBJ_VFS_REF) == 0) ||
1061			(vp->v_object->flags & OBJ_DEAD))) {
1062		/*
1063		 * We need to lock to VP during the time that
1064		 * the object is created.  This is necessary to
1065		 * keep the system from re-entrantly doing it
1066		 * multiple times.
1067		 * XXX vfs_object_create probably needs the interlock?
1068		 */
1069		simple_unlock(&vp->v_interlock);
1070		vfs_object_create(vp, curproc, curproc->p_ucred, 0);
1071		return;
1072	}
1073	simple_unlock(&vp->v_interlock);
1074}
1075
1076/*
1077 * Vnode put/release.
1078 * If count drops to zero, call inactive routine and return to freelist.
1079 */
1080static void
1081vputrele(vp, put)
1082	struct vnode *vp;
1083	int put;
1084{
1085	struct proc *p = curproc;	/* XXX */
1086
1087#ifdef DIAGNOSTIC
1088	if (vp == NULL)
1089		panic("vputrele: null vp");
1090#endif
1091	simple_lock(&vp->v_interlock);
1092
1093	if ((vp->v_usecount == 2) &&
1094		vp->v_object &&
1095		(vp->v_object->flags & OBJ_VFS_REF)) {
1096		vp->v_usecount--;
1097		vp->v_object->flags &= ~OBJ_VFS_REF;
1098		if (put) {
1099			VOP_UNLOCK(vp, LK_INTERLOCK, p);
1100		} else {
1101			simple_unlock(&vp->v_interlock);
1102		}
1103		vm_object_deallocate(vp->v_object);
1104		return;
1105	}
1106
1107	if (vp->v_usecount > 1) {
1108		vp->v_usecount--;
1109		if (put) {
1110			VOP_UNLOCK(vp, LK_INTERLOCK, p);
1111		} else {
1112			simple_unlock(&vp->v_interlock);
1113		}
1114		return;
1115	}
1116
1117	if (vp->v_usecount < 1) {
1118#ifdef DIAGNOSTIC
1119		vprint("vputrele: negative ref count", vp);
1120#endif
1121		panic("vputrele: negative ref cnt");
1122	}
1123
1124	/*
1125	 * If we are doing a vput, the node is already locked, and we must
1126	 * call VOP_INACTIVE with the node locked.  So, in the case of
1127	 * vrele, we explicitly lock the vnode before calling VOP_INACTIVE.
1128	 */
1129	if (put) {
1130		simple_unlock(&vp->v_interlock);
1131		VOP_INACTIVE(vp, p);
1132	} else if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK, p) == 0) {
1133		VOP_INACTIVE(vp, p);
1134	}
1135
1136	vp->v_usecount--;
1137	simple_lock(&vnode_free_list_slock);
1138	if (vp->v_flag & VAGE) {
1139		vp->v_flag &= ~VAGE;
1140		if(vp->v_tag != VT_TFS)
1141			TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
1142	} else {
1143		if(vp->v_tag != VT_TFS)
1144			TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
1145	}
1146	freevnodes++;
1147	simple_unlock(&vnode_free_list_slock);
1148}
1149
1150/*
1151 * vput(), just unlock and vrele()
1152 */
1153void
1154vput(vp)
1155	struct vnode *vp;
1156{
1157	vputrele(vp, 1);
1158}
1159
1160void
1161vrele(vp)
1162	struct vnode *vp;
1163{
1164	vputrele(vp, 0);
1165}
1166
1167#ifdef DIAGNOSTIC
1168/*
1169 * Page or buffer structure gets a reference.
1170 */
1171void
1172vhold(vp)
1173	register struct vnode *vp;
1174{
1175
1176	simple_lock(&vp->v_interlock);
1177	vp->v_holdcnt++;
1178	simple_unlock(&vp->v_interlock);
1179}
1180
1181/*
1182 * Page or buffer structure frees a reference.
1183 */
1184void
1185holdrele(vp)
1186	register struct vnode *vp;
1187{
1188
1189	simple_lock(&vp->v_interlock);
1190	if (vp->v_holdcnt <= 0)
1191		panic("holdrele: holdcnt");
1192	vp->v_holdcnt--;
1193	simple_unlock(&vp->v_interlock);
1194}
1195#endif /* DIAGNOSTIC */
1196
1197/*
1198 * Remove any vnodes in the vnode table belonging to mount point mp.
1199 *
1200 * If MNT_NOFORCE is specified, there should not be any active ones,
1201 * return error if any are found (nb: this is a user error, not a
1202 * system error). If MNT_FORCE is specified, detach any active vnodes
1203 * that are found.
1204 */
1205#ifdef DIAGNOSTIC
1206static int busyprt = 0;		/* print out busy vnodes */
1207SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "");
1208#endif
1209
1210int
1211vflush(mp, skipvp, flags)
1212	struct mount *mp;
1213	struct vnode *skipvp;
1214	int flags;
1215{
1216	struct proc *p = curproc;	/* XXX */
1217	struct vnode *vp, *nvp;
1218	int busy = 0;
1219
1220	simple_lock(&mntvnode_slock);
1221loop:
1222	for (vp = mp->mnt_vnodelist.lh_first; vp; vp = nvp) {
1223		/*
1224		 * Make sure this vnode wasn't reclaimed in getnewvnode().
1225		 * Start over if it has (it won't be on the list anymore).
1226		 */
1227		if (vp->v_mount != mp)
1228			goto loop;
1229		nvp = vp->v_mntvnodes.le_next;
1230		/*
1231		 * Skip over a selected vnode.
1232		 */
1233		if (vp == skipvp)
1234			continue;
1235
1236		simple_lock(&vp->v_interlock);
1237		/*
1238		 * Skip over a vnodes marked VSYSTEM.
1239		 */
1240		if ((flags & SKIPSYSTEM) && (vp->v_flag & VSYSTEM)) {
1241			simple_unlock(&vp->v_interlock);
1242			continue;
1243		}
1244		/*
1245		 * If WRITECLOSE is set, only flush out regular file vnodes
1246		 * open for writing.
1247		 */
1248		if ((flags & WRITECLOSE) &&
1249		    (vp->v_writecount == 0 || vp->v_type != VREG)) {
1250			simple_unlock(&vp->v_interlock);
1251			continue;
1252		}
1253
1254		/*
1255		 * With v_usecount == 0, all we need to do is clear out the
1256		 * vnode data structures and we are done.
1257		 */
1258		if (vp->v_usecount == 0) {
1259			simple_unlock(&mntvnode_slock);
1260			vgonel(vp, p);
1261			simple_lock(&mntvnode_slock);
1262			continue;
1263		}
1264
1265		/*
1266		 * If FORCECLOSE is set, forcibly close the vnode. For block
1267		 * or character devices, revert to an anonymous device. For
1268		 * all other files, just kill them.
1269		 */
1270		if (flags & FORCECLOSE) {
1271			simple_unlock(&mntvnode_slock);
1272			if (vp->v_type != VBLK && vp->v_type != VCHR) {
1273				vgonel(vp, p);
1274			} else {
1275				vclean(vp, 0, p);
1276				vp->v_op = spec_vnodeop_p;
1277				insmntque(vp, (struct mount *) 0);
1278			}
1279			simple_lock(&mntvnode_slock);
1280			continue;
1281		}
1282#ifdef DIAGNOSTIC
1283		if (busyprt)
1284			vprint("vflush: busy vnode", vp);
1285#endif
1286		simple_unlock(&vp->v_interlock);
1287		busy++;
1288	}
1289	simple_unlock(&mntvnode_slock);
1290	if (busy)
1291		return (EBUSY);
1292	return (0);
1293}
1294
1295/*
1296 * Disassociate the underlying file system from a vnode.
1297 */
1298static void
1299vclean(struct vnode *vp, int flags, struct proc *p)
1300{
1301	int active, irefed;
1302	vm_object_t object;
1303
1304	/*
1305	 * Check to see if the vnode is in use. If so we have to reference it
1306	 * before we clean it out so that its count cannot fall to zero and
1307	 * generate a race against ourselves to recycle it.
1308	 */
1309	if ((active = vp->v_usecount))
1310		vp->v_usecount++;
1311	/*
1312	 * Prevent the vnode from being recycled or brought into use while we
1313	 * clean it out.
1314	 */
1315	if (vp->v_flag & VXLOCK)
1316		panic("vclean: deadlock");
1317	vp->v_flag |= VXLOCK;
1318	/*
1319	 * Even if the count is zero, the VOP_INACTIVE routine may still
1320	 * have the object locked while it cleans it out. The VOP_LOCK
1321	 * ensures that the VOP_INACTIVE routine is done with its work.
1322	 * For active vnodes, it ensures that no other activity can
1323	 * occur while the underlying object is being cleaned out.
1324	 */
1325	VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK, p);
1326
1327	object = vp->v_object;
1328	irefed = 0;
1329	if (object && ((object->flags & OBJ_DEAD) == 0)) {
1330		if (object->ref_count == 0) {
1331			vm_object_reference(object);
1332			irefed = 1;
1333		}
1334		++object->ref_count;
1335		pager_cache(object, FALSE);
1336	}
1337
1338	/*
1339	 * Clean out any buffers associated with the vnode.
1340	 */
1341	if (flags & DOCLOSE)
1342		vinvalbuf(vp, V_SAVE, NOCRED, p, 0, 0);
1343
1344	if (irefed) {
1345		vm_object_deallocate(object);
1346	}
1347
1348	/*
1349	 * If purging an active vnode, it must be closed and
1350	 * deactivated before being reclaimed. Note that the
1351	 * VOP_INACTIVE will unlock the vnode.
1352	 */
1353	if (active) {
1354		if (flags & DOCLOSE)
1355			VOP_CLOSE(vp, IO_NDELAY, NOCRED, p);
1356		VOP_INACTIVE(vp, p);
1357	} else {
1358		/*
1359		 * Any other processes trying to obtain this lock must first
1360		 * wait for VXLOCK to clear, then call the new lock operation.
1361		 */
1362		VOP_UNLOCK(vp, 0, p);
1363	}
1364	/*
1365	 * Reclaim the vnode.
1366	 */
1367	if (VOP_RECLAIM(vp, p))
1368		panic("vclean: cannot reclaim");
1369	if (active)
1370		vrele(vp);
1371	cache_purge(vp);
1372	if (vp->v_vnlock) {
1373#ifdef DIAGNOSTIC
1374		if ((vp->v_vnlock->lk_flags & LK_DRAINED) == 0)
1375			vprint("vclean: lock not drained", vp);
1376#endif
1377		FREE(vp->v_vnlock, M_VNODE);
1378		vp->v_vnlock = NULL;
1379	}
1380
1381	/*
1382	 * Done with purge, notify sleepers of the grim news.
1383	 */
1384	vp->v_op = dead_vnodeop_p;
1385	vp->v_tag = VT_NON;
1386	vp->v_flag &= ~VXLOCK;
1387	if (vp->v_flag & VXWANT) {
1388		vp->v_flag &= ~VXWANT;
1389		wakeup((caddr_t) vp);
1390	}
1391}
1392
1393/*
1394 * Eliminate all activity associated with the requested vnode
1395 * and with all vnodes aliased to the requested vnode.
1396 */
1397int
1398vop_revoke(ap)
1399	struct vop_revoke_args /* {
1400		struct vnode *a_vp;
1401		int a_flags;
1402	} */ *ap;
1403{
1404	struct vnode *vp, *vq;
1405	struct proc *p = curproc;	/* XXX */
1406
1407#ifdef DIAGNOSTIC
1408	if ((ap->a_flags & REVOKEALL) == 0)
1409		panic("vop_revoke");
1410#endif
1411
1412	vp = ap->a_vp;
1413	simple_lock(&vp->v_interlock);
1414
1415	if (vp->v_flag & VALIASED) {
1416		/*
1417		 * If a vgone (or vclean) is already in progress,
1418		 * wait until it is done and return.
1419		 */
1420		if (vp->v_flag & VXLOCK) {
1421			vp->v_flag |= VXWANT;
1422			simple_unlock(&vp->v_interlock);
1423			tsleep((caddr_t)vp, PINOD, "vop_revokeall", 0);
1424			return (0);
1425		}
1426		/*
1427		 * Ensure that vp will not be vgone'd while we
1428		 * are eliminating its aliases.
1429		 */
1430		vp->v_flag |= VXLOCK;
1431		simple_unlock(&vp->v_interlock);
1432		while (vp->v_flag & VALIASED) {
1433			simple_lock(&spechash_slock);
1434			for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
1435				if (vq->v_rdev != vp->v_rdev ||
1436				    vq->v_type != vp->v_type || vp == vq)
1437					continue;
1438				simple_unlock(&spechash_slock);
1439				vgone(vq);
1440				break;
1441			}
1442			if (vq == NULLVP) {
1443				simple_unlock(&spechash_slock);
1444			}
1445		}
1446		/*
1447		 * Remove the lock so that vgone below will
1448		 * really eliminate the vnode after which time
1449		 * vgone will awaken any sleepers.
1450		 */
1451		simple_lock(&vp->v_interlock);
1452		vp->v_flag &= ~VXLOCK;
1453	}
1454	vgonel(vp, p);
1455	return (0);
1456}
1457
1458/*
1459 * Recycle an unused vnode to the front of the free list.
1460 * Release the passed interlock if the vnode will be recycled.
1461 */
1462int
1463vrecycle(vp, inter_lkp, p)
1464	struct vnode *vp;
1465	struct simplelock *inter_lkp;
1466	struct proc *p;
1467{
1468
1469	simple_lock(&vp->v_interlock);
1470	if (vp->v_usecount == 0) {
1471		if (inter_lkp) {
1472			simple_unlock(inter_lkp);
1473		}
1474		vgonel(vp, p);
1475		return (1);
1476	}
1477	simple_unlock(&vp->v_interlock);
1478	return (0);
1479}
1480
1481/*
1482 * Eliminate all activity associated with a vnode
1483 * in preparation for reuse.
1484 */
1485void
1486vgone(vp)
1487	register struct vnode *vp;
1488{
1489	struct proc *p = curproc;	/* XXX */
1490
1491	simple_lock(&vp->v_interlock);
1492	vgonel(vp, p);
1493}
1494
1495/*
1496 * vgone, with the vp interlock held.
1497 */
1498static void
1499vgonel(vp, p)
1500	struct vnode *vp;
1501	struct proc *p;
1502{
1503	struct vnode *vq;
1504	struct vnode *vx;
1505
1506	/*
1507	 * If a vgone (or vclean) is already in progress,
1508	 * wait until it is done and return.
1509	 */
1510	if (vp->v_flag & VXLOCK) {
1511		vp->v_flag |= VXWANT;
1512		simple_unlock(&vp->v_interlock);
1513		tsleep((caddr_t)vp, PINOD, "vgone", 0);
1514		return;
1515	}
1516
1517	if (vp->v_object) {
1518		vp->v_object->flags |= OBJ_VNODE_GONE;
1519	}
1520
1521	/*
1522	 * Clean out the filesystem specific data.
1523	 */
1524	vclean(vp, DOCLOSE, p);
1525	/*
1526	 * Delete from old mount point vnode list, if on one.
1527	 */
1528	if (vp->v_mount != NULL)
1529		insmntque(vp, (struct mount *)0);
1530	/*
1531	 * If special device, remove it from special device alias list
1532	 * if it is on one.
1533	 */
1534	if ((vp->v_type == VBLK || vp->v_type == VCHR) && vp->v_specinfo != 0) {
1535		simple_lock(&spechash_slock);
1536		if (*vp->v_hashchain == vp) {
1537			*vp->v_hashchain = vp->v_specnext;
1538		} else {
1539			for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
1540				if (vq->v_specnext != vp)
1541					continue;
1542				vq->v_specnext = vp->v_specnext;
1543				break;
1544			}
1545			if (vq == NULL)
1546				panic("missing bdev");
1547		}
1548		if (vp->v_flag & VALIASED) {
1549			vx = NULL;
1550			for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
1551				if (vq->v_rdev != vp->v_rdev ||
1552				    vq->v_type != vp->v_type)
1553					continue;
1554				if (vx)
1555					break;
1556				vx = vq;
1557			}
1558			if (vx == NULL)
1559				panic("missing alias");
1560			if (vq == NULL)
1561				vx->v_flag &= ~VALIASED;
1562			vp->v_flag &= ~VALIASED;
1563		}
1564		simple_unlock(&spechash_slock);
1565		FREE(vp->v_specinfo, M_VNODE);
1566		vp->v_specinfo = NULL;
1567	}
1568
1569	/*
1570	 * If it is on the freelist and not already at the head,
1571	 * move it to the head of the list. The test of the back
1572	 * pointer and the reference count of zero is because
1573	 * it will be removed from the free list by getnewvnode,
1574	 * but will not have its reference count incremented until
1575	 * after calling vgone. If the reference count were
1576	 * incremented first, vgone would (incorrectly) try to
1577	 * close the previous instance of the underlying object.
1578	 * So, the back pointer is explicitly set to `0xdeadb' in
1579	 * getnewvnode after removing it from the freelist to ensure
1580	 * that we do not try to move it here.
1581	 */
1582	if (vp->v_usecount == 0) {
1583		simple_lock(&vnode_free_list_slock);
1584		if ((vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb) &&
1585			vnode_free_list.tqh_first != vp) {
1586			TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
1587			TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
1588		}
1589		simple_unlock(&vnode_free_list_slock);
1590	}
1591
1592	vp->v_type = VBAD;
1593}
1594
1595/*
1596 * Lookup a vnode by device number.
1597 */
1598int
1599vfinddev(dev, type, vpp)
1600	dev_t dev;
1601	enum vtype type;
1602	struct vnode **vpp;
1603{
1604	register struct vnode *vp;
1605	int rc = 0;
1606
1607	simple_lock(&spechash_slock);
1608	for (vp = speclisth[SPECHASH(dev)]; vp; vp = vp->v_specnext) {
1609		if (dev != vp->v_rdev || type != vp->v_type)
1610			continue;
1611		*vpp = vp;
1612		rc = 1;
1613		break;
1614	}
1615	simple_unlock(&spechash_slock);
1616	return (rc);
1617}
1618
1619/*
1620 * Calculate the total number of references to a special device.
1621 */
1622int
1623vcount(vp)
1624	register struct vnode *vp;
1625{
1626	struct vnode *vq, *vnext;
1627	int count;
1628
1629loop:
1630	if ((vp->v_flag & VALIASED) == 0)
1631		return (vp->v_usecount);
1632	simple_lock(&spechash_slock);
1633	for (count = 0, vq = *vp->v_hashchain; vq; vq = vnext) {
1634		vnext = vq->v_specnext;
1635		if (vq->v_rdev != vp->v_rdev || vq->v_type != vp->v_type)
1636			continue;
1637		/*
1638		 * Alias, but not in use, so flush it out.
1639		 */
1640		if (vq->v_usecount == 0 && vq != vp) {
1641			simple_unlock(&spechash_slock);
1642			vgone(vq);
1643			goto loop;
1644		}
1645		count += vq->v_usecount;
1646	}
1647	simple_unlock(&spechash_slock);
1648	return (count);
1649}
1650
1651/*
1652 * Print out a description of a vnode.
1653 */
1654static char *typename[] =
1655{"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"};
1656
1657void
1658vprint(label, vp)
1659	char *label;
1660	register struct vnode *vp;
1661{
1662	char buf[64];
1663
1664	if (label != NULL)
1665		printf("%s: %x: ", label, vp);
1666	else
1667		printf("%x: ", vp);
1668	printf("type %s, usecount %d, writecount %d, refcount %ld,",
1669	    typename[vp->v_type], vp->v_usecount, vp->v_writecount,
1670	    vp->v_holdcnt);
1671	buf[0] = '\0';
1672	if (vp->v_flag & VROOT)
1673		strcat(buf, "|VROOT");
1674	if (vp->v_flag & VTEXT)
1675		strcat(buf, "|VTEXT");
1676	if (vp->v_flag & VSYSTEM)
1677		strcat(buf, "|VSYSTEM");
1678	if (vp->v_flag & VXLOCK)
1679		strcat(buf, "|VXLOCK");
1680	if (vp->v_flag & VXWANT)
1681		strcat(buf, "|VXWANT");
1682	if (vp->v_flag & VBWAIT)
1683		strcat(buf, "|VBWAIT");
1684	if (vp->v_flag & VALIASED)
1685		strcat(buf, "|VALIASED");
1686	if (buf[0] != '\0')
1687		printf(" flags (%s)", &buf[1]);
1688	if (vp->v_data == NULL) {
1689		printf("\n");
1690	} else {
1691		printf("\n\t");
1692		VOP_PRINT(vp);
1693	}
1694}
1695
1696#ifdef DDB
1697/*
1698 * List all of the locked vnodes in the system.
1699 * Called when debugging the kernel.
1700 */
1701void
1702printlockedvnodes()
1703{
1704	struct proc *p = curproc;	/* XXX */
1705	struct mount *mp, *nmp;
1706	struct vnode *vp;
1707
1708	printf("Locked vnodes\n");
1709	simple_lock(&mountlist_slock);
1710	for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) {
1711		if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) {
1712			nmp = mp->mnt_list.cqe_next;
1713			continue;
1714		}
1715		for (vp = mp->mnt_vnodelist.lh_first;
1716		     vp != NULL;
1717		     vp = vp->v_mntvnodes.le_next) {
1718			if (VOP_ISLOCKED(vp))
1719				vprint((char *)0, vp);
1720		}
1721		simple_lock(&mountlist_slock);
1722		nmp = mp->mnt_list.cqe_next;
1723		vfs_unbusy(mp, p);
1724	}
1725	simple_unlock(&mountlist_slock);
1726}
1727#endif
1728
1729/*
1730 * Top level filesystem related information gathering.
1731 */
1732static int	sysctl_ovfs_conf __P(SYSCTL_HANDLER_ARGS);
1733
1734static int
1735vfs_sysctl SYSCTL_HANDLER_ARGS
1736{
1737	int *name = (int *)arg1 - 1;	/* XXX */
1738	u_int namelen = arg2 + 1;	/* XXX */
1739	struct vfsconf *vfsp;
1740
1741#ifndef NO_COMPAT_PRELITE2
1742	/* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */
1743	if (namelen == 1)
1744		return (sysctl_ovfs_conf(oidp, arg1, arg2, req));
1745#endif
1746
1747#ifdef notyet
1748	/* all sysctl names at this level are at least name and field */
1749	if (namelen < 2)
1750		return (ENOTDIR);		/* overloaded */
1751	if (name[0] != VFS_GENERIC) {
1752		for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
1753			if (vfsp->vfc_typenum == name[0])
1754				break;
1755		if (vfsp == NULL)
1756			return (EOPNOTSUPP);
1757		return ((*vfsp->vfc_vfsops->vfs_sysctl)(&name[1], namelen - 1,
1758		    oldp, oldlenp, newp, newlen, p));
1759	}
1760#endif
1761	switch (name[1]) {
1762	case VFS_MAXTYPENUM:
1763		if (namelen != 2)
1764			return (ENOTDIR);
1765		return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int)));
1766	case VFS_CONF:
1767		if (namelen != 3)
1768			return (ENOTDIR);	/* overloaded */
1769		for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
1770			if (vfsp->vfc_typenum == name[2])
1771				break;
1772		if (vfsp == NULL)
1773			return (EOPNOTSUPP);
1774		return (SYSCTL_OUT(req, vfsp, sizeof *vfsp));
1775	}
1776	return (EOPNOTSUPP);
1777}
1778
1779SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD, vfs_sysctl,
1780	"Generic filesystem");
1781
1782#ifndef NO_COMPAT_PRELITE2
1783
1784static int
1785sysctl_ovfs_conf SYSCTL_HANDLER_ARGS
1786{
1787	int error;
1788	struct vfsconf *vfsp;
1789	struct ovfsconf ovfs;
1790
1791	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
1792		ovfs.vfc_vfsops = vfsp->vfc_vfsops;	/* XXX used as flag */
1793		strcpy(ovfs.vfc_name, vfsp->vfc_name);
1794		ovfs.vfc_index = vfsp->vfc_typenum;
1795		ovfs.vfc_refcount = vfsp->vfc_refcount;
1796		ovfs.vfc_flags = vfsp->vfc_flags;
1797		error = SYSCTL_OUT(req, &ovfs, sizeof ovfs);
1798		if (error)
1799			return error;
1800	}
1801	return 0;
1802}
1803
1804#endif /* !NO_COMPAT_PRELITE2 */
1805
1806int kinfo_vdebug = 1;
1807int kinfo_vgetfailed;
1808
1809#define KINFO_VNODESLOP	10
1810/*
1811 * Dump vnode list (via sysctl).
1812 * Copyout address of vnode followed by vnode.
1813 */
1814/* ARGSUSED */
1815static int
1816sysctl_vnode SYSCTL_HANDLER_ARGS
1817{
1818	struct proc *p = curproc;	/* XXX */
1819	struct mount *mp, *nmp;
1820	struct vnode *nvp, *vp;
1821	int error;
1822
1823#define VPTRSZ	sizeof (struct vnode *)
1824#define VNODESZ	sizeof (struct vnode)
1825
1826	req->lock = 0;
1827	if (!req->oldptr) /* Make an estimate */
1828		return (SYSCTL_OUT(req, 0,
1829			(numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ)));
1830
1831	simple_lock(&mountlist_slock);
1832	for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) {
1833		if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) {
1834			nmp = mp->mnt_list.cqe_next;
1835			continue;
1836		}
1837again:
1838		simple_lock(&mntvnode_slock);
1839		for (vp = mp->mnt_vnodelist.lh_first;
1840		     vp != NULL;
1841		     vp = nvp) {
1842			/*
1843			 * Check that the vp is still associated with
1844			 * this filesystem.  RACE: could have been
1845			 * recycled onto the same filesystem.
1846			 */
1847			if (vp->v_mount != mp) {
1848				simple_unlock(&mntvnode_slock);
1849				if (kinfo_vdebug)
1850					printf("kinfo: vp changed\n");
1851				goto again;
1852			}
1853			nvp = vp->v_mntvnodes.le_next;
1854			simple_unlock(&mntvnode_slock);
1855			if ((error = SYSCTL_OUT(req, &vp, VPTRSZ)) ||
1856			    (error = SYSCTL_OUT(req, vp, VNODESZ)))
1857				return (error);
1858			simple_lock(&mntvnode_slock);
1859		}
1860		simple_unlock(&mntvnode_slock);
1861		simple_lock(&mountlist_slock);
1862		nmp = mp->mnt_list.cqe_next;
1863		vfs_unbusy(mp, p);
1864	}
1865	simple_unlock(&mountlist_slock);
1866
1867	return (0);
1868}
1869
1870/*
1871 * XXX
1872 * Exporting the vnode list on large systems causes them to crash.
1873 * Exporting the vnode list on medium systems causes sysctl to coredump.
1874 */
1875#if 0
1876SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD,
1877	0, 0, sysctl_vnode, "S,vnode", "");
1878#endif
1879
1880/*
1881 * Check to see if a filesystem is mounted on a block device.
1882 */
1883int
1884vfs_mountedon(vp)
1885	struct vnode *vp;
1886{
1887	struct vnode *vq;
1888	int error = 0;
1889
1890	if (vp->v_specflags & SI_MOUNTEDON)
1891		return (EBUSY);
1892	if (vp->v_flag & VALIASED) {
1893		simple_lock(&spechash_slock);
1894		for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
1895			if (vq->v_rdev != vp->v_rdev ||
1896			    vq->v_type != vp->v_type)
1897				continue;
1898			if (vq->v_specflags & SI_MOUNTEDON) {
1899				error = EBUSY;
1900				break;
1901			}
1902		}
1903		simple_unlock(&spechash_slock);
1904	}
1905	return (error);
1906}
1907
1908/*
1909 * Unmount all filesystems. The list is traversed in reverse order
1910 * of mounting to avoid dependencies.
1911 */
1912void
1913vfs_unmountall()
1914{
1915	struct mount *mp, *nmp;
1916	struct proc *p = initproc;	/* XXX XXX should this be proc0? */
1917	int error;
1918
1919	/*
1920	 * Since this only runs when rebooting, it is not interlocked.
1921	 */
1922	for (mp = mountlist.cqh_last; mp != (void *)&mountlist; mp = nmp) {
1923		nmp = mp->mnt_list.cqe_prev;
1924		error = dounmount(mp, MNT_FORCE, p);
1925		if (error) {
1926			printf("unmount of %s failed (",
1927			    mp->mnt_stat.f_mntonname);
1928			if (error == EBUSY)
1929				printf("BUSY)\n");
1930			else
1931				printf("%d)\n", error);
1932		}
1933	}
1934}
1935
1936/*
1937 * Build hash lists of net addresses and hang them off the mount point.
1938 * Called by ufs_mount() to set up the lists of export addresses.
1939 */
1940static int
1941vfs_hang_addrlist(struct mount *mp, struct netexport *nep,
1942	struct export_args *argp)
1943{
1944	register struct netcred *np;
1945	register struct radix_node_head *rnh;
1946	register int i;
1947	struct radix_node *rn;
1948	struct sockaddr *saddr, *smask = 0;
1949	struct domain *dom;
1950	int error;
1951
1952	if (argp->ex_addrlen == 0) {
1953		if (mp->mnt_flag & MNT_DEFEXPORTED)
1954			return (EPERM);
1955		np = &nep->ne_defexported;
1956		np->netc_exflags = argp->ex_flags;
1957		np->netc_anon = argp->ex_anon;
1958		np->netc_anon.cr_ref = 1;
1959		mp->mnt_flag |= MNT_DEFEXPORTED;
1960		return (0);
1961	}
1962	i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen;
1963	np = (struct netcred *) malloc(i, M_NETADDR, M_WAITOK);
1964	bzero((caddr_t) np, i);
1965	saddr = (struct sockaddr *) (np + 1);
1966	if ((error = copyin(argp->ex_addr, (caddr_t) saddr, argp->ex_addrlen)))
1967		goto out;
1968	if (saddr->sa_len > argp->ex_addrlen)
1969		saddr->sa_len = argp->ex_addrlen;
1970	if (argp->ex_masklen) {
1971		smask = (struct sockaddr *) ((caddr_t) saddr + argp->ex_addrlen);
1972		error = copyin(argp->ex_mask, (caddr_t) smask, argp->ex_masklen);
1973		if (error)
1974			goto out;
1975		if (smask->sa_len > argp->ex_masklen)
1976			smask->sa_len = argp->ex_masklen;
1977	}
1978	i = saddr->sa_family;
1979	if ((rnh = nep->ne_rtable[i]) == 0) {
1980		/*
1981		 * Seems silly to initialize every AF when most are not used,
1982		 * do so on demand here
1983		 */
1984		for (dom = domains; dom; dom = dom->dom_next)
1985			if (dom->dom_family == i && dom->dom_rtattach) {
1986				dom->dom_rtattach((void **) &nep->ne_rtable[i],
1987				    dom->dom_rtoffset);
1988				break;
1989			}
1990		if ((rnh = nep->ne_rtable[i]) == 0) {
1991			error = ENOBUFS;
1992			goto out;
1993		}
1994	}
1995	rn = (*rnh->rnh_addaddr) ((caddr_t) saddr, (caddr_t) smask, rnh,
1996	    np->netc_rnodes);
1997	if (rn == 0 || np != (struct netcred *) rn) {	/* already exists */
1998		error = EPERM;
1999		goto out;
2000	}
2001	np->netc_exflags = argp->ex_flags;
2002	np->netc_anon = argp->ex_anon;
2003	np->netc_anon.cr_ref = 1;
2004	return (0);
2005out:
2006	free(np, M_NETADDR);
2007	return (error);
2008}
2009
2010/* ARGSUSED */
2011static int
2012vfs_free_netcred(struct radix_node *rn, void *w)
2013{
2014	register struct radix_node_head *rnh = (struct radix_node_head *) w;
2015
2016	(*rnh->rnh_deladdr) (rn->rn_key, rn->rn_mask, rnh);
2017	free((caddr_t) rn, M_NETADDR);
2018	return (0);
2019}
2020
2021/*
2022 * Free the net address hash lists that are hanging off the mount points.
2023 */
2024static void
2025vfs_free_addrlist(struct netexport *nep)
2026{
2027	register int i;
2028	register struct radix_node_head *rnh;
2029
2030	for (i = 0; i <= AF_MAX; i++)
2031		if ((rnh = nep->ne_rtable[i])) {
2032			(*rnh->rnh_walktree) (rnh, vfs_free_netcred,
2033			    (caddr_t) rnh);
2034			free((caddr_t) rnh, M_RTABLE);
2035			nep->ne_rtable[i] = 0;
2036		}
2037}
2038
2039int
2040vfs_export(mp, nep, argp)
2041	struct mount *mp;
2042	struct netexport *nep;
2043	struct export_args *argp;
2044{
2045	int error;
2046
2047	if (argp->ex_flags & MNT_DELEXPORT) {
2048		if (mp->mnt_flag & MNT_EXPUBLIC) {
2049			vfs_setpublicfs(NULL, NULL, NULL);
2050			mp->mnt_flag &= ~MNT_EXPUBLIC;
2051		}
2052		vfs_free_addrlist(nep);
2053		mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED);
2054	}
2055	if (argp->ex_flags & MNT_EXPORTED) {
2056		if (argp->ex_flags & MNT_EXPUBLIC) {
2057			if ((error = vfs_setpublicfs(mp, nep, argp)) != 0)
2058				return (error);
2059			mp->mnt_flag |= MNT_EXPUBLIC;
2060		}
2061		if ((error = vfs_hang_addrlist(mp, nep, argp)))
2062			return (error);
2063		mp->mnt_flag |= MNT_EXPORTED;
2064	}
2065	return (0);
2066}
2067
2068
2069/*
2070 * Set the publicly exported filesystem (WebNFS). Currently, only
2071 * one public filesystem is possible in the spec (RFC 2054 and 2055)
2072 */
2073int
2074vfs_setpublicfs(mp, nep, argp)
2075	struct mount *mp;
2076	struct netexport *nep;
2077	struct export_args *argp;
2078{
2079	int error;
2080	struct vnode *rvp;
2081	char *cp;
2082
2083	/*
2084	 * mp == NULL -> invalidate the current info, the FS is
2085	 * no longer exported. May be called from either vfs_export
2086	 * or unmount, so check if it hasn't already been done.
2087	 */
2088	if (mp == NULL) {
2089		if (nfs_pub.np_valid) {
2090			nfs_pub.np_valid = 0;
2091			if (nfs_pub.np_index != NULL) {
2092				FREE(nfs_pub.np_index, M_TEMP);
2093				nfs_pub.np_index = NULL;
2094			}
2095		}
2096		return (0);
2097	}
2098
2099	/*
2100	 * Only one allowed at a time.
2101	 */
2102	if (nfs_pub.np_valid != 0 && mp != nfs_pub.np_mount)
2103		return (EBUSY);
2104
2105	/*
2106	 * Get real filehandle for root of exported FS.
2107	 */
2108	bzero((caddr_t)&nfs_pub.np_handle, sizeof(nfs_pub.np_handle));
2109	nfs_pub.np_handle.fh_fsid = mp->mnt_stat.f_fsid;
2110
2111	if ((error = VFS_ROOT(mp, &rvp)))
2112		return (error);
2113
2114	if ((error = VFS_VPTOFH(rvp, &nfs_pub.np_handle.fh_fid)))
2115		return (error);
2116
2117	vput(rvp);
2118
2119	/*
2120	 * If an indexfile was specified, pull it in.
2121	 */
2122	if (argp->ex_indexfile != NULL) {
2123		MALLOC(nfs_pub.np_index, char *, MAXNAMLEN + 1, M_TEMP,
2124		    M_WAITOK);
2125		error = copyinstr(argp->ex_indexfile, nfs_pub.np_index,
2126		    MAXNAMLEN, (size_t *)0);
2127		if (!error) {
2128			/*
2129			 * Check for illegal filenames.
2130			 */
2131			for (cp = nfs_pub.np_index; *cp; cp++) {
2132				if (*cp == '/') {
2133					error = EINVAL;
2134					break;
2135				}
2136			}
2137		}
2138		if (error) {
2139			FREE(nfs_pub.np_index, M_TEMP);
2140			return (error);
2141		}
2142	}
2143
2144	nfs_pub.np_mount = mp;
2145	nfs_pub.np_valid = 1;
2146	return (0);
2147}
2148
2149struct netcred *
2150vfs_export_lookup(mp, nep, nam)
2151	register struct mount *mp;
2152	struct netexport *nep;
2153	struct sockaddr *nam;
2154{
2155	register struct netcred *np;
2156	register struct radix_node_head *rnh;
2157	struct sockaddr *saddr;
2158
2159	np = NULL;
2160	if (mp->mnt_flag & MNT_EXPORTED) {
2161		/*
2162		 * Lookup in the export list first.
2163		 */
2164		if (nam != NULL) {
2165			saddr = nam;
2166			rnh = nep->ne_rtable[saddr->sa_family];
2167			if (rnh != NULL) {
2168				np = (struct netcred *)
2169					(*rnh->rnh_matchaddr)((caddr_t)saddr,
2170							      rnh);
2171				if (np && np->netc_rnodes->rn_flags & RNF_ROOT)
2172					np = NULL;
2173			}
2174		}
2175		/*
2176		 * If no address match, use the default if it exists.
2177		 */
2178		if (np == NULL && mp->mnt_flag & MNT_DEFEXPORTED)
2179			np = &nep->ne_defexported;
2180	}
2181	return (np);
2182}
2183
2184/*
2185 * perform msync on all vnodes under a mount point
2186 * the mount point must be locked.
2187 */
2188void
2189vfs_msync(struct mount *mp, int flags) {
2190	struct vnode *vp, *nvp;
2191loop:
2192	for (vp = mp->mnt_vnodelist.lh_first; vp != NULL; vp = nvp) {
2193
2194		if (vp->v_mount != mp)
2195			goto loop;
2196		nvp = vp->v_mntvnodes.le_next;
2197		if (VOP_ISLOCKED(vp) && (flags != MNT_WAIT))
2198			continue;
2199		if (vp->v_object &&
2200		   (vp->v_object->flags & OBJ_MIGHTBEDIRTY)) {
2201			vm_object_page_clean(vp->v_object, 0, 0, TRUE, TRUE);
2202		}
2203	}
2204}
2205
2206/*
2207 * Create the VM object needed for VMIO and mmap support.  This
2208 * is done for all VREG files in the system.  Some filesystems might
2209 * afford the additional metadata buffering capability of the
2210 * VMIO code by making the device node be VMIO mode also.
2211 */
2212int
2213vfs_object_create(vp, p, cred, waslocked)
2214	struct vnode *vp;
2215	struct proc *p;
2216	struct ucred *cred;
2217	int waslocked;
2218{
2219	struct vattr vat;
2220	vm_object_t object;
2221	int error = 0;
2222
2223retry:
2224	if ((object = vp->v_object) == NULL) {
2225		if (vp->v_type == VREG) {
2226			if ((error = VOP_GETATTR(vp, &vat, cred, p)) != 0)
2227				goto retn;
2228			(void) vnode_pager_alloc(vp,
2229				OFF_TO_IDX(round_page(vat.va_size)), 0, 0);
2230		} else {
2231			/*
2232			 * This simply allocates the biggest object possible
2233			 * for a VBLK vnode.  This should be fixed, but doesn't
2234			 * cause any problems (yet).
2235			 */
2236			(void) vnode_pager_alloc(vp, INT_MAX, 0, 0);
2237		}
2238		vp->v_object->flags |= OBJ_VFS_REF;
2239	} else {
2240		if (object->flags & OBJ_DEAD) {
2241			if (waslocked)
2242				VOP_UNLOCK(vp, 0, p);
2243			tsleep(object, PVM, "vodead", 0);
2244			if (waslocked)
2245				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
2246			goto retry;
2247		}
2248		if ((object->flags & OBJ_VFS_REF) == 0) {
2249			object->flags |= OBJ_VFS_REF;
2250			vm_object_reference(object);
2251		}
2252	}
2253	if (vp->v_object)
2254		vp->v_flag |= VVMIO;
2255
2256retn:
2257	return error;
2258}
2259
2260void
2261vtouch(vp)
2262	struct vnode *vp;
2263{
2264	simple_lock(&vp->v_interlock);
2265	if (vp->v_usecount) {
2266		simple_unlock(&vp->v_interlock);
2267		return;
2268	}
2269	if (simple_lock_try(&vnode_free_list_slock)) {
2270		if (vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb) {
2271			TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
2272			TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
2273		}
2274		simple_unlock(&vnode_free_list_slock);
2275	}
2276	simple_unlock(&vp->v_interlock);
2277}
2278