vfs_export.c revision 23254
1/*
2 * Copyright (c) 1989, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 *    must display the following acknowledgement:
20 *	This product includes software developed by the University of
21 *	California, Berkeley and its contributors.
22 * 4. Neither the name of the University nor the names of its contributors
23 *    may be used to endorse or promote products derived from this software
24 *    without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 *
38 *	@(#)vfs_subr.c	8.31 (Berkeley) 5/26/95
39 * $Id: vfs_subr.c,v 1.75 1997/02/27 16:08:43 bde Exp $
40 */
41
42/*
43 * External virtual filesystem routines
44 */
45#include "opt_ddb.h"
46#include "opt_devfs.h"
47
48#include <sys/param.h>
49#include <sys/systm.h>
50#include <sys/kernel.h>
51#include <sys/file.h>
52#include <sys/proc.h>
53#include <sys/mount.h>
54#include <sys/time.h>
55#include <sys/vnode.h>
56#include <sys/stat.h>
57#include <sys/namei.h>
58#include <sys/ucred.h>
59#include <sys/buf.h>
60#include <sys/errno.h>
61#include <sys/malloc.h>
62#include <sys/domain.h>
63#include <sys/mbuf.h>
64
65#include <vm/vm.h>
66#include <vm/vm_param.h>
67#include <vm/vm_object.h>
68#include <vm/vm_extern.h>
69#include <vm/vm_pager.h>
70#include <vm/vnode_pager.h>
71#include <sys/sysctl.h>
72
73#include <miscfs/specfs/specdev.h>
74
75#ifdef DDB
76extern void	printlockedvnodes __P((void));
77#endif
78static void	vclean __P((struct vnode *vp, int flags, struct proc *p));
79extern void	vgonel __P((struct vnode *vp, struct proc *p));
80unsigned long	numvnodes;
81extern void	vfs_unmountroot __P((struct mount *rootfs));
82extern void	vputrele __P((struct vnode *vp, int put));
83
84enum vtype iftovt_tab[16] = {
85	VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
86	VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
87};
88int vttoif_tab[9] = {
89	0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
90	S_IFSOCK, S_IFIFO, S_IFMT,
91};
92
93/*
94 * Insq/Remq for the vnode usage lists.
95 */
96#define	bufinsvn(bp, dp)	LIST_INSERT_HEAD(dp, bp, b_vnbufs)
97#define	bufremvn(bp) {							\
98	LIST_REMOVE(bp, b_vnbufs);					\
99	(bp)->b_vnbufs.le_next = NOLIST;				\
100}
101TAILQ_HEAD(freelst, vnode) vnode_free_list;	/* vnode free list */
102static u_long freevnodes = 0;
103
104struct mntlist mountlist;	/* mounted filesystem list */
105struct simplelock mountlist_slock;
106static struct simplelock mntid_slock;
107struct simplelock mntvnode_slock;
108struct simplelock vnode_free_list_slock;
109static struct simplelock spechash_slock;
110
111int desiredvnodes;
112SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW, &desiredvnodes, 0, "");
113
114static void	vfs_free_addrlist __P((struct netexport *nep));
115static int	vfs_free_netcred __P((struct radix_node *rn, void *w));
116static int	vfs_hang_addrlist __P((struct mount *mp, struct netexport *nep,
117				       struct export_args *argp));
118
119/*
120 * Initialize the vnode management data structures.
121 */
122void
123vntblinit()
124{
125
126	desiredvnodes = maxproc + vm_object_cache_max;
127	simple_lock_init(&mntvnode_slock);
128	simple_lock_init(&mntid_slock);
129	simple_lock_init(&spechash_slock);
130	TAILQ_INIT(&vnode_free_list);
131	simple_lock_init(&vnode_free_list_slock);
132	CIRCLEQ_INIT(&mountlist);
133}
134
135/*
136 * Mark a mount point as busy. Used to synchronize access and to delay
137 * unmounting. Interlock is not released on failure.
138 */
139int
140vfs_busy(mp, flags, interlkp, p)
141	struct mount *mp;
142	int flags;
143	struct simplelock *interlkp;
144	struct proc *p;
145{
146	int lkflags;
147
148	if (mp->mnt_flag & MNT_UNMOUNT) {
149		if (flags & LK_NOWAIT)
150			return (ENOENT);
151		mp->mnt_flag |= MNT_MWAIT;
152		if (interlkp) {
153			simple_unlock(interlkp);
154		}
155		/*
156		 * Since all busy locks are shared except the exclusive
157		 * lock granted when unmounting, the only place that a
158		 * wakeup needs to be done is at the release of the
159		 * exclusive lock at the end of dounmount.
160		 */
161		tsleep((caddr_t)mp, PVFS, "vfs_busy", 0);
162		if (interlkp) {
163			simple_lock(interlkp);
164		}
165		return (ENOENT);
166	}
167	lkflags = LK_SHARED;
168	if (interlkp)
169		lkflags |= LK_INTERLOCK;
170	if (lockmgr(&mp->mnt_lock, lkflags, interlkp, p))
171		panic("vfs_busy: unexpected lock failure");
172	return (0);
173}
174
175/*
176 * Free a busy filesystem.
177 */
178void
179vfs_unbusy(mp, p)
180	struct mount *mp;
181	struct proc *p;
182{
183
184	lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, p);
185}
186
187/*
188 * Lookup a filesystem type, and if found allocate and initialize
189 * a mount structure for it.
190 *
191 * Devname is usually updated by mount(8) after booting.
192 */
193int
194vfs_rootmountalloc(fstypename, devname, mpp)
195	char *fstypename;
196	char *devname;
197	struct mount **mpp;
198{
199	struct proc *p = curproc;	/* XXX */
200	struct vfsconf *vfsp;
201	struct mount *mp;
202
203	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
204		if (!strcmp(vfsp->vfc_name, fstypename))
205			break;
206	if (vfsp == NULL)
207		return (ENODEV);
208	mp = malloc((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK);
209	bzero((char *)mp, (u_long)sizeof(struct mount));
210	lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, 0);
211	(void)vfs_busy(mp, LK_NOWAIT, 0, p);
212	LIST_INIT(&mp->mnt_vnodelist);
213	mp->mnt_vfc = vfsp;
214	mp->mnt_op = vfsp->vfc_vfsops;
215	mp->mnt_flag = MNT_RDONLY;
216	mp->mnt_vnodecovered = NULLVP;
217	vfsp->vfc_refcount++;
218	mp->mnt_stat.f_type = vfsp->vfc_typenum;
219	mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
220	strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN);
221	mp->mnt_stat.f_mntonname[0] = '/';
222	mp->mnt_stat.f_mntonname[1] = 0;
223	(void) copystr(devname, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, 0);
224	*mpp = mp;
225	return (0);
226}
227
228/*
229 * Find an appropriate filesystem to use for the root. If a filesystem
230 * has not been preselected, walk through the list of known filesystems
231 * trying those that have mountroot routines, and try them until one
232 * works or we have tried them all.
233 */
234#ifdef notdef	/* XXX JH */
235int
236lite2_vfs_mountroot(void)
237{
238	struct vfsconf *vfsp;
239	extern int (*lite2_mountroot)(void);
240	int error;
241
242	if (lite2_mountroot != NULL)
243		return ((*lite2_mountroot)());
244	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
245		if (vfsp->vfc_mountroot == NULL)
246			continue;
247		if ((error = (*vfsp->vfc_mountroot)()) == 0)
248			return (0);
249		printf("%s_mountroot failed: %d\n", vfsp->vfc_name, error);
250	}
251	return (ENODEV);
252}
253#endif
254
255/*
256 * Lookup a mount point by filesystem identifier.
257 */
258struct mount *
259vfs_getvfs(fsid)
260	fsid_t *fsid;
261{
262	register struct mount *mp;
263
264	simple_lock(&mountlist_slock);
265	for (mp = mountlist.cqh_first; mp != (void *)&mountlist;
266	    mp = mp->mnt_list.cqe_next) {
267		if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
268		    mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
269			simple_unlock(&mountlist_slock);
270			return (mp);
271	    }
272	}
273	simple_unlock(&mountlist_slock);
274	return ((struct mount *) 0);
275}
276
277/*
278 * Get a new unique fsid
279 */
280void
281vfs_getnewfsid(mp)
282	struct mount *mp;
283{
284	static u_short xxxfs_mntid;
285
286	fsid_t tfsid;
287	int mtype;
288
289	simple_lock(&mntid_slock);
290	mtype = mp->mnt_vfc->vfc_typenum;
291	mp->mnt_stat.f_fsid.val[0] = makedev(nblkdev + mtype, 0);
292	mp->mnt_stat.f_fsid.val[1] = mtype;
293	if (xxxfs_mntid == 0)
294		++xxxfs_mntid;
295	tfsid.val[0] = makedev(nblkdev + mtype, xxxfs_mntid);
296	tfsid.val[1] = mtype;
297	if (mountlist.cqh_first != (void *)&mountlist) {
298		while (vfs_getvfs(&tfsid)) {
299			tfsid.val[0]++;
300			xxxfs_mntid++;
301		}
302	}
303	mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
304	simple_unlock(&mntid_slock);
305}
306
307/*
308 * Set vnode attributes to VNOVAL
309 */
310void
311vattr_null(vap)
312	register struct vattr *vap;
313{
314
315	vap->va_type = VNON;
316	vap->va_size = VNOVAL;
317	vap->va_bytes = VNOVAL;
318	vap->va_mode = vap->va_nlink = vap->va_uid = vap->va_gid =
319	    vap->va_fsid = vap->va_fileid =
320	    vap->va_blocksize = vap->va_rdev =
321	    vap->va_atime.tv_sec = vap->va_atime.tv_nsec =
322	    vap->va_mtime.tv_sec = vap->va_mtime.tv_nsec =
323	    vap->va_ctime.tv_sec = vap->va_ctime.tv_nsec =
324	    vap->va_flags = vap->va_gen = VNOVAL;
325	vap->va_vaflags = 0;
326}
327
328/*
329 * Routines having to do with the management of the vnode table.
330 */
331extern vop_t **dead_vnodeop_p;
332
333/*
334 * Return the next vnode from the free list.
335 */
336int
337getnewvnode(tag, mp, vops, vpp)
338	enum vtagtype tag;
339	struct mount *mp;
340	vop_t **vops;
341	struct vnode **vpp;
342{
343	struct proc *p = curproc;	/* XXX */
344	struct vnode *vp;
345
346	simple_lock(&vnode_free_list_slock);
347retry:
348	/*
349	 * we allocate a new vnode if
350	 * 	1. we don't have any free
351	 *		Pretty obvious, we actually used to panic, but that
352	 *		is a silly thing to do.
353	 *	2. we havn't filled our pool yet
354	 *		We don't want to trash the incore (VM-)vnodecache.
355	 *	3. if less that 1/4th of our vnodes are free.
356	 *		We don't want to trash the namei cache either.
357	 */
358	if (freevnodes < (numvnodes >> 2) ||
359	    numvnodes < desiredvnodes ||
360	    vnode_free_list.tqh_first == NULL) {
361		simple_unlock(&vnode_free_list_slock);
362		vp = (struct vnode *) malloc((u_long) sizeof *vp,
363		    M_VNODE, M_WAITOK);
364		bzero((char *) vp, sizeof *vp);
365		numvnodes++;
366	} else {
367		for (vp = vnode_free_list.tqh_first;
368				vp != NULLVP; vp = vp->v_freelist.tqe_next) {
369			if (simple_lock_try(&vp->v_interlock))
370				break;
371		}
372		/*
373		 * Unless this is a bad time of the month, at most
374		 * the first NCPUS items on the free list are
375		 * locked, so this is close enough to being empty.
376		 */
377		if (vp == NULLVP) {
378			simple_unlock(&vnode_free_list_slock);
379			tablefull("vnode");
380			*vpp = 0;
381			return (ENFILE);
382		}
383		if (vp->v_usecount)
384			panic("free vnode isn't");
385		TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
386		if (vp->v_usage > 0) {
387			simple_unlock(&vp->v_interlock);
388			--vp->v_usage;
389			TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
390			goto retry;
391		}
392		freevnodes--;
393
394		/* see comment on why 0xdeadb is set at end of vgone (below) */
395		vp->v_freelist.tqe_prev = (struct vnode **) 0xdeadb;
396		simple_unlock(&vnode_free_list_slock);
397		vp->v_lease = NULL;
398		if (vp->v_type != VBAD)
399			vgonel(vp, p);
400		else {
401			simple_unlock(&vp->v_interlock);
402		}
403
404#ifdef DIAGNOSTIC
405		{
406			int s;
407
408			if (vp->v_data)
409				panic("cleaned vnode isn't");
410			s = splbio();
411			if (vp->v_numoutput)
412				panic("Clean vnode has pending I/O's");
413			splx(s);
414		}
415#endif
416		vp->v_flag = 0;
417		vp->v_lastr = 0;
418		vp->v_lastw = 0;
419		vp->v_lasta = 0;
420		vp->v_cstart = 0;
421		vp->v_clen = 0;
422		vp->v_socket = 0;
423		vp->v_writecount = 0;	/* XXX */
424		vp->v_usage = 0;
425	}
426	vp->v_type = VNON;
427	cache_purge(vp);
428	vp->v_tag = tag;
429	vp->v_op = vops;
430	insmntque(vp, mp);
431	*vpp = vp;
432	vp->v_usecount = 1;
433	vp->v_data = 0;
434	return (0);
435}
436
437/*
438 * Move a vnode from one mount queue to another.
439 */
440void
441insmntque(vp, mp)
442	register struct vnode *vp;
443	register struct mount *mp;
444{
445
446	simple_lock(&mntvnode_slock);
447	/*
448	 * Delete from old mount point vnode list, if on one.
449	 */
450	if (vp->v_mount != NULL)
451		LIST_REMOVE(vp, v_mntvnodes);
452	/*
453	 * Insert into list of vnodes for the new mount point, if available.
454	 */
455	if ((vp->v_mount = mp) == NULL) {
456		simple_unlock(&mntvnode_slock);
457		return;
458	}
459	LIST_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes);
460	simple_unlock(&mntvnode_slock);
461}
462
463/*
464 * Update outstanding I/O count and do wakeup if requested.
465 */
466void
467vwakeup(bp)
468	register struct buf *bp;
469{
470	register struct vnode *vp;
471
472	bp->b_flags &= ~B_WRITEINPROG;
473	if ((vp = bp->b_vp)) {
474		vp->v_numoutput--;
475		if (vp->v_numoutput < 0)
476			panic("vwakeup: neg numoutput");
477		if ((vp->v_numoutput == 0) && (vp->v_flag & VBWAIT)) {
478			vp->v_flag &= ~VBWAIT;
479			wakeup((caddr_t) &vp->v_numoutput);
480		}
481	}
482}
483
484/*
485 * Flush out and invalidate all buffers associated with a vnode.
486 * Called with the underlying object locked.
487 */
488int
489vinvalbuf(vp, flags, cred, p, slpflag, slptimeo)
490	register struct vnode *vp;
491	int flags;
492	struct ucred *cred;
493	struct proc *p;
494	int slpflag, slptimeo;
495{
496	register struct buf *bp;
497	struct buf *nbp, *blist;
498	int s, error;
499	vm_object_t object;
500
501	if (flags & V_SAVE) {
502		if ((error = VOP_FSYNC(vp, cred, MNT_WAIT, p)))
503			return (error);
504		if (vp->v_dirtyblkhd.lh_first != NULL)
505			panic("vinvalbuf: dirty bufs");
506	}
507
508	s = splbio();
509	for (;;) {
510		if ((blist = vp->v_cleanblkhd.lh_first) && (flags & V_SAVEMETA))
511			while (blist && blist->b_lblkno < 0)
512				blist = blist->b_vnbufs.le_next;
513		if (!blist && (blist = vp->v_dirtyblkhd.lh_first) &&
514		    (flags & V_SAVEMETA))
515			while (blist && blist->b_lblkno < 0)
516				blist = blist->b_vnbufs.le_next;
517		if (!blist)
518			break;
519
520		for (bp = blist; bp; bp = nbp) {
521			nbp = bp->b_vnbufs.le_next;
522			if ((flags & V_SAVEMETA) && bp->b_lblkno < 0)
523				continue;
524			if (bp->b_flags & B_BUSY) {
525				bp->b_flags |= B_WANTED;
526				error = tsleep((caddr_t) bp,
527				    slpflag | (PRIBIO + 1), "vinvalbuf",
528				    slptimeo);
529				splx(s);
530				if (error)
531					return (error);
532				break;
533			}
534			bremfree(bp);
535			bp->b_flags |= B_BUSY;
536			/*
537			 * XXX Since there are no node locks for NFS, I
538			 * believe there is a slight chance that a delayed
539			 * write will occur while sleeping just above, so
540			 * check for it.
541			 */
542			if ((bp->b_flags & B_DELWRI) && (flags & V_SAVE)) {
543				(void) VOP_BWRITE(bp);
544				break;
545			}
546			bp->b_flags |= (B_INVAL|B_NOCACHE|B_RELBUF);
547			brelse(bp);
548		}
549	}
550	splx(s);
551
552	s = splbio();
553	while (vp->v_numoutput > 0) {
554		vp->v_flag |= VBWAIT;
555		tsleep(&vp->v_numoutput, PVM, "vnvlbv", 0);
556	}
557	splx(s);
558
559	/*
560	 * Destroy the copy in the VM cache, too.
561	 */
562	object = vp->v_object;
563	if (object != NULL) {
564		vm_object_page_remove(object, 0, object->size,
565		    (flags & V_SAVE) ? TRUE : FALSE);
566	}
567	if (!(flags & V_SAVEMETA) &&
568	    (vp->v_dirtyblkhd.lh_first || vp->v_cleanblkhd.lh_first))
569		panic("vinvalbuf: flush failed");
570	return (0);
571}
572
573/*
574 * Associate a buffer with a vnode.
575 */
576void
577bgetvp(vp, bp)
578	register struct vnode *vp;
579	register struct buf *bp;
580{
581	int s;
582
583	if (bp->b_vp)
584		panic("bgetvp: not free");
585	VHOLD(vp);
586	bp->b_vp = vp;
587	if (vp->v_type == VBLK || vp->v_type == VCHR)
588		bp->b_dev = vp->v_rdev;
589	else
590		bp->b_dev = NODEV;
591	/*
592	 * Insert onto list for new vnode.
593	 */
594	s = splbio();
595	bufinsvn(bp, &vp->v_cleanblkhd);
596	splx(s);
597}
598
599/*
600 * Disassociate a buffer from a vnode.
601 */
602void
603brelvp(bp)
604	register struct buf *bp;
605{
606	struct vnode *vp;
607	int s;
608
609	if (bp->b_vp == (struct vnode *) 0)
610		panic("brelvp: NULL");
611	/*
612	 * Delete from old vnode list, if on one.
613	 */
614	s = splbio();
615	if (bp->b_vnbufs.le_next != NOLIST)
616		bufremvn(bp);
617	splx(s);
618
619	vp = bp->b_vp;
620	bp->b_vp = (struct vnode *) 0;
621	HOLDRELE(vp);
622}
623
624/*
625 * Associate a p-buffer with a vnode.
626 */
627void
628pbgetvp(vp, bp)
629	register struct vnode *vp;
630	register struct buf *bp;
631{
632#if defined(DIAGNOSTIC)
633	if (bp->b_vp)
634		panic("pbgetvp: not free");
635#endif
636	bp->b_vp = vp;
637	if (vp->v_type == VBLK || vp->v_type == VCHR)
638		bp->b_dev = vp->v_rdev;
639	else
640		bp->b_dev = NODEV;
641}
642
643/*
644 * Disassociate a p-buffer from a vnode.
645 */
646void
647pbrelvp(bp)
648	register struct buf *bp;
649{
650	struct vnode *vp;
651
652#if defined(DIAGNOSTIC)
653	if (bp->b_vp == (struct vnode *) 0)
654		panic("pbrelvp: NULL");
655#endif
656
657	bp->b_vp = (struct vnode *) 0;
658}
659
660/*
661 * Reassign a buffer from one vnode to another.
662 * Used to assign file specific control information
663 * (indirect blocks) to the vnode to which they belong.
664 */
665void
666reassignbuf(bp, newvp)
667	register struct buf *bp;
668	register struct vnode *newvp;
669{
670	int s;
671
672	if (newvp == NULL) {
673		printf("reassignbuf: NULL");
674		return;
675	}
676
677	s = splbio();
678	/*
679	 * Delete from old vnode list, if on one.
680	 */
681	if (bp->b_vnbufs.le_next != NOLIST)
682		bufremvn(bp);
683	/*
684	 * If dirty, put on list of dirty buffers; otherwise insert onto list
685	 * of clean buffers.
686	 */
687	if (bp->b_flags & B_DELWRI) {
688		struct buf *tbp;
689
690		tbp = newvp->v_dirtyblkhd.lh_first;
691		if (!tbp || (tbp->b_lblkno > bp->b_lblkno)) {
692			bufinsvn(bp, &newvp->v_dirtyblkhd);
693		} else {
694			while (tbp->b_vnbufs.le_next &&
695				(tbp->b_vnbufs.le_next->b_lblkno < bp->b_lblkno)) {
696				tbp = tbp->b_vnbufs.le_next;
697			}
698			LIST_INSERT_AFTER(tbp, bp, b_vnbufs);
699		}
700	} else {
701		bufinsvn(bp, &newvp->v_cleanblkhd);
702	}
703	splx(s);
704}
705
706#ifndef DEVFS_ROOT
707/*
708 * Create a vnode for a block device.
709 * Used for root filesystem, argdev, and swap areas.
710 * Also used for memory file system special devices.
711 */
712int
713bdevvp(dev, vpp)
714	dev_t dev;
715	struct vnode **vpp;
716{
717	register struct vnode *vp;
718	struct vnode *nvp;
719	int error;
720
721	if (dev == NODEV)
722		return (0);
723	error = getnewvnode(VT_NON, (struct mount *) 0, spec_vnodeop_p, &nvp);
724	if (error) {
725		*vpp = 0;
726		return (error);
727	}
728	vp = nvp;
729	vp->v_type = VBLK;
730	if ((nvp = checkalias(vp, dev, (struct mount *) 0))) {
731		vput(vp);
732		vp = nvp;
733	}
734	*vpp = vp;
735	return (0);
736}
737#endif /* !DEVFS_ROOT */
738
739/*
740 * Check to see if the new vnode represents a special device
741 * for which we already have a vnode (either because of
742 * bdevvp() or because of a different vnode representing
743 * the same block device). If such an alias exists, deallocate
744 * the existing contents and return the aliased vnode. The
745 * caller is responsible for filling it with its new contents.
746 */
747struct vnode *
748checkalias(nvp, nvp_rdev, mp)
749	register struct vnode *nvp;
750	dev_t nvp_rdev;
751	struct mount *mp;
752{
753	struct proc *p = curproc;	/* XXX */
754	struct vnode *vp;
755	struct vnode **vpp;
756
757	if (nvp->v_type != VBLK && nvp->v_type != VCHR)
758		return (NULLVP);
759
760	vpp = &speclisth[SPECHASH(nvp_rdev)];
761loop:
762	simple_lock(&spechash_slock);
763	for (vp = *vpp; vp; vp = vp->v_specnext) {
764		if (nvp_rdev != vp->v_rdev || nvp->v_type != vp->v_type)
765			continue;
766		/*
767		 * Alias, but not in use, so flush it out.
768		 */
769		simple_lock(&vp->v_interlock);
770		if (vp->v_usecount == 0) {
771			simple_unlock(&spechash_slock);
772			vgonel(vp, p);
773			goto loop;
774		}
775		if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, p)) {
776			simple_unlock(&spechash_slock);
777			goto loop;
778		}
779		break;
780	}
781	if (vp == NULL || vp->v_tag != VT_NON) {
782		MALLOC(nvp->v_specinfo, struct specinfo *,
783		    sizeof(struct specinfo), M_VNODE, M_WAITOK);
784		nvp->v_rdev = nvp_rdev;
785		nvp->v_hashchain = vpp;
786		nvp->v_specnext = *vpp;
787		nvp->v_specflags = 0;
788		simple_unlock(&spechash_slock);
789		*vpp = nvp;
790		if (vp != NULLVP) {
791			nvp->v_flag |= VALIASED;
792			vp->v_flag |= VALIASED;
793			vput(vp);
794		}
795		return (NULLVP);
796	}
797	simple_unlock(&spechash_slock);
798	VOP_UNLOCK(vp, 0, p);
799	simple_lock(&vp->v_interlock);
800	vclean(vp, 0, p);
801	vp->v_op = nvp->v_op;
802	vp->v_tag = nvp->v_tag;
803	nvp->v_type = VNON;
804	insmntque(vp, mp);
805	return (vp);
806}
807
808/*
809 * Grab a particular vnode from the free list, increment its
810 * reference count and lock it. The vnode lock bit is set the
811 * vnode is being eliminated in vgone. The process is awakened
812 * when the transition is completed, and an error returned to
813 * indicate that the vnode is no longer usable (possibly having
814 * been changed to a new file system type).
815 */
816int
817vget(vp, flags, p)
818	register struct vnode *vp;
819	int flags;
820	struct proc *p;
821{
822	int error;
823
824	/*
825	 * If the vnode is in the process of being cleaned out for
826	 * another use, we wait for the cleaning to finish and then
827	 * return failure. Cleaning is determined by checking that
828	 * the VXLOCK flag is set.
829	 */
830	if ((flags & LK_INTERLOCK) == 0) {
831		simple_lock(&vp->v_interlock);
832	}
833	if (vp->v_flag & VXLOCK) {
834		vp->v_flag |= VXWANT;
835		simple_unlock(&vp->v_interlock);
836		tsleep((caddr_t)vp, PINOD, "vget", 0);
837		return (ENOENT);
838	}
839	if (vp->v_usecount == 0) {
840		simple_lock(&vnode_free_list_slock);
841		TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
842		simple_unlock(&vnode_free_list_slock);
843		freevnodes--;
844	}
845	vp->v_usecount++;
846	/*
847	 * Create the VM object, if needed
848	 */
849	if ((vp->v_type == VREG) &&
850		((vp->v_object == NULL) ||
851			(vp->v_object->flags & OBJ_VFS_REF) == 0)) {
852		/*
853		 * XXX vfs_object_create probably needs the interlock.
854		 */
855		simple_unlock(&vp->v_interlock);
856		vfs_object_create(vp, curproc, curproc->p_ucred, 0);
857		simple_lock(&vp->v_interlock);
858	}
859	if (flags & LK_TYPE_MASK) {
860		if (error = vn_lock(vp, flags | LK_INTERLOCK, p))
861			vrele(vp);
862		return (error);
863	}
864	simple_unlock(&vp->v_interlock);
865	return (0);
866}
867
868/*
869 * Stubs to use when there is no locking to be done on the underlying object.
870 * A minimal shared lock is necessary to ensure that the underlying object
871 * is not revoked while an operation is in progress. So, an active shared
872 * count is maintained in an auxillary vnode lock structure.
873 */
874int
875vop_nolock(ap)
876	struct vop_lock_args /* {
877		struct vnode *a_vp;
878		int a_flags;
879		struct proc *a_p;
880	} */ *ap;
881{
882#ifdef notyet
883	/*
884	 * This code cannot be used until all the non-locking filesystems
885	 * (notably NFS) are converted to properly lock and release nodes.
886	 * Also, certain vnode operations change the locking state within
887	 * the operation (create, mknod, remove, link, rename, mkdir, rmdir,
888	 * and symlink). Ideally these operations should not change the
889	 * lock state, but should be changed to let the caller of the
890	 * function unlock them. Otherwise all intermediate vnode layers
891	 * (such as union, umapfs, etc) must catch these functions to do
892	 * the necessary locking at their layer. Note that the inactive
893	 * and lookup operations also change their lock state, but this
894	 * cannot be avoided, so these two operations will always need
895	 * to be handled in intermediate layers.
896	 */
897	struct vnode *vp = ap->a_vp;
898	int vnflags, flags = ap->a_flags;
899
900	if (vp->v_vnlock == NULL) {
901		if ((flags & LK_TYPE_MASK) == LK_DRAIN)
902			return (0);
903		MALLOC(vp->v_vnlock, struct lock *, sizeof(struct lock),
904		    M_VNODE, M_WAITOK);
905		lockinit(vp->v_vnlock, PVFS, "vnlock", 0, 0);
906	}
907	switch (flags & LK_TYPE_MASK) {
908	case LK_DRAIN:
909		vnflags = LK_DRAIN;
910		break;
911	case LK_EXCLUSIVE:
912	case LK_SHARED:
913		vnflags = LK_SHARED;
914		break;
915	case LK_UPGRADE:
916	case LK_EXCLUPGRADE:
917	case LK_DOWNGRADE:
918		return (0);
919	case LK_RELEASE:
920	default:
921		panic("vop_nolock: bad operation %d", flags & LK_TYPE_MASK);
922	}
923	if (flags & LK_INTERLOCK)
924		vnflags |= LK_INTERLOCK;
925	return(lockmgr(vp->v_vnlock, vnflags, &vp->v_interlock, ap->a_p));
926#else /* for now */
927	/*
928	 * Since we are not using the lock manager, we must clear
929	 * the interlock here.
930	 */
931	if (ap->a_flags & LK_INTERLOCK) {
932		simple_unlock(&ap->a_vp->v_interlock);
933	}
934	return (0);
935#endif
936}
937
938/*
939 * Do the inverse of vop_nolock, handling the interlock in a compatible way.
940 */
941int
942vop_nounlock(ap)
943	struct vop_unlock_args /* {
944		struct vnode *a_vp;
945		int a_flags;
946		struct proc *a_p;
947	} */ *ap;
948{
949	struct vnode *vp = ap->a_vp;
950
951	if (vp->v_vnlock == NULL) {
952		if (ap->a_flags & LK_INTERLOCK)
953			simple_unlock(&ap->a_vp->v_interlock);
954		return (0);
955	}
956	return (lockmgr(vp->v_vnlock, LK_RELEASE | ap->a_flags,
957		&ap->a_vp->v_interlock, ap->a_p));
958}
959
960/*
961 * Return whether or not the node is in use.
962 */
963int
964vop_noislocked(ap)
965	struct vop_islocked_args /* {
966		struct vnode *a_vp;
967	} */ *ap;
968{
969	struct vnode *vp = ap->a_vp;
970
971	if (vp->v_vnlock == NULL)
972		return (0);
973	return (lockstatus(vp->v_vnlock));
974}
975
976/* #ifdef DIAGNOSTIC */
977/*
978 * Vnode reference, just increment the count
979 */
980void
981vref(vp)
982	struct vnode *vp;
983{
984	simple_lock(&vp->v_interlock);
985	if (vp->v_usecount <= 0)
986		panic("vref used where vget required");
987
988	vp->v_usecount++;
989
990	if ((vp->v_type == VREG) &&
991		((vp->v_object == NULL) ||
992			((vp->v_object->flags & OBJ_VFS_REF) == 0)) ) {
993		/*
994		 * We need to lock to VP during the time that
995		 * the object is created.  This is necessary to
996		 * keep the system from re-entrantly doing it
997		 * multiple times.
998		 * XXX vfs_object_create probably needs the interlock?
999		 */
1000		simple_unlock(&vp->v_interlock);
1001		vfs_object_create(vp, curproc, curproc->p_ucred, 0);
1002		return;
1003	}
1004	simple_unlock(&vp->v_interlock);
1005}
1006
1007/*
1008 * Vnode put/release.
1009 * If count drops to zero, call inactive routine and return to freelist.
1010 */
1011void
1012vputrele(vp, put)
1013	struct vnode *vp;
1014	int put;
1015{
1016	struct proc *p = curproc;	/* XXX */
1017
1018#ifdef DIAGNOSTIC
1019	if (vp == NULL)
1020		panic("vputrele: null vp");
1021#endif
1022	simple_lock(&vp->v_interlock);
1023	vp->v_usecount--;
1024
1025	if ((vp->v_usecount == 1) &&
1026		vp->v_object &&
1027		(vp->v_object->flags & OBJ_VFS_REF)) {
1028		vp->v_object->flags &= ~OBJ_VFS_REF;
1029		if (put) {
1030			VOP_UNLOCK(vp, LK_INTERLOCK, p);
1031		} else {
1032			simple_unlock(&vp->v_interlock);
1033		}
1034		vm_object_deallocate(vp->v_object);
1035		return;
1036	}
1037
1038	if (vp->v_usecount > 0) {
1039		if (put) {
1040			VOP_UNLOCK(vp, LK_INTERLOCK, p);
1041		} else {
1042			simple_unlock(&vp->v_interlock);
1043		}
1044		return;
1045	}
1046
1047	if (vp->v_usecount < 0) {
1048#ifdef DIAGNOSTIC
1049		vprint("vputrele: negative ref count", vp);
1050#endif
1051		panic("vputrele: negative ref cnt");
1052	}
1053	simple_lock(&vnode_free_list_slock);
1054	if (vp->v_flag & VAGE) {
1055		vp->v_flag &= ~VAGE;
1056		vp->v_usage = 0;
1057		if(vp->v_tag != VT_TFS)
1058			TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
1059	} else {
1060		if(vp->v_tag != VT_TFS)
1061			TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
1062	}
1063	freevnodes++;
1064	simple_unlock(&vnode_free_list_slock);
1065
1066	/*
1067	 * If we are doing a vput, the node is already locked, and we must
1068	 * call VOP_INACTIVE with the node locked.  So, in the case of
1069	 * vrele, we explicitly lock the vnode before calling VOP_INACTIVE.
1070	 */
1071	if (put) {
1072		simple_unlock(&vp->v_interlock);
1073		VOP_INACTIVE(vp, p);
1074	} else if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK, p) == 0) {
1075		VOP_INACTIVE(vp, p);
1076	}
1077}
1078
1079/*
1080 * vput(), just unlock and vrele()
1081 */
1082void
1083vput(vp)
1084	struct vnode *vp;
1085{
1086	vputrele(vp, 1);
1087}
1088
1089void
1090vrele(vp)
1091	struct vnode *vp;
1092{
1093	vputrele(vp, 0);
1094}
1095
1096#ifdef DIAGNOSTIC
1097/*
1098 * Page or buffer structure gets a reference.
1099 */
1100void
1101vhold(vp)
1102	register struct vnode *vp;
1103{
1104
1105	simple_lock(&vp->v_interlock);
1106	vp->v_holdcnt++;
1107	simple_unlock(&vp->v_interlock);
1108}
1109
1110/*
1111 * Page or buffer structure frees a reference.
1112 */
1113void
1114holdrele(vp)
1115	register struct vnode *vp;
1116{
1117
1118	simple_lock(&vp->v_interlock);
1119	if (vp->v_holdcnt <= 0)
1120		panic("holdrele: holdcnt");
1121	vp->v_holdcnt--;
1122	simple_unlock(&vp->v_interlock);
1123}
1124#endif /* DIAGNOSTIC */
1125
1126/*
1127 * Remove any vnodes in the vnode table belonging to mount point mp.
1128 *
1129 * If MNT_NOFORCE is specified, there should not be any active ones,
1130 * return error if any are found (nb: this is a user error, not a
1131 * system error). If MNT_FORCE is specified, detach any active vnodes
1132 * that are found.
1133 */
1134#ifdef DIAGNOSTIC
1135static int busyprt = 0;		/* print out busy vnodes */
1136SYSCTL_INT(_debug, 1, busyprt, CTLFLAG_RW, &busyprt, 0, "");
1137#endif
1138
1139int
1140vflush(mp, skipvp, flags)
1141	struct mount *mp;
1142	struct vnode *skipvp;
1143	int flags;
1144{
1145	struct proc *p = curproc;	/* XXX */
1146	struct vnode *vp, *nvp;
1147	int busy = 0;
1148
1149	simple_lock(&mntvnode_slock);
1150loop:
1151	for (vp = mp->mnt_vnodelist.lh_first; vp; vp = nvp) {
1152		/*
1153		 * Make sure this vnode wasn't reclaimed in getnewvnode().
1154		 * Start over if it has (it won't be on the list anymore).
1155		 */
1156		if (vp->v_mount != mp)
1157			goto loop;
1158		nvp = vp->v_mntvnodes.le_next;
1159		/*
1160		 * Skip over a selected vnode.
1161		 */
1162		if (vp == skipvp)
1163			continue;
1164
1165		simple_lock(&vp->v_interlock);
1166		/*
1167		 * Skip over a vnodes marked VSYSTEM.
1168		 */
1169		if ((flags & SKIPSYSTEM) && (vp->v_flag & VSYSTEM)) {
1170			simple_unlock(&vp->v_interlock);
1171			continue;
1172		}
1173		/*
1174		 * If WRITECLOSE is set, only flush out regular file vnodes
1175		 * open for writing.
1176		 */
1177		if ((flags & WRITECLOSE) &&
1178		    (vp->v_writecount == 0 || vp->v_type != VREG)) {
1179			simple_unlock(&vp->v_interlock);
1180			continue;
1181		}
1182
1183		if (vp->v_object && (vp->v_object->flags & OBJ_VFS_REF)) {
1184			simple_unlock(&vp->v_interlock);
1185			simple_unlock(&mntvnode_slock);
1186			vm_object_reference(vp->v_object);
1187			pager_cache(vp->v_object, FALSE);
1188			vp->v_object->flags &= ~OBJ_VFS_REF;
1189			vm_object_deallocate(vp->v_object);
1190			simple_lock(&mntvnode_slock);
1191			simple_lock(&vp->v_interlock);
1192		}
1193
1194		/*
1195		 * With v_usecount == 0, all we need to do is clear out the
1196		 * vnode data structures and we are done.
1197		 */
1198		if (vp->v_usecount == 0) {
1199			simple_unlock(&mntvnode_slock);
1200			vgonel(vp, p);
1201			simple_lock(&mntvnode_slock);
1202			continue;
1203		}
1204
1205		/*
1206		 * If FORCECLOSE is set, forcibly close the vnode. For block
1207		 * or character devices, revert to an anonymous device. For
1208		 * all other files, just kill them.
1209		 */
1210		if (flags & FORCECLOSE) {
1211			simple_unlock(&mntvnode_slock);
1212			if (vp->v_type != VBLK && vp->v_type != VCHR) {
1213				vgonel(vp, p);
1214			} else {
1215				vclean(vp, 0, p);
1216				vp->v_op = spec_vnodeop_p;
1217				insmntque(vp, (struct mount *) 0);
1218			}
1219			simple_lock(&mntvnode_slock);
1220			continue;
1221		}
1222#ifdef DIAGNOSTIC
1223		if (busyprt)
1224			vprint("vflush: busy vnode", vp);
1225#endif
1226		simple_unlock(&vp->v_interlock);
1227		busy++;
1228	}
1229	simple_unlock(&mntvnode_slock);
1230	if (busy)
1231		return (EBUSY);
1232	return (0);
1233}
1234
1235/*
1236 * Disassociate the underlying file system from a vnode.
1237 */
1238static void
1239vclean(struct vnode *vp, int flags, struct proc *p)
1240{
1241	int active;
1242
1243	/*
1244	 * Check to see if the vnode is in use. If so we have to reference it
1245	 * before we clean it out so that its count cannot fall to zero and
1246	 * generate a race against ourselves to recycle it.
1247	 */
1248	if ((active = vp->v_usecount))
1249		vp->v_usecount++;
1250	/*
1251	 * Prevent the vnode from being recycled or brought into use while we
1252	 * clean it out.
1253	 */
1254	if (vp->v_flag & VXLOCK)
1255		panic("vclean: deadlock");
1256	vp->v_flag |= VXLOCK;
1257	/*
1258	 * Even if the count is zero, the VOP_INACTIVE routine may still
1259	 * have the object locked while it cleans it out. The VOP_LOCK
1260	 * ensures that the VOP_INACTIVE routine is done with its work.
1261	 * For active vnodes, it ensures that no other activity can
1262	 * occur while the underlying object is being cleaned out.
1263	 */
1264	VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK, p);
1265	/*
1266	 * Clean out any buffers associated with the vnode.
1267	 */
1268	if (flags & DOCLOSE)
1269		vinvalbuf(vp, V_SAVE, NOCRED, p, 0, 0);
1270	/*
1271	 * If purging an active vnode, it must be closed and
1272	 * deactivated before being reclaimed. Note that the
1273	 * VOP_INACTIVE will unlock the vnode.
1274	 */
1275	if (active) {
1276		if (flags & DOCLOSE)
1277			VOP_CLOSE(vp, IO_NDELAY, NOCRED, p);
1278		VOP_INACTIVE(vp, p);
1279	} else {
1280		/*
1281		 * Any other processes trying to obtain this lock must first
1282		 * wait for VXLOCK to clear, then call the new lock operation.
1283		 */
1284		VOP_UNLOCK(vp, 0, p);
1285	}
1286	/*
1287	 * Reclaim the vnode.
1288	 */
1289	if (VOP_RECLAIM(vp, p))
1290		panic("vclean: cannot reclaim");
1291	if (active)
1292		vrele(vp);
1293	cache_purge(vp);
1294	if (vp->v_vnlock) {
1295		if ((vp->v_vnlock->lk_flags & LK_DRAINED) == 0)
1296			vprint("vclean: lock not drained", vp);
1297		FREE(vp->v_vnlock, M_VNODE);
1298		vp->v_vnlock = NULL;
1299	}
1300
1301	/*
1302	 * Done with purge, notify sleepers of the grim news.
1303	 */
1304	vp->v_op = dead_vnodeop_p;
1305	vp->v_tag = VT_NON;
1306	vp->v_flag &= ~VXLOCK;
1307	if (vp->v_flag & VXWANT) {
1308		vp->v_flag &= ~VXWANT;
1309		wakeup((caddr_t) vp);
1310	}
1311}
1312
1313/*
1314 * Eliminate all activity associated with the requested vnode
1315 * and with all vnodes aliased to the requested vnode.
1316 */
1317int
1318vop_revoke(ap)
1319	struct vop_revoke_args /* {
1320		struct vnode *a_vp;
1321		int a_flags;
1322	} */ *ap;
1323{
1324	struct vnode *vp, *vq;
1325	struct proc *p = curproc;	/* XXX */
1326
1327#ifdef DIAGNOSTIC
1328	if ((ap->a_flags & REVOKEALL) == 0)
1329		panic("vop_revoke");
1330#endif
1331
1332	vp = ap->a_vp;
1333	simple_lock(&vp->v_interlock);
1334
1335	if (vp->v_flag & VALIASED) {
1336		/*
1337		 * If a vgone (or vclean) is already in progress,
1338		 * wait until it is done and return.
1339		 */
1340		if (vp->v_flag & VXLOCK) {
1341			vp->v_flag |= VXWANT;
1342			simple_unlock(&vp->v_interlock);
1343			tsleep((caddr_t)vp, PINOD, "vop_revokeall", 0);
1344			return (0);
1345		}
1346		/*
1347		 * Ensure that vp will not be vgone'd while we
1348		 * are eliminating its aliases.
1349		 */
1350		vp->v_flag |= VXLOCK;
1351		simple_unlock(&vp->v_interlock);
1352		while (vp->v_flag & VALIASED) {
1353			simple_lock(&spechash_slock);
1354			for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
1355				if (vq->v_rdev != vp->v_rdev ||
1356				    vq->v_type != vp->v_type || vp == vq)
1357					continue;
1358				simple_unlock(&spechash_slock);
1359				vgone(vq);
1360				break;
1361			}
1362			if (vq == NULLVP) {
1363				simple_unlock(&spechash_slock);
1364			}
1365		}
1366		/*
1367		 * Remove the lock so that vgone below will
1368		 * really eliminate the vnode after which time
1369		 * vgone will awaken any sleepers.
1370		 */
1371		simple_lock(&vp->v_interlock);
1372		vp->v_flag &= ~VXLOCK;
1373	}
1374	vgonel(vp, p);
1375	return (0);
1376}
1377
1378/*
1379 * Recycle an unused vnode to the front of the free list.
1380 * Release the passed interlock if the vnode will be recycled.
1381 */
1382int
1383vrecycle(vp, inter_lkp, p)
1384	struct vnode *vp;
1385	struct simplelock *inter_lkp;
1386	struct proc *p;
1387{
1388
1389	simple_lock(&vp->v_interlock);
1390	if (vp->v_usecount == 0) {
1391		if (inter_lkp) {
1392			simple_unlock(inter_lkp);
1393		}
1394		vgonel(vp, p);
1395		return (1);
1396	}
1397	simple_unlock(&vp->v_interlock);
1398	return (0);
1399}
1400
1401/*
1402 * Eliminate all activity associated with a vnode
1403 * in preparation for reuse.
1404 */
1405void
1406vgone(vp)
1407	register struct vnode *vp;
1408{
1409	struct proc *p = curproc;	/* XXX */
1410
1411	simple_lock(&vp->v_interlock);
1412	vgonel(vp, p);
1413}
1414
1415/*
1416 * vgone, with the vp interlock held.
1417 */
1418void
1419vgonel(vp, p)
1420	struct vnode *vp;
1421	struct proc *p;
1422{
1423	struct vnode *vq;
1424	struct vnode *vx;
1425
1426	/*
1427	 * If a vgone (or vclean) is already in progress,
1428	 * wait until it is done and return.
1429	 */
1430	if (vp->v_flag & VXLOCK) {
1431		vp->v_flag |= VXWANT;
1432		simple_unlock(&vp->v_interlock);
1433		tsleep((caddr_t)vp, PINOD, "vgone", 0);
1434		return;
1435	}
1436
1437	if (vp->v_object) {
1438		vp->v_object->flags |= OBJ_VNODE_GONE;
1439	}
1440
1441	/*
1442	 * Clean out the filesystem specific data.
1443	 */
1444	vclean(vp, DOCLOSE, p);
1445	/*
1446	 * Delete from old mount point vnode list, if on one.
1447	 */
1448	if (vp->v_mount != NULL)
1449		insmntque(vp, (struct mount *)0);
1450	/*
1451	 * If special device, remove it from special device alias list
1452	 * if it is on one.
1453	 */
1454	if ((vp->v_type == VBLK || vp->v_type == VCHR) && vp->v_specinfo != 0) {
1455		simple_lock(&spechash_slock);
1456		if (*vp->v_hashchain == vp) {
1457			*vp->v_hashchain = vp->v_specnext;
1458		} else {
1459			for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
1460				if (vq->v_specnext != vp)
1461					continue;
1462				vq->v_specnext = vp->v_specnext;
1463				break;
1464			}
1465			if (vq == NULL)
1466				panic("missing bdev");
1467		}
1468		if (vp->v_flag & VALIASED) {
1469			vx = NULL;
1470			for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
1471				if (vq->v_rdev != vp->v_rdev ||
1472				    vq->v_type != vp->v_type)
1473					continue;
1474				if (vx)
1475					break;
1476				vx = vq;
1477			}
1478			if (vx == NULL)
1479				panic("missing alias");
1480			if (vq == NULL)
1481				vx->v_flag &= ~VALIASED;
1482			vp->v_flag &= ~VALIASED;
1483		}
1484		simple_unlock(&spechash_slock);
1485		FREE(vp->v_specinfo, M_VNODE);
1486		vp->v_specinfo = NULL;
1487	}
1488
1489	/*
1490	 * If it is on the freelist and not already at the head,
1491	 * move it to the head of the list. The test of the back
1492	 * pointer and the reference count of zero is because
1493	 * it will be removed from the free list by getnewvnode,
1494	 * but will not have its reference count incremented until
1495	 * after calling vgone. If the reference count were
1496	 * incremented first, vgone would (incorrectly) try to
1497	 * close the previous instance of the underlying object.
1498	 * So, the back pointer is explicitly set to `0xdeadb' in
1499	 * getnewvnode after removing it from the freelist to ensure
1500	 * that we do not try to move it here.
1501	 */
1502	if (vp->v_usecount == 0) {
1503		simple_lock(&vnode_free_list_slock);
1504		if ((vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb) &&
1505			vnode_free_list.tqh_first != vp) {
1506			TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
1507			TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
1508		}
1509		simple_unlock(&vnode_free_list_slock);
1510	}
1511
1512	vp->v_type = VBAD;
1513}
1514
1515/*
1516 * Lookup a vnode by device number.
1517 */
1518int
1519vfinddev(dev, type, vpp)
1520	dev_t dev;
1521	enum vtype type;
1522	struct vnode **vpp;
1523{
1524	register struct vnode *vp;
1525	int rc = 0;
1526
1527	simple_lock(&spechash_slock);
1528	for (vp = speclisth[SPECHASH(dev)]; vp; vp = vp->v_specnext) {
1529		if (dev != vp->v_rdev || type != vp->v_type)
1530			continue;
1531		*vpp = vp;
1532		rc = 1;
1533		break;
1534	}
1535	simple_unlock(&spechash_slock);
1536	return (rc);
1537}
1538
1539/*
1540 * Calculate the total number of references to a special device.
1541 */
1542int
1543vcount(vp)
1544	register struct vnode *vp;
1545{
1546	struct vnode *vq, *vnext;
1547	int count;
1548
1549loop:
1550	if ((vp->v_flag & VALIASED) == 0)
1551		return (vp->v_usecount);
1552	simple_lock(&spechash_slock);
1553	for (count = 0, vq = *vp->v_hashchain; vq; vq = vnext) {
1554		vnext = vq->v_specnext;
1555		if (vq->v_rdev != vp->v_rdev || vq->v_type != vp->v_type)
1556			continue;
1557		/*
1558		 * Alias, but not in use, so flush it out.
1559		 */
1560		if (vq->v_usecount == 0 && vq != vp) {
1561			simple_unlock(&spechash_slock);
1562			vgone(vq);
1563			goto loop;
1564		}
1565		count += vq->v_usecount;
1566	}
1567	simple_unlock(&spechash_slock);
1568	return (count);
1569}
1570
1571/*
1572 * Print out a description of a vnode.
1573 */
1574static char *typename[] =
1575{"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"};
1576
1577void
1578vprint(label, vp)
1579	char *label;
1580	register struct vnode *vp;
1581{
1582	char buf[64];
1583
1584	if (label != NULL)
1585		printf("%s: ", label);
1586	printf("type %s, usecount %d, writecount %d, refcount %ld,",
1587	    typename[vp->v_type], vp->v_usecount, vp->v_writecount,
1588	    vp->v_holdcnt);
1589	buf[0] = '\0';
1590	if (vp->v_flag & VROOT)
1591		strcat(buf, "|VROOT");
1592	if (vp->v_flag & VTEXT)
1593		strcat(buf, "|VTEXT");
1594	if (vp->v_flag & VSYSTEM)
1595		strcat(buf, "|VSYSTEM");
1596	if (vp->v_flag & VXLOCK)
1597		strcat(buf, "|VXLOCK");
1598	if (vp->v_flag & VXWANT)
1599		strcat(buf, "|VXWANT");
1600	if (vp->v_flag & VBWAIT)
1601		strcat(buf, "|VBWAIT");
1602	if (vp->v_flag & VALIASED)
1603		strcat(buf, "|VALIASED");
1604	if (buf[0] != '\0')
1605		printf(" flags (%s)", &buf[1]);
1606	if (vp->v_data == NULL) {
1607		printf("\n");
1608	} else {
1609		printf("\n\t");
1610		VOP_PRINT(vp);
1611	}
1612}
1613
1614#ifdef DDB
1615/*
1616 * List all of the locked vnodes in the system.
1617 * Called when debugging the kernel.
1618 */
1619void
1620printlockedvnodes()
1621{
1622	struct proc *p = curproc;	/* XXX */
1623	struct mount *mp, *nmp;
1624	struct vnode *vp;
1625
1626	printf("Locked vnodes\n");
1627	simple_lock(&mountlist_slock);
1628	for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) {
1629		if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) {
1630			nmp = mp->mnt_list.cqe_next;
1631			continue;
1632		}
1633		for (vp = mp->mnt_vnodelist.lh_first;
1634		     vp != NULL;
1635		     vp = vp->v_mntvnodes.le_next) {
1636			if (VOP_ISLOCKED(vp))
1637				vprint((char *)0, vp);
1638		}
1639		simple_lock(&mountlist_slock);
1640		nmp = mp->mnt_list.cqe_next;
1641		vfs_unbusy(mp, p);
1642	}
1643	simple_unlock(&mountlist_slock);
1644}
1645#endif
1646
1647static int
1648sysctl_vfs_conf SYSCTL_HANDLER_ARGS
1649{
1650	int error;
1651	struct vfsconf *vfsp;
1652
1653	if (req->newptr)
1654		return EINVAL;
1655	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
1656		error = SYSCTL_OUT(req, vfsp, sizeof *vfsp);
1657		if (error)
1658			return error;
1659	}
1660	return 0;
1661}
1662
1663SYSCTL_PROC(_vfs, VFS_VFSCONF, vfsconf, CTLTYPE_OPAQUE|CTLFLAG_RD,
1664	0, 0, sysctl_vfs_conf, "S,vfsconf", "");
1665
1666#ifndef NO_COMPAT_PRELITE2
1667
1668#define OVFS_MAXNAMELEN 32
1669struct ovfsconf {
1670	void *vfc_vfsops;
1671	char vfc_name[OVFS_MAXNAMELEN];
1672	int vfc_index;
1673	int vfc_refcount;
1674	int vfc_flags;
1675};
1676
1677static int
1678sysctl_ovfs_conf SYSCTL_HANDLER_ARGS
1679{
1680	int error;
1681	struct vfsconf *vfsp;
1682
1683	if (req->newptr)
1684		return EINVAL;
1685	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
1686		struct ovfsconf ovfs;
1687		ovfs.vfc_vfsops = vfsp->vfc_vfsops;	/* XXX used as flag */
1688		strcpy(ovfs.vfc_name, vfsp->vfc_name);
1689		ovfs.vfc_index = vfsp->vfc_typenum;
1690		ovfs.vfc_refcount = vfsp->vfc_refcount;
1691		ovfs.vfc_flags = vfsp->vfc_flags;
1692		error = SYSCTL_OUT(req, &ovfs, sizeof ovfs);
1693		if (error)
1694			return error;
1695	}
1696	return 0;
1697}
1698
1699SYSCTL_PROC(_vfs, VFS_OVFSCONF, ovfsconf, CTLTYPE_OPAQUE|CTLFLAG_RD,
1700	0, 0, sysctl_ovfs_conf, "S,ovfsconf", "");
1701
1702#endif /* !NO_COMPAT_PRELITE2 */
1703
1704int kinfo_vdebug = 1;
1705int kinfo_vgetfailed;
1706
1707#define KINFO_VNODESLOP	10
1708/*
1709 * Dump vnode list (via sysctl).
1710 * Copyout address of vnode followed by vnode.
1711 */
1712/* ARGSUSED */
1713static int
1714sysctl_vnode SYSCTL_HANDLER_ARGS
1715{
1716	struct proc *p = curproc;	/* XXX */
1717	struct mount *mp, *nmp;
1718	struct vnode *nvp, *vp;
1719	int error;
1720
1721#define VPTRSZ	sizeof (struct vnode *)
1722#define VNODESZ	sizeof (struct vnode)
1723
1724	req->lock = 0;
1725	if (!req->oldptr) /* Make an estimate */
1726		return (SYSCTL_OUT(req, 0,
1727			(numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ)));
1728
1729	simple_lock(&mountlist_slock);
1730	for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) {
1731		if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) {
1732			nmp = mp->mnt_list.cqe_next;
1733			continue;
1734		}
1735again:
1736		simple_lock(&mntvnode_slock);
1737		for (vp = mp->mnt_vnodelist.lh_first;
1738		     vp != NULL;
1739		     vp = nvp) {
1740			/*
1741			 * Check that the vp is still associated with
1742			 * this filesystem.  RACE: could have been
1743			 * recycled onto the same filesystem.
1744			 */
1745			if (vp->v_mount != mp) {
1746				simple_unlock(&mntvnode_slock);
1747				if (kinfo_vdebug)
1748					printf("kinfo: vp changed\n");
1749				goto again;
1750			}
1751			nvp = vp->v_mntvnodes.le_next;
1752			simple_unlock(&mntvnode_slock);
1753			if ((error = SYSCTL_OUT(req, &vp, VPTRSZ)) ||
1754			    (error = SYSCTL_OUT(req, vp, VNODESZ)))
1755				return (error);
1756			simple_lock(&mntvnode_slock);
1757		}
1758		simple_unlock(&mntvnode_slock);
1759		simple_lock(&mountlist_slock);
1760		nmp = mp->mnt_list.cqe_next;
1761		vfs_unbusy(mp, p);
1762	}
1763	simple_unlock(&mountlist_slock);
1764
1765	return (0);
1766}
1767
1768SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD,
1769	0, 0, sysctl_vnode, "S,vnode", "");
1770
1771/*
1772 * Check to see if a filesystem is mounted on a block device.
1773 */
1774int
1775vfs_mountedon(vp)
1776	struct vnode *vp;
1777{
1778	struct vnode *vq;
1779	int error = 0;
1780
1781	if (vp->v_specflags & SI_MOUNTEDON)
1782		return (EBUSY);
1783	if (vp->v_flag & VALIASED) {
1784		simple_lock(&spechash_slock);
1785		for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
1786			if (vq->v_rdev != vp->v_rdev ||
1787			    vq->v_type != vp->v_type)
1788				continue;
1789			if (vq->v_specflags & SI_MOUNTEDON) {
1790				error = EBUSY;
1791				break;
1792			}
1793		}
1794		simple_unlock(&spechash_slock);
1795	}
1796	return (error);
1797}
1798
1799/*
1800 * Unmount all filesystems. The list is traversed in reverse order
1801 * of mounting to avoid dependencies.
1802 */
1803void
1804vfs_unmountall()
1805{
1806	struct mount *mp, *nmp;
1807	struct proc *p = initproc;	/* XXX XXX should this be proc0? */
1808	int error;
1809
1810	/*
1811	 * Since this only runs when rebooting, it is not interlocked.
1812	 */
1813	for (mp = mountlist.cqh_last; mp != (void *)&mountlist; mp = nmp) {
1814		nmp = mp->mnt_list.cqe_prev;
1815		error = dounmount(mp, MNT_FORCE, p);
1816		if (error) {
1817			printf("unmount of %s failed (",
1818			    mp->mnt_stat.f_mntonname);
1819			if (error == EBUSY)
1820				printf("BUSY)\n");
1821			else
1822				printf("%d)\n", error);
1823		}
1824	}
1825}
1826
1827/*
1828 * Build hash lists of net addresses and hang them off the mount point.
1829 * Called by ufs_mount() to set up the lists of export addresses.
1830 */
1831static int
1832vfs_hang_addrlist(struct mount *mp, struct netexport *nep,
1833	struct export_args *argp)
1834{
1835	register struct netcred *np;
1836	register struct radix_node_head *rnh;
1837	register int i;
1838	struct radix_node *rn;
1839	struct sockaddr *saddr, *smask = 0;
1840	struct domain *dom;
1841	int error;
1842
1843	if (argp->ex_addrlen == 0) {
1844		if (mp->mnt_flag & MNT_DEFEXPORTED)
1845			return (EPERM);
1846		np = &nep->ne_defexported;
1847		np->netc_exflags = argp->ex_flags;
1848		np->netc_anon = argp->ex_anon;
1849		np->netc_anon.cr_ref = 1;
1850		mp->mnt_flag |= MNT_DEFEXPORTED;
1851		return (0);
1852	}
1853	i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen;
1854	np = (struct netcred *) malloc(i, M_NETADDR, M_WAITOK);
1855	bzero((caddr_t) np, i);
1856	saddr = (struct sockaddr *) (np + 1);
1857	if ((error = copyin(argp->ex_addr, (caddr_t) saddr, argp->ex_addrlen)))
1858		goto out;
1859	if (saddr->sa_len > argp->ex_addrlen)
1860		saddr->sa_len = argp->ex_addrlen;
1861	if (argp->ex_masklen) {
1862		smask = (struct sockaddr *) ((caddr_t) saddr + argp->ex_addrlen);
1863		error = copyin(argp->ex_addr, (caddr_t) smask, argp->ex_masklen);
1864		if (error)
1865			goto out;
1866		if (smask->sa_len > argp->ex_masklen)
1867			smask->sa_len = argp->ex_masklen;
1868	}
1869	i = saddr->sa_family;
1870	if ((rnh = nep->ne_rtable[i]) == 0) {
1871		/*
1872		 * Seems silly to initialize every AF when most are not used,
1873		 * do so on demand here
1874		 */
1875		for (dom = domains; dom; dom = dom->dom_next)
1876			if (dom->dom_family == i && dom->dom_rtattach) {
1877				dom->dom_rtattach((void **) &nep->ne_rtable[i],
1878				    dom->dom_rtoffset);
1879				break;
1880			}
1881		if ((rnh = nep->ne_rtable[i]) == 0) {
1882			error = ENOBUFS;
1883			goto out;
1884		}
1885	}
1886	rn = (*rnh->rnh_addaddr) ((caddr_t) saddr, (caddr_t) smask, rnh,
1887	    np->netc_rnodes);
1888	if (rn == 0 || np != (struct netcred *) rn) {	/* already exists */
1889		error = EPERM;
1890		goto out;
1891	}
1892	np->netc_exflags = argp->ex_flags;
1893	np->netc_anon = argp->ex_anon;
1894	np->netc_anon.cr_ref = 1;
1895	return (0);
1896out:
1897	free(np, M_NETADDR);
1898	return (error);
1899}
1900
1901/* ARGSUSED */
1902static int
1903vfs_free_netcred(struct radix_node *rn, void *w)
1904{
1905	register struct radix_node_head *rnh = (struct radix_node_head *) w;
1906
1907	(*rnh->rnh_deladdr) (rn->rn_key, rn->rn_mask, rnh);
1908	free((caddr_t) rn, M_NETADDR);
1909	return (0);
1910}
1911
1912/*
1913 * Free the net address hash lists that are hanging off the mount points.
1914 */
1915static void
1916vfs_free_addrlist(struct netexport *nep)
1917{
1918	register int i;
1919	register struct radix_node_head *rnh;
1920
1921	for (i = 0; i <= AF_MAX; i++)
1922		if ((rnh = nep->ne_rtable[i])) {
1923			(*rnh->rnh_walktree) (rnh, vfs_free_netcred,
1924			    (caddr_t) rnh);
1925			free((caddr_t) rnh, M_RTABLE);
1926			nep->ne_rtable[i] = 0;
1927		}
1928}
1929
1930int
1931vfs_export(mp, nep, argp)
1932	struct mount *mp;
1933	struct netexport *nep;
1934	struct export_args *argp;
1935{
1936	int error;
1937
1938	if (argp->ex_flags & MNT_DELEXPORT) {
1939		vfs_free_addrlist(nep);
1940		mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED);
1941	}
1942	if (argp->ex_flags & MNT_EXPORTED) {
1943		if ((error = vfs_hang_addrlist(mp, nep, argp)))
1944			return (error);
1945		mp->mnt_flag |= MNT_EXPORTED;
1946	}
1947	return (0);
1948}
1949
1950struct netcred *
1951vfs_export_lookup(mp, nep, nam)
1952	register struct mount *mp;
1953	struct netexport *nep;
1954	struct mbuf *nam;
1955{
1956	register struct netcred *np;
1957	register struct radix_node_head *rnh;
1958	struct sockaddr *saddr;
1959
1960	np = NULL;
1961	if (mp->mnt_flag & MNT_EXPORTED) {
1962		/*
1963		 * Lookup in the export list first.
1964		 */
1965		if (nam != NULL) {
1966			saddr = mtod(nam, struct sockaddr *);
1967			rnh = nep->ne_rtable[saddr->sa_family];
1968			if (rnh != NULL) {
1969				np = (struct netcred *)
1970					(*rnh->rnh_matchaddr)((caddr_t)saddr,
1971							      rnh);
1972				if (np && np->netc_rnodes->rn_flags & RNF_ROOT)
1973					np = NULL;
1974			}
1975		}
1976		/*
1977		 * If no address match, use the default if it exists.
1978		 */
1979		if (np == NULL && mp->mnt_flag & MNT_DEFEXPORTED)
1980			np = &nep->ne_defexported;
1981	}
1982	return (np);
1983}
1984
1985/*
1986 * perform msync on all vnodes under a mount point
1987 * the mount point must be locked.
1988 */
1989void
1990vfs_msync(struct mount *mp, int flags) {
1991	struct vnode *vp, *nvp;
1992loop:
1993	for (vp = mp->mnt_vnodelist.lh_first; vp != NULL; vp = nvp) {
1994
1995		if (vp->v_mount != mp)
1996			goto loop;
1997		nvp = vp->v_mntvnodes.le_next;
1998		if (VOP_ISLOCKED(vp) && (flags != MNT_WAIT))
1999			continue;
2000		if (vp->v_object &&
2001		   (vp->v_object->flags & OBJ_MIGHTBEDIRTY)) {
2002			vm_object_page_clean(vp->v_object, 0, 0, TRUE, TRUE);
2003		}
2004	}
2005}
2006
2007/*
2008 * Create the VM object needed for VMIO and mmap support.  This
2009 * is done for all VREG files in the system.  Some filesystems might
2010 * afford the additional metadata buffering capability of the
2011 * VMIO code by making the device node be VMIO mode also.
2012 */
2013int
2014vfs_object_create(vp, p, cred, waslocked)
2015	struct vnode *vp;
2016	struct proc *p;
2017	struct ucred *cred;
2018	int waslocked;
2019{
2020	struct vattr vat;
2021	vm_object_t object;
2022	int error = 0;
2023
2024retry:
2025	if ((object = vp->v_object) == NULL) {
2026		if (vp->v_type == VREG) {
2027			if ((error = VOP_GETATTR(vp, &vat, cred, p)) != 0)
2028				goto retn;
2029			(void) vnode_pager_alloc(vp,
2030				OFF_TO_IDX(round_page(vat.va_size)), 0, 0);
2031		} else {
2032			/*
2033			 * This simply allocates the biggest object possible
2034			 * for a VBLK vnode.  This should be fixed, but doesn't
2035			 * cause any problems (yet).
2036			 */
2037			(void) vnode_pager_alloc(vp, INT_MAX, 0, 0);
2038		}
2039		vp->v_object->flags |= OBJ_VFS_REF;
2040	} else {
2041		if (object->flags & OBJ_DEAD) {
2042			if (waslocked)
2043				VOP_UNLOCK(vp, 0, p);
2044			tsleep(object, PVM, "vodead", 0);
2045			if (waslocked)
2046				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
2047			goto retry;
2048		}
2049		if ((object->flags & OBJ_VFS_REF) == 0) {
2050			object->flags |= OBJ_VFS_REF;
2051			vm_object_reference(object);
2052		}
2053	}
2054	if (vp->v_object)
2055		vp->v_flag |= VVMIO;
2056
2057retn:
2058	return error;
2059}
2060