vfs_subr.c revision 34928
1/*
2 * Copyright (c) 1989, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 *    must display the following acknowledgement:
20 *	This product includes software developed by the University of
21 *	California, Berkeley and its contributors.
22 * 4. Neither the name of the University nor the names of its contributors
23 *    may be used to endorse or promote products derived from this software
24 *    without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 *
38 *	@(#)vfs_subr.c	8.31 (Berkeley) 5/26/95
39 * $Id: vfs_subr.c,v 1.146 1998/03/28 12:04:32 bde Exp $
40 */
41
42/*
43 * External virtual filesystem routines
44 */
45#include "opt_ddb.h"
46#include "opt_devfs.h"
47
48#include <sys/param.h>
49#include <sys/systm.h>
50#include <sys/kernel.h>
51#include <sys/proc.h>
52#include <sys/malloc.h>
53#include <sys/mount.h>
54#include <sys/socket.h>
55#include <sys/vnode.h>
56#include <sys/stat.h>
57#include <sys/buf.h>
58#include <sys/domain.h>
59#include <sys/dirent.h>
60#include <sys/vmmeter.h>
61
62#include <machine/limits.h>
63
64#include <vm/vm.h>
65#include <vm/vm_object.h>
66#include <vm/vm_extern.h>
67#include <vm/pmap.h>
68#include <vm/vm_map.h>
69#include <vm/vm_pager.h>
70#include <vm/vnode_pager.h>
71#include <vm/vm_zone.h>
72#include <sys/sysctl.h>
73
74#include <miscfs/specfs/specdev.h>
75
76static MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure");
77
78static void	insmntque __P((struct vnode *vp, struct mount *mp));
79#ifdef DDB
80static void	printlockedvnodes __P((void));
81#endif
82static void	vclean __P((struct vnode *vp, int flags, struct proc *p));
83static void	vfree __P((struct vnode *));
84static void	vgonel __P((struct vnode *vp, struct proc *p));
85static unsigned long	numvnodes;
86SYSCTL_INT(_debug, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, "");
87
88enum vtype iftovt_tab[16] = {
89	VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
90	VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
91};
92int vttoif_tab[9] = {
93	0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
94	S_IFSOCK, S_IFIFO, S_IFMT,
95};
96
97/*
98 * Insq/Remq for the vnode usage lists.
99 */
100#define	bufinsvn(bp, dp)	LIST_INSERT_HEAD(dp, bp, b_vnbufs)
101#define	bufremvn(bp) {							\
102	LIST_REMOVE(bp, b_vnbufs);					\
103	(bp)->b_vnbufs.le_next = NOLIST;				\
104}
105
106static TAILQ_HEAD(freelst, vnode) vnode_free_list;	/* vnode free list */
107struct tobefreelist vnode_tobefree_list;	/* vnode free list */
108
109static u_long wantfreevnodes = 25;
110SYSCTL_INT(_debug, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, "");
111static u_long freevnodes = 0;
112SYSCTL_INT(_debug, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, "");
113
114int vfs_ioopt = 0;
115#ifdef ENABLE_VFS_IOOPT
116SYSCTL_INT(_vfs, OID_AUTO, ioopt, CTLFLAG_RW, &vfs_ioopt, 0, "");
117#endif
118
119struct mntlist mountlist;	/* mounted filesystem list */
120struct simplelock mountlist_slock;
121static struct simplelock mntid_slock;
122struct simplelock mntvnode_slock;
123static struct simplelock vnode_free_list_slock;
124static struct simplelock spechash_slock;
125struct nfs_public nfs_pub;	/* publicly exported FS */
126static vm_zone_t vnode_zone;
127
128/*
129 * The workitem queue.
130 */
131#define SYNCER_MAXDELAY		32
132int syncer_maxdelay =		SYNCER_MAXDELAY;	/* maximum delay time */
133time_t syncdelay =		30;
134int rushjob;				/* number of slots to run ASAP */
135
136static int syncer_delayno = 0;
137static long syncer_mask;
138LIST_HEAD(synclist, vnode);
139static struct synclist *syncer_workitem_pending;
140
141int desiredvnodes;
142SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW, &desiredvnodes, 0, "");
143
144static void	vfs_free_addrlist __P((struct netexport *nep));
145static int	vfs_free_netcred __P((struct radix_node *rn, void *w));
146static int	vfs_hang_addrlist __P((struct mount *mp, struct netexport *nep,
147				       struct export_args *argp));
148
149/*
150 * Initialize the vnode management data structures.
151 */
152void
153vntblinit()
154{
155
156	desiredvnodes = maxproc + cnt.v_page_count / 4;
157	simple_lock_init(&mntvnode_slock);
158	simple_lock_init(&mntid_slock);
159	simple_lock_init(&spechash_slock);
160	TAILQ_INIT(&vnode_free_list);
161	TAILQ_INIT(&vnode_tobefree_list);
162	simple_lock_init(&vnode_free_list_slock);
163	CIRCLEQ_INIT(&mountlist);
164	vnode_zone = zinit("VNODE", sizeof (struct vnode), 0, 0, 5);
165	/*
166	 * Initialize the filesystem syncer.
167	 */
168	syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE,
169		&syncer_mask);
170	syncer_maxdelay = syncer_mask + 1;
171}
172
173/*
174 * Mark a mount point as busy. Used to synchronize access and to delay
175 * unmounting. Interlock is not released on failure.
176 */
177int
178vfs_busy(mp, flags, interlkp, p)
179	struct mount *mp;
180	int flags;
181	struct simplelock *interlkp;
182	struct proc *p;
183{
184	int lkflags;
185
186	if (mp->mnt_kern_flag & MNTK_UNMOUNT) {
187		if (flags & LK_NOWAIT)
188			return (ENOENT);
189		mp->mnt_kern_flag |= MNTK_MWAIT;
190		if (interlkp) {
191			simple_unlock(interlkp);
192		}
193		/*
194		 * Since all busy locks are shared except the exclusive
195		 * lock granted when unmounting, the only place that a
196		 * wakeup needs to be done is at the release of the
197		 * exclusive lock at the end of dounmount.
198		 */
199		tsleep((caddr_t)mp, PVFS, "vfs_busy", 0);
200		if (interlkp) {
201			simple_lock(interlkp);
202		}
203		return (ENOENT);
204	}
205	lkflags = LK_SHARED | LK_NOPAUSE;
206	if (interlkp)
207		lkflags |= LK_INTERLOCK;
208	if (lockmgr(&mp->mnt_lock, lkflags, interlkp, p))
209		panic("vfs_busy: unexpected lock failure");
210	return (0);
211}
212
213/*
214 * Free a busy filesystem.
215 */
216void
217vfs_unbusy(mp, p)
218	struct mount *mp;
219	struct proc *p;
220{
221
222	lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, p);
223}
224
225/*
226 * Lookup a filesystem type, and if found allocate and initialize
227 * a mount structure for it.
228 *
229 * Devname is usually updated by mount(8) after booting.
230 */
231int
232vfs_rootmountalloc(fstypename, devname, mpp)
233	char *fstypename;
234	char *devname;
235	struct mount **mpp;
236{
237	struct proc *p = curproc;	/* XXX */
238	struct vfsconf *vfsp;
239	struct mount *mp;
240
241	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
242		if (!strcmp(vfsp->vfc_name, fstypename))
243			break;
244	if (vfsp == NULL)
245		return (ENODEV);
246	mp = malloc((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK);
247	bzero((char *)mp, (u_long)sizeof(struct mount));
248	lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, LK_NOPAUSE);
249	(void)vfs_busy(mp, LK_NOWAIT, 0, p);
250	LIST_INIT(&mp->mnt_vnodelist);
251	mp->mnt_vfc = vfsp;
252	mp->mnt_op = vfsp->vfc_vfsops;
253	mp->mnt_flag = MNT_RDONLY;
254	mp->mnt_vnodecovered = NULLVP;
255	vfsp->vfc_refcount++;
256	mp->mnt_stat.f_type = vfsp->vfc_typenum;
257	mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
258	strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN);
259	mp->mnt_stat.f_mntonname[0] = '/';
260	mp->mnt_stat.f_mntonname[1] = 0;
261	(void) copystr(devname, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, 0);
262	*mpp = mp;
263	return (0);
264}
265
266/*
267 * Find an appropriate filesystem to use for the root. If a filesystem
268 * has not been preselected, walk through the list of known filesystems
269 * trying those that have mountroot routines, and try them until one
270 * works or we have tried them all.
271 */
272#ifdef notdef	/* XXX JH */
273int
274lite2_vfs_mountroot()
275{
276	struct vfsconf *vfsp;
277	extern int (*lite2_mountroot) __P((void));
278	int error;
279
280	if (lite2_mountroot != NULL)
281		return ((*lite2_mountroot)());
282	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
283		if (vfsp->vfc_mountroot == NULL)
284			continue;
285		if ((error = (*vfsp->vfc_mountroot)()) == 0)
286			return (0);
287		printf("%s_mountroot failed: %d\n", vfsp->vfc_name, error);
288	}
289	return (ENODEV);
290}
291#endif
292
293/*
294 * Lookup a mount point by filesystem identifier.
295 */
296struct mount *
297vfs_getvfs(fsid)
298	fsid_t *fsid;
299{
300	register struct mount *mp;
301
302	simple_lock(&mountlist_slock);
303	for (mp = mountlist.cqh_first; mp != (void *)&mountlist;
304	    mp = mp->mnt_list.cqe_next) {
305		if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
306		    mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
307			simple_unlock(&mountlist_slock);
308			return (mp);
309	    }
310	}
311	simple_unlock(&mountlist_slock);
312	return ((struct mount *) 0);
313}
314
315/*
316 * Get a new unique fsid
317 */
318void
319vfs_getnewfsid(mp)
320	struct mount *mp;
321{
322	static u_short xxxfs_mntid;
323
324	fsid_t tfsid;
325	int mtype;
326
327	simple_lock(&mntid_slock);
328	mtype = mp->mnt_vfc->vfc_typenum;
329	mp->mnt_stat.f_fsid.val[0] = makedev(nblkdev + mtype, 0);
330	mp->mnt_stat.f_fsid.val[1] = mtype;
331	if (xxxfs_mntid == 0)
332		++xxxfs_mntid;
333	tfsid.val[0] = makedev(nblkdev + mtype, xxxfs_mntid);
334	tfsid.val[1] = mtype;
335	if (mountlist.cqh_first != (void *)&mountlist) {
336		while (vfs_getvfs(&tfsid)) {
337			tfsid.val[0]++;
338			xxxfs_mntid++;
339		}
340	}
341	mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
342	simple_unlock(&mntid_slock);
343}
344
345/*
346 * Set vnode attributes to VNOVAL
347 */
348void
349vattr_null(vap)
350	register struct vattr *vap;
351{
352
353	vap->va_type = VNON;
354	vap->va_size = VNOVAL;
355	vap->va_bytes = VNOVAL;
356	vap->va_mode = vap->va_nlink = vap->va_uid = vap->va_gid =
357	    vap->va_fsid = vap->va_fileid =
358	    vap->va_blocksize = vap->va_rdev =
359	    vap->va_atime.tv_sec = vap->va_atime.tv_nsec =
360	    vap->va_mtime.tv_sec = vap->va_mtime.tv_nsec =
361	    vap->va_ctime.tv_sec = vap->va_ctime.tv_nsec =
362	    vap->va_flags = vap->va_gen = VNOVAL;
363	vap->va_vaflags = 0;
364}
365
366/*
367 * Routines having to do with the management of the vnode table.
368 */
369extern vop_t **dead_vnodeop_p;
370
371/*
372 * Return the next vnode from the free list.
373 */
374int
375getnewvnode(tag, mp, vops, vpp)
376	enum vtagtype tag;
377	struct mount *mp;
378	vop_t **vops;
379	struct vnode **vpp;
380{
381	int s;
382	struct proc *p = curproc;	/* XXX */
383	struct vnode *vp, *tvp, *nvp;
384	vm_object_t object;
385	TAILQ_HEAD(freelst, vnode) vnode_tmp_list;
386
387	/*
388	 * We take the least recently used vnode from the freelist
389	 * if we can get it and it has no cached pages, and no
390	 * namecache entries are relative to it.
391	 * Otherwise we allocate a new vnode
392	 */
393
394	s = splbio();
395	simple_lock(&vnode_free_list_slock);
396	TAILQ_INIT(&vnode_tmp_list);
397
398	for (vp = TAILQ_FIRST(&vnode_tobefree_list); vp; vp = nvp) {
399		nvp = TAILQ_NEXT(vp, v_freelist);
400		TAILQ_REMOVE(&vnode_tobefree_list, vp, v_freelist);
401		if (vp->v_flag & VAGE) {
402			TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
403		} else {
404			TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
405		}
406		vp->v_flag &= ~(VTBFREE|VAGE);
407		vp->v_flag |= VFREE;
408		if (vp->v_usecount)
409			panic("tobe free vnode isn't");
410		freevnodes++;
411	}
412
413	if (wantfreevnodes && freevnodes < wantfreevnodes) {
414		vp = NULL;
415	} else if (!wantfreevnodes && freevnodes <= desiredvnodes) {
416		/*
417		 * XXX: this is only here to be backwards compatible
418		 */
419		vp = NULL;
420	} else {
421		for (vp = TAILQ_FIRST(&vnode_free_list); vp; vp = nvp) {
422
423			nvp = TAILQ_NEXT(vp, v_freelist);
424
425			if (!simple_lock_try(&vp->v_interlock))
426				continue;
427			if (vp->v_usecount)
428				panic("free vnode isn't");
429
430			object = vp->v_object;
431			if (object && (object->resident_page_count || object->ref_count)) {
432				printf("object inconsistant state: RPC: %d, RC: %d\n",
433					object->resident_page_count, object->ref_count);
434				/* Don't recycle if it's caching some pages */
435				TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
436				TAILQ_INSERT_TAIL(&vnode_tmp_list, vp, v_freelist);
437				continue;
438			} else if (LIST_FIRST(&vp->v_cache_src)) {
439				/* Don't recycle if active in the namecache */
440				simple_unlock(&vp->v_interlock);
441				continue;
442			} else {
443				break;
444			}
445		}
446	}
447
448	for (tvp = TAILQ_FIRST(&vnode_tmp_list); tvp; tvp = nvp) {
449		nvp = TAILQ_NEXT(tvp, v_freelist);
450		TAILQ_REMOVE(&vnode_tmp_list, tvp, v_freelist);
451		TAILQ_INSERT_TAIL(&vnode_free_list, tvp, v_freelist);
452		simple_unlock(&tvp->v_interlock);
453	}
454
455	if (vp) {
456		vp->v_flag |= VDOOMED;
457		TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
458		freevnodes--;
459		simple_unlock(&vnode_free_list_slock);
460		cache_purge(vp);
461		vp->v_lease = NULL;
462		if (vp->v_type != VBAD) {
463			vgonel(vp, p);
464		} else {
465			simple_unlock(&vp->v_interlock);
466		}
467
468#ifdef DIAGNOSTIC
469		{
470			int s;
471
472			if (vp->v_data)
473				panic("cleaned vnode isn't");
474			s = splbio();
475			if (vp->v_numoutput)
476				panic("Clean vnode has pending I/O's");
477			splx(s);
478		}
479#endif
480		vp->v_flag = 0;
481		vp->v_lastr = 0;
482		vp->v_lastw = 0;
483		vp->v_lasta = 0;
484		vp->v_cstart = 0;
485		vp->v_clen = 0;
486		vp->v_socket = 0;
487		vp->v_writecount = 0;	/* XXX */
488		vp->v_maxio = 0;
489	} else {
490		simple_unlock(&vnode_free_list_slock);
491		vp = (struct vnode *) zalloc(vnode_zone);
492		bzero((char *) vp, sizeof *vp);
493		simple_lock_init(&vp->v_interlock);
494		vp->v_dd = vp;
495		cache_purge(vp);
496		LIST_INIT(&vp->v_cache_src);
497		TAILQ_INIT(&vp->v_cache_dst);
498		numvnodes++;
499	}
500
501	vp->v_type = VNON;
502	vp->v_tag = tag;
503	vp->v_op = vops;
504	insmntque(vp, mp);
505	*vpp = vp;
506	vp->v_usecount = 1;
507	vp->v_data = 0;
508	splx(s);
509
510	vfs_object_create(vp, p, p->p_ucred, TRUE);
511	return (0);
512}
513
514/*
515 * Move a vnode from one mount queue to another.
516 */
517static void
518insmntque(vp, mp)
519	register struct vnode *vp;
520	register struct mount *mp;
521{
522
523	simple_lock(&mntvnode_slock);
524	/*
525	 * Delete from old mount point vnode list, if on one.
526	 */
527	if (vp->v_mount != NULL)
528		LIST_REMOVE(vp, v_mntvnodes);
529	/*
530	 * Insert into list of vnodes for the new mount point, if available.
531	 */
532	if ((vp->v_mount = mp) == NULL) {
533		simple_unlock(&mntvnode_slock);
534		return;
535	}
536	LIST_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes);
537	simple_unlock(&mntvnode_slock);
538}
539
540/*
541 * Update outstanding I/O count and do wakeup if requested.
542 */
543void
544vwakeup(bp)
545	register struct buf *bp;
546{
547	register struct vnode *vp;
548
549	bp->b_flags &= ~B_WRITEINPROG;
550	if ((vp = bp->b_vp)) {
551		vp->v_numoutput--;
552		if (vp->v_numoutput < 0)
553			panic("vwakeup: neg numoutput");
554		if ((vp->v_numoutput == 0) && (vp->v_flag & VBWAIT)) {
555			vp->v_flag &= ~VBWAIT;
556			wakeup((caddr_t) &vp->v_numoutput);
557		}
558	}
559}
560
561/*
562 * Flush out and invalidate all buffers associated with a vnode.
563 * Called with the underlying object locked.
564 */
565int
566vinvalbuf(vp, flags, cred, p, slpflag, slptimeo)
567	register struct vnode *vp;
568	int flags;
569	struct ucred *cred;
570	struct proc *p;
571	int slpflag, slptimeo;
572{
573	register struct buf *bp;
574	struct buf *nbp, *blist;
575	int s, error;
576	vm_object_t object;
577
578	if ((flags & V_SAVE) && vp->v_dirtyblkhd.lh_first != NULL) {
579		if ((error = VOP_FSYNC(vp, cred, MNT_WAIT, p)))
580			return (error);
581		if (vp->v_dirtyblkhd.lh_first != NULL)
582			panic("vinvalbuf: dirty bufs");
583	}
584
585	s = splbio();
586	for (;;) {
587		if ((blist = vp->v_cleanblkhd.lh_first) && (flags & V_SAVEMETA))
588			while (blist && blist->b_lblkno < 0)
589				blist = blist->b_vnbufs.le_next;
590		if (!blist && (blist = vp->v_dirtyblkhd.lh_first) &&
591		    (flags & V_SAVEMETA))
592			while (blist && blist->b_lblkno < 0)
593				blist = blist->b_vnbufs.le_next;
594		if (!blist)
595			break;
596
597		for (bp = blist; bp; bp = nbp) {
598			nbp = bp->b_vnbufs.le_next;
599			if ((flags & V_SAVEMETA) && bp->b_lblkno < 0)
600				continue;
601			if (bp->b_flags & B_BUSY) {
602				bp->b_flags |= B_WANTED;
603				error = tsleep((caddr_t) bp,
604				    slpflag | (PRIBIO + 4), "vinvalbuf",
605				    slptimeo);
606				if (error) {
607					splx(s);
608					return (error);
609				}
610				break;
611			}
612			/*
613			 * XXX Since there are no node locks for NFS, I
614			 * believe there is a slight chance that a delayed
615			 * write will occur while sleeping just above, so
616			 * check for it.  Note that vfs_bio_awrite expects
617			 * buffers to reside on a queue, while VOP_BWRITE and
618			 * brelse do not.
619			 */
620			if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) &&
621				(flags & V_SAVE)) {
622
623				if (bp->b_vp == vp) {
624					if (bp->b_flags & B_CLUSTEROK) {
625						vfs_bio_awrite(bp);
626					} else {
627						bremfree(bp);
628						bp->b_flags |= (B_BUSY | B_ASYNC);
629						VOP_BWRITE(bp);
630					}
631				} else {
632					bremfree(bp);
633					bp->b_flags |= B_BUSY;
634					(void) VOP_BWRITE(bp);
635				}
636				break;
637			}
638			bremfree(bp);
639			bp->b_flags |= (B_INVAL | B_NOCACHE | B_RELBUF | B_BUSY);
640			bp->b_flags &= ~B_ASYNC;
641			brelse(bp);
642		}
643	}
644
645	while (vp->v_numoutput > 0) {
646		vp->v_flag |= VBWAIT;
647		tsleep(&vp->v_numoutput, PVM, "vnvlbv", 0);
648	}
649
650	splx(s);
651
652	/*
653	 * Destroy the copy in the VM cache, too.
654	 */
655	simple_lock(&vp->v_interlock);
656	object = vp->v_object;
657	if (object != NULL) {
658		if (flags & V_SAVEMETA)
659			vm_object_page_remove(object, 0, object->size,
660				(flags & V_SAVE) ? TRUE : FALSE);
661		else
662			vm_object_page_remove(object, 0, 0,
663				(flags & V_SAVE) ? TRUE : FALSE);
664	}
665	simple_unlock(&vp->v_interlock);
666
667	if (!(flags & V_SAVEMETA) &&
668	    (vp->v_dirtyblkhd.lh_first || vp->v_cleanblkhd.lh_first))
669		panic("vinvalbuf: flush failed");
670	return (0);
671}
672
673/*
674 * Truncate a file's buffer and pages to a specified length.  This
675 * is in lieu of the old vinvalbuf mechanism, which performed unneeded
676 * sync activity.
677 */
678int
679vtruncbuf(vp, cred, p, length, blksize)
680	register struct vnode *vp;
681	struct ucred *cred;
682	struct proc *p;
683	off_t length;
684	int blksize;
685{
686	register struct buf *bp;
687	struct buf *nbp, *blist;
688	int s, error, anyfreed;
689	vm_object_t object;
690	int trunclbn;
691
692	/*
693	 * Round up to the *next* lbn.
694	 */
695	trunclbn = (length + blksize - 1) / blksize;
696
697	s = splbio();
698restart:
699	anyfreed = 1;
700	for (;anyfreed;) {
701		anyfreed = 0;
702		for ( bp = LIST_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) {
703
704			nbp = LIST_NEXT(bp, b_vnbufs);
705
706			if (bp->b_lblkno >= trunclbn) {
707				if (bp->b_flags & B_BUSY) {
708					bp->b_flags |= B_WANTED;
709					tsleep(bp, PRIBIO + 4, "vtrb1", 0);
710					goto restart;
711				} else {
712					bremfree(bp);
713					bp->b_flags |= (B_BUSY | B_INVAL | B_RELBUF);
714					bp->b_flags &= ~B_ASYNC;
715					brelse(bp);
716					anyfreed = 1;
717				}
718				if (nbp &&
719					((LIST_NEXT(nbp, b_vnbufs) == NOLIST) ||
720					 (nbp->b_vp != vp) ||
721					 (nbp->b_flags & B_DELWRI))) {
722					goto restart;
723				}
724			}
725		}
726
727		for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
728
729			nbp = LIST_NEXT(bp, b_vnbufs);
730
731			if (bp->b_lblkno >= trunclbn) {
732				if (bp->b_flags & B_BUSY) {
733					bp->b_flags |= B_WANTED;
734					tsleep(bp, PRIBIO + 4, "vtrb2", 0);
735					goto restart;
736				} else {
737					bremfree(bp);
738					bp->b_flags |= (B_BUSY | B_INVAL | B_RELBUF);
739					bp->b_flags &= ~B_ASYNC;
740					brelse(bp);
741					anyfreed = 1;
742				}
743				if (nbp &&
744					((LIST_NEXT(nbp, b_vnbufs) == NOLIST) ||
745					 (nbp->b_vp != vp) ||
746					 (nbp->b_flags & B_DELWRI) == 0)) {
747					goto restart;
748				}
749			}
750		}
751	}
752
753	if (length > 0) {
754restartsync:
755		for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
756
757			nbp = LIST_NEXT(bp, b_vnbufs);
758
759			if ((bp->b_flags & B_DELWRI) && (bp->b_lblkno < 0)) {
760				if (bp->b_flags & B_BUSY) {
761					bp->b_flags |= B_WANTED;
762					tsleep(bp, PRIBIO, "vtrb3", 0);
763				} else {
764					bremfree(bp);
765					bp->b_flags |= B_BUSY;
766					if (bp->b_vp == vp) {
767						bp->b_flags |= B_ASYNC;
768					} else {
769						bp->b_flags &= ~B_ASYNC;
770					}
771					VOP_BWRITE(bp);
772				}
773				goto restartsync;
774			}
775
776		}
777	}
778
779	while (vp->v_numoutput > 0) {
780		vp->v_flag |= VBWAIT;
781		tsleep(&vp->v_numoutput, PVM, "vbtrunc", 0);
782	}
783
784	splx(s);
785
786	vnode_pager_setsize(vp, length);
787
788	return (0);
789}
790
791/*
792 * Associate a buffer with a vnode.
793 */
794void
795bgetvp(vp, bp)
796	register struct vnode *vp;
797	register struct buf *bp;
798{
799	int s;
800
801#if defined(DIAGNOSTIC)
802	if (bp->b_vp)
803		panic("bgetvp: not free");
804#endif
805	vhold(vp);
806	bp->b_vp = vp;
807	if (vp->v_type == VBLK || vp->v_type == VCHR)
808		bp->b_dev = vp->v_rdev;
809	else
810		bp->b_dev = NODEV;
811	/*
812	 * Insert onto list for new vnode.
813	 */
814	s = splbio();
815	bufinsvn(bp, &vp->v_cleanblkhd);
816	splx(s);
817}
818
819/*
820 * Disassociate a buffer from a vnode.
821 */
822void
823brelvp(bp)
824	register struct buf *bp;
825{
826	struct vnode *vp;
827	int s;
828
829#if defined(DIAGNOSTIC)
830	if (bp->b_vp == (struct vnode *) 0)
831		panic("brelvp: NULL");
832#endif
833
834	/*
835	 * Delete from old vnode list, if on one.
836	 */
837	vp = bp->b_vp;
838	s = splbio();
839	if (bp->b_vnbufs.le_next != NOLIST)
840		bufremvn(bp);
841	if ((vp->v_flag & VONWORKLST) && (LIST_FIRST(&vp->v_dirtyblkhd) == NULL)) {
842		vp->v_flag &= ~VONWORKLST;
843		LIST_REMOVE(vp, v_synclist);
844	}
845	splx(s);
846	bp->b_vp = (struct vnode *) 0;
847	vdrop(vp);
848}
849
850/*
851 * The workitem queue.
852 *
853 * It is useful to delay writes of file data and filesystem metadata
854 * for tens of seconds so that quickly created and deleted files need
855 * not waste disk bandwidth being created and removed. To realize this,
856 * we append vnodes to a "workitem" queue. When running with a soft
857 * updates implementation, most pending metadata dependencies should
858 * not wait for more than a few seconds. Thus, mounted on block devices
859 * are delayed only about a half the time that file data is delayed.
860 * Similarly, directory updates are more critical, so are only delayed
861 * about a third the time that file data is delayed. Thus, there are
862 * SYNCER_MAXDELAY queues that are processed round-robin at a rate of
863 * one each second (driven off the filesystem syner process). The
864 * syncer_delayno variable indicates the next queue that is to be processed.
865 * Items that need to be processed soon are placed in this queue:
866 *
867 *	syncer_workitem_pending[syncer_delayno]
868 *
869 * A delay of fifteen seconds is done by placing the request fifteen
870 * entries later in the queue:
871 *
872 *	syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
873 *
874 */
875
876/*
877 * Add an item to the syncer work queue.
878 */
879void
880vn_syncer_add_to_worklist(vp, delay)
881	struct vnode *vp;
882	int delay;
883{
884	int s, slot;
885
886	s = splbio();
887
888	if (vp->v_flag & VONWORKLST) {
889		LIST_REMOVE(vp, v_synclist);
890	}
891
892	if (delay > syncer_maxdelay - 2)
893		delay = syncer_maxdelay - 2;
894	slot = (syncer_delayno + delay) & syncer_mask;
895
896	LIST_INSERT_HEAD(&syncer_workitem_pending[slot], vp, v_synclist);
897	vp->v_flag |= VONWORKLST;
898	splx(s);
899}
900
901static void sched_sync __P((void));
902static struct	proc *updateproc;
903static struct kproc_desc up_kp = {
904	"syncer",
905	sched_sync,
906	&updateproc
907};
908SYSINIT_KT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp)
909
910/*
911 * System filesystem synchronizer daemon.
912 */
913void
914sched_sync(void)
915{
916	struct synclist *slp;
917	struct vnode *vp;
918	long starttime;
919	int s;
920	struct proc *p = updateproc;
921
922	for (;;) {
923		starttime = time.tv_sec;
924
925		/*
926		 * Push files whose dirty time has expired.
927		 */
928		s = splbio();
929		slp = &syncer_workitem_pending[syncer_delayno];
930		syncer_delayno += 1;
931		if (syncer_delayno == syncer_maxdelay)
932			syncer_delayno = 0;
933		splx(s);
934
935		while ((vp = LIST_FIRST(slp)) != NULL) {
936			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
937			(void) VOP_FSYNC(vp, p->p_ucred, MNT_LAZY, p);
938			VOP_UNLOCK(vp, 0, p);
939			if (LIST_FIRST(slp) == vp) {
940				if (LIST_FIRST(&vp->v_dirtyblkhd) == NULL &&
941				    vp->v_type != VBLK)
942					panic("sched_sync: fsync failed");
943				/*
944				 * Move ourselves to the back of the sync list.
945				 */
946				LIST_REMOVE(vp, v_synclist);
947				vn_syncer_add_to_worklist(vp, syncdelay);
948			}
949		}
950
951		/*
952		 * Do soft update processing.
953		 */
954		if (bioops.io_sync)
955			(*bioops.io_sync)(NULL);
956
957		/*
958		 * The variable rushjob allows the kernel to speed up the
959		 * processing of the filesystem syncer process. A rushjob
960		 * value of N tells the filesystem syncer to process the next
961		 * N seconds worth of work on its queue ASAP. Currently rushjob
962		 * is used by the soft update code to speed up the filesystem
963		 * syncer process when the incore state is getting so far
964		 * ahead of the disk that the kernel memory pool is being
965		 * threatened with exhaustion.
966		 */
967		if (rushjob > 0) {
968			rushjob -= 1;
969			continue;
970		}
971		/*
972		 * If it has taken us less than a second to process the
973		 * current work, then wait. Otherwise start right over
974		 * again. We can still lose time if any single round
975		 * takes more than two seconds, but it does not really
976		 * matter as we are just trying to generally pace the
977		 * filesystem activity.
978		 */
979		if (time.tv_sec == starttime)
980			tsleep(&lbolt, PPAUSE, "syncer", 0);
981	}
982}
983
984/*
985 * Associate a p-buffer with a vnode.
986 */
987void
988pbgetvp(vp, bp)
989	register struct vnode *vp;
990	register struct buf *bp;
991{
992#if defined(DIAGNOSTIC)
993	if (bp->b_vp)
994		panic("pbgetvp: not free");
995#endif
996	bp->b_vp = vp;
997	if (vp->v_type == VBLK || vp->v_type == VCHR)
998		bp->b_dev = vp->v_rdev;
999	else
1000		bp->b_dev = NODEV;
1001}
1002
1003/*
1004 * Disassociate a p-buffer from a vnode.
1005 */
1006void
1007pbrelvp(bp)
1008	register struct buf *bp;
1009{
1010
1011#if defined(DIAGNOSTIC)
1012	if (bp->b_vp == (struct vnode *) 0)
1013		panic("pbrelvp: NULL");
1014#endif
1015
1016	bp->b_vp = (struct vnode *) 0;
1017}
1018
1019/*
1020 * Reassign a buffer from one vnode to another.
1021 * Used to assign file specific control information
1022 * (indirect blocks) to the vnode to which they belong.
1023 */
1024void
1025reassignbuf(bp, newvp)
1026	register struct buf *bp;
1027	register struct vnode *newvp;
1028{
1029	struct buflists *listheadp;
1030	int delay;
1031	int s;
1032
1033	if (newvp == NULL) {
1034		printf("reassignbuf: NULL");
1035		return;
1036	}
1037
1038	s = splbio();
1039	/*
1040	 * Delete from old vnode list, if on one.
1041	 */
1042	if (bp->b_vnbufs.le_next != NOLIST) {
1043		bufremvn(bp);
1044		vdrop(bp->b_vp);
1045	}
1046	/*
1047	 * If dirty, put on list of dirty buffers; otherwise insert onto list
1048	 * of clean buffers.
1049	 */
1050	if (bp->b_flags & B_DELWRI) {
1051		struct buf *tbp;
1052
1053		listheadp = &newvp->v_dirtyblkhd;
1054		if ((newvp->v_flag & VONWORKLST) == 0) {
1055			switch (newvp->v_type) {
1056			case VDIR:
1057				delay = syncdelay / 3;
1058				break;
1059			case VBLK:
1060				if (newvp->v_specmountpoint != NULL) {
1061					delay = syncdelay / 2;
1062					break;
1063				}
1064				/* fall through */
1065			default:
1066				delay = syncdelay;
1067			}
1068			vn_syncer_add_to_worklist(newvp, delay);
1069		}
1070		tbp = listheadp->lh_first;
1071		if (!tbp || (tbp->b_lblkno > bp->b_lblkno)) {
1072			bufinsvn(bp, listheadp);
1073		} else {
1074			while (tbp->b_vnbufs.le_next &&
1075			    (tbp->b_vnbufs.le_next->b_lblkno < bp->b_lblkno)) {
1076				tbp = tbp->b_vnbufs.le_next;
1077			}
1078			LIST_INSERT_AFTER(tbp, bp, b_vnbufs);
1079		}
1080	} else {
1081		bufinsvn(bp, &newvp->v_cleanblkhd);
1082		if ((newvp->v_flag & VONWORKLST) &&
1083			LIST_FIRST(&newvp->v_dirtyblkhd) == NULL) {
1084			newvp->v_flag &= ~VONWORKLST;
1085			LIST_REMOVE(newvp, v_synclist);
1086		}
1087	}
1088	bp->b_vp = newvp;
1089	vhold(bp->b_vp);
1090	splx(s);
1091}
1092
1093#ifndef DEVFS_ROOT
1094/*
1095 * Create a vnode for a block device.
1096 * Used for mounting the root file system.
1097 */
1098int
1099bdevvp(dev, vpp)
1100	dev_t dev;
1101	struct vnode **vpp;
1102{
1103	register struct vnode *vp;
1104	struct vnode *nvp;
1105	int error;
1106
1107	if (dev == NODEV)
1108		return (0);
1109	error = getnewvnode(VT_NON, (struct mount *) 0, spec_vnodeop_p, &nvp);
1110	if (error) {
1111		*vpp = 0;
1112		return (error);
1113	}
1114	vp = nvp;
1115	vp->v_type = VBLK;
1116	if ((nvp = checkalias(vp, dev, (struct mount *) 0))) {
1117		vput(vp);
1118		vp = nvp;
1119	}
1120	*vpp = vp;
1121	return (0);
1122}
1123#endif /* !DEVFS_ROOT */
1124
1125/*
1126 * Check to see if the new vnode represents a special device
1127 * for which we already have a vnode (either because of
1128 * bdevvp() or because of a different vnode representing
1129 * the same block device). If such an alias exists, deallocate
1130 * the existing contents and return the aliased vnode. The
1131 * caller is responsible for filling it with its new contents.
1132 */
1133struct vnode *
1134checkalias(nvp, nvp_rdev, mp)
1135	register struct vnode *nvp;
1136	dev_t nvp_rdev;
1137	struct mount *mp;
1138{
1139	struct proc *p = curproc;	/* XXX */
1140	struct vnode *vp;
1141	struct vnode **vpp;
1142
1143	if (nvp->v_type != VBLK && nvp->v_type != VCHR)
1144		return (NULLVP);
1145
1146	vpp = &speclisth[SPECHASH(nvp_rdev)];
1147loop:
1148	simple_lock(&spechash_slock);
1149	for (vp = *vpp; vp; vp = vp->v_specnext) {
1150		if (nvp_rdev != vp->v_rdev || nvp->v_type != vp->v_type)
1151			continue;
1152		/*
1153		 * Alias, but not in use, so flush it out.
1154		 */
1155		simple_lock(&vp->v_interlock);
1156		if (vp->v_usecount == 0) {
1157			simple_unlock(&spechash_slock);
1158			vgonel(vp, p);
1159			goto loop;
1160		}
1161		if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, p)) {
1162			simple_unlock(&spechash_slock);
1163			goto loop;
1164		}
1165		break;
1166	}
1167	if (vp == NULL || vp->v_tag != VT_NON) {
1168		MALLOC(nvp->v_specinfo, struct specinfo *,
1169		    sizeof(struct specinfo), M_VNODE, M_WAITOK);
1170		nvp->v_rdev = nvp_rdev;
1171		nvp->v_hashchain = vpp;
1172		nvp->v_specnext = *vpp;
1173		nvp->v_specmountpoint = NULL;
1174		simple_unlock(&spechash_slock);
1175		*vpp = nvp;
1176		if (vp != NULLVP) {
1177			nvp->v_flag |= VALIASED;
1178			vp->v_flag |= VALIASED;
1179			vput(vp);
1180		}
1181		return (NULLVP);
1182	}
1183	simple_unlock(&spechash_slock);
1184	VOP_UNLOCK(vp, 0, p);
1185	simple_lock(&vp->v_interlock);
1186	vclean(vp, 0, p);
1187	vp->v_op = nvp->v_op;
1188	vp->v_tag = nvp->v_tag;
1189	nvp->v_type = VNON;
1190	insmntque(vp, mp);
1191	return (vp);
1192}
1193
1194/*
1195 * Grab a particular vnode from the free list, increment its
1196 * reference count and lock it. The vnode lock bit is set the
1197 * vnode is being eliminated in vgone. The process is awakened
1198 * when the transition is completed, and an error returned to
1199 * indicate that the vnode is no longer usable (possibly having
1200 * been changed to a new file system type).
1201 */
1202int
1203vget(vp, flags, p)
1204	register struct vnode *vp;
1205	int flags;
1206	struct proc *p;
1207{
1208	int error;
1209
1210	/*
1211	 * If the vnode is in the process of being cleaned out for
1212	 * another use, we wait for the cleaning to finish and then
1213	 * return failure. Cleaning is determined by checking that
1214	 * the VXLOCK flag is set.
1215	 */
1216	if ((flags & LK_INTERLOCK) == 0) {
1217		simple_lock(&vp->v_interlock);
1218	}
1219	if (vp->v_flag & VXLOCK) {
1220		vp->v_flag |= VXWANT;
1221		simple_unlock(&vp->v_interlock);
1222		tsleep((caddr_t)vp, PINOD, "vget", 0);
1223		return (ENOENT);
1224	}
1225
1226	vp->v_usecount++;
1227
1228	if (VSHOULDBUSY(vp))
1229		vbusy(vp);
1230	if (flags & LK_TYPE_MASK) {
1231		if ((error = vn_lock(vp, flags | LK_INTERLOCK, p)) != 0) {
1232			/*
1233			 * must expand vrele here because we do not want
1234			 * to call VOP_INACTIVE if the reference count
1235			 * drops back to zero since it was never really
1236			 * active. We must remove it from the free list
1237			 * before sleeping so that multiple processes do
1238			 * not try to recycle it.
1239			 */
1240			simple_lock(&vp->v_interlock);
1241			vp->v_usecount--;
1242			if (VSHOULDFREE(vp))
1243				vfree(vp);
1244			simple_unlock(&vp->v_interlock);
1245		}
1246		return (error);
1247	}
1248	simple_unlock(&vp->v_interlock);
1249	return (0);
1250}
1251
1252void
1253vref(struct vnode *vp)
1254{
1255	simple_lock(&vp->v_interlock);
1256	vp->v_usecount++;
1257	simple_unlock(&vp->v_interlock);
1258}
1259
1260/*
1261 * Vnode put/release.
1262 * If count drops to zero, call inactive routine and return to freelist.
1263 */
1264void
1265vrele(vp)
1266	struct vnode *vp;
1267{
1268	struct proc *p = curproc;	/* XXX */
1269
1270#ifdef DIAGNOSTIC
1271	if (vp == NULL)
1272		panic("vrele: null vp");
1273#endif
1274	simple_lock(&vp->v_interlock);
1275
1276	if (vp->v_usecount > 1) {
1277
1278		vp->v_usecount--;
1279		simple_unlock(&vp->v_interlock);
1280
1281		return;
1282	}
1283
1284	if (vp->v_usecount == 1) {
1285
1286		vp->v_usecount--;
1287
1288		if (VSHOULDFREE(vp))
1289			vfree(vp);
1290	/*
1291	 * If we are doing a vput, the node is already locked, and we must
1292	 * call VOP_INACTIVE with the node locked.  So, in the case of
1293	 * vrele, we explicitly lock the vnode before calling VOP_INACTIVE.
1294	 */
1295		if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK, p) == 0) {
1296			VOP_INACTIVE(vp, p);
1297		}
1298
1299	} else {
1300#ifdef DIAGNOSTIC
1301		vprint("vrele: negative ref count", vp);
1302		simple_unlock(&vp->v_interlock);
1303#endif
1304		panic("vrele: negative ref cnt");
1305	}
1306}
1307
1308void
1309vput(vp)
1310	struct vnode *vp;
1311{
1312	struct proc *p = curproc;	/* XXX */
1313
1314#ifdef DIAGNOSTIC
1315	if (vp == NULL)
1316		panic("vput: null vp");
1317#endif
1318
1319	simple_lock(&vp->v_interlock);
1320
1321	if (vp->v_usecount > 1) {
1322
1323		vp->v_usecount--;
1324		VOP_UNLOCK(vp, LK_INTERLOCK, p);
1325		return;
1326
1327	}
1328
1329	if (vp->v_usecount == 1) {
1330
1331		vp->v_usecount--;
1332		if (VSHOULDFREE(vp))
1333			vfree(vp);
1334	/*
1335	 * If we are doing a vput, the node is already locked, and we must
1336	 * call VOP_INACTIVE with the node locked.  So, in the case of
1337	 * vrele, we explicitly lock the vnode before calling VOP_INACTIVE.
1338	 */
1339		simple_unlock(&vp->v_interlock);
1340		VOP_INACTIVE(vp, p);
1341
1342	} else {
1343#ifdef DIAGNOSTIC
1344		vprint("vput: negative ref count", vp);
1345#endif
1346		panic("vput: negative ref cnt");
1347	}
1348}
1349
1350/*
1351 * Somebody doesn't want the vnode recycled.
1352 */
1353void
1354vhold(vp)
1355	register struct vnode *vp;
1356{
1357	int s;
1358
1359  	s = splbio();
1360	vp->v_holdcnt++;
1361	if (VSHOULDBUSY(vp))
1362		vbusy(vp);
1363	splx(s);
1364}
1365
1366/*
1367 * One less who cares about this vnode.
1368 */
1369void
1370vdrop(vp)
1371	register struct vnode *vp;
1372{
1373	int s;
1374
1375	s = splbio();
1376	if (vp->v_holdcnt <= 0)
1377		panic("vdrop: holdcnt");
1378	vp->v_holdcnt--;
1379	if (VSHOULDFREE(vp))
1380		vfree(vp);
1381	splx(s);
1382}
1383
1384/*
1385 * Remove any vnodes in the vnode table belonging to mount point mp.
1386 *
1387 * If MNT_NOFORCE is specified, there should not be any active ones,
1388 * return error if any are found (nb: this is a user error, not a
1389 * system error). If MNT_FORCE is specified, detach any active vnodes
1390 * that are found.
1391 */
1392#ifdef DIAGNOSTIC
1393static int busyprt = 0;		/* print out busy vnodes */
1394SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "");
1395#endif
1396
1397int
1398vflush(mp, skipvp, flags)
1399	struct mount *mp;
1400	struct vnode *skipvp;
1401	int flags;
1402{
1403	struct proc *p = curproc;	/* XXX */
1404	struct vnode *vp, *nvp;
1405	int busy = 0;
1406
1407	simple_lock(&mntvnode_slock);
1408loop:
1409	for (vp = mp->mnt_vnodelist.lh_first; vp; vp = nvp) {
1410		/*
1411		 * Make sure this vnode wasn't reclaimed in getnewvnode().
1412		 * Start over if it has (it won't be on the list anymore).
1413		 */
1414		if (vp->v_mount != mp)
1415			goto loop;
1416		nvp = vp->v_mntvnodes.le_next;
1417		/*
1418		 * Skip over a selected vnode.
1419		 */
1420		if (vp == skipvp)
1421			continue;
1422
1423		simple_lock(&vp->v_interlock);
1424		/*
1425		 * Skip over a vnodes marked VSYSTEM.
1426		 */
1427		if ((flags & SKIPSYSTEM) && (vp->v_flag & VSYSTEM)) {
1428			simple_unlock(&vp->v_interlock);
1429			continue;
1430		}
1431		/*
1432		 * If WRITECLOSE is set, only flush out regular file vnodes
1433		 * open for writing.
1434		 */
1435		if ((flags & WRITECLOSE) &&
1436		    (vp->v_writecount == 0 || vp->v_type != VREG)) {
1437			simple_unlock(&vp->v_interlock);
1438			continue;
1439		}
1440
1441		/*
1442		 * With v_usecount == 0, all we need to do is clear out the
1443		 * vnode data structures and we are done.
1444		 */
1445		if (vp->v_usecount == 0) {
1446			simple_unlock(&mntvnode_slock);
1447			vgonel(vp, p);
1448			simple_lock(&mntvnode_slock);
1449			continue;
1450		}
1451
1452		/*
1453		 * If FORCECLOSE is set, forcibly close the vnode. For block
1454		 * or character devices, revert to an anonymous device. For
1455		 * all other files, just kill them.
1456		 */
1457		if (flags & FORCECLOSE) {
1458			simple_unlock(&mntvnode_slock);
1459			if (vp->v_type != VBLK && vp->v_type != VCHR) {
1460				vgonel(vp, p);
1461			} else {
1462				vclean(vp, 0, p);
1463				vp->v_op = spec_vnodeop_p;
1464				insmntque(vp, (struct mount *) 0);
1465			}
1466			simple_lock(&mntvnode_slock);
1467			continue;
1468		}
1469#ifdef DIAGNOSTIC
1470		if (busyprt)
1471			vprint("vflush: busy vnode", vp);
1472#endif
1473		simple_unlock(&vp->v_interlock);
1474		busy++;
1475	}
1476	simple_unlock(&mntvnode_slock);
1477	if (busy)
1478		return (EBUSY);
1479	return (0);
1480}
1481
1482/*
1483 * Disassociate the underlying file system from a vnode.
1484 */
1485static void
1486vclean(vp, flags, p)
1487	struct vnode *vp;
1488	int flags;
1489	struct proc *p;
1490{
1491	int active;
1492	vm_object_t obj;
1493
1494	/*
1495	 * Check to see if the vnode is in use. If so we have to reference it
1496	 * before we clean it out so that its count cannot fall to zero and
1497	 * generate a race against ourselves to recycle it.
1498	 */
1499	if ((active = vp->v_usecount))
1500		vp->v_usecount++;
1501
1502	/*
1503	 * Prevent the vnode from being recycled or brought into use while we
1504	 * clean it out.
1505	 */
1506	if (vp->v_flag & VXLOCK)
1507		panic("vclean: deadlock");
1508	vp->v_flag |= VXLOCK;
1509	/*
1510	 * Even if the count is zero, the VOP_INACTIVE routine may still
1511	 * have the object locked while it cleans it out. The VOP_LOCK
1512	 * ensures that the VOP_INACTIVE routine is done with its work.
1513	 * For active vnodes, it ensures that no other activity can
1514	 * occur while the underlying object is being cleaned out.
1515	 */
1516	VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK, p);
1517
1518	/*
1519	 * Clean out any buffers associated with the vnode.
1520	 */
1521	vinvalbuf(vp, V_SAVE, NOCRED, p, 0, 0);
1522	if (obj = vp->v_object) {
1523		if (obj->ref_count == 0) {
1524			/*
1525			 * This is a normal way of shutting down the object/vnode
1526			 * association.
1527			 */
1528			vm_object_terminate(obj);
1529		} else {
1530			/*
1531			 * Woe to the process that tries to page now :-).
1532			 */
1533			vm_pager_deallocate(obj);
1534		}
1535	}
1536
1537	/*
1538	 * If purging an active vnode, it must be closed and
1539	 * deactivated before being reclaimed. Note that the
1540	 * VOP_INACTIVE will unlock the vnode.
1541	 */
1542	if (active) {
1543		if (flags & DOCLOSE)
1544			VOP_CLOSE(vp, IO_NDELAY, NOCRED, p);
1545		VOP_INACTIVE(vp, p);
1546	} else {
1547		/*
1548		 * Any other processes trying to obtain this lock must first
1549		 * wait for VXLOCK to clear, then call the new lock operation.
1550		 */
1551		VOP_UNLOCK(vp, 0, p);
1552	}
1553	/*
1554	 * Reclaim the vnode.
1555	 */
1556	if (VOP_RECLAIM(vp, p))
1557		panic("vclean: cannot reclaim");
1558
1559	if (active)
1560		vrele(vp);
1561
1562	cache_purge(vp);
1563	if (vp->v_vnlock) {
1564#if 0 /* This is the only place we have LK_DRAINED in the entire kernel ??? */
1565#ifdef DIAGNOSTIC
1566		if ((vp->v_vnlock->lk_flags & LK_DRAINED) == 0)
1567			vprint("vclean: lock not drained", vp);
1568#endif
1569#endif
1570		FREE(vp->v_vnlock, M_VNODE);
1571		vp->v_vnlock = NULL;
1572	}
1573
1574	if (VSHOULDFREE(vp))
1575		vfree(vp);
1576
1577	/*
1578	 * Done with purge, notify sleepers of the grim news.
1579	 */
1580	vp->v_op = dead_vnodeop_p;
1581	vn_pollgone(vp);
1582	vp->v_tag = VT_NON;
1583	vp->v_flag &= ~VXLOCK;
1584	if (vp->v_flag & VXWANT) {
1585		vp->v_flag &= ~VXWANT;
1586		wakeup((caddr_t) vp);
1587	}
1588}
1589
1590/*
1591 * Eliminate all activity associated with the requested vnode
1592 * and with all vnodes aliased to the requested vnode.
1593 */
1594int
1595vop_revoke(ap)
1596	struct vop_revoke_args /* {
1597		struct vnode *a_vp;
1598		int a_flags;
1599	} */ *ap;
1600{
1601	struct vnode *vp, *vq;
1602	struct proc *p = curproc;	/* XXX */
1603
1604#ifdef DIAGNOSTIC
1605	if ((ap->a_flags & REVOKEALL) == 0)
1606		panic("vop_revoke");
1607#endif
1608
1609	vp = ap->a_vp;
1610	simple_lock(&vp->v_interlock);
1611
1612	if (vp->v_flag & VALIASED) {
1613		/*
1614		 * If a vgone (or vclean) is already in progress,
1615		 * wait until it is done and return.
1616		 */
1617		if (vp->v_flag & VXLOCK) {
1618			vp->v_flag |= VXWANT;
1619			simple_unlock(&vp->v_interlock);
1620			tsleep((caddr_t)vp, PINOD, "vop_revokeall", 0);
1621			return (0);
1622		}
1623		/*
1624		 * Ensure that vp will not be vgone'd while we
1625		 * are eliminating its aliases.
1626		 */
1627		vp->v_flag |= VXLOCK;
1628		simple_unlock(&vp->v_interlock);
1629		while (vp->v_flag & VALIASED) {
1630			simple_lock(&spechash_slock);
1631			for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
1632				if (vq->v_rdev != vp->v_rdev ||
1633				    vq->v_type != vp->v_type || vp == vq)
1634					continue;
1635				simple_unlock(&spechash_slock);
1636				vgone(vq);
1637				break;
1638			}
1639			if (vq == NULLVP) {
1640				simple_unlock(&spechash_slock);
1641			}
1642		}
1643		/*
1644		 * Remove the lock so that vgone below will
1645		 * really eliminate the vnode after which time
1646		 * vgone will awaken any sleepers.
1647		 */
1648		simple_lock(&vp->v_interlock);
1649		vp->v_flag &= ~VXLOCK;
1650		if (vp->v_flag & VXWANT) {
1651			vp->v_flag &= ~VXWANT;
1652			wakeup(vp);
1653		}
1654	}
1655	vgonel(vp, p);
1656	return (0);
1657}
1658
1659/*
1660 * Recycle an unused vnode to the front of the free list.
1661 * Release the passed interlock if the vnode will be recycled.
1662 */
1663int
1664vrecycle(vp, inter_lkp, p)
1665	struct vnode *vp;
1666	struct simplelock *inter_lkp;
1667	struct proc *p;
1668{
1669
1670	simple_lock(&vp->v_interlock);
1671	if (vp->v_usecount == 0) {
1672		if (inter_lkp) {
1673			simple_unlock(inter_lkp);
1674		}
1675		vgonel(vp, p);
1676		return (1);
1677	}
1678	simple_unlock(&vp->v_interlock);
1679	return (0);
1680}
1681
1682/*
1683 * Eliminate all activity associated with a vnode
1684 * in preparation for reuse.
1685 */
1686void
1687vgone(vp)
1688	register struct vnode *vp;
1689{
1690	struct proc *p = curproc;	/* XXX */
1691
1692	simple_lock(&vp->v_interlock);
1693	vgonel(vp, p);
1694}
1695
1696/*
1697 * vgone, with the vp interlock held.
1698 */
1699static void
1700vgonel(vp, p)
1701	struct vnode *vp;
1702	struct proc *p;
1703{
1704	int s;
1705	struct vnode *vq;
1706	struct vnode *vx;
1707
1708	/*
1709	 * If a vgone (or vclean) is already in progress,
1710	 * wait until it is done and return.
1711	 */
1712	if (vp->v_flag & VXLOCK) {
1713		vp->v_flag |= VXWANT;
1714		simple_unlock(&vp->v_interlock);
1715		tsleep((caddr_t)vp, PINOD, "vgone", 0);
1716		return;
1717	}
1718
1719	/*
1720	 * Clean out the filesystem specific data.
1721	 */
1722	vclean(vp, DOCLOSE, p);
1723	simple_lock(&vp->v_interlock);
1724
1725	/*
1726	 * Delete from old mount point vnode list, if on one.
1727	 */
1728	if (vp->v_mount != NULL)
1729		insmntque(vp, (struct mount *)0);
1730	/*
1731	 * If special device, remove it from special device alias list
1732	 * if it is on one.
1733	 */
1734	if ((vp->v_type == VBLK || vp->v_type == VCHR) && vp->v_specinfo != 0) {
1735		simple_lock(&spechash_slock);
1736		if (*vp->v_hashchain == vp) {
1737			*vp->v_hashchain = vp->v_specnext;
1738		} else {
1739			for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
1740				if (vq->v_specnext != vp)
1741					continue;
1742				vq->v_specnext = vp->v_specnext;
1743				break;
1744			}
1745			if (vq == NULL)
1746				panic("missing bdev");
1747		}
1748		if (vp->v_flag & VALIASED) {
1749			vx = NULL;
1750			for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
1751				if (vq->v_rdev != vp->v_rdev ||
1752				    vq->v_type != vp->v_type)
1753					continue;
1754				if (vx)
1755					break;
1756				vx = vq;
1757			}
1758			if (vx == NULL)
1759				panic("missing alias");
1760			if (vq == NULL)
1761				vx->v_flag &= ~VALIASED;
1762			vp->v_flag &= ~VALIASED;
1763		}
1764		simple_unlock(&spechash_slock);
1765		FREE(vp->v_specinfo, M_VNODE);
1766		vp->v_specinfo = NULL;
1767	}
1768
1769	/*
1770	 * If it is on the freelist and not already at the head,
1771	 * move it to the head of the list. The test of the back
1772	 * pointer and the reference count of zero is because
1773	 * it will be removed from the free list by getnewvnode,
1774	 * but will not have its reference count incremented until
1775	 * after calling vgone. If the reference count were
1776	 * incremented first, vgone would (incorrectly) try to
1777	 * close the previous instance of the underlying object.
1778	 */
1779	if (vp->v_usecount == 0 && !(vp->v_flag & VDOOMED)) {
1780		s = splbio();
1781		simple_lock(&vnode_free_list_slock);
1782		if (vp->v_flag & VFREE) {
1783			TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
1784		} else if (vp->v_flag & VTBFREE) {
1785			TAILQ_REMOVE(&vnode_tobefree_list, vp, v_freelist);
1786			vp->v_flag &= ~VTBFREE;
1787			freevnodes++;
1788		} else
1789			freevnodes++;
1790		vp->v_flag |= VFREE;
1791		TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
1792		simple_unlock(&vnode_free_list_slock);
1793		splx(s);
1794	}
1795
1796	vp->v_type = VBAD;
1797	simple_unlock(&vp->v_interlock);
1798}
1799
1800/*
1801 * Lookup a vnode by device number.
1802 */
1803int
1804vfinddev(dev, type, vpp)
1805	dev_t dev;
1806	enum vtype type;
1807	struct vnode **vpp;
1808{
1809	register struct vnode *vp;
1810	int rc = 0;
1811
1812	simple_lock(&spechash_slock);
1813	for (vp = speclisth[SPECHASH(dev)]; vp; vp = vp->v_specnext) {
1814		if (dev != vp->v_rdev || type != vp->v_type)
1815			continue;
1816		*vpp = vp;
1817		rc = 1;
1818		break;
1819	}
1820	simple_unlock(&spechash_slock);
1821	return (rc);
1822}
1823
1824/*
1825 * Calculate the total number of references to a special device.
1826 */
1827int
1828vcount(vp)
1829	register struct vnode *vp;
1830{
1831	struct vnode *vq, *vnext;
1832	int count;
1833
1834loop:
1835	if ((vp->v_flag & VALIASED) == 0)
1836		return (vp->v_usecount);
1837	simple_lock(&spechash_slock);
1838	for (count = 0, vq = *vp->v_hashchain; vq; vq = vnext) {
1839		vnext = vq->v_specnext;
1840		if (vq->v_rdev != vp->v_rdev || vq->v_type != vp->v_type)
1841			continue;
1842		/*
1843		 * Alias, but not in use, so flush it out.
1844		 */
1845		if (vq->v_usecount == 0 && vq != vp) {
1846			simple_unlock(&spechash_slock);
1847			vgone(vq);
1848			goto loop;
1849		}
1850		count += vq->v_usecount;
1851	}
1852	simple_unlock(&spechash_slock);
1853	return (count);
1854}
1855/*
1856 * Print out a description of a vnode.
1857 */
1858static char *typename[] =
1859{"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"};
1860
1861void
1862vprint(label, vp)
1863	char *label;
1864	register struct vnode *vp;
1865{
1866	char buf[64];
1867
1868	if (label != NULL)
1869		printf("%s: %x: ", label, vp);
1870	else
1871		printf("%x: ", vp);
1872	printf("type %s, usecount %d, writecount %d, refcount %ld,",
1873	    typename[vp->v_type], vp->v_usecount, vp->v_writecount,
1874	    vp->v_holdcnt);
1875	buf[0] = '\0';
1876	if (vp->v_flag & VROOT)
1877		strcat(buf, "|VROOT");
1878	if (vp->v_flag & VTEXT)
1879		strcat(buf, "|VTEXT");
1880	if (vp->v_flag & VSYSTEM)
1881		strcat(buf, "|VSYSTEM");
1882	if (vp->v_flag & VXLOCK)
1883		strcat(buf, "|VXLOCK");
1884	if (vp->v_flag & VXWANT)
1885		strcat(buf, "|VXWANT");
1886	if (vp->v_flag & VBWAIT)
1887		strcat(buf, "|VBWAIT");
1888	if (vp->v_flag & VALIASED)
1889		strcat(buf, "|VALIASED");
1890	if (vp->v_flag & VDOOMED)
1891		strcat(buf, "|VDOOMED");
1892	if (vp->v_flag & VFREE)
1893		strcat(buf, "|VFREE");
1894	if (vp->v_flag & VOBJBUF)
1895		strcat(buf, "|VOBJBUF");
1896	if (buf[0] != '\0')
1897		printf(" flags (%s)", &buf[1]);
1898	if (vp->v_data == NULL) {
1899		printf("\n");
1900	} else {
1901		printf("\n\t");
1902		VOP_PRINT(vp);
1903	}
1904}
1905
1906#ifdef DDB
1907/*
1908 * List all of the locked vnodes in the system.
1909 * Called when debugging the kernel.
1910 */
1911static void
1912printlockedvnodes()
1913{
1914	struct proc *p = curproc;	/* XXX */
1915	struct mount *mp, *nmp;
1916	struct vnode *vp;
1917
1918	printf("Locked vnodes\n");
1919	simple_lock(&mountlist_slock);
1920	for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) {
1921		if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) {
1922			nmp = mp->mnt_list.cqe_next;
1923			continue;
1924		}
1925		for (vp = mp->mnt_vnodelist.lh_first;
1926		     vp != NULL;
1927		     vp = vp->v_mntvnodes.le_next) {
1928			if (VOP_ISLOCKED(vp))
1929				vprint((char *)0, vp);
1930		}
1931		simple_lock(&mountlist_slock);
1932		nmp = mp->mnt_list.cqe_next;
1933		vfs_unbusy(mp, p);
1934	}
1935	simple_unlock(&mountlist_slock);
1936}
1937#endif
1938
1939/*
1940 * Top level filesystem related information gathering.
1941 */
1942static int	sysctl_ovfs_conf __P(SYSCTL_HANDLER_ARGS);
1943
1944static int
1945vfs_sysctl SYSCTL_HANDLER_ARGS
1946{
1947	int *name = (int *)arg1 - 1;	/* XXX */
1948	u_int namelen = arg2 + 1;	/* XXX */
1949	struct vfsconf *vfsp;
1950
1951#ifndef NO_COMPAT_PRELITE2
1952	/* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */
1953	if (namelen == 1)
1954		return (sysctl_ovfs_conf(oidp, arg1, arg2, req));
1955#endif
1956
1957#ifdef notyet
1958	/* all sysctl names at this level are at least name and field */
1959	if (namelen < 2)
1960		return (ENOTDIR);		/* overloaded */
1961	if (name[0] != VFS_GENERIC) {
1962		for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
1963			if (vfsp->vfc_typenum == name[0])
1964				break;
1965		if (vfsp == NULL)
1966			return (EOPNOTSUPP);
1967		return ((*vfsp->vfc_vfsops->vfs_sysctl)(&name[1], namelen - 1,
1968		    oldp, oldlenp, newp, newlen, p));
1969	}
1970#endif
1971	switch (name[1]) {
1972	case VFS_MAXTYPENUM:
1973		if (namelen != 2)
1974			return (ENOTDIR);
1975		return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int)));
1976	case VFS_CONF:
1977		if (namelen != 3)
1978			return (ENOTDIR);	/* overloaded */
1979		for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
1980			if (vfsp->vfc_typenum == name[2])
1981				break;
1982		if (vfsp == NULL)
1983			return (EOPNOTSUPP);
1984		return (SYSCTL_OUT(req, vfsp, sizeof *vfsp));
1985	}
1986	return (EOPNOTSUPP);
1987}
1988
1989SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD, vfs_sysctl,
1990	"Generic filesystem");
1991
1992#ifndef NO_COMPAT_PRELITE2
1993
1994static int
1995sysctl_ovfs_conf SYSCTL_HANDLER_ARGS
1996{
1997	int error;
1998	struct vfsconf *vfsp;
1999	struct ovfsconf ovfs;
2000
2001	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
2002		ovfs.vfc_vfsops = vfsp->vfc_vfsops;	/* XXX used as flag */
2003		strcpy(ovfs.vfc_name, vfsp->vfc_name);
2004		ovfs.vfc_index = vfsp->vfc_typenum;
2005		ovfs.vfc_refcount = vfsp->vfc_refcount;
2006		ovfs.vfc_flags = vfsp->vfc_flags;
2007		error = SYSCTL_OUT(req, &ovfs, sizeof ovfs);
2008		if (error)
2009			return error;
2010	}
2011	return 0;
2012}
2013
2014#endif /* !NO_COMPAT_PRELITE2 */
2015
2016static volatile int kinfo_vdebug = 1;
2017
2018#if 0
2019#define KINFO_VNODESLOP	10
2020/*
2021 * Dump vnode list (via sysctl).
2022 * Copyout address of vnode followed by vnode.
2023 */
2024/* ARGSUSED */
2025static int
2026sysctl_vnode SYSCTL_HANDLER_ARGS
2027{
2028	struct proc *p = curproc;	/* XXX */
2029	struct mount *mp, *nmp;
2030	struct vnode *nvp, *vp;
2031	int error;
2032
2033#define VPTRSZ	sizeof (struct vnode *)
2034#define VNODESZ	sizeof (struct vnode)
2035
2036	req->lock = 0;
2037	if (!req->oldptr) /* Make an estimate */
2038		return (SYSCTL_OUT(req, 0,
2039			(numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ)));
2040
2041	simple_lock(&mountlist_slock);
2042	for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) {
2043		if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) {
2044			nmp = mp->mnt_list.cqe_next;
2045			continue;
2046		}
2047again:
2048		simple_lock(&mntvnode_slock);
2049		for (vp = mp->mnt_vnodelist.lh_first;
2050		     vp != NULL;
2051		     vp = nvp) {
2052			/*
2053			 * Check that the vp is still associated with
2054			 * this filesystem.  RACE: could have been
2055			 * recycled onto the same filesystem.
2056			 */
2057			if (vp->v_mount != mp) {
2058				simple_unlock(&mntvnode_slock);
2059				if (kinfo_vdebug)
2060					printf("kinfo: vp changed\n");
2061				goto again;
2062			}
2063			nvp = vp->v_mntvnodes.le_next;
2064			simple_unlock(&mntvnode_slock);
2065			if ((error = SYSCTL_OUT(req, &vp, VPTRSZ)) ||
2066			    (error = SYSCTL_OUT(req, vp, VNODESZ)))
2067				return (error);
2068			simple_lock(&mntvnode_slock);
2069		}
2070		simple_unlock(&mntvnode_slock);
2071		simple_lock(&mountlist_slock);
2072		nmp = mp->mnt_list.cqe_next;
2073		vfs_unbusy(mp, p);
2074	}
2075	simple_unlock(&mountlist_slock);
2076
2077	return (0);
2078}
2079#endif
2080
2081/*
2082 * XXX
2083 * Exporting the vnode list on large systems causes them to crash.
2084 * Exporting the vnode list on medium systems causes sysctl to coredump.
2085 */
2086#if 0
2087SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD,
2088	0, 0, sysctl_vnode, "S,vnode", "");
2089#endif
2090
2091/*
2092 * Check to see if a filesystem is mounted on a block device.
2093 */
2094int
2095vfs_mountedon(vp)
2096	struct vnode *vp;
2097{
2098	struct vnode *vq;
2099	int error = 0;
2100
2101	if (vp->v_specmountpoint != NULL)
2102		return (EBUSY);
2103	if (vp->v_flag & VALIASED) {
2104		simple_lock(&spechash_slock);
2105		for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
2106			if (vq->v_rdev != vp->v_rdev ||
2107			    vq->v_type != vp->v_type)
2108				continue;
2109			if (vq->v_specmountpoint != NULL) {
2110				error = EBUSY;
2111				break;
2112			}
2113		}
2114		simple_unlock(&spechash_slock);
2115	}
2116	return (error);
2117}
2118
2119/*
2120 * Unmount all filesystems. The list is traversed in reverse order
2121 * of mounting to avoid dependencies.
2122 */
2123void
2124vfs_unmountall()
2125{
2126	struct mount *mp, *nmp;
2127	struct proc *p = initproc;	/* XXX XXX should this be proc0? */
2128	int error;
2129
2130	/*
2131	 * Since this only runs when rebooting, it is not interlocked.
2132	 */
2133	for (mp = mountlist.cqh_last; mp != (void *)&mountlist; mp = nmp) {
2134		nmp = mp->mnt_list.cqe_prev;
2135		error = dounmount(mp, MNT_FORCE, p);
2136		if (error) {
2137			printf("unmount of %s failed (",
2138			    mp->mnt_stat.f_mntonname);
2139			if (error == EBUSY)
2140				printf("BUSY)\n");
2141			else
2142				printf("%d)\n", error);
2143		}
2144	}
2145}
2146
2147/*
2148 * Build hash lists of net addresses and hang them off the mount point.
2149 * Called by ufs_mount() to set up the lists of export addresses.
2150 */
2151static int
2152vfs_hang_addrlist(mp, nep, argp)
2153	struct mount *mp;
2154	struct netexport *nep;
2155	struct export_args *argp;
2156{
2157	register struct netcred *np;
2158	register struct radix_node_head *rnh;
2159	register int i;
2160	struct radix_node *rn;
2161	struct sockaddr *saddr, *smask = 0;
2162	struct domain *dom;
2163	int error;
2164
2165	if (argp->ex_addrlen == 0) {
2166		if (mp->mnt_flag & MNT_DEFEXPORTED)
2167			return (EPERM);
2168		np = &nep->ne_defexported;
2169		np->netc_exflags = argp->ex_flags;
2170		np->netc_anon = argp->ex_anon;
2171		np->netc_anon.cr_ref = 1;
2172		mp->mnt_flag |= MNT_DEFEXPORTED;
2173		return (0);
2174	}
2175	i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen;
2176	np = (struct netcred *) malloc(i, M_NETADDR, M_WAITOK);
2177	bzero((caddr_t) np, i);
2178	saddr = (struct sockaddr *) (np + 1);
2179	if ((error = copyin(argp->ex_addr, (caddr_t) saddr, argp->ex_addrlen)))
2180		goto out;
2181	if (saddr->sa_len > argp->ex_addrlen)
2182		saddr->sa_len = argp->ex_addrlen;
2183	if (argp->ex_masklen) {
2184		smask = (struct sockaddr *) ((caddr_t) saddr + argp->ex_addrlen);
2185		error = copyin(argp->ex_mask, (caddr_t) smask, argp->ex_masklen);
2186		if (error)
2187			goto out;
2188		if (smask->sa_len > argp->ex_masklen)
2189			smask->sa_len = argp->ex_masklen;
2190	}
2191	i = saddr->sa_family;
2192	if ((rnh = nep->ne_rtable[i]) == 0) {
2193		/*
2194		 * Seems silly to initialize every AF when most are not used,
2195		 * do so on demand here
2196		 */
2197		for (dom = domains; dom; dom = dom->dom_next)
2198			if (dom->dom_family == i && dom->dom_rtattach) {
2199				dom->dom_rtattach((void **) &nep->ne_rtable[i],
2200				    dom->dom_rtoffset);
2201				break;
2202			}
2203		if ((rnh = nep->ne_rtable[i]) == 0) {
2204			error = ENOBUFS;
2205			goto out;
2206		}
2207	}
2208	rn = (*rnh->rnh_addaddr) ((caddr_t) saddr, (caddr_t) smask, rnh,
2209	    np->netc_rnodes);
2210	if (rn == 0 || np != (struct netcred *) rn) {	/* already exists */
2211		error = EPERM;
2212		goto out;
2213	}
2214	np->netc_exflags = argp->ex_flags;
2215	np->netc_anon = argp->ex_anon;
2216	np->netc_anon.cr_ref = 1;
2217	return (0);
2218out:
2219	free(np, M_NETADDR);
2220	return (error);
2221}
2222
2223/* ARGSUSED */
2224static int
2225vfs_free_netcred(rn, w)
2226	struct radix_node *rn;
2227	void *w;
2228{
2229	register struct radix_node_head *rnh = (struct radix_node_head *) w;
2230
2231	(*rnh->rnh_deladdr) (rn->rn_key, rn->rn_mask, rnh);
2232	free((caddr_t) rn, M_NETADDR);
2233	return (0);
2234}
2235
2236/*
2237 * Free the net address hash lists that are hanging off the mount points.
2238 */
2239static void
2240vfs_free_addrlist(nep)
2241	struct netexport *nep;
2242{
2243	register int i;
2244	register struct radix_node_head *rnh;
2245
2246	for (i = 0; i <= AF_MAX; i++)
2247		if ((rnh = nep->ne_rtable[i])) {
2248			(*rnh->rnh_walktree) (rnh, vfs_free_netcred,
2249			    (caddr_t) rnh);
2250			free((caddr_t) rnh, M_RTABLE);
2251			nep->ne_rtable[i] = 0;
2252		}
2253}
2254
2255int
2256vfs_export(mp, nep, argp)
2257	struct mount *mp;
2258	struct netexport *nep;
2259	struct export_args *argp;
2260{
2261	int error;
2262
2263	if (argp->ex_flags & MNT_DELEXPORT) {
2264		if (mp->mnt_flag & MNT_EXPUBLIC) {
2265			vfs_setpublicfs(NULL, NULL, NULL);
2266			mp->mnt_flag &= ~MNT_EXPUBLIC;
2267		}
2268		vfs_free_addrlist(nep);
2269		mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED);
2270	}
2271	if (argp->ex_flags & MNT_EXPORTED) {
2272		if (argp->ex_flags & MNT_EXPUBLIC) {
2273			if ((error = vfs_setpublicfs(mp, nep, argp)) != 0)
2274				return (error);
2275			mp->mnt_flag |= MNT_EXPUBLIC;
2276		}
2277		if ((error = vfs_hang_addrlist(mp, nep, argp)))
2278			return (error);
2279		mp->mnt_flag |= MNT_EXPORTED;
2280	}
2281	return (0);
2282}
2283
2284
2285/*
2286 * Set the publicly exported filesystem (WebNFS). Currently, only
2287 * one public filesystem is possible in the spec (RFC 2054 and 2055)
2288 */
2289int
2290vfs_setpublicfs(mp, nep, argp)
2291	struct mount *mp;
2292	struct netexport *nep;
2293	struct export_args *argp;
2294{
2295	int error;
2296	struct vnode *rvp;
2297	char *cp;
2298
2299	/*
2300	 * mp == NULL -> invalidate the current info, the FS is
2301	 * no longer exported. May be called from either vfs_export
2302	 * or unmount, so check if it hasn't already been done.
2303	 */
2304	if (mp == NULL) {
2305		if (nfs_pub.np_valid) {
2306			nfs_pub.np_valid = 0;
2307			if (nfs_pub.np_index != NULL) {
2308				FREE(nfs_pub.np_index, M_TEMP);
2309				nfs_pub.np_index = NULL;
2310			}
2311		}
2312		return (0);
2313	}
2314
2315	/*
2316	 * Only one allowed at a time.
2317	 */
2318	if (nfs_pub.np_valid != 0 && mp != nfs_pub.np_mount)
2319		return (EBUSY);
2320
2321	/*
2322	 * Get real filehandle for root of exported FS.
2323	 */
2324	bzero((caddr_t)&nfs_pub.np_handle, sizeof(nfs_pub.np_handle));
2325	nfs_pub.np_handle.fh_fsid = mp->mnt_stat.f_fsid;
2326
2327	if ((error = VFS_ROOT(mp, &rvp)))
2328		return (error);
2329
2330	if ((error = VFS_VPTOFH(rvp, &nfs_pub.np_handle.fh_fid)))
2331		return (error);
2332
2333	vput(rvp);
2334
2335	/*
2336	 * If an indexfile was specified, pull it in.
2337	 */
2338	if (argp->ex_indexfile != NULL) {
2339		MALLOC(nfs_pub.np_index, char *, MAXNAMLEN + 1, M_TEMP,
2340		    M_WAITOK);
2341		error = copyinstr(argp->ex_indexfile, nfs_pub.np_index,
2342		    MAXNAMLEN, (size_t *)0);
2343		if (!error) {
2344			/*
2345			 * Check for illegal filenames.
2346			 */
2347			for (cp = nfs_pub.np_index; *cp; cp++) {
2348				if (*cp == '/') {
2349					error = EINVAL;
2350					break;
2351				}
2352			}
2353		}
2354		if (error) {
2355			FREE(nfs_pub.np_index, M_TEMP);
2356			return (error);
2357		}
2358	}
2359
2360	nfs_pub.np_mount = mp;
2361	nfs_pub.np_valid = 1;
2362	return (0);
2363}
2364
2365struct netcred *
2366vfs_export_lookup(mp, nep, nam)
2367	register struct mount *mp;
2368	struct netexport *nep;
2369	struct sockaddr *nam;
2370{
2371	register struct netcred *np;
2372	register struct radix_node_head *rnh;
2373	struct sockaddr *saddr;
2374
2375	np = NULL;
2376	if (mp->mnt_flag & MNT_EXPORTED) {
2377		/*
2378		 * Lookup in the export list first.
2379		 */
2380		if (nam != NULL) {
2381			saddr = nam;
2382			rnh = nep->ne_rtable[saddr->sa_family];
2383			if (rnh != NULL) {
2384				np = (struct netcred *)
2385					(*rnh->rnh_matchaddr)((caddr_t)saddr,
2386							      rnh);
2387				if (np && np->netc_rnodes->rn_flags & RNF_ROOT)
2388					np = NULL;
2389			}
2390		}
2391		/*
2392		 * If no address match, use the default if it exists.
2393		 */
2394		if (np == NULL && mp->mnt_flag & MNT_DEFEXPORTED)
2395			np = &nep->ne_defexported;
2396	}
2397	return (np);
2398}
2399
2400/*
2401 * perform msync on all vnodes under a mount point
2402 * the mount point must be locked.
2403 */
2404void
2405vfs_msync(struct mount *mp, int flags) {
2406	struct vnode *vp, *nvp;
2407	int anyio, tries;
2408
2409	tries = 5;
2410loop:
2411	anyio = 0;
2412	for (vp = mp->mnt_vnodelist.lh_first; vp != NULL; vp = nvp) {
2413
2414		nvp = vp->v_mntvnodes.le_next;
2415
2416		if (vp->v_mount != mp) {
2417			goto loop;
2418		}
2419
2420		if ((vp->v_flag & VXLOCK) ||
2421			(VOP_ISLOCKED(vp) && (flags != MNT_WAIT))) {
2422			continue;
2423		}
2424
2425		simple_lock(&vp->v_interlock);
2426		if (vp->v_object &&
2427		   (vp->v_object->flags & OBJ_MIGHTBEDIRTY)) {
2428			if (!vget(vp,
2429				LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY | LK_NOOBJ, curproc)) {
2430				if (vp->v_object) {
2431					vm_object_page_clean(vp->v_object, 0, 0, TRUE);
2432					anyio = 1;
2433				}
2434				vput(vp);
2435			}
2436		} else {
2437			simple_unlock(&vp->v_interlock);
2438		}
2439	}
2440	if (anyio && (--tries > 0))
2441		goto loop;
2442}
2443
2444/*
2445 * Create the VM object needed for VMIO and mmap support.  This
2446 * is done for all VREG files in the system.  Some filesystems might
2447 * afford the additional metadata buffering capability of the
2448 * VMIO code by making the device node be VMIO mode also.
2449 *
2450 * If !waslocked, must be called with interlock.
2451 */
2452int
2453vfs_object_create(vp, p, cred, waslocked)
2454	struct vnode *vp;
2455	struct proc *p;
2456	struct ucred *cred;
2457	int waslocked;
2458{
2459	struct vattr vat;
2460	vm_object_t object;
2461	int error = 0;
2462
2463	if ((vp->v_type != VREG) && (vp->v_type != VBLK)) {
2464		if (!waslocked)
2465			simple_unlock(&vp->v_interlock);
2466		return 0;
2467	}
2468
2469	if (!waslocked)
2470		vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK | LK_RETRY, p);
2471
2472retry:
2473	if ((object = vp->v_object) == NULL) {
2474		if (vp->v_type == VREG) {
2475			if ((error = VOP_GETATTR(vp, &vat, cred, p)) != 0)
2476				goto retn;
2477			object = vnode_pager_alloc(vp,
2478				OFF_TO_IDX(round_page(vat.va_size)), 0, 0);
2479		} else if (major(vp->v_rdev) < nblkdev) {
2480			/*
2481			 * This simply allocates the biggest object possible
2482			 * for a VBLK vnode.  This should be fixed, but doesn't
2483			 * cause any problems (yet).
2484			 */
2485			object = vnode_pager_alloc(vp, INT_MAX, 0, 0);
2486		}
2487		object->ref_count--;
2488		vp->v_usecount--;
2489	} else {
2490		if (object->flags & OBJ_DEAD) {
2491			VOP_UNLOCK(vp, 0, p);
2492			tsleep(object, PVM, "vodead", 0);
2493			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
2494			goto retry;
2495		}
2496	}
2497
2498	if (vp->v_object) {
2499		vp->v_flag |= VOBJBUF;
2500	}
2501
2502retn:
2503	if (!waslocked) {
2504		simple_lock(&vp->v_interlock);
2505		VOP_UNLOCK(vp, LK_INTERLOCK, p);
2506	}
2507
2508	return error;
2509}
2510
2511static void
2512vfree(vp)
2513	struct vnode *vp;
2514{
2515	int s;
2516
2517	s = splbio();
2518	simple_lock(&vnode_free_list_slock);
2519	if (vp->v_flag & VTBFREE) {
2520		TAILQ_REMOVE(&vnode_tobefree_list, vp, v_freelist);
2521		vp->v_flag &= ~VTBFREE;
2522	}
2523	if (vp->v_flag & VAGE) {
2524		TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
2525	} else {
2526		TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
2527	}
2528	freevnodes++;
2529	simple_unlock(&vnode_free_list_slock);
2530	vp->v_flag &= ~VAGE;
2531	vp->v_flag |= VFREE;
2532	splx(s);
2533}
2534
2535void
2536vbusy(vp)
2537	struct vnode *vp;
2538{
2539	int s;
2540
2541	s = splbio();
2542	simple_lock(&vnode_free_list_slock);
2543	if (vp->v_flag & VTBFREE) {
2544		TAILQ_REMOVE(&vnode_tobefree_list, vp, v_freelist);
2545		vp->v_flag &= ~VTBFREE;
2546	} else {
2547		TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
2548		freevnodes--;
2549	}
2550	simple_unlock(&vnode_free_list_slock);
2551	vp->v_flag &= ~(VFREE|VAGE);
2552	splx(s);
2553}
2554
2555/*
2556 * Record a process's interest in events which might happen to
2557 * a vnode.  Because poll uses the historic select-style interface
2558 * internally, this routine serves as both the ``check for any
2559 * pending events'' and the ``record my interest in future events''
2560 * functions.  (These are done together, while the lock is held,
2561 * to avoid race conditions.)
2562 */
2563int
2564vn_pollrecord(vp, p, events)
2565	struct vnode *vp;
2566	struct proc *p;
2567	short events;
2568{
2569	simple_lock(&vp->v_pollinfo.vpi_lock);
2570	if (vp->v_pollinfo.vpi_revents & events) {
2571		/*
2572		 * This leaves events we are not interested
2573		 * in available for the other process which
2574		 * which presumably had requested them
2575		 * (otherwise they would never have been
2576		 * recorded).
2577		 */
2578		events &= vp->v_pollinfo.vpi_revents;
2579		vp->v_pollinfo.vpi_revents &= ~events;
2580
2581		simple_unlock(&vp->v_pollinfo.vpi_lock);
2582		return events;
2583	}
2584	vp->v_pollinfo.vpi_events |= events;
2585	selrecord(p, &vp->v_pollinfo.vpi_selinfo);
2586	simple_unlock(&vp->v_pollinfo.vpi_lock);
2587	return 0;
2588}
2589
2590/*
2591 * Note the occurrence of an event.  If the VN_POLLEVENT macro is used,
2592 * it is possible for us to miss an event due to race conditions, but
2593 * that condition is expected to be rare, so for the moment it is the
2594 * preferred interface.
2595 */
2596void
2597vn_pollevent(vp, events)
2598	struct vnode *vp;
2599	short events;
2600{
2601	simple_lock(&vp->v_pollinfo.vpi_lock);
2602	if (vp->v_pollinfo.vpi_events & events) {
2603		/*
2604		 * We clear vpi_events so that we don't
2605		 * call selwakeup() twice if two events are
2606		 * posted before the polling process(es) is
2607		 * awakened.  This also ensures that we take at
2608		 * most one selwakeup() if the polling process
2609		 * is no longer interested.  However, it does
2610		 * mean that only one event can be noticed at
2611		 * a time.  (Perhaps we should only clear those
2612		 * event bits which we note?) XXX
2613		 */
2614		vp->v_pollinfo.vpi_events = 0;	/* &= ~events ??? */
2615		vp->v_pollinfo.vpi_revents |= events;
2616		selwakeup(&vp->v_pollinfo.vpi_selinfo);
2617	}
2618	simple_unlock(&vp->v_pollinfo.vpi_lock);
2619}
2620
2621/*
2622 * Wake up anyone polling on vp because it is being revoked.
2623 * This depends on dead_poll() returning POLLHUP for correct
2624 * behavior.
2625 */
2626void
2627vn_pollgone(vp)
2628	struct vnode *vp;
2629{
2630	simple_lock(&vp->v_pollinfo.vpi_lock);
2631	if (vp->v_pollinfo.vpi_events) {
2632		vp->v_pollinfo.vpi_events = 0;
2633		selwakeup(&vp->v_pollinfo.vpi_selinfo);
2634	}
2635	simple_unlock(&vp->v_pollinfo.vpi_lock);
2636}
2637
2638
2639
2640/*
2641 * Routine to create and manage a filesystem syncer vnode.
2642 */
2643#define sync_close ((int (*) __P((struct  vop_close_args *)))nullop)
2644int	sync_fsync __P((struct  vop_fsync_args *));
2645int	sync_inactive __P((struct  vop_inactive_args *));
2646int	sync_reclaim  __P((struct  vop_reclaim_args *));
2647#define sync_lock ((int (*) __P((struct  vop_lock_args *)))vop_nolock)
2648#define sync_unlock ((int (*) __P((struct  vop_unlock_args *)))vop_nounlock)
2649int	sync_print __P((struct vop_print_args *));
2650#define sync_islocked ((int(*) __P((struct vop_islocked_args *)))vop_noislocked)
2651
2652vop_t **sync_vnodeop_p;
2653struct vnodeopv_entry_desc sync_vnodeop_entries[] = {
2654	{ &vop_default_desc,	(vop_t *) vop_eopnotsupp },
2655	{ &vop_close_desc,	(vop_t *) sync_close },		/* close */
2656	{ &vop_fsync_desc,	(vop_t *) sync_fsync },		/* fsync */
2657	{ &vop_inactive_desc,	(vop_t *) sync_inactive },	/* inactive */
2658	{ &vop_reclaim_desc,	(vop_t *) sync_reclaim },	/* reclaim */
2659	{ &vop_lock_desc,	(vop_t *) sync_lock },		/* lock */
2660	{ &vop_unlock_desc,	(vop_t *) sync_unlock },	/* unlock */
2661	{ &vop_print_desc,	(vop_t *) sync_print },		/* print */
2662	{ &vop_islocked_desc,	(vop_t *) sync_islocked },	/* islocked */
2663	{ NULL, NULL }
2664};
2665struct vnodeopv_desc sync_vnodeop_opv_desc =
2666	{ &sync_vnodeop_p, sync_vnodeop_entries };
2667
2668VNODEOP_SET(sync_vnodeop_opv_desc);
2669
2670/*
2671 * Create a new filesystem syncer vnode for the specified mount point.
2672 */
2673int
2674vfs_allocate_syncvnode(mp)
2675	struct mount *mp;
2676{
2677	struct vnode *vp;
2678	static long start, incr, next;
2679	int error;
2680
2681	/* Allocate a new vnode */
2682	if ((error = getnewvnode(VT_VFS, mp, sync_vnodeop_p, &vp)) != 0) {
2683		mp->mnt_syncer = NULL;
2684		return (error);
2685	}
2686	vp->v_type = VNON;
2687	/*
2688	 * Place the vnode onto the syncer worklist. We attempt to
2689	 * scatter them about on the list so that they will go off
2690	 * at evenly distributed times even if all the filesystems
2691	 * are mounted at once.
2692	 */
2693	next += incr;
2694	if (next == 0 || next > syncer_maxdelay) {
2695		start /= 2;
2696		incr /= 2;
2697		if (start == 0) {
2698			start = syncer_maxdelay / 2;
2699			incr = syncer_maxdelay;
2700		}
2701		next = start;
2702	}
2703	vn_syncer_add_to_worklist(vp, syncdelay > 0 ? next % syncdelay : 0);
2704	mp->mnt_syncer = vp;
2705	return (0);
2706}
2707
2708/*
2709 * Do a lazy sync of the filesystem.
2710 */
2711int
2712sync_fsync(ap)
2713	struct vop_fsync_args /* {
2714		struct vnode *a_vp;
2715		struct ucred *a_cred;
2716		int a_waitfor;
2717		struct proc *a_p;
2718	} */ *ap;
2719{
2720	struct vnode *syncvp = ap->a_vp;
2721	struct mount *mp = syncvp->v_mount;
2722	struct proc *p = ap->a_p;
2723	int asyncflag;
2724
2725	/*
2726	 * We only need to do something if this is a lazy evaluation.
2727	 */
2728	if (ap->a_waitfor != MNT_LAZY)
2729		return (0);
2730
2731	/*
2732	 * Move ourselves to the back of the sync list.
2733	 */
2734	vn_syncer_add_to_worklist(syncvp, syncdelay);
2735
2736	/*
2737	 * Walk the list of vnodes pushing all that are dirty and
2738	 * not already on the sync list.
2739	 */
2740	simple_lock(&mountlist_slock);
2741	if (vfs_busy(mp, LK_EXCLUSIVE | LK_NOWAIT, &mountlist_slock, p) != 0)
2742		return (0);
2743	asyncflag = mp->mnt_flag & MNT_ASYNC;
2744	mp->mnt_flag &= ~MNT_ASYNC;
2745	VFS_SYNC(mp, MNT_LAZY, ap->a_cred, p);
2746	if (asyncflag)
2747		mp->mnt_flag |= MNT_ASYNC;
2748	vfs_unbusy(mp, p);
2749	return (0);
2750}
2751
2752/*
2753 * The syncer vnode is no referenced.
2754 */
2755int
2756sync_inactive(ap)
2757	struct vop_inactive_args /* {
2758		struct vnode *a_vp;
2759		struct proc *a_p;
2760	} */ *ap;
2761{
2762
2763	vgone(ap->a_vp);
2764	return (0);
2765}
2766
2767/*
2768 * The syncer vnode is no longer needed and is being decommissioned.
2769 */
2770int
2771sync_reclaim(ap)
2772	struct vop_reclaim_args /* {
2773		struct vnode *a_vp;
2774	} */ *ap;
2775{
2776	struct vnode *vp = ap->a_vp;
2777
2778	vp->v_mount->mnt_syncer = NULL;
2779	if (vp->v_flag & VONWORKLST) {
2780		LIST_REMOVE(vp, v_synclist);
2781		vp->v_flag &= ~VONWORKLST;
2782	}
2783
2784	return (0);
2785}
2786
2787/*
2788 * Print out a syncer vnode.
2789 */
2790int
2791sync_print(ap)
2792	struct vop_print_args /* {
2793		struct vnode *a_vp;
2794	} */ *ap;
2795{
2796	struct vnode *vp = ap->a_vp;
2797
2798	printf("syncer vnode");
2799	if (vp->v_vnlock != NULL)
2800		lockmgr_printinfo(vp->v_vnlock);
2801	printf("\n");
2802	return (0);
2803}
2804