vfs_export.c revision 48391
1/*
2 * Copyright (c) 1989, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 *    must display the following acknowledgement:
20 *	This product includes software developed by the University of
21 *	California, Berkeley and its contributors.
22 * 4. Neither the name of the University nor the names of its contributors
23 *    may be used to endorse or promote products derived from this software
24 *    without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 *
38 *	@(#)vfs_subr.c	8.31 (Berkeley) 5/26/95
39 * $Id: vfs_subr.c,v 1.203 1999/06/26 02:46:10 mckusick Exp $
40 */
41
42/*
43 * External virtual filesystem routines
44 */
45#include "opt_ddb.h"
46
47#include <sys/param.h>
48#include <sys/systm.h>
49#include <sys/conf.h>
50#include <sys/fcntl.h>
51#include <sys/kernel.h>
52#include <sys/proc.h>
53#include <sys/kthread.h>
54#include <sys/malloc.h>
55#include <sys/mount.h>
56#include <sys/socket.h>
57#include <sys/vnode.h>
58#include <sys/stat.h>
59#include <sys/buf.h>
60#include <sys/domain.h>
61#include <sys/dirent.h>
62#include <sys/vmmeter.h>
63
64#include <machine/limits.h>
65
66#include <vm/vm.h>
67#include <vm/vm_param.h>
68#include <vm/vm_prot.h>
69#include <vm/vm_object.h>
70#include <vm/vm_extern.h>
71#include <vm/pmap.h>
72#include <vm/vm_map.h>
73#include <vm/vm_page.h>
74#include <vm/vm_pager.h>
75#include <vm/vnode_pager.h>
76#include <vm/vm_zone.h>
77#include <sys/sysctl.h>
78
79#include <miscfs/specfs/specdev.h>
80
81static MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure");
82
83static void	insmntque __P((struct vnode *vp, struct mount *mp));
84static void	vclean __P((struct vnode *vp, int flags, struct proc *p));
85static void	vfree __P((struct vnode *));
86static void	vgonel __P((struct vnode *vp, struct proc *p));
87static unsigned long	numvnodes;
88SYSCTL_INT(_debug, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, "");
89
90enum vtype iftovt_tab[16] = {
91	VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
92	VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
93};
94int vttoif_tab[9] = {
95	0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
96	S_IFSOCK, S_IFIFO, S_IFMT,
97};
98
99static TAILQ_HEAD(freelst, vnode) vnode_free_list;	/* vnode free list */
100struct tobefreelist vnode_tobefree_list;	/* vnode free list */
101
102static u_long wantfreevnodes = 25;
103SYSCTL_INT(_debug, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, "");
104static u_long freevnodes = 0;
105SYSCTL_INT(_debug, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, "");
106
107int vfs_ioopt = 0;
108#ifdef ENABLE_VFS_IOOPT
109SYSCTL_INT(_vfs, OID_AUTO, ioopt, CTLFLAG_RW, &vfs_ioopt, 0, "");
110#endif
111
112struct mntlist mountlist;	/* mounted filesystem list */
113struct simplelock mountlist_slock;
114struct simplelock mntvnode_slock;
115int	nfs_mount_type = -1;
116#ifndef NULL_SIMPLELOCKS
117static struct simplelock mntid_slock;
118static struct simplelock vnode_free_list_slock;
119static struct simplelock spechash_slock;
120#endif
121struct nfs_public nfs_pub;	/* publicly exported FS */
122static vm_zone_t vnode_zone;
123
124/*
125 * The workitem queue.
126 */
127#define SYNCER_MAXDELAY		32
128static int syncer_maxdelay = SYNCER_MAXDELAY;	/* maximum delay time */
129time_t syncdelay = 30;		/* max time to delay syncing data */
130time_t filedelay = 30;		/* time to delay syncing files */
131SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, "");
132time_t dirdelay = 29;		/* time to delay syncing directories */
133SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, "");
134time_t metadelay = 28;		/* time to delay syncing metadata */
135SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, "");
136static int rushjob;			/* number of slots to run ASAP */
137static int stat_rush_requests;	/* number of times I/O speeded up */
138SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, "");
139
140static int syncer_delayno = 0;
141static long syncer_mask;
142LIST_HEAD(synclist, vnode);
143static struct synclist *syncer_workitem_pending;
144
145int desiredvnodes;
146SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW,
147    &desiredvnodes, 0, "Maximum number of vnodes");
148
149static void	vfs_free_addrlist __P((struct netexport *nep));
150static int	vfs_free_netcred __P((struct radix_node *rn, void *w));
151static int	vfs_hang_addrlist __P((struct mount *mp, struct netexport *nep,
152				       struct export_args *argp));
153
154/*
155 * Initialize the vnode management data structures.
156 */
157void
158vntblinit()
159{
160
161	desiredvnodes = maxproc + cnt.v_page_count / 4;
162	simple_lock_init(&mntvnode_slock);
163	simple_lock_init(&mntid_slock);
164	simple_lock_init(&spechash_slock);
165	TAILQ_INIT(&vnode_free_list);
166	TAILQ_INIT(&vnode_tobefree_list);
167	simple_lock_init(&vnode_free_list_slock);
168	CIRCLEQ_INIT(&mountlist);
169	vnode_zone = zinit("VNODE", sizeof (struct vnode), 0, 0, 5);
170	/*
171	 * Initialize the filesystem syncer.
172	 */
173	syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE,
174		&syncer_mask);
175	syncer_maxdelay = syncer_mask + 1;
176}
177
178/*
179 * Mark a mount point as busy. Used to synchronize access and to delay
180 * unmounting. Interlock is not released on failure.
181 */
182int
183vfs_busy(mp, flags, interlkp, p)
184	struct mount *mp;
185	int flags;
186	struct simplelock *interlkp;
187	struct proc *p;
188{
189	int lkflags;
190
191	if (mp->mnt_kern_flag & MNTK_UNMOUNT) {
192		if (flags & LK_NOWAIT)
193			return (ENOENT);
194		mp->mnt_kern_flag |= MNTK_MWAIT;
195		if (interlkp) {
196			simple_unlock(interlkp);
197		}
198		/*
199		 * Since all busy locks are shared except the exclusive
200		 * lock granted when unmounting, the only place that a
201		 * wakeup needs to be done is at the release of the
202		 * exclusive lock at the end of dounmount.
203		 */
204		tsleep((caddr_t)mp, PVFS, "vfs_busy", 0);
205		if (interlkp) {
206			simple_lock(interlkp);
207		}
208		return (ENOENT);
209	}
210	lkflags = LK_SHARED | LK_NOPAUSE;
211	if (interlkp)
212		lkflags |= LK_INTERLOCK;
213	if (lockmgr(&mp->mnt_lock, lkflags, interlkp, p))
214		panic("vfs_busy: unexpected lock failure");
215	return (0);
216}
217
218/*
219 * Free a busy filesystem.
220 */
221void
222vfs_unbusy(mp, p)
223	struct mount *mp;
224	struct proc *p;
225{
226
227	lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, p);
228}
229
230/*
231 * Lookup a filesystem type, and if found allocate and initialize
232 * a mount structure for it.
233 *
234 * Devname is usually updated by mount(8) after booting.
235 */
236int
237vfs_rootmountalloc(fstypename, devname, mpp)
238	char *fstypename;
239	char *devname;
240	struct mount **mpp;
241{
242	struct proc *p = curproc;	/* XXX */
243	struct vfsconf *vfsp;
244	struct mount *mp;
245
246	if (fstypename == NULL)
247		return (ENODEV);
248	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
249		if (!strcmp(vfsp->vfc_name, fstypename))
250			break;
251	if (vfsp == NULL)
252		return (ENODEV);
253	mp = malloc((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK);
254	bzero((char *)mp, (u_long)sizeof(struct mount));
255	lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, LK_NOPAUSE);
256	(void)vfs_busy(mp, LK_NOWAIT, 0, p);
257	LIST_INIT(&mp->mnt_vnodelist);
258	mp->mnt_vfc = vfsp;
259	mp->mnt_op = vfsp->vfc_vfsops;
260	mp->mnt_flag = MNT_RDONLY;
261	mp->mnt_vnodecovered = NULLVP;
262	vfsp->vfc_refcount++;
263	mp->mnt_stat.f_type = vfsp->vfc_typenum;
264	mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
265	strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN);
266	mp->mnt_stat.f_mntonname[0] = '/';
267	mp->mnt_stat.f_mntonname[1] = 0;
268	(void) copystr(devname, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, 0);
269	*mpp = mp;
270	return (0);
271}
272
273/*
274 * Find an appropriate filesystem to use for the root. If a filesystem
275 * has not been preselected, walk through the list of known filesystems
276 * trying those that have mountroot routines, and try them until one
277 * works or we have tried them all.
278 */
279#ifdef notdef	/* XXX JH */
280int
281lite2_vfs_mountroot()
282{
283	struct vfsconf *vfsp;
284	extern int (*lite2_mountroot) __P((void));
285	int error;
286
287	if (lite2_mountroot != NULL)
288		return ((*lite2_mountroot)());
289	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
290		if (vfsp->vfc_mountroot == NULL)
291			continue;
292		if ((error = (*vfsp->vfc_mountroot)()) == 0)
293			return (0);
294		printf("%s_mountroot failed: %d\n", vfsp->vfc_name, error);
295	}
296	return (ENODEV);
297}
298#endif
299
300/*
301 * Lookup a mount point by filesystem identifier.
302 */
303struct mount *
304vfs_getvfs(fsid)
305	fsid_t *fsid;
306{
307	register struct mount *mp;
308
309	simple_lock(&mountlist_slock);
310	for (mp = mountlist.cqh_first; mp != (void *)&mountlist;
311	    mp = mp->mnt_list.cqe_next) {
312		if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
313		    mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
314			simple_unlock(&mountlist_slock);
315			return (mp);
316	    }
317	}
318	simple_unlock(&mountlist_slock);
319	return ((struct mount *) 0);
320}
321
322/*
323 * Get a new unique fsid
324 */
325void
326vfs_getnewfsid(mp)
327	struct mount *mp;
328{
329	static u_short xxxfs_mntid;
330
331	fsid_t tfsid;
332	int mtype;
333
334	simple_lock(&mntid_slock);
335	mtype = mp->mnt_vfc->vfc_typenum;
336	mp->mnt_stat.f_fsid.val[0] = (256 + mtype) * 256;
337	mp->mnt_stat.f_fsid.val[1] = mtype;
338	if (xxxfs_mntid == 0)
339		++xxxfs_mntid;
340	tfsid.val[0] = (256 + mtype) * 256 | xxxfs_mntid;
341	tfsid.val[1] = mtype;
342	if (mountlist.cqh_first != (void *)&mountlist) {
343		while (vfs_getvfs(&tfsid)) {
344			tfsid.val[0]++;
345			xxxfs_mntid++;
346		}
347	}
348	mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
349	simple_unlock(&mntid_slock);
350}
351
352/*
353 * Set vnode attributes to VNOVAL
354 */
355void
356vattr_null(vap)
357	register struct vattr *vap;
358{
359
360	vap->va_type = VNON;
361	vap->va_size = VNOVAL;
362	vap->va_bytes = VNOVAL;
363	vap->va_mode = VNOVAL;
364	vap->va_nlink = VNOVAL;
365	vap->va_uid = VNOVAL;
366	vap->va_gid = VNOVAL;
367	vap->va_fsid = VNOVAL;
368	vap->va_fileid = VNOVAL;
369	vap->va_blocksize = VNOVAL;
370	vap->va_rdev = VNOVAL;
371	vap->va_atime.tv_sec = VNOVAL;
372	vap->va_atime.tv_nsec = VNOVAL;
373	vap->va_mtime.tv_sec = VNOVAL;
374	vap->va_mtime.tv_nsec = VNOVAL;
375	vap->va_ctime.tv_sec = VNOVAL;
376	vap->va_ctime.tv_nsec = VNOVAL;
377	vap->va_flags = VNOVAL;
378	vap->va_gen = VNOVAL;
379	vap->va_vaflags = 0;
380}
381
382/*
383 * Routines having to do with the management of the vnode table.
384 */
385extern vop_t **dead_vnodeop_p;
386
387/*
388 * Return the next vnode from the free list.
389 */
390int
391getnewvnode(tag, mp, vops, vpp)
392	enum vtagtype tag;
393	struct mount *mp;
394	vop_t **vops;
395	struct vnode **vpp;
396{
397	int s;
398	struct proc *p = curproc;	/* XXX */
399	struct vnode *vp, *tvp, *nvp;
400	vm_object_t object;
401	TAILQ_HEAD(freelst, vnode) vnode_tmp_list;
402
403	/*
404	 * We take the least recently used vnode from the freelist
405	 * if we can get it and it has no cached pages, and no
406	 * namecache entries are relative to it.
407	 * Otherwise we allocate a new vnode
408	 */
409
410	s = splbio();
411	simple_lock(&vnode_free_list_slock);
412	TAILQ_INIT(&vnode_tmp_list);
413
414	for (vp = TAILQ_FIRST(&vnode_tobefree_list); vp; vp = nvp) {
415		nvp = TAILQ_NEXT(vp, v_freelist);
416		TAILQ_REMOVE(&vnode_tobefree_list, vp, v_freelist);
417		if (vp->v_flag & VAGE) {
418			TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
419		} else {
420			TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
421		}
422		vp->v_flag &= ~(VTBFREE|VAGE);
423		vp->v_flag |= VFREE;
424		if (vp->v_usecount)
425			panic("tobe free vnode isn't");
426		freevnodes++;
427	}
428
429	if (wantfreevnodes && freevnodes < wantfreevnodes) {
430		vp = NULL;
431	} else if (!wantfreevnodes && freevnodes <= desiredvnodes) {
432		/*
433		 * XXX: this is only here to be backwards compatible
434		 */
435		vp = NULL;
436	} else {
437		for (vp = TAILQ_FIRST(&vnode_free_list); vp; vp = nvp) {
438			nvp = TAILQ_NEXT(vp, v_freelist);
439			if (!simple_lock_try(&vp->v_interlock))
440				continue;
441			if (vp->v_usecount)
442				panic("free vnode isn't");
443
444			object = vp->v_object;
445			if (object && (object->resident_page_count || object->ref_count)) {
446				printf("object inconsistant state: RPC: %d, RC: %d\n",
447					object->resident_page_count, object->ref_count);
448				/* Don't recycle if it's caching some pages */
449				TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
450				TAILQ_INSERT_TAIL(&vnode_tmp_list, vp, v_freelist);
451				continue;
452			} else if (LIST_FIRST(&vp->v_cache_src)) {
453				/* Don't recycle if active in the namecache */
454				simple_unlock(&vp->v_interlock);
455				continue;
456			} else {
457				break;
458			}
459		}
460	}
461
462	for (tvp = TAILQ_FIRST(&vnode_tmp_list); tvp; tvp = nvp) {
463		nvp = TAILQ_NEXT(tvp, v_freelist);
464		TAILQ_REMOVE(&vnode_tmp_list, tvp, v_freelist);
465		TAILQ_INSERT_TAIL(&vnode_free_list, tvp, v_freelist);
466		simple_unlock(&tvp->v_interlock);
467	}
468
469	if (vp) {
470		vp->v_flag |= VDOOMED;
471		TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
472		freevnodes--;
473		simple_unlock(&vnode_free_list_slock);
474		cache_purge(vp);
475		vp->v_lease = NULL;
476		if (vp->v_type != VBAD) {
477			vgonel(vp, p);
478		} else {
479			simple_unlock(&vp->v_interlock);
480		}
481
482#ifdef INVARIANTS
483		{
484			int s;
485
486			if (vp->v_data)
487				panic("cleaned vnode isn't");
488			s = splbio();
489			if (vp->v_numoutput)
490				panic("Clean vnode has pending I/O's");
491			splx(s);
492		}
493#endif
494		vp->v_flag = 0;
495		vp->v_lastr = 0;
496		vp->v_lastw = 0;
497		vp->v_lasta = 0;
498		vp->v_cstart = 0;
499		vp->v_clen = 0;
500		vp->v_socket = 0;
501		vp->v_writecount = 0;	/* XXX */
502		vp->v_maxio = 0;
503	} else {
504		simple_unlock(&vnode_free_list_slock);
505		vp = (struct vnode *) zalloc(vnode_zone);
506		bzero((char *) vp, sizeof *vp);
507		simple_lock_init(&vp->v_interlock);
508		vp->v_dd = vp;
509		cache_purge(vp);
510		LIST_INIT(&vp->v_cache_src);
511		TAILQ_INIT(&vp->v_cache_dst);
512		numvnodes++;
513	}
514
515	TAILQ_INIT(&vp->v_cleanblkhd);
516	TAILQ_INIT(&vp->v_dirtyblkhd);
517	vp->v_type = VNON;
518	vp->v_tag = tag;
519	vp->v_op = vops;
520	insmntque(vp, mp);
521	*vpp = vp;
522	vp->v_usecount = 1;
523	vp->v_data = 0;
524	splx(s);
525
526	vfs_object_create(vp, p, p->p_ucred);
527	return (0);
528}
529
530/*
531 * Move a vnode from one mount queue to another.
532 */
533static void
534insmntque(vp, mp)
535	register struct vnode *vp;
536	register struct mount *mp;
537{
538
539	simple_lock(&mntvnode_slock);
540	/*
541	 * Delete from old mount point vnode list, if on one.
542	 */
543	if (vp->v_mount != NULL)
544		LIST_REMOVE(vp, v_mntvnodes);
545	/*
546	 * Insert into list of vnodes for the new mount point, if available.
547	 */
548	if ((vp->v_mount = mp) == NULL) {
549		simple_unlock(&mntvnode_slock);
550		return;
551	}
552	LIST_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes);
553	simple_unlock(&mntvnode_slock);
554}
555
556/*
557 * Update outstanding I/O count and do wakeup if requested.
558 */
559void
560vwakeup(bp)
561	register struct buf *bp;
562{
563	register struct vnode *vp;
564
565	bp->b_flags &= ~B_WRITEINPROG;
566	if ((vp = bp->b_vp)) {
567		vp->v_numoutput--;
568		if (vp->v_numoutput < 0)
569			panic("vwakeup: neg numoutput");
570		if ((vp->v_numoutput == 0) && (vp->v_flag & VBWAIT)) {
571			vp->v_flag &= ~VBWAIT;
572			wakeup((caddr_t) &vp->v_numoutput);
573		}
574	}
575}
576
577/*
578 * Flush out and invalidate all buffers associated with a vnode.
579 * Called with the underlying object locked.
580 */
581int
582vinvalbuf(vp, flags, cred, p, slpflag, slptimeo)
583	register struct vnode *vp;
584	int flags;
585	struct ucred *cred;
586	struct proc *p;
587	int slpflag, slptimeo;
588{
589	register struct buf *bp;
590	struct buf *nbp, *blist;
591	int s, error;
592	vm_object_t object;
593
594	if (flags & V_SAVE) {
595		s = splbio();
596		while (vp->v_numoutput) {
597			vp->v_flag |= VBWAIT;
598			error = tsleep((caddr_t)&vp->v_numoutput,
599			    slpflag | (PRIBIO + 1), "vinvlbuf", slptimeo);
600			if (error) {
601				splx(s);
602				return (error);
603			}
604		}
605		if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
606			splx(s);
607			if ((error = VOP_FSYNC(vp, cred, MNT_WAIT, p)) != 0)
608				return (error);
609			s = splbio();
610			if (vp->v_numoutput > 0 ||
611			    !TAILQ_EMPTY(&vp->v_dirtyblkhd))
612				panic("vinvalbuf: dirty bufs");
613		}
614		splx(s);
615  	}
616	s = splbio();
617	for (;;) {
618		blist = TAILQ_FIRST(&vp->v_cleanblkhd);
619		if (!blist)
620			blist = TAILQ_FIRST(&vp->v_dirtyblkhd);
621		if (!blist)
622			break;
623
624		for (bp = blist; bp; bp = nbp) {
625			nbp = TAILQ_NEXT(bp, b_vnbufs);
626			if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
627				error = BUF_TIMELOCK(bp,
628				    LK_EXCLUSIVE | LK_SLEEPFAIL,
629				    "vinvalbuf", slpflag, slptimeo);
630				if (error == ENOLCK)
631					break;
632				splx(s);
633				return (error);
634			}
635			/*
636			 * XXX Since there are no node locks for NFS, I
637			 * believe there is a slight chance that a delayed
638			 * write will occur while sleeping just above, so
639			 * check for it.  Note that vfs_bio_awrite expects
640			 * buffers to reside on a queue, while VOP_BWRITE and
641			 * brelse do not.
642			 */
643			if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) &&
644				(flags & V_SAVE)) {
645
646				if (bp->b_vp == vp) {
647					if (bp->b_flags & B_CLUSTEROK) {
648						BUF_UNLOCK(bp);
649						vfs_bio_awrite(bp);
650					} else {
651						bremfree(bp);
652						bp->b_flags |= B_ASYNC;
653						VOP_BWRITE(bp->b_vp, bp);
654					}
655				} else {
656					bremfree(bp);
657					(void) VOP_BWRITE(bp->b_vp, bp);
658				}
659				break;
660			}
661			bremfree(bp);
662			bp->b_flags |= (B_INVAL | B_NOCACHE | B_RELBUF);
663			bp->b_flags &= ~B_ASYNC;
664			brelse(bp);
665		}
666	}
667
668	while (vp->v_numoutput > 0) {
669		vp->v_flag |= VBWAIT;
670		tsleep(&vp->v_numoutput, PVM, "vnvlbv", 0);
671	}
672
673	splx(s);
674
675	/*
676	 * Destroy the copy in the VM cache, too.
677	 */
678	simple_lock(&vp->v_interlock);
679	object = vp->v_object;
680	if (object != NULL) {
681		vm_object_page_remove(object, 0, 0,
682			(flags & V_SAVE) ? TRUE : FALSE);
683	}
684	simple_unlock(&vp->v_interlock);
685
686	if (!TAILQ_EMPTY(&vp->v_dirtyblkhd) || !TAILQ_EMPTY(&vp->v_cleanblkhd))
687		panic("vinvalbuf: flush failed");
688	return (0);
689}
690
691/*
692 * Truncate a file's buffer and pages to a specified length.  This
693 * is in lieu of the old vinvalbuf mechanism, which performed unneeded
694 * sync activity.
695 */
696int
697vtruncbuf(vp, cred, p, length, blksize)
698	register struct vnode *vp;
699	struct ucred *cred;
700	struct proc *p;
701	off_t length;
702	int blksize;
703{
704	register struct buf *bp;
705	struct buf *nbp;
706	int s, anyfreed;
707	int trunclbn;
708
709	/*
710	 * Round up to the *next* lbn.
711	 */
712	trunclbn = (length + blksize - 1) / blksize;
713
714	s = splbio();
715restart:
716	anyfreed = 1;
717	for (;anyfreed;) {
718		anyfreed = 0;
719		for (bp = TAILQ_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) {
720			nbp = TAILQ_NEXT(bp, b_vnbufs);
721			if (bp->b_lblkno >= trunclbn) {
722				if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
723					BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL);
724					goto restart;
725				} else {
726					bremfree(bp);
727					bp->b_flags |= (B_INVAL | B_RELBUF);
728					bp->b_flags &= ~B_ASYNC;
729					brelse(bp);
730					anyfreed = 1;
731				}
732				if (nbp && (((nbp->b_xflags & B_VNCLEAN) == 0)||
733					 (nbp->b_vp != vp) ||
734					 (nbp->b_flags & B_DELWRI))) {
735					goto restart;
736				}
737			}
738		}
739
740		for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
741			nbp = TAILQ_NEXT(bp, b_vnbufs);
742			if (bp->b_lblkno >= trunclbn) {
743				if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
744					BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL);
745					goto restart;
746				} else {
747					bremfree(bp);
748					bp->b_flags |= (B_INVAL | B_RELBUF);
749					bp->b_flags &= ~B_ASYNC;
750					brelse(bp);
751					anyfreed = 1;
752				}
753				if (nbp && (((nbp->b_xflags & B_VNDIRTY) == 0)||
754					 (nbp->b_vp != vp) ||
755					 (nbp->b_flags & B_DELWRI) == 0)) {
756					goto restart;
757				}
758			}
759		}
760	}
761
762	if (length > 0) {
763restartsync:
764		for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
765			nbp = TAILQ_NEXT(bp, b_vnbufs);
766			if ((bp->b_flags & B_DELWRI) && (bp->b_lblkno < 0)) {
767				if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
768					BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL);
769					goto restart;
770				} else {
771					bremfree(bp);
772					if (bp->b_vp == vp) {
773						bp->b_flags |= B_ASYNC;
774					} else {
775						bp->b_flags &= ~B_ASYNC;
776					}
777					VOP_BWRITE(bp->b_vp, bp);
778				}
779				goto restartsync;
780			}
781
782		}
783	}
784
785	while (vp->v_numoutput > 0) {
786		vp->v_flag |= VBWAIT;
787		tsleep(&vp->v_numoutput, PVM, "vbtrunc", 0);
788	}
789
790	splx(s);
791
792	vnode_pager_setsize(vp, length);
793
794	return (0);
795}
796
797/*
798 * Associate a buffer with a vnode.
799 */
800void
801bgetvp(vp, bp)
802	register struct vnode *vp;
803	register struct buf *bp;
804{
805	int s;
806
807	KASSERT(bp->b_vp == NULL, ("bgetvp: not free"));
808
809	vhold(vp);
810	bp->b_vp = vp;
811	if (vp->v_type == VBLK || vp->v_type == VCHR)
812		bp->b_dev = vp->v_rdev;
813	else
814		bp->b_dev = NODEV;
815	/*
816	 * Insert onto list for new vnode.
817	 */
818	s = splbio();
819	bp->b_xflags |= B_VNCLEAN;
820	bp->b_xflags &= ~B_VNDIRTY;
821	TAILQ_INSERT_TAIL(&vp->v_cleanblkhd, bp, b_vnbufs);
822	splx(s);
823}
824
825/*
826 * Disassociate a buffer from a vnode.
827 */
828void
829brelvp(bp)
830	register struct buf *bp;
831{
832	struct vnode *vp;
833	struct buflists *listheadp;
834	int s;
835
836	KASSERT(bp->b_vp != NULL, ("brelvp: NULL"));
837
838	/*
839	 * Delete from old vnode list, if on one.
840	 */
841	vp = bp->b_vp;
842	s = splbio();
843	if (bp->b_xflags & (B_VNDIRTY|B_VNCLEAN)) {
844		if (bp->b_xflags & B_VNDIRTY)
845			listheadp = &vp->v_dirtyblkhd;
846		else
847			listheadp = &vp->v_cleanblkhd;
848		TAILQ_REMOVE(listheadp, bp, b_vnbufs);
849		bp->b_xflags &= ~(B_VNDIRTY|B_VNCLEAN);
850	}
851	if ((vp->v_flag & VONWORKLST) && TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
852		vp->v_flag &= ~VONWORKLST;
853		LIST_REMOVE(vp, v_synclist);
854	}
855	splx(s);
856	bp->b_vp = (struct vnode *) 0;
857	vdrop(vp);
858}
859
860/*
861 * The workitem queue.
862 *
863 * It is useful to delay writes of file data and filesystem metadata
864 * for tens of seconds so that quickly created and deleted files need
865 * not waste disk bandwidth being created and removed. To realize this,
866 * we append vnodes to a "workitem" queue. When running with a soft
867 * updates implementation, most pending metadata dependencies should
868 * not wait for more than a few seconds. Thus, mounted on block devices
869 * are delayed only about a half the time that file data is delayed.
870 * Similarly, directory updates are more critical, so are only delayed
871 * about a third the time that file data is delayed. Thus, there are
872 * SYNCER_MAXDELAY queues that are processed round-robin at a rate of
873 * one each second (driven off the filesystem syner process). The
874 * syncer_delayno variable indicates the next queue that is to be processed.
875 * Items that need to be processed soon are placed in this queue:
876 *
877 *	syncer_workitem_pending[syncer_delayno]
878 *
879 * A delay of fifteen seconds is done by placing the request fifteen
880 * entries later in the queue:
881 *
882 *	syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
883 *
884 */
885
886/*
887 * Add an item to the syncer work queue.
888 */
889static void
890vn_syncer_add_to_worklist(struct vnode *vp, int delay)
891{
892	int s, slot;
893
894	s = splbio();
895
896	if (vp->v_flag & VONWORKLST) {
897		LIST_REMOVE(vp, v_synclist);
898	}
899
900	if (delay > syncer_maxdelay - 2)
901		delay = syncer_maxdelay - 2;
902	slot = (syncer_delayno + delay) & syncer_mask;
903
904	LIST_INSERT_HEAD(&syncer_workitem_pending[slot], vp, v_synclist);
905	vp->v_flag |= VONWORKLST;
906	splx(s);
907}
908
909struct  proc *updateproc;
910static void sched_sync __P((void));
911static struct kproc_desc up_kp = {
912	"syncer",
913	sched_sync,
914	&updateproc
915};
916SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp)
917
918/*
919 * System filesystem synchronizer daemon.
920 */
921void
922sched_sync(void)
923{
924	struct synclist *slp;
925	struct vnode *vp;
926	long starttime;
927	int s;
928	struct proc *p = updateproc;
929
930	for (;;) {
931		starttime = time_second;
932
933		/*
934		 * Push files whose dirty time has expired.  Be careful
935		 * of interrupt race on slp queue.
936		 */
937		s = splbio();
938		slp = &syncer_workitem_pending[syncer_delayno];
939		syncer_delayno += 1;
940		if (syncer_delayno == syncer_maxdelay)
941			syncer_delayno = 0;
942		splx(s);
943
944		while ((vp = LIST_FIRST(slp)) != NULL) {
945			if (VOP_ISLOCKED(vp) == 0) {
946				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
947				(void) VOP_FSYNC(vp, p->p_ucred, MNT_LAZY, p);
948				VOP_UNLOCK(vp, 0, p);
949			}
950			s = splbio();
951			if (LIST_FIRST(slp) == vp) {
952				/*
953				 * Note: v_tag VT_VFS vps can remain on the
954				 * worklist too with no dirty blocks, but
955				 * since sync_fsync() moves it to a different
956				 * slot we are safe.
957				 */
958				if (TAILQ_EMPTY(&vp->v_dirtyblkhd) &&
959				    vp->v_type != VBLK)
960					panic("sched_sync: fsync failed vp %p tag %d", vp, vp->v_tag);
961				/*
962				 * Put us back on the worklist.  The worklist
963				 * routine will remove us from our current
964				 * position and then add us back in at a later
965				 * position.
966				 */
967				vn_syncer_add_to_worklist(vp, syncdelay);
968			}
969			splx(s);
970		}
971
972		/*
973		 * Do soft update processing.
974		 */
975		if (bioops.io_sync)
976			(*bioops.io_sync)(NULL);
977
978		/*
979		 * The variable rushjob allows the kernel to speed up the
980		 * processing of the filesystem syncer process. A rushjob
981		 * value of N tells the filesystem syncer to process the next
982		 * N seconds worth of work on its queue ASAP. Currently rushjob
983		 * is used by the soft update code to speed up the filesystem
984		 * syncer process when the incore state is getting so far
985		 * ahead of the disk that the kernel memory pool is being
986		 * threatened with exhaustion.
987		 */
988		if (rushjob > 0) {
989			rushjob -= 1;
990			continue;
991		}
992		/*
993		 * If it has taken us less than a second to process the
994		 * current work, then wait. Otherwise start right over
995		 * again. We can still lose time if any single round
996		 * takes more than two seconds, but it does not really
997		 * matter as we are just trying to generally pace the
998		 * filesystem activity.
999		 */
1000		if (time_second == starttime)
1001			tsleep(&lbolt, PPAUSE, "syncer", 0);
1002	}
1003}
1004
1005/*
1006 * Request the syncer daemon to speed up its work.
1007 * We never push it to speed up more than half of its
1008 * normal turn time, otherwise it could take over the cpu.
1009 */
1010int
1011speedup_syncer()
1012{
1013	int s;
1014
1015	s = splhigh();
1016	if (updateproc->p_wchan == &lbolt)
1017		setrunnable(updateproc);
1018	splx(s);
1019	if (rushjob < syncdelay / 2) {
1020		rushjob += 1;
1021		stat_rush_requests += 1;
1022		return (1);
1023	}
1024	return(0);
1025}
1026
1027/*
1028 * Associate a p-buffer with a vnode.
1029 *
1030 * Also sets B_PAGING flag to indicate that vnode is not fully associated
1031 * with the buffer.  i.e. the bp has not been linked into the vnode or
1032 * ref-counted.
1033 */
1034void
1035pbgetvp(vp, bp)
1036	register struct vnode *vp;
1037	register struct buf *bp;
1038{
1039
1040	KASSERT(bp->b_vp == NULL, ("pbgetvp: not free"));
1041
1042	bp->b_vp = vp;
1043	bp->b_flags |= B_PAGING;
1044	if (vp->v_type == VBLK || vp->v_type == VCHR)
1045		bp->b_dev = vp->v_rdev;
1046	else
1047		bp->b_dev = NODEV;
1048}
1049
1050/*
1051 * Disassociate a p-buffer from a vnode.
1052 */
1053void
1054pbrelvp(bp)
1055	register struct buf *bp;
1056{
1057
1058	KASSERT(bp->b_vp != NULL, ("pbrelvp: NULL"));
1059
1060#if !defined(MAX_PERF)
1061	/* XXX REMOVE ME */
1062	if (bp->b_vnbufs.tqe_next != NULL) {
1063		panic(
1064		    "relpbuf(): b_vp was probably reassignbuf()d %p %x",
1065		    bp,
1066		    (int)bp->b_flags
1067		);
1068	}
1069#endif
1070	bp->b_vp = (struct vnode *) 0;
1071	bp->b_flags &= ~B_PAGING;
1072}
1073
1074void
1075pbreassignbuf(bp, newvp)
1076	struct buf *bp;
1077	struct vnode *newvp;
1078{
1079#if !defined(MAX_PERF)
1080	if ((bp->b_flags & B_PAGING) == 0) {
1081		panic(
1082		    "pbreassignbuf() on non phys bp %p",
1083		    bp
1084		);
1085	}
1086#endif
1087	bp->b_vp = newvp;
1088}
1089
1090/*
1091 * Reassign a buffer from one vnode to another.
1092 * Used to assign file specific control information
1093 * (indirect blocks) to the vnode to which they belong.
1094 */
1095void
1096reassignbuf(bp, newvp)
1097	register struct buf *bp;
1098	register struct vnode *newvp;
1099{
1100	struct buflists *listheadp;
1101	int delay;
1102	int s;
1103
1104	if (newvp == NULL) {
1105		printf("reassignbuf: NULL");
1106		return;
1107	}
1108
1109#if !defined(MAX_PERF)
1110	/*
1111	 * B_PAGING flagged buffers cannot be reassigned because their vp
1112	 * is not fully linked in.
1113	 */
1114	if (bp->b_flags & B_PAGING)
1115		panic("cannot reassign paging buffer");
1116#endif
1117
1118	s = splbio();
1119	/*
1120	 * Delete from old vnode list, if on one.
1121	 */
1122	if (bp->b_xflags & (B_VNDIRTY|B_VNCLEAN)) {
1123		if (bp->b_xflags & B_VNDIRTY)
1124			listheadp = &bp->b_vp->v_dirtyblkhd;
1125		else
1126			listheadp = &bp->b_vp->v_cleanblkhd;
1127		TAILQ_REMOVE(listheadp, bp, b_vnbufs);
1128		bp->b_xflags &= ~(B_VNDIRTY|B_VNCLEAN);
1129		if (bp->b_vp != newvp) {
1130			vdrop(bp->b_vp);
1131			bp->b_vp = NULL;	/* for clarification */
1132		}
1133	}
1134	/*
1135	 * If dirty, put on list of dirty buffers; otherwise insert onto list
1136	 * of clean buffers.
1137	 */
1138	if (bp->b_flags & B_DELWRI) {
1139		struct buf *tbp;
1140
1141		listheadp = &newvp->v_dirtyblkhd;
1142		if ((newvp->v_flag & VONWORKLST) == 0) {
1143			switch (newvp->v_type) {
1144			case VDIR:
1145				delay = dirdelay;
1146				break;
1147			case VBLK:
1148				if (newvp->v_specmountpoint != NULL) {
1149					delay = metadelay;
1150					break;
1151				}
1152				/* fall through */
1153			default:
1154				delay = filedelay;
1155			}
1156			vn_syncer_add_to_worklist(newvp, delay);
1157		}
1158		bp->b_xflags |= B_VNDIRTY;
1159		tbp = TAILQ_FIRST(listheadp);
1160		if (tbp == NULL ||
1161		    (bp->b_lblkno >= 0 && tbp->b_lblkno > bp->b_lblkno)) {
1162			TAILQ_INSERT_HEAD(listheadp, bp, b_vnbufs);
1163		} else {
1164			if (bp->b_lblkno >= 0) {
1165				struct buf *ttbp;
1166				while ((ttbp = TAILQ_NEXT(tbp, b_vnbufs)) &&
1167				    (ttbp->b_lblkno < bp->b_lblkno)) {
1168					tbp = ttbp;
1169				}
1170				TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs);
1171			} else {
1172				TAILQ_INSERT_TAIL(listheadp, bp, b_vnbufs);
1173			}
1174		}
1175	} else {
1176		bp->b_xflags |= B_VNCLEAN;
1177		TAILQ_INSERT_TAIL(&newvp->v_cleanblkhd, bp, b_vnbufs);
1178		if ((newvp->v_flag & VONWORKLST) &&
1179		    TAILQ_EMPTY(&newvp->v_dirtyblkhd)) {
1180			newvp->v_flag &= ~VONWORKLST;
1181			LIST_REMOVE(newvp, v_synclist);
1182		}
1183	}
1184	if (bp->b_vp != newvp) {
1185		bp->b_vp = newvp;
1186		vhold(bp->b_vp);
1187	}
1188	splx(s);
1189}
1190
1191/*
1192 * Create a vnode for a block device.
1193 * Used for mounting the root file system.
1194 */
1195int
1196bdevvp(dev, vpp)
1197	dev_t dev;
1198	struct vnode **vpp;
1199{
1200	register struct vnode *vp;
1201	struct vnode *nvp;
1202	int error;
1203
1204	if (dev == NODEV) {
1205		*vpp = NULLVP;
1206		return (ENXIO);
1207	}
1208	error = getnewvnode(VT_NON, (struct mount *)0, spec_vnodeop_p, &nvp);
1209	if (error) {
1210		*vpp = NULLVP;
1211		return (error);
1212	}
1213	vp = nvp;
1214	vp->v_type = VBLK;
1215	if ((nvp = checkalias(vp, dev2udev(dev), (struct mount *)0)) != NULL) {
1216		vput(vp);
1217		vp = nvp;
1218	}
1219	*vpp = vp;
1220	return (0);
1221}
1222
1223/*
1224 * Check to see if the new vnode represents a special device
1225 * for which we already have a vnode (either because of
1226 * bdevvp() or because of a different vnode representing
1227 * the same block device). If such an alias exists, deallocate
1228 * the existing contents and return the aliased vnode. The
1229 * caller is responsible for filling it with its new contents.
1230 */
1231struct vnode *
1232checkalias(nvp, nvp_rdev, mp)
1233	register struct vnode *nvp;
1234	udev_t nvp_rdev;
1235	struct mount *mp;
1236{
1237	struct proc *p = curproc;	/* XXX */
1238	struct vnode *vp;
1239	struct vnode **vpp;
1240	dev_t	dev;
1241
1242	if (nvp->v_type != VBLK && nvp->v_type != VCHR)
1243		return (NULLVP);
1244
1245	dev = udev2dev(nvp_rdev, 2);
1246
1247	vpp = &speclisth[SPECHASH(dev)];
1248loop:
1249	simple_lock(&spechash_slock);
1250	for (vp = *vpp; vp; vp = vp->v_specnext) {
1251		if (dev != vp->v_rdev || nvp->v_type != vp->v_type)
1252			continue;
1253		/*
1254		 * Alias, but not in use, so flush it out.
1255		 * Only alias active device nodes.
1256		 * Not sure why we don't re-use this like we do below.
1257		 */
1258		simple_lock(&vp->v_interlock);
1259		if (vp->v_usecount == 0) {
1260			simple_unlock(&spechash_slock);
1261			vgonel(vp, p);
1262			goto loop;
1263		}
1264		if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, p)) {
1265			/*
1266			 * It dissappeared, and we may have slept.
1267			 * Restart from the beginning
1268			 */
1269			simple_unlock(&spechash_slock);
1270			goto loop;
1271		}
1272		break;
1273	}
1274	/*
1275	 * It would be a lot clearer what is going on here if
1276	 * this had been expressed as:
1277	 * if ( vp && (vp->v_tag == VT_NULL))
1278	 * and the clauses had been swapped.
1279	 */
1280	if (vp == NULL || vp->v_tag != VT_NON) {
1281		struct specinfo *sinfo;
1282
1283		/*
1284		 * Put the new vnode into the hash chain.
1285		 * and if there was an alias, connect them.
1286		 */
1287		MALLOC(sinfo, struct specinfo *,
1288		    sizeof(struct specinfo), M_VNODE, M_WAITOK);
1289		bzero(sinfo, sizeof(struct specinfo));
1290		nvp->v_specinfo = sinfo;
1291		sinfo->si_rdev = dev;
1292		sinfo->si_hashchain = vpp;
1293		sinfo->si_specnext = *vpp;
1294		sinfo->si_bsize_phys = DEV_BSIZE;
1295		sinfo->si_bsize_best = BLKDEV_IOSIZE;
1296		sinfo->si_bsize_max = MAXBSIZE;
1297
1298		/*
1299		 * Ask the device to fix up specinfo.  Typically the
1300		 * si_bsize_* parameters may need fixing up.
1301		 */
1302
1303		if (nvp->v_type == VBLK) {
1304			if (bdevsw(dev) && bdevsw(dev)->d_parms)
1305				(*bdevsw(dev)->d_parms)(dev, sinfo, DPARM_GET);
1306		} else if (nvp->v_type == VCHR) {
1307			if (devsw(dev) && devsw(dev)->d_parms)
1308				(*devsw(dev)->d_parms)(dev, sinfo, DPARM_GET);
1309		}
1310
1311		simple_unlock(&spechash_slock);
1312		*vpp = nvp;
1313		if (vp != NULLVP) {
1314			nvp->v_flag |= VALIASED;
1315			vp->v_flag |= VALIASED;
1316			vput(vp);
1317		}
1318		return (NULLVP);
1319	}
1320	/*
1321	 * if ( vp && (vp->v_tag == VT_NULL))
1322	 * We have a vnode alias, but it is a trashed.
1323	 * Make it look like it's newley allocated. (by getnewvnode())
1324	 * The caller should use this instead.
1325	 */
1326	simple_unlock(&spechash_slock);
1327	VOP_UNLOCK(vp, 0, p);
1328	simple_lock(&vp->v_interlock);
1329	vclean(vp, 0, p);
1330	vp->v_op = nvp->v_op;
1331	vp->v_tag = nvp->v_tag;
1332	nvp->v_type = VNON;
1333	insmntque(vp, mp);
1334	return (vp);
1335}
1336
1337/*
1338 * Grab a particular vnode from the free list, increment its
1339 * reference count and lock it. The vnode lock bit is set the
1340 * vnode is being eliminated in vgone. The process is awakened
1341 * when the transition is completed, and an error returned to
1342 * indicate that the vnode is no longer usable (possibly having
1343 * been changed to a new file system type).
1344 */
1345int
1346vget(vp, flags, p)
1347	register struct vnode *vp;
1348	int flags;
1349	struct proc *p;
1350{
1351	int error;
1352
1353	/*
1354	 * If the vnode is in the process of being cleaned out for
1355	 * another use, we wait for the cleaning to finish and then
1356	 * return failure. Cleaning is determined by checking that
1357	 * the VXLOCK flag is set.
1358	 */
1359	if ((flags & LK_INTERLOCK) == 0) {
1360		simple_lock(&vp->v_interlock);
1361	}
1362	if (vp->v_flag & VXLOCK) {
1363		vp->v_flag |= VXWANT;
1364		simple_unlock(&vp->v_interlock);
1365		tsleep((caddr_t)vp, PINOD, "vget", 0);
1366		return (ENOENT);
1367	}
1368
1369	vp->v_usecount++;
1370
1371	if (VSHOULDBUSY(vp))
1372		vbusy(vp);
1373	if (flags & LK_TYPE_MASK) {
1374		if ((error = vn_lock(vp, flags | LK_INTERLOCK, p)) != 0) {
1375			/*
1376			 * must expand vrele here because we do not want
1377			 * to call VOP_INACTIVE if the reference count
1378			 * drops back to zero since it was never really
1379			 * active. We must remove it from the free list
1380			 * before sleeping so that multiple processes do
1381			 * not try to recycle it.
1382			 */
1383			simple_lock(&vp->v_interlock);
1384			vp->v_usecount--;
1385			if (VSHOULDFREE(vp))
1386				vfree(vp);
1387			simple_unlock(&vp->v_interlock);
1388		}
1389		return (error);
1390	}
1391	simple_unlock(&vp->v_interlock);
1392	return (0);
1393}
1394
1395void
1396vref(struct vnode *vp)
1397{
1398	simple_lock(&vp->v_interlock);
1399	vp->v_usecount++;
1400	simple_unlock(&vp->v_interlock);
1401}
1402
1403/*
1404 * Vnode put/release.
1405 * If count drops to zero, call inactive routine and return to freelist.
1406 */
1407void
1408vrele(vp)
1409	struct vnode *vp;
1410{
1411	struct proc *p = curproc;	/* XXX */
1412
1413	KASSERT(vp != NULL, ("vrele: null vp"));
1414
1415	simple_lock(&vp->v_interlock);
1416
1417	if (vp->v_usecount > 1) {
1418
1419		vp->v_usecount--;
1420		simple_unlock(&vp->v_interlock);
1421
1422		return;
1423	}
1424
1425	if (vp->v_usecount == 1) {
1426
1427		vp->v_usecount--;
1428		if (VSHOULDFREE(vp))
1429			vfree(vp);
1430	/*
1431	 * If we are doing a vput, the node is already locked, and we must
1432	 * call VOP_INACTIVE with the node locked.  So, in the case of
1433	 * vrele, we explicitly lock the vnode before calling VOP_INACTIVE.
1434	 */
1435		if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK, p) == 0) {
1436			VOP_INACTIVE(vp, p);
1437		}
1438
1439	} else {
1440#ifdef DIAGNOSTIC
1441		vprint("vrele: negative ref count", vp);
1442		simple_unlock(&vp->v_interlock);
1443#endif
1444		panic("vrele: negative ref cnt");
1445	}
1446}
1447
1448void
1449vput(vp)
1450	struct vnode *vp;
1451{
1452	struct proc *p = curproc;	/* XXX */
1453
1454	KASSERT(vp != NULL, ("vput: null vp"));
1455
1456	simple_lock(&vp->v_interlock);
1457
1458	if (vp->v_usecount > 1) {
1459
1460		vp->v_usecount--;
1461		VOP_UNLOCK(vp, LK_INTERLOCK, p);
1462		return;
1463
1464	}
1465
1466	if (vp->v_usecount == 1) {
1467
1468		vp->v_usecount--;
1469		if (VSHOULDFREE(vp))
1470			vfree(vp);
1471	/*
1472	 * If we are doing a vput, the node is already locked, and we must
1473	 * call VOP_INACTIVE with the node locked.  So, in the case of
1474	 * vrele, we explicitly lock the vnode before calling VOP_INACTIVE.
1475	 */
1476		simple_unlock(&vp->v_interlock);
1477		VOP_INACTIVE(vp, p);
1478
1479	} else {
1480#ifdef DIAGNOSTIC
1481		vprint("vput: negative ref count", vp);
1482#endif
1483		panic("vput: negative ref cnt");
1484	}
1485}
1486
1487/*
1488 * Somebody doesn't want the vnode recycled.
1489 */
1490void
1491vhold(vp)
1492	register struct vnode *vp;
1493{
1494	int s;
1495
1496  	s = splbio();
1497	vp->v_holdcnt++;
1498	if (VSHOULDBUSY(vp))
1499		vbusy(vp);
1500	splx(s);
1501}
1502
1503/*
1504 * One less who cares about this vnode.
1505 */
1506void
1507vdrop(vp)
1508	register struct vnode *vp;
1509{
1510	int s;
1511
1512	s = splbio();
1513	if (vp->v_holdcnt <= 0)
1514		panic("vdrop: holdcnt");
1515	vp->v_holdcnt--;
1516	if (VSHOULDFREE(vp))
1517		vfree(vp);
1518	splx(s);
1519}
1520
1521/*
1522 * Remove any vnodes in the vnode table belonging to mount point mp.
1523 *
1524 * If MNT_NOFORCE is specified, there should not be any active ones,
1525 * return error if any are found (nb: this is a user error, not a
1526 * system error). If MNT_FORCE is specified, detach any active vnodes
1527 * that are found.
1528 */
1529#ifdef DIAGNOSTIC
1530static int busyprt = 0;		/* print out busy vnodes */
1531SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "");
1532#endif
1533
1534int
1535vflush(mp, skipvp, flags)
1536	struct mount *mp;
1537	struct vnode *skipvp;
1538	int flags;
1539{
1540	struct proc *p = curproc;	/* XXX */
1541	struct vnode *vp, *nvp;
1542	int busy = 0;
1543
1544	simple_lock(&mntvnode_slock);
1545loop:
1546	for (vp = mp->mnt_vnodelist.lh_first; vp; vp = nvp) {
1547		/*
1548		 * Make sure this vnode wasn't reclaimed in getnewvnode().
1549		 * Start over if it has (it won't be on the list anymore).
1550		 */
1551		if (vp->v_mount != mp)
1552			goto loop;
1553		nvp = vp->v_mntvnodes.le_next;
1554		/*
1555		 * Skip over a selected vnode.
1556		 */
1557		if (vp == skipvp)
1558			continue;
1559
1560		simple_lock(&vp->v_interlock);
1561		/*
1562		 * Skip over a vnodes marked VSYSTEM.
1563		 */
1564		if ((flags & SKIPSYSTEM) && (vp->v_flag & VSYSTEM)) {
1565			simple_unlock(&vp->v_interlock);
1566			continue;
1567		}
1568		/*
1569		 * If WRITECLOSE is set, only flush out regular file vnodes
1570		 * open for writing.
1571		 */
1572		if ((flags & WRITECLOSE) &&
1573		    (vp->v_writecount == 0 || vp->v_type != VREG)) {
1574			simple_unlock(&vp->v_interlock);
1575			continue;
1576		}
1577
1578		/*
1579		 * With v_usecount == 0, all we need to do is clear out the
1580		 * vnode data structures and we are done.
1581		 */
1582		if (vp->v_usecount == 0) {
1583			simple_unlock(&mntvnode_slock);
1584			vgonel(vp, p);
1585			simple_lock(&mntvnode_slock);
1586			continue;
1587		}
1588
1589		/*
1590		 * If FORCECLOSE is set, forcibly close the vnode. For block
1591		 * or character devices, revert to an anonymous device. For
1592		 * all other files, just kill them.
1593		 */
1594		if (flags & FORCECLOSE) {
1595			simple_unlock(&mntvnode_slock);
1596			if (vp->v_type != VBLK && vp->v_type != VCHR) {
1597				vgonel(vp, p);
1598			} else {
1599				vclean(vp, 0, p);
1600				vp->v_op = spec_vnodeop_p;
1601				insmntque(vp, (struct mount *) 0);
1602			}
1603			simple_lock(&mntvnode_slock);
1604			continue;
1605		}
1606#ifdef DIAGNOSTIC
1607		if (busyprt)
1608			vprint("vflush: busy vnode", vp);
1609#endif
1610		simple_unlock(&vp->v_interlock);
1611		busy++;
1612	}
1613	simple_unlock(&mntvnode_slock);
1614	if (busy)
1615		return (EBUSY);
1616	return (0);
1617}
1618
1619/*
1620 * Disassociate the underlying file system from a vnode.
1621 */
1622static void
1623vclean(vp, flags, p)
1624	struct vnode *vp;
1625	int flags;
1626	struct proc *p;
1627{
1628	int active;
1629	vm_object_t obj;
1630
1631	/*
1632	 * Check to see if the vnode is in use. If so we have to reference it
1633	 * before we clean it out so that its count cannot fall to zero and
1634	 * generate a race against ourselves to recycle it.
1635	 */
1636	if ((active = vp->v_usecount))
1637		vp->v_usecount++;
1638
1639	/*
1640	 * Prevent the vnode from being recycled or brought into use while we
1641	 * clean it out.
1642	 */
1643	if (vp->v_flag & VXLOCK)
1644		panic("vclean: deadlock");
1645	vp->v_flag |= VXLOCK;
1646	/*
1647	 * Even if the count is zero, the VOP_INACTIVE routine may still
1648	 * have the object locked while it cleans it out. The VOP_LOCK
1649	 * ensures that the VOP_INACTIVE routine is done with its work.
1650	 * For active vnodes, it ensures that no other activity can
1651	 * occur while the underlying object is being cleaned out.
1652	 */
1653	VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK, p);
1654
1655	/*
1656	 * Clean out any buffers associated with the vnode.
1657	 */
1658	vinvalbuf(vp, V_SAVE, NOCRED, p, 0, 0);
1659	if ((obj = vp->v_object) != NULL) {
1660		if (obj->ref_count == 0) {
1661			/*
1662			 * This is a normal way of shutting down the object/vnode
1663			 * association.
1664			 */
1665			vm_object_terminate(obj);
1666		} else {
1667			/*
1668			 * Woe to the process that tries to page now :-).
1669			 */
1670			vm_pager_deallocate(obj);
1671		}
1672	}
1673
1674	/*
1675	 * If purging an active vnode, it must be closed and
1676	 * deactivated before being reclaimed. Note that the
1677	 * VOP_INACTIVE will unlock the vnode.
1678	 */
1679	if (active) {
1680		if (flags & DOCLOSE)
1681			VOP_CLOSE(vp, FNONBLOCK, NOCRED, p);
1682		VOP_INACTIVE(vp, p);
1683	} else {
1684		/*
1685		 * Any other processes trying to obtain this lock must first
1686		 * wait for VXLOCK to clear, then call the new lock operation.
1687		 */
1688		VOP_UNLOCK(vp, 0, p);
1689	}
1690	/*
1691	 * Reclaim the vnode.
1692	 */
1693	if (VOP_RECLAIM(vp, p))
1694		panic("vclean: cannot reclaim");
1695
1696	if (active)
1697		vrele(vp);
1698
1699	cache_purge(vp);
1700	if (vp->v_vnlock) {
1701#if 0 /* This is the only place we have LK_DRAINED in the entire kernel ??? */
1702#ifdef DIAGNOSTIC
1703		if ((vp->v_vnlock->lk_flags & LK_DRAINED) == 0)
1704			vprint("vclean: lock not drained", vp);
1705#endif
1706#endif
1707		FREE(vp->v_vnlock, M_VNODE);
1708		vp->v_vnlock = NULL;
1709	}
1710
1711	if (VSHOULDFREE(vp))
1712		vfree(vp);
1713
1714	/*
1715	 * Done with purge, notify sleepers of the grim news.
1716	 */
1717	vp->v_op = dead_vnodeop_p;
1718	vn_pollgone(vp);
1719	vp->v_tag = VT_NON;
1720	vp->v_flag &= ~VXLOCK;
1721	if (vp->v_flag & VXWANT) {
1722		vp->v_flag &= ~VXWANT;
1723		wakeup((caddr_t) vp);
1724	}
1725}
1726
1727/*
1728 * Eliminate all activity associated with the requested vnode
1729 * and with all vnodes aliased to the requested vnode.
1730 */
1731int
1732vop_revoke(ap)
1733	struct vop_revoke_args /* {
1734		struct vnode *a_vp;
1735		int a_flags;
1736	} */ *ap;
1737{
1738	struct vnode *vp, *vq;
1739	struct proc *p = curproc;	/* XXX */
1740
1741	KASSERT((ap->a_flags & REVOKEALL) != 0, ("vop_revoke"));
1742
1743	vp = ap->a_vp;
1744	simple_lock(&vp->v_interlock);
1745
1746	if (vp->v_flag & VALIASED) {
1747		/*
1748		 * If a vgone (or vclean) is already in progress,
1749		 * wait until it is done and return.
1750		 */
1751		if (vp->v_flag & VXLOCK) {
1752			vp->v_flag |= VXWANT;
1753			simple_unlock(&vp->v_interlock);
1754			tsleep((caddr_t)vp, PINOD, "vop_revokeall", 0);
1755			return (0);
1756		}
1757		/*
1758		 * Ensure that vp will not be vgone'd while we
1759		 * are eliminating its aliases.
1760		 */
1761		vp->v_flag |= VXLOCK;
1762		simple_unlock(&vp->v_interlock);
1763		while (vp->v_flag & VALIASED) {
1764			simple_lock(&spechash_slock);
1765			for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
1766				if (vq->v_rdev != vp->v_rdev ||
1767				    vq->v_type != vp->v_type || vp == vq)
1768					continue;
1769				simple_unlock(&spechash_slock);
1770				vgone(vq);
1771				break;
1772			}
1773			if (vq == NULLVP) {
1774				simple_unlock(&spechash_slock);
1775			}
1776		}
1777		/*
1778		 * Remove the lock so that vgone below will
1779		 * really eliminate the vnode after which time
1780		 * vgone will awaken any sleepers.
1781		 */
1782		simple_lock(&vp->v_interlock);
1783		vp->v_flag &= ~VXLOCK;
1784		if (vp->v_flag & VXWANT) {
1785			vp->v_flag &= ~VXWANT;
1786			wakeup(vp);
1787		}
1788	}
1789	vgonel(vp, p);
1790	return (0);
1791}
1792
1793/*
1794 * Recycle an unused vnode to the front of the free list.
1795 * Release the passed interlock if the vnode will be recycled.
1796 */
1797int
1798vrecycle(vp, inter_lkp, p)
1799	struct vnode *vp;
1800	struct simplelock *inter_lkp;
1801	struct proc *p;
1802{
1803
1804	simple_lock(&vp->v_interlock);
1805	if (vp->v_usecount == 0) {
1806		if (inter_lkp) {
1807			simple_unlock(inter_lkp);
1808		}
1809		vgonel(vp, p);
1810		return (1);
1811	}
1812	simple_unlock(&vp->v_interlock);
1813	return (0);
1814}
1815
1816/*
1817 * Eliminate all activity associated with a vnode
1818 * in preparation for reuse.
1819 */
1820void
1821vgone(vp)
1822	register struct vnode *vp;
1823{
1824	struct proc *p = curproc;	/* XXX */
1825
1826	simple_lock(&vp->v_interlock);
1827	vgonel(vp, p);
1828}
1829
1830/*
1831 * vgone, with the vp interlock held.
1832 */
1833static void
1834vgonel(vp, p)
1835	struct vnode *vp;
1836	struct proc *p;
1837{
1838	int s;
1839	struct vnode *vq;
1840	struct vnode *vx;
1841
1842	/*
1843	 * If a vgone (or vclean) is already in progress,
1844	 * wait until it is done and return.
1845	 */
1846	if (vp->v_flag & VXLOCK) {
1847		vp->v_flag |= VXWANT;
1848		simple_unlock(&vp->v_interlock);
1849		tsleep((caddr_t)vp, PINOD, "vgone", 0);
1850		return;
1851	}
1852
1853	/*
1854	 * Clean out the filesystem specific data.
1855	 */
1856	vclean(vp, DOCLOSE, p);
1857	simple_lock(&vp->v_interlock);
1858
1859	/*
1860	 * Delete from old mount point vnode list, if on one.
1861	 */
1862	if (vp->v_mount != NULL)
1863		insmntque(vp, (struct mount *)0);
1864	/*
1865	 * If special device, remove it from special device alias list
1866	 * if it is on one.
1867	 */
1868	if ((vp->v_type == VBLK || vp->v_type == VCHR) && vp->v_specinfo != 0) {
1869		simple_lock(&spechash_slock);
1870		if (*vp->v_hashchain == vp) {
1871			*vp->v_hashchain = vp->v_specnext;
1872		} else {
1873			for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
1874				if (vq->v_specnext != vp)
1875					continue;
1876				vq->v_specnext = vp->v_specnext;
1877				break;
1878			}
1879			if (vq == NULL)
1880				panic("missing bdev");
1881		}
1882		if (vp->v_flag & VALIASED) {
1883			vx = NULL;
1884			for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
1885				if (vq->v_rdev != vp->v_rdev ||
1886				    vq->v_type != vp->v_type)
1887					continue;
1888				if (vx)
1889					break;
1890				vx = vq;
1891			}
1892			if (vx == NULL)
1893				panic("missing alias");
1894			if (vq == NULL)
1895				vx->v_flag &= ~VALIASED;
1896			vp->v_flag &= ~VALIASED;
1897		}
1898		simple_unlock(&spechash_slock);
1899		FREE(vp->v_specinfo, M_VNODE);
1900		vp->v_specinfo = NULL;
1901	}
1902
1903	/*
1904	 * If it is on the freelist and not already at the head,
1905	 * move it to the head of the list. The test of the back
1906	 * pointer and the reference count of zero is because
1907	 * it will be removed from the free list by getnewvnode,
1908	 * but will not have its reference count incremented until
1909	 * after calling vgone. If the reference count were
1910	 * incremented first, vgone would (incorrectly) try to
1911	 * close the previous instance of the underlying object.
1912	 */
1913	if (vp->v_usecount == 0 && !(vp->v_flag & VDOOMED)) {
1914		s = splbio();
1915		simple_lock(&vnode_free_list_slock);
1916		if (vp->v_flag & VFREE) {
1917			TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
1918		} else if (vp->v_flag & VTBFREE) {
1919			TAILQ_REMOVE(&vnode_tobefree_list, vp, v_freelist);
1920			vp->v_flag &= ~VTBFREE;
1921			freevnodes++;
1922		} else
1923			freevnodes++;
1924		vp->v_flag |= VFREE;
1925		TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
1926		simple_unlock(&vnode_free_list_slock);
1927		splx(s);
1928	}
1929
1930	vp->v_type = VBAD;
1931	simple_unlock(&vp->v_interlock);
1932}
1933
1934/*
1935 * Lookup a vnode by device number.
1936 */
1937int
1938vfinddev(dev, type, vpp)
1939	dev_t dev;
1940	enum vtype type;
1941	struct vnode **vpp;
1942{
1943	register struct vnode *vp;
1944	int rc = 0;
1945
1946	simple_lock(&spechash_slock);
1947	for (vp = speclisth[SPECHASH(dev)]; vp; vp = vp->v_specnext) {
1948		if (dev != vp->v_rdev || type != vp->v_type)
1949			continue;
1950		*vpp = vp;
1951		rc = 1;
1952		break;
1953	}
1954	simple_unlock(&spechash_slock);
1955	return (rc);
1956}
1957
1958/*
1959 * Calculate the total number of references to a special device.
1960 */
1961int
1962vcount(vp)
1963	register struct vnode *vp;
1964{
1965	struct vnode *vq, *vnext;
1966	int count;
1967
1968loop:
1969	if ((vp->v_flag & VALIASED) == 0)
1970		return (vp->v_usecount);
1971	simple_lock(&spechash_slock);
1972	for (count = 0, vq = *vp->v_hashchain; vq; vq = vnext) {
1973		vnext = vq->v_specnext;
1974		if (vq->v_rdev != vp->v_rdev || vq->v_type != vp->v_type)
1975			continue;
1976		/*
1977		 * Alias, but not in use, so flush it out.
1978		 */
1979		if (vq->v_usecount == 0 && vq != vp) {
1980			simple_unlock(&spechash_slock);
1981			vgone(vq);
1982			goto loop;
1983		}
1984		count += vq->v_usecount;
1985	}
1986	simple_unlock(&spechash_slock);
1987	return (count);
1988}
1989/*
1990 * Print out a description of a vnode.
1991 */
1992static char *typename[] =
1993{"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"};
1994
1995void
1996vprint(label, vp)
1997	char *label;
1998	register struct vnode *vp;
1999{
2000	char buf[96];
2001
2002	if (label != NULL)
2003		printf("%s: %p: ", label, (void *)vp);
2004	else
2005		printf("%p: ", (void *)vp);
2006	printf("type %s, usecount %d, writecount %d, refcount %d,",
2007	    typename[vp->v_type], vp->v_usecount, vp->v_writecount,
2008	    vp->v_holdcnt);
2009	buf[0] = '\0';
2010	if (vp->v_flag & VROOT)
2011		strcat(buf, "|VROOT");
2012	if (vp->v_flag & VTEXT)
2013		strcat(buf, "|VTEXT");
2014	if (vp->v_flag & VSYSTEM)
2015		strcat(buf, "|VSYSTEM");
2016	if (vp->v_flag & VXLOCK)
2017		strcat(buf, "|VXLOCK");
2018	if (vp->v_flag & VXWANT)
2019		strcat(buf, "|VXWANT");
2020	if (vp->v_flag & VBWAIT)
2021		strcat(buf, "|VBWAIT");
2022	if (vp->v_flag & VALIASED)
2023		strcat(buf, "|VALIASED");
2024	if (vp->v_flag & VDOOMED)
2025		strcat(buf, "|VDOOMED");
2026	if (vp->v_flag & VFREE)
2027		strcat(buf, "|VFREE");
2028	if (vp->v_flag & VOBJBUF)
2029		strcat(buf, "|VOBJBUF");
2030	if (buf[0] != '\0')
2031		printf(" flags (%s)", &buf[1]);
2032	if (vp->v_data == NULL) {
2033		printf("\n");
2034	} else {
2035		printf("\n\t");
2036		VOP_PRINT(vp);
2037	}
2038}
2039
2040#ifdef DDB
2041#include <ddb/ddb.h>
2042/*
2043 * List all of the locked vnodes in the system.
2044 * Called when debugging the kernel.
2045 */
2046DB_SHOW_COMMAND(lockedvnodes, lockedvnodes)
2047{
2048	struct proc *p = curproc;	/* XXX */
2049	struct mount *mp, *nmp;
2050	struct vnode *vp;
2051
2052	printf("Locked vnodes\n");
2053	simple_lock(&mountlist_slock);
2054	for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) {
2055		if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) {
2056			nmp = mp->mnt_list.cqe_next;
2057			continue;
2058		}
2059		for (vp = mp->mnt_vnodelist.lh_first;
2060		     vp != NULL;
2061		     vp = vp->v_mntvnodes.le_next) {
2062			if (VOP_ISLOCKED(vp))
2063				vprint((char *)0, vp);
2064		}
2065		simple_lock(&mountlist_slock);
2066		nmp = mp->mnt_list.cqe_next;
2067		vfs_unbusy(mp, p);
2068	}
2069	simple_unlock(&mountlist_slock);
2070}
2071#endif
2072
2073/*
2074 * Top level filesystem related information gathering.
2075 */
2076static int	sysctl_ovfs_conf __P(SYSCTL_HANDLER_ARGS);
2077
2078static int
2079vfs_sysctl SYSCTL_HANDLER_ARGS
2080{
2081	int *name = (int *)arg1 - 1;	/* XXX */
2082	u_int namelen = arg2 + 1;	/* XXX */
2083	struct vfsconf *vfsp;
2084
2085#if 1 || defined(COMPAT_PRELITE2)
2086	/* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */
2087	if (namelen == 1)
2088		return (sysctl_ovfs_conf(oidp, arg1, arg2, req));
2089#endif
2090
2091#ifdef notyet
2092	/* all sysctl names at this level are at least name and field */
2093	if (namelen < 2)
2094		return (ENOTDIR);		/* overloaded */
2095	if (name[0] != VFS_GENERIC) {
2096		for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
2097			if (vfsp->vfc_typenum == name[0])
2098				break;
2099		if (vfsp == NULL)
2100			return (EOPNOTSUPP);
2101		return ((*vfsp->vfc_vfsops->vfs_sysctl)(&name[1], namelen - 1,
2102		    oldp, oldlenp, newp, newlen, p));
2103	}
2104#endif
2105	switch (name[1]) {
2106	case VFS_MAXTYPENUM:
2107		if (namelen != 2)
2108			return (ENOTDIR);
2109		return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int)));
2110	case VFS_CONF:
2111		if (namelen != 3)
2112			return (ENOTDIR);	/* overloaded */
2113		for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
2114			if (vfsp->vfc_typenum == name[2])
2115				break;
2116		if (vfsp == NULL)
2117			return (EOPNOTSUPP);
2118		return (SYSCTL_OUT(req, vfsp, sizeof *vfsp));
2119	}
2120	return (EOPNOTSUPP);
2121}
2122
2123SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD, vfs_sysctl,
2124	"Generic filesystem");
2125
2126#if 1 || defined(COMPAT_PRELITE2)
2127
2128static int
2129sysctl_ovfs_conf SYSCTL_HANDLER_ARGS
2130{
2131	int error;
2132	struct vfsconf *vfsp;
2133	struct ovfsconf ovfs;
2134
2135	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
2136		ovfs.vfc_vfsops = vfsp->vfc_vfsops;	/* XXX used as flag */
2137		strcpy(ovfs.vfc_name, vfsp->vfc_name);
2138		ovfs.vfc_index = vfsp->vfc_typenum;
2139		ovfs.vfc_refcount = vfsp->vfc_refcount;
2140		ovfs.vfc_flags = vfsp->vfc_flags;
2141		error = SYSCTL_OUT(req, &ovfs, sizeof ovfs);
2142		if (error)
2143			return error;
2144	}
2145	return 0;
2146}
2147
2148#endif /* 1 || COMPAT_PRELITE2 */
2149
2150#if 0
2151#define KINFO_VNODESLOP	10
2152/*
2153 * Dump vnode list (via sysctl).
2154 * Copyout address of vnode followed by vnode.
2155 */
2156/* ARGSUSED */
2157static int
2158sysctl_vnode SYSCTL_HANDLER_ARGS
2159{
2160	struct proc *p = curproc;	/* XXX */
2161	struct mount *mp, *nmp;
2162	struct vnode *nvp, *vp;
2163	int error;
2164
2165#define VPTRSZ	sizeof (struct vnode *)
2166#define VNODESZ	sizeof (struct vnode)
2167
2168	req->lock = 0;
2169	if (!req->oldptr) /* Make an estimate */
2170		return (SYSCTL_OUT(req, 0,
2171			(numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ)));
2172
2173	simple_lock(&mountlist_slock);
2174	for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) {
2175		if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) {
2176			nmp = mp->mnt_list.cqe_next;
2177			continue;
2178		}
2179again:
2180		simple_lock(&mntvnode_slock);
2181		for (vp = mp->mnt_vnodelist.lh_first;
2182		     vp != NULL;
2183		     vp = nvp) {
2184			/*
2185			 * Check that the vp is still associated with
2186			 * this filesystem.  RACE: could have been
2187			 * recycled onto the same filesystem.
2188			 */
2189			if (vp->v_mount != mp) {
2190				simple_unlock(&mntvnode_slock);
2191				goto again;
2192			}
2193			nvp = vp->v_mntvnodes.le_next;
2194			simple_unlock(&mntvnode_slock);
2195			if ((error = SYSCTL_OUT(req, &vp, VPTRSZ)) ||
2196			    (error = SYSCTL_OUT(req, vp, VNODESZ)))
2197				return (error);
2198			simple_lock(&mntvnode_slock);
2199		}
2200		simple_unlock(&mntvnode_slock);
2201		simple_lock(&mountlist_slock);
2202		nmp = mp->mnt_list.cqe_next;
2203		vfs_unbusy(mp, p);
2204	}
2205	simple_unlock(&mountlist_slock);
2206
2207	return (0);
2208}
2209#endif
2210
2211/*
2212 * XXX
2213 * Exporting the vnode list on large systems causes them to crash.
2214 * Exporting the vnode list on medium systems causes sysctl to coredump.
2215 */
2216#if 0
2217SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD,
2218	0, 0, sysctl_vnode, "S,vnode", "");
2219#endif
2220
2221/*
2222 * Check to see if a filesystem is mounted on a block device.
2223 */
2224int
2225vfs_mountedon(vp)
2226	struct vnode *vp;
2227{
2228	struct vnode *vq;
2229	int error = 0;
2230
2231	if (vp->v_specmountpoint != NULL)
2232		return (EBUSY);
2233	if (vp->v_flag & VALIASED) {
2234		simple_lock(&spechash_slock);
2235		for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
2236			if (vq->v_rdev != vp->v_rdev ||
2237			    vq->v_type != vp->v_type)
2238				continue;
2239			if (vq->v_specmountpoint != NULL) {
2240				error = EBUSY;
2241				break;
2242			}
2243		}
2244		simple_unlock(&spechash_slock);
2245	}
2246	return (error);
2247}
2248
2249/*
2250 * Unmount all filesystems. The list is traversed in reverse order
2251 * of mounting to avoid dependencies.
2252 */
2253void
2254vfs_unmountall()
2255{
2256	struct mount *mp, *nmp;
2257	struct proc *p;
2258	int error;
2259
2260	if (curproc != NULL)
2261		p = curproc;
2262	else
2263		p = initproc;	/* XXX XXX should this be proc0? */
2264	/*
2265	 * Since this only runs when rebooting, it is not interlocked.
2266	 */
2267	for (mp = mountlist.cqh_last; mp != (void *)&mountlist; mp = nmp) {
2268		nmp = mp->mnt_list.cqe_prev;
2269		error = dounmount(mp, MNT_FORCE, p);
2270		if (error) {
2271			printf("unmount of %s failed (",
2272			    mp->mnt_stat.f_mntonname);
2273			if (error == EBUSY)
2274				printf("BUSY)\n");
2275			else
2276				printf("%d)\n", error);
2277		}
2278	}
2279}
2280
2281/*
2282 * Build hash lists of net addresses and hang them off the mount point.
2283 * Called by ufs_mount() to set up the lists of export addresses.
2284 */
2285static int
2286vfs_hang_addrlist(mp, nep, argp)
2287	struct mount *mp;
2288	struct netexport *nep;
2289	struct export_args *argp;
2290{
2291	register struct netcred *np;
2292	register struct radix_node_head *rnh;
2293	register int i;
2294	struct radix_node *rn;
2295	struct sockaddr *saddr, *smask = 0;
2296	struct domain *dom;
2297	int error;
2298
2299	if (argp->ex_addrlen == 0) {
2300		if (mp->mnt_flag & MNT_DEFEXPORTED)
2301			return (EPERM);
2302		np = &nep->ne_defexported;
2303		np->netc_exflags = argp->ex_flags;
2304		np->netc_anon = argp->ex_anon;
2305		np->netc_anon.cr_ref = 1;
2306		mp->mnt_flag |= MNT_DEFEXPORTED;
2307		return (0);
2308	}
2309	i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen;
2310	np = (struct netcred *) malloc(i, M_NETADDR, M_WAITOK);
2311	bzero((caddr_t) np, i);
2312	saddr = (struct sockaddr *) (np + 1);
2313	if ((error = copyin(argp->ex_addr, (caddr_t) saddr, argp->ex_addrlen)))
2314		goto out;
2315	if (saddr->sa_len > argp->ex_addrlen)
2316		saddr->sa_len = argp->ex_addrlen;
2317	if (argp->ex_masklen) {
2318		smask = (struct sockaddr *) ((caddr_t) saddr + argp->ex_addrlen);
2319		error = copyin(argp->ex_mask, (caddr_t) smask, argp->ex_masklen);
2320		if (error)
2321			goto out;
2322		if (smask->sa_len > argp->ex_masklen)
2323			smask->sa_len = argp->ex_masklen;
2324	}
2325	i = saddr->sa_family;
2326	if ((rnh = nep->ne_rtable[i]) == 0) {
2327		/*
2328		 * Seems silly to initialize every AF when most are not used,
2329		 * do so on demand here
2330		 */
2331		for (dom = domains; dom; dom = dom->dom_next)
2332			if (dom->dom_family == i && dom->dom_rtattach) {
2333				dom->dom_rtattach((void **) &nep->ne_rtable[i],
2334				    dom->dom_rtoffset);
2335				break;
2336			}
2337		if ((rnh = nep->ne_rtable[i]) == 0) {
2338			error = ENOBUFS;
2339			goto out;
2340		}
2341	}
2342	rn = (*rnh->rnh_addaddr) ((caddr_t) saddr, (caddr_t) smask, rnh,
2343	    np->netc_rnodes);
2344	if (rn == 0 || np != (struct netcred *) rn) {	/* already exists */
2345		error = EPERM;
2346		goto out;
2347	}
2348	np->netc_exflags = argp->ex_flags;
2349	np->netc_anon = argp->ex_anon;
2350	np->netc_anon.cr_ref = 1;
2351	return (0);
2352out:
2353	free(np, M_NETADDR);
2354	return (error);
2355}
2356
2357/* ARGSUSED */
2358static int
2359vfs_free_netcred(rn, w)
2360	struct radix_node *rn;
2361	void *w;
2362{
2363	register struct radix_node_head *rnh = (struct radix_node_head *) w;
2364
2365	(*rnh->rnh_deladdr) (rn->rn_key, rn->rn_mask, rnh);
2366	free((caddr_t) rn, M_NETADDR);
2367	return (0);
2368}
2369
2370/*
2371 * Free the net address hash lists that are hanging off the mount points.
2372 */
2373static void
2374vfs_free_addrlist(nep)
2375	struct netexport *nep;
2376{
2377	register int i;
2378	register struct radix_node_head *rnh;
2379
2380	for (i = 0; i <= AF_MAX; i++)
2381		if ((rnh = nep->ne_rtable[i])) {
2382			(*rnh->rnh_walktree) (rnh, vfs_free_netcred,
2383			    (caddr_t) rnh);
2384			free((caddr_t) rnh, M_RTABLE);
2385			nep->ne_rtable[i] = 0;
2386		}
2387}
2388
2389int
2390vfs_export(mp, nep, argp)
2391	struct mount *mp;
2392	struct netexport *nep;
2393	struct export_args *argp;
2394{
2395	int error;
2396
2397	if (argp->ex_flags & MNT_DELEXPORT) {
2398		if (mp->mnt_flag & MNT_EXPUBLIC) {
2399			vfs_setpublicfs(NULL, NULL, NULL);
2400			mp->mnt_flag &= ~MNT_EXPUBLIC;
2401		}
2402		vfs_free_addrlist(nep);
2403		mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED);
2404	}
2405	if (argp->ex_flags & MNT_EXPORTED) {
2406		if (argp->ex_flags & MNT_EXPUBLIC) {
2407			if ((error = vfs_setpublicfs(mp, nep, argp)) != 0)
2408				return (error);
2409			mp->mnt_flag |= MNT_EXPUBLIC;
2410		}
2411		if ((error = vfs_hang_addrlist(mp, nep, argp)))
2412			return (error);
2413		mp->mnt_flag |= MNT_EXPORTED;
2414	}
2415	return (0);
2416}
2417
2418
2419/*
2420 * Set the publicly exported filesystem (WebNFS). Currently, only
2421 * one public filesystem is possible in the spec (RFC 2054 and 2055)
2422 */
2423int
2424vfs_setpublicfs(mp, nep, argp)
2425	struct mount *mp;
2426	struct netexport *nep;
2427	struct export_args *argp;
2428{
2429	int error;
2430	struct vnode *rvp;
2431	char *cp;
2432
2433	/*
2434	 * mp == NULL -> invalidate the current info, the FS is
2435	 * no longer exported. May be called from either vfs_export
2436	 * or unmount, so check if it hasn't already been done.
2437	 */
2438	if (mp == NULL) {
2439		if (nfs_pub.np_valid) {
2440			nfs_pub.np_valid = 0;
2441			if (nfs_pub.np_index != NULL) {
2442				FREE(nfs_pub.np_index, M_TEMP);
2443				nfs_pub.np_index = NULL;
2444			}
2445		}
2446		return (0);
2447	}
2448
2449	/*
2450	 * Only one allowed at a time.
2451	 */
2452	if (nfs_pub.np_valid != 0 && mp != nfs_pub.np_mount)
2453		return (EBUSY);
2454
2455	/*
2456	 * Get real filehandle for root of exported FS.
2457	 */
2458	bzero((caddr_t)&nfs_pub.np_handle, sizeof(nfs_pub.np_handle));
2459	nfs_pub.np_handle.fh_fsid = mp->mnt_stat.f_fsid;
2460
2461	if ((error = VFS_ROOT(mp, &rvp)))
2462		return (error);
2463
2464	if ((error = VFS_VPTOFH(rvp, &nfs_pub.np_handle.fh_fid)))
2465		return (error);
2466
2467	vput(rvp);
2468
2469	/*
2470	 * If an indexfile was specified, pull it in.
2471	 */
2472	if (argp->ex_indexfile != NULL) {
2473		MALLOC(nfs_pub.np_index, char *, MAXNAMLEN + 1, M_TEMP,
2474		    M_WAITOK);
2475		error = copyinstr(argp->ex_indexfile, nfs_pub.np_index,
2476		    MAXNAMLEN, (size_t *)0);
2477		if (!error) {
2478			/*
2479			 * Check for illegal filenames.
2480			 */
2481			for (cp = nfs_pub.np_index; *cp; cp++) {
2482				if (*cp == '/') {
2483					error = EINVAL;
2484					break;
2485				}
2486			}
2487		}
2488		if (error) {
2489			FREE(nfs_pub.np_index, M_TEMP);
2490			return (error);
2491		}
2492	}
2493
2494	nfs_pub.np_mount = mp;
2495	nfs_pub.np_valid = 1;
2496	return (0);
2497}
2498
2499struct netcred *
2500vfs_export_lookup(mp, nep, nam)
2501	register struct mount *mp;
2502	struct netexport *nep;
2503	struct sockaddr *nam;
2504{
2505	register struct netcred *np;
2506	register struct radix_node_head *rnh;
2507	struct sockaddr *saddr;
2508
2509	np = NULL;
2510	if (mp->mnt_flag & MNT_EXPORTED) {
2511		/*
2512		 * Lookup in the export list first.
2513		 */
2514		if (nam != NULL) {
2515			saddr = nam;
2516			rnh = nep->ne_rtable[saddr->sa_family];
2517			if (rnh != NULL) {
2518				np = (struct netcred *)
2519					(*rnh->rnh_matchaddr)((caddr_t)saddr,
2520							      rnh);
2521				if (np && np->netc_rnodes->rn_flags & RNF_ROOT)
2522					np = NULL;
2523			}
2524		}
2525		/*
2526		 * If no address match, use the default if it exists.
2527		 */
2528		if (np == NULL && mp->mnt_flag & MNT_DEFEXPORTED)
2529			np = &nep->ne_defexported;
2530	}
2531	return (np);
2532}
2533
2534/*
2535 * perform msync on all vnodes under a mount point
2536 * the mount point must be locked.
2537 */
2538void
2539vfs_msync(struct mount *mp, int flags) {
2540	struct vnode *vp, *nvp;
2541	struct vm_object *obj;
2542	int anyio, tries;
2543
2544	tries = 5;
2545loop:
2546	anyio = 0;
2547	for (vp = mp->mnt_vnodelist.lh_first; vp != NULL; vp = nvp) {
2548
2549		nvp = vp->v_mntvnodes.le_next;
2550
2551		if (vp->v_mount != mp) {
2552			goto loop;
2553		}
2554
2555		if (vp->v_flag & VXLOCK)	/* XXX: what if MNT_WAIT? */
2556			continue;
2557
2558		if (flags != MNT_WAIT) {
2559			obj = vp->v_object;
2560			if (obj == NULL || (obj->flags & OBJ_MIGHTBEDIRTY) == 0)
2561				continue;
2562			if (VOP_ISLOCKED(vp))
2563				continue;
2564		}
2565
2566		simple_lock(&vp->v_interlock);
2567		if (vp->v_object &&
2568		   (vp->v_object->flags & OBJ_MIGHTBEDIRTY)) {
2569			if (!vget(vp,
2570				LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY | LK_NOOBJ, curproc)) {
2571				if (vp->v_object) {
2572					vm_object_page_clean(vp->v_object, 0, 0, flags == MNT_WAIT ? OBJPC_SYNC : 0);
2573					anyio = 1;
2574				}
2575				vput(vp);
2576			}
2577		} else {
2578			simple_unlock(&vp->v_interlock);
2579		}
2580	}
2581	if (anyio && (--tries > 0))
2582		goto loop;
2583}
2584
2585/*
2586 * Create the VM object needed for VMIO and mmap support.  This
2587 * is done for all VREG files in the system.  Some filesystems might
2588 * afford the additional metadata buffering capability of the
2589 * VMIO code by making the device node be VMIO mode also.
2590 *
2591 * vp must be locked when vfs_object_create is called.
2592 */
2593int
2594vfs_object_create(vp, p, cred)
2595	struct vnode *vp;
2596	struct proc *p;
2597	struct ucred *cred;
2598{
2599	struct vattr vat;
2600	vm_object_t object;
2601	int error = 0;
2602
2603	if ((vp->v_type != VREG) && (vp->v_type != VBLK))
2604		return 0;
2605
2606retry:
2607	if ((object = vp->v_object) == NULL) {
2608		if (vp->v_type == VREG) {
2609			if ((error = VOP_GETATTR(vp, &vat, cred, p)) != 0)
2610				goto retn;
2611			object = vnode_pager_alloc(vp, vat.va_size, 0, 0);
2612		} else if (bdevsw(vp->v_rdev) != NULL) {
2613			/*
2614			 * This simply allocates the biggest object possible
2615			 * for a VBLK vnode.  This should be fixed, but doesn't
2616			 * cause any problems (yet).
2617			 */
2618			object = vnode_pager_alloc(vp, IDX_TO_OFF(INT_MAX), 0, 0);
2619		} else {
2620			goto retn;
2621		}
2622		/*
2623		 * Dereference the reference we just created.  This assumes
2624		 * that the object is associated with the vp.
2625		 */
2626		object->ref_count--;
2627		vp->v_usecount--;
2628	} else {
2629		if (object->flags & OBJ_DEAD) {
2630			VOP_UNLOCK(vp, 0, p);
2631			tsleep(object, PVM, "vodead", 0);
2632			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
2633			goto retry;
2634		}
2635	}
2636
2637	KASSERT(vp->v_object != NULL, ("vfs_object_create: NULL object"));
2638	vp->v_flag |= VOBJBUF;
2639
2640retn:
2641	return error;
2642}
2643
2644static void
2645vfree(vp)
2646	struct vnode *vp;
2647{
2648	int s;
2649
2650	s = splbio();
2651	simple_lock(&vnode_free_list_slock);
2652	if (vp->v_flag & VTBFREE) {
2653		TAILQ_REMOVE(&vnode_tobefree_list, vp, v_freelist);
2654		vp->v_flag &= ~VTBFREE;
2655	}
2656	if (vp->v_flag & VAGE) {
2657		TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
2658	} else {
2659		TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
2660	}
2661	freevnodes++;
2662	simple_unlock(&vnode_free_list_slock);
2663	vp->v_flag &= ~VAGE;
2664	vp->v_flag |= VFREE;
2665	splx(s);
2666}
2667
2668void
2669vbusy(vp)
2670	struct vnode *vp;
2671{
2672	int s;
2673
2674	s = splbio();
2675	simple_lock(&vnode_free_list_slock);
2676	if (vp->v_flag & VTBFREE) {
2677		TAILQ_REMOVE(&vnode_tobefree_list, vp, v_freelist);
2678		vp->v_flag &= ~VTBFREE;
2679	} else {
2680		TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
2681		freevnodes--;
2682	}
2683	simple_unlock(&vnode_free_list_slock);
2684	vp->v_flag &= ~(VFREE|VAGE);
2685	splx(s);
2686}
2687
2688/*
2689 * Record a process's interest in events which might happen to
2690 * a vnode.  Because poll uses the historic select-style interface
2691 * internally, this routine serves as both the ``check for any
2692 * pending events'' and the ``record my interest in future events''
2693 * functions.  (These are done together, while the lock is held,
2694 * to avoid race conditions.)
2695 */
2696int
2697vn_pollrecord(vp, p, events)
2698	struct vnode *vp;
2699	struct proc *p;
2700	short events;
2701{
2702	simple_lock(&vp->v_pollinfo.vpi_lock);
2703	if (vp->v_pollinfo.vpi_revents & events) {
2704		/*
2705		 * This leaves events we are not interested
2706		 * in available for the other process which
2707		 * which presumably had requested them
2708		 * (otherwise they would never have been
2709		 * recorded).
2710		 */
2711		events &= vp->v_pollinfo.vpi_revents;
2712		vp->v_pollinfo.vpi_revents &= ~events;
2713
2714		simple_unlock(&vp->v_pollinfo.vpi_lock);
2715		return events;
2716	}
2717	vp->v_pollinfo.vpi_events |= events;
2718	selrecord(p, &vp->v_pollinfo.vpi_selinfo);
2719	simple_unlock(&vp->v_pollinfo.vpi_lock);
2720	return 0;
2721}
2722
2723/*
2724 * Note the occurrence of an event.  If the VN_POLLEVENT macro is used,
2725 * it is possible for us to miss an event due to race conditions, but
2726 * that condition is expected to be rare, so for the moment it is the
2727 * preferred interface.
2728 */
2729void
2730vn_pollevent(vp, events)
2731	struct vnode *vp;
2732	short events;
2733{
2734	simple_lock(&vp->v_pollinfo.vpi_lock);
2735	if (vp->v_pollinfo.vpi_events & events) {
2736		/*
2737		 * We clear vpi_events so that we don't
2738		 * call selwakeup() twice if two events are
2739		 * posted before the polling process(es) is
2740		 * awakened.  This also ensures that we take at
2741		 * most one selwakeup() if the polling process
2742		 * is no longer interested.  However, it does
2743		 * mean that only one event can be noticed at
2744		 * a time.  (Perhaps we should only clear those
2745		 * event bits which we note?) XXX
2746		 */
2747		vp->v_pollinfo.vpi_events = 0;	/* &= ~events ??? */
2748		vp->v_pollinfo.vpi_revents |= events;
2749		selwakeup(&vp->v_pollinfo.vpi_selinfo);
2750	}
2751	simple_unlock(&vp->v_pollinfo.vpi_lock);
2752}
2753
2754/*
2755 * Wake up anyone polling on vp because it is being revoked.
2756 * This depends on dead_poll() returning POLLHUP for correct
2757 * behavior.
2758 */
2759void
2760vn_pollgone(vp)
2761	struct vnode *vp;
2762{
2763	simple_lock(&vp->v_pollinfo.vpi_lock);
2764	if (vp->v_pollinfo.vpi_events) {
2765		vp->v_pollinfo.vpi_events = 0;
2766		selwakeup(&vp->v_pollinfo.vpi_selinfo);
2767	}
2768	simple_unlock(&vp->v_pollinfo.vpi_lock);
2769}
2770
2771
2772
2773/*
2774 * Routine to create and manage a filesystem syncer vnode.
2775 */
2776#define sync_close ((int (*) __P((struct  vop_close_args *)))nullop)
2777static int	sync_fsync __P((struct  vop_fsync_args *));
2778static int	sync_inactive __P((struct  vop_inactive_args *));
2779static int	sync_reclaim  __P((struct  vop_reclaim_args *));
2780#define sync_lock ((int (*) __P((struct  vop_lock_args *)))vop_nolock)
2781#define sync_unlock ((int (*) __P((struct  vop_unlock_args *)))vop_nounlock)
2782static int	sync_print __P((struct vop_print_args *));
2783#define sync_islocked ((int(*) __P((struct vop_islocked_args *)))vop_noislocked)
2784
2785static vop_t **sync_vnodeop_p;
2786static struct vnodeopv_entry_desc sync_vnodeop_entries[] = {
2787	{ &vop_default_desc,	(vop_t *) vop_eopnotsupp },
2788	{ &vop_close_desc,	(vop_t *) sync_close },		/* close */
2789	{ &vop_fsync_desc,	(vop_t *) sync_fsync },		/* fsync */
2790	{ &vop_inactive_desc,	(vop_t *) sync_inactive },	/* inactive */
2791	{ &vop_reclaim_desc,	(vop_t *) sync_reclaim },	/* reclaim */
2792	{ &vop_lock_desc,	(vop_t *) sync_lock },		/* lock */
2793	{ &vop_unlock_desc,	(vop_t *) sync_unlock },	/* unlock */
2794	{ &vop_print_desc,	(vop_t *) sync_print },		/* print */
2795	{ &vop_islocked_desc,	(vop_t *) sync_islocked },	/* islocked */
2796	{ NULL, NULL }
2797};
2798static struct vnodeopv_desc sync_vnodeop_opv_desc =
2799	{ &sync_vnodeop_p, sync_vnodeop_entries };
2800
2801VNODEOP_SET(sync_vnodeop_opv_desc);
2802
2803/*
2804 * Create a new filesystem syncer vnode for the specified mount point.
2805 */
2806int
2807vfs_allocate_syncvnode(mp)
2808	struct mount *mp;
2809{
2810	struct vnode *vp;
2811	static long start, incr, next;
2812	int error;
2813
2814	/* Allocate a new vnode */
2815	if ((error = getnewvnode(VT_VFS, mp, sync_vnodeop_p, &vp)) != 0) {
2816		mp->mnt_syncer = NULL;
2817		return (error);
2818	}
2819	vp->v_type = VNON;
2820	/*
2821	 * Place the vnode onto the syncer worklist. We attempt to
2822	 * scatter them about on the list so that they will go off
2823	 * at evenly distributed times even if all the filesystems
2824	 * are mounted at once.
2825	 */
2826	next += incr;
2827	if (next == 0 || next > syncer_maxdelay) {
2828		start /= 2;
2829		incr /= 2;
2830		if (start == 0) {
2831			start = syncer_maxdelay / 2;
2832			incr = syncer_maxdelay;
2833		}
2834		next = start;
2835	}
2836	vn_syncer_add_to_worklist(vp, syncdelay > 0 ? next % syncdelay : 0);
2837	mp->mnt_syncer = vp;
2838	return (0);
2839}
2840
2841/*
2842 * Do a lazy sync of the filesystem.
2843 */
2844static int
2845sync_fsync(ap)
2846	struct vop_fsync_args /* {
2847		struct vnode *a_vp;
2848		struct ucred *a_cred;
2849		int a_waitfor;
2850		struct proc *a_p;
2851	} */ *ap;
2852{
2853	struct vnode *syncvp = ap->a_vp;
2854	struct mount *mp = syncvp->v_mount;
2855	struct proc *p = ap->a_p;
2856	int asyncflag;
2857
2858	/*
2859	 * We only need to do something if this is a lazy evaluation.
2860	 */
2861	if (ap->a_waitfor != MNT_LAZY)
2862		return (0);
2863
2864	/*
2865	 * Move ourselves to the back of the sync list.
2866	 */
2867	vn_syncer_add_to_worklist(syncvp, syncdelay);
2868
2869	/*
2870	 * Walk the list of vnodes pushing all that are dirty and
2871	 * not already on the sync list.
2872	 */
2873	simple_lock(&mountlist_slock);
2874	if (vfs_busy(mp, LK_EXCLUSIVE | LK_NOWAIT, &mountlist_slock, p) != 0) {
2875		simple_unlock(&mountlist_slock);
2876		return (0);
2877	}
2878	asyncflag = mp->mnt_flag & MNT_ASYNC;
2879	mp->mnt_flag &= ~MNT_ASYNC;
2880	vfs_msync(mp, MNT_NOWAIT);
2881	VFS_SYNC(mp, MNT_LAZY, ap->a_cred, p);
2882	if (asyncflag)
2883		mp->mnt_flag |= MNT_ASYNC;
2884	vfs_unbusy(mp, p);
2885	return (0);
2886}
2887
2888/*
2889 * The syncer vnode is no referenced.
2890 */
2891static int
2892sync_inactive(ap)
2893	struct vop_inactive_args /* {
2894		struct vnode *a_vp;
2895		struct proc *a_p;
2896	} */ *ap;
2897{
2898
2899	vgone(ap->a_vp);
2900	return (0);
2901}
2902
2903/*
2904 * The syncer vnode is no longer needed and is being decommissioned.
2905 *
2906 * Modifications to the worklist must be protected at splbio().
2907 */
2908static int
2909sync_reclaim(ap)
2910	struct vop_reclaim_args /* {
2911		struct vnode *a_vp;
2912	} */ *ap;
2913{
2914	struct vnode *vp = ap->a_vp;
2915	int s;
2916
2917	s = splbio();
2918	vp->v_mount->mnt_syncer = NULL;
2919	if (vp->v_flag & VONWORKLST) {
2920		LIST_REMOVE(vp, v_synclist);
2921		vp->v_flag &= ~VONWORKLST;
2922	}
2923	splx(s);
2924
2925	return (0);
2926}
2927
2928/*
2929 * Print out a syncer vnode.
2930 */
2931static int
2932sync_print(ap)
2933	struct vop_print_args /* {
2934		struct vnode *a_vp;
2935	} */ *ap;
2936{
2937	struct vnode *vp = ap->a_vp;
2938
2939	printf("syncer vnode");
2940	if (vp->v_vnlock != NULL)
2941		lockmgr_printinfo(vp->v_vnlock);
2942	printf("\n");
2943	return (0);
2944}
2945