vfs_subr.c revision 49101
1/*
2 * Copyright (c) 1989, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 *    must display the following acknowledgement:
20 *	This product includes software developed by the University of
21 *	California, Berkeley and its contributors.
22 * 4. Neither the name of the University nor the names of its contributors
23 *    may be used to endorse or promote products derived from this software
24 *    without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 *
38 *	@(#)vfs_subr.c	8.31 (Berkeley) 5/26/95
39 * $Id: vfs_subr.c,v 1.213 1999/07/20 09:47:44 phk Exp $
40 */
41
42/*
43 * External virtual filesystem routines
44 */
45#include "opt_ddb.h"
46
47#include <sys/param.h>
48#include <sys/systm.h>
49#include <sys/conf.h>
50#include <sys/fcntl.h>
51#include <sys/kernel.h>
52#include <sys/proc.h>
53#include <sys/kthread.h>
54#include <sys/malloc.h>
55#include <sys/mount.h>
56#include <sys/socket.h>
57#include <sys/vnode.h>
58#include <sys/stat.h>
59#include <sys/buf.h>
60#include <sys/domain.h>
61#include <sys/dirent.h>
62#include <sys/vmmeter.h>
63
64#include <machine/limits.h>
65
66#include <vm/vm.h>
67#include <vm/vm_param.h>
68#include <vm/vm_prot.h>
69#include <vm/vm_object.h>
70#include <vm/vm_extern.h>
71#include <vm/pmap.h>
72#include <vm/vm_map.h>
73#include <vm/vm_page.h>
74#include <vm/vm_pager.h>
75#include <vm/vnode_pager.h>
76#include <vm/vm_zone.h>
77#include <sys/sysctl.h>
78
79#include <miscfs/specfs/specdev.h>
80
81static MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure");
82
83static struct vnode *checkalias2 __P((struct vnode *nvp, dev_t dev, struct mount *mp));
84static void	insmntque __P((struct vnode *vp, struct mount *mp));
85static void	vclean __P((struct vnode *vp, int flags, struct proc *p));
86static void	vfree __P((struct vnode *));
87static void	vgonel __P((struct vnode *vp, struct proc *p));
88static unsigned long	numvnodes;
89SYSCTL_INT(_debug, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, "");
90
91enum vtype iftovt_tab[16] = {
92	VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
93	VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
94};
95int vttoif_tab[9] = {
96	0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
97	S_IFSOCK, S_IFIFO, S_IFMT,
98};
99
100static TAILQ_HEAD(freelst, vnode) vnode_free_list;	/* vnode free list */
101struct tobefreelist vnode_tobefree_list;	/* vnode free list */
102
103static u_long wantfreevnodes = 25;
104SYSCTL_INT(_debug, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, "");
105static u_long freevnodes = 0;
106SYSCTL_INT(_debug, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, "");
107
108static int reassignbufcalls;
109SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0, "");
110static int reassignbufloops;
111SYSCTL_INT(_vfs, OID_AUTO, reassignbufloops, CTLFLAG_RW, &reassignbufloops, 0, "");
112static int reassignbufsortgood;
113SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortgood, CTLFLAG_RW, &reassignbufsortgood, 0, "");
114static int reassignbufsortbad;
115SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortbad, CTLFLAG_RW, &reassignbufsortbad, 0, "");
116static int reassignbufmethod = 1;
117SYSCTL_INT(_vfs, OID_AUTO, reassignbufmethod, CTLFLAG_RW, &reassignbufmethod, 0, "");
118
119#ifdef ENABLE_VFS_IOOPT
120int vfs_ioopt = 0;
121SYSCTL_INT(_vfs, OID_AUTO, ioopt, CTLFLAG_RW, &vfs_ioopt, 0, "");
122#endif
123
124struct mntlist mountlist;	/* mounted filesystem list */
125struct simplelock mountlist_slock;
126struct simplelock mntvnode_slock;
127int	nfs_mount_type = -1;
128#ifndef NULL_SIMPLELOCKS
129static struct simplelock mntid_slock;
130static struct simplelock vnode_free_list_slock;
131static struct simplelock spechash_slock;
132#endif
133struct nfs_public nfs_pub;	/* publicly exported FS */
134static vm_zone_t vnode_zone;
135
136/*
137 * The workitem queue.
138 */
139#define SYNCER_MAXDELAY		32
140static int syncer_maxdelay = SYNCER_MAXDELAY;	/* maximum delay time */
141time_t syncdelay = 30;		/* max time to delay syncing data */
142time_t filedelay = 30;		/* time to delay syncing files */
143SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, "");
144time_t dirdelay = 29;		/* time to delay syncing directories */
145SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, "");
146time_t metadelay = 28;		/* time to delay syncing metadata */
147SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, "");
148static int rushjob;			/* number of slots to run ASAP */
149static int stat_rush_requests;	/* number of times I/O speeded up */
150SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, "");
151
152static int syncer_delayno = 0;
153static long syncer_mask;
154LIST_HEAD(synclist, vnode);
155static struct synclist *syncer_workitem_pending;
156
157int desiredvnodes;
158SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW,
159    &desiredvnodes, 0, "Maximum number of vnodes");
160
161static void	vfs_free_addrlist __P((struct netexport *nep));
162static int	vfs_free_netcred __P((struct radix_node *rn, void *w));
163static int	vfs_hang_addrlist __P((struct mount *mp, struct netexport *nep,
164				       struct export_args *argp));
165
166/*
167 * Initialize the vnode management data structures.
168 */
169void
170vntblinit()
171{
172
173	desiredvnodes = maxproc + cnt.v_page_count / 4;
174	simple_lock_init(&mntvnode_slock);
175	simple_lock_init(&mntid_slock);
176	simple_lock_init(&spechash_slock);
177	TAILQ_INIT(&vnode_free_list);
178	TAILQ_INIT(&vnode_tobefree_list);
179	simple_lock_init(&vnode_free_list_slock);
180	CIRCLEQ_INIT(&mountlist);
181	vnode_zone = zinit("VNODE", sizeof (struct vnode), 0, 0, 5);
182	/*
183	 * Initialize the filesystem syncer.
184	 */
185	syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE,
186		&syncer_mask);
187	syncer_maxdelay = syncer_mask + 1;
188}
189
190/*
191 * Mark a mount point as busy. Used to synchronize access and to delay
192 * unmounting. Interlock is not released on failure.
193 */
194int
195vfs_busy(mp, flags, interlkp, p)
196	struct mount *mp;
197	int flags;
198	struct simplelock *interlkp;
199	struct proc *p;
200{
201	int lkflags;
202
203	if (mp->mnt_kern_flag & MNTK_UNMOUNT) {
204		if (flags & LK_NOWAIT)
205			return (ENOENT);
206		mp->mnt_kern_flag |= MNTK_MWAIT;
207		if (interlkp) {
208			simple_unlock(interlkp);
209		}
210		/*
211		 * Since all busy locks are shared except the exclusive
212		 * lock granted when unmounting, the only place that a
213		 * wakeup needs to be done is at the release of the
214		 * exclusive lock at the end of dounmount.
215		 */
216		tsleep((caddr_t)mp, PVFS, "vfs_busy", 0);
217		if (interlkp) {
218			simple_lock(interlkp);
219		}
220		return (ENOENT);
221	}
222	lkflags = LK_SHARED | LK_NOPAUSE;
223	if (interlkp)
224		lkflags |= LK_INTERLOCK;
225	if (lockmgr(&mp->mnt_lock, lkflags, interlkp, p))
226		panic("vfs_busy: unexpected lock failure");
227	return (0);
228}
229
230/*
231 * Free a busy filesystem.
232 */
233void
234vfs_unbusy(mp, p)
235	struct mount *mp;
236	struct proc *p;
237{
238
239	lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, p);
240}
241
242/*
243 * Lookup a filesystem type, and if found allocate and initialize
244 * a mount structure for it.
245 *
246 * Devname is usually updated by mount(8) after booting.
247 */
248int
249vfs_rootmountalloc(fstypename, devname, mpp)
250	char *fstypename;
251	char *devname;
252	struct mount **mpp;
253{
254	struct proc *p = curproc;	/* XXX */
255	struct vfsconf *vfsp;
256	struct mount *mp;
257
258	if (fstypename == NULL)
259		return (ENODEV);
260	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
261		if (!strcmp(vfsp->vfc_name, fstypename))
262			break;
263	if (vfsp == NULL)
264		return (ENODEV);
265	mp = malloc((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK);
266	bzero((char *)mp, (u_long)sizeof(struct mount));
267	lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, LK_NOPAUSE);
268	(void)vfs_busy(mp, LK_NOWAIT, 0, p);
269	LIST_INIT(&mp->mnt_vnodelist);
270	mp->mnt_vfc = vfsp;
271	mp->mnt_op = vfsp->vfc_vfsops;
272	mp->mnt_flag = MNT_RDONLY;
273	mp->mnt_vnodecovered = NULLVP;
274	vfsp->vfc_refcount++;
275	mp->mnt_stat.f_type = vfsp->vfc_typenum;
276	mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
277	strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN);
278	mp->mnt_stat.f_mntonname[0] = '/';
279	mp->mnt_stat.f_mntonname[1] = 0;
280	(void) copystr(devname, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, 0);
281	*mpp = mp;
282	return (0);
283}
284
285/*
286 * Find an appropriate filesystem to use for the root. If a filesystem
287 * has not been preselected, walk through the list of known filesystems
288 * trying those that have mountroot routines, and try them until one
289 * works or we have tried them all.
290 */
291#ifdef notdef	/* XXX JH */
292int
293lite2_vfs_mountroot()
294{
295	struct vfsconf *vfsp;
296	extern int (*lite2_mountroot) __P((void));
297	int error;
298
299	if (lite2_mountroot != NULL)
300		return ((*lite2_mountroot)());
301	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
302		if (vfsp->vfc_mountroot == NULL)
303			continue;
304		if ((error = (*vfsp->vfc_mountroot)()) == 0)
305			return (0);
306		printf("%s_mountroot failed: %d\n", vfsp->vfc_name, error);
307	}
308	return (ENODEV);
309}
310#endif
311
312/*
313 * Lookup a mount point by filesystem identifier.
314 */
315struct mount *
316vfs_getvfs(fsid)
317	fsid_t *fsid;
318{
319	register struct mount *mp;
320
321	simple_lock(&mountlist_slock);
322	for (mp = mountlist.cqh_first; mp != (void *)&mountlist;
323	    mp = mp->mnt_list.cqe_next) {
324		if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
325		    mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
326			simple_unlock(&mountlist_slock);
327			return (mp);
328	    }
329	}
330	simple_unlock(&mountlist_slock);
331	return ((struct mount *) 0);
332}
333
334/*
335 * Get a new unique fsid
336 */
337void
338vfs_getnewfsid(mp)
339	struct mount *mp;
340{
341	static u_short xxxfs_mntid;
342
343	fsid_t tfsid;
344	int mtype;
345
346	simple_lock(&mntid_slock);
347	mtype = mp->mnt_vfc->vfc_typenum;
348	mp->mnt_stat.f_fsid.val[0] = makeudev(255, mtype);
349	mp->mnt_stat.f_fsid.val[1] = mtype;
350	if (xxxfs_mntid == 0)
351		++xxxfs_mntid;
352	tfsid.val[0] = makeudev(255, mtype + (xxxfs_mntid << 16));
353	tfsid.val[1] = mtype;
354	if (mountlist.cqh_first != (void *)&mountlist) {
355		while (vfs_getvfs(&tfsid)) {
356			xxxfs_mntid++;
357			tfsid.val[0] = makeudev(255,
358			    mtype + (xxxfs_mntid << 16));
359		}
360	}
361	mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
362	simple_unlock(&mntid_slock);
363}
364
365/*
366 * Set vnode attributes to VNOVAL
367 */
368void
369vattr_null(vap)
370	register struct vattr *vap;
371{
372
373	vap->va_type = VNON;
374	vap->va_size = VNOVAL;
375	vap->va_bytes = VNOVAL;
376	vap->va_mode = VNOVAL;
377	vap->va_nlink = VNOVAL;
378	vap->va_uid = VNOVAL;
379	vap->va_gid = VNOVAL;
380	vap->va_fsid = VNOVAL;
381	vap->va_fileid = VNOVAL;
382	vap->va_blocksize = VNOVAL;
383	vap->va_rdev = VNOVAL;
384	vap->va_atime.tv_sec = VNOVAL;
385	vap->va_atime.tv_nsec = VNOVAL;
386	vap->va_mtime.tv_sec = VNOVAL;
387	vap->va_mtime.tv_nsec = VNOVAL;
388	vap->va_ctime.tv_sec = VNOVAL;
389	vap->va_ctime.tv_nsec = VNOVAL;
390	vap->va_flags = VNOVAL;
391	vap->va_gen = VNOVAL;
392	vap->va_vaflags = 0;
393}
394
395/*
396 * Routines having to do with the management of the vnode table.
397 */
398extern vop_t **dead_vnodeop_p;
399
400/*
401 * Return the next vnode from the free list.
402 */
403int
404getnewvnode(tag, mp, vops, vpp)
405	enum vtagtype tag;
406	struct mount *mp;
407	vop_t **vops;
408	struct vnode **vpp;
409{
410	int s;
411	struct proc *p = curproc;	/* XXX */
412	struct vnode *vp, *tvp, *nvp;
413	vm_object_t object;
414	TAILQ_HEAD(freelst, vnode) vnode_tmp_list;
415
416	/*
417	 * We take the least recently used vnode from the freelist
418	 * if we can get it and it has no cached pages, and no
419	 * namecache entries are relative to it.
420	 * Otherwise we allocate a new vnode
421	 */
422
423	s = splbio();
424	simple_lock(&vnode_free_list_slock);
425	TAILQ_INIT(&vnode_tmp_list);
426
427	for (vp = TAILQ_FIRST(&vnode_tobefree_list); vp; vp = nvp) {
428		nvp = TAILQ_NEXT(vp, v_freelist);
429		TAILQ_REMOVE(&vnode_tobefree_list, vp, v_freelist);
430		if (vp->v_flag & VAGE) {
431			TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
432		} else {
433			TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
434		}
435		vp->v_flag &= ~(VTBFREE|VAGE);
436		vp->v_flag |= VFREE;
437		if (vp->v_usecount)
438			panic("tobe free vnode isn't");
439		freevnodes++;
440	}
441
442	if (wantfreevnodes && freevnodes < wantfreevnodes) {
443		vp = NULL;
444	} else if (!wantfreevnodes && freevnodes <= desiredvnodes) {
445		/*
446		 * XXX: this is only here to be backwards compatible
447		 */
448		vp = NULL;
449	} else {
450		for (vp = TAILQ_FIRST(&vnode_free_list); vp; vp = nvp) {
451			nvp = TAILQ_NEXT(vp, v_freelist);
452			if (!simple_lock_try(&vp->v_interlock))
453				continue;
454			if (vp->v_usecount)
455				panic("free vnode isn't");
456
457			object = vp->v_object;
458			if (object && (object->resident_page_count || object->ref_count)) {
459				printf("object inconsistant state: RPC: %d, RC: %d\n",
460					object->resident_page_count, object->ref_count);
461				/* Don't recycle if it's caching some pages */
462				TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
463				TAILQ_INSERT_TAIL(&vnode_tmp_list, vp, v_freelist);
464				continue;
465			} else if (LIST_FIRST(&vp->v_cache_src)) {
466				/* Don't recycle if active in the namecache */
467				simple_unlock(&vp->v_interlock);
468				continue;
469			} else {
470				break;
471			}
472		}
473	}
474
475	for (tvp = TAILQ_FIRST(&vnode_tmp_list); tvp; tvp = nvp) {
476		nvp = TAILQ_NEXT(tvp, v_freelist);
477		TAILQ_REMOVE(&vnode_tmp_list, tvp, v_freelist);
478		TAILQ_INSERT_TAIL(&vnode_free_list, tvp, v_freelist);
479		simple_unlock(&tvp->v_interlock);
480	}
481
482	if (vp) {
483		vp->v_flag |= VDOOMED;
484		TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
485		freevnodes--;
486		simple_unlock(&vnode_free_list_slock);
487		cache_purge(vp);
488		vp->v_lease = NULL;
489		if (vp->v_type != VBAD) {
490			vgonel(vp, p);
491		} else {
492			simple_unlock(&vp->v_interlock);
493		}
494
495#ifdef INVARIANTS
496		{
497			int s;
498
499			if (vp->v_data)
500				panic("cleaned vnode isn't");
501			s = splbio();
502			if (vp->v_numoutput)
503				panic("Clean vnode has pending I/O's");
504			splx(s);
505		}
506#endif
507		vp->v_flag = 0;
508		vp->v_lastr = 0;
509		vp->v_lastw = 0;
510		vp->v_lasta = 0;
511		vp->v_cstart = 0;
512		vp->v_clen = 0;
513		vp->v_socket = 0;
514		vp->v_writecount = 0;	/* XXX */
515		vp->v_maxio = 0;
516	} else {
517		simple_unlock(&vnode_free_list_slock);
518		vp = (struct vnode *) zalloc(vnode_zone);
519		bzero((char *) vp, sizeof *vp);
520		simple_lock_init(&vp->v_interlock);
521		vp->v_dd = vp;
522		cache_purge(vp);
523		LIST_INIT(&vp->v_cache_src);
524		TAILQ_INIT(&vp->v_cache_dst);
525		numvnodes++;
526	}
527
528	TAILQ_INIT(&vp->v_cleanblkhd);
529	TAILQ_INIT(&vp->v_dirtyblkhd);
530	vp->v_type = VNON;
531	vp->v_tag = tag;
532	vp->v_op = vops;
533	insmntque(vp, mp);
534	*vpp = vp;
535	vp->v_usecount = 1;
536	vp->v_data = 0;
537	splx(s);
538
539	vfs_object_create(vp, p, p->p_ucred);
540	return (0);
541}
542
543/*
544 * Move a vnode from one mount queue to another.
545 */
546static void
547insmntque(vp, mp)
548	register struct vnode *vp;
549	register struct mount *mp;
550{
551
552	simple_lock(&mntvnode_slock);
553	/*
554	 * Delete from old mount point vnode list, if on one.
555	 */
556	if (vp->v_mount != NULL)
557		LIST_REMOVE(vp, v_mntvnodes);
558	/*
559	 * Insert into list of vnodes for the new mount point, if available.
560	 */
561	if ((vp->v_mount = mp) == NULL) {
562		simple_unlock(&mntvnode_slock);
563		return;
564	}
565	LIST_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes);
566	simple_unlock(&mntvnode_slock);
567}
568
569/*
570 * Update outstanding I/O count and do wakeup if requested.
571 */
572void
573vwakeup(bp)
574	register struct buf *bp;
575{
576	register struct vnode *vp;
577
578	bp->b_flags &= ~B_WRITEINPROG;
579	if ((vp = bp->b_vp)) {
580		vp->v_numoutput--;
581		if (vp->v_numoutput < 0)
582			panic("vwakeup: neg numoutput");
583		if ((vp->v_numoutput == 0) && (vp->v_flag & VBWAIT)) {
584			vp->v_flag &= ~VBWAIT;
585			wakeup((caddr_t) &vp->v_numoutput);
586		}
587	}
588}
589
590/*
591 * Flush out and invalidate all buffers associated with a vnode.
592 * Called with the underlying object locked.
593 */
594int
595vinvalbuf(vp, flags, cred, p, slpflag, slptimeo)
596	register struct vnode *vp;
597	int flags;
598	struct ucred *cred;
599	struct proc *p;
600	int slpflag, slptimeo;
601{
602	register struct buf *bp;
603	struct buf *nbp, *blist;
604	int s, error;
605	vm_object_t object;
606
607	if (flags & V_SAVE) {
608		s = splbio();
609		while (vp->v_numoutput) {
610			vp->v_flag |= VBWAIT;
611			error = tsleep((caddr_t)&vp->v_numoutput,
612			    slpflag | (PRIBIO + 1), "vinvlbuf", slptimeo);
613			if (error) {
614				splx(s);
615				return (error);
616			}
617		}
618		if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
619			splx(s);
620			if ((error = VOP_FSYNC(vp, cred, MNT_WAIT, p)) != 0)
621				return (error);
622			s = splbio();
623			if (vp->v_numoutput > 0 ||
624			    !TAILQ_EMPTY(&vp->v_dirtyblkhd))
625				panic("vinvalbuf: dirty bufs");
626		}
627		splx(s);
628  	}
629	s = splbio();
630	for (;;) {
631		blist = TAILQ_FIRST(&vp->v_cleanblkhd);
632		if (!blist)
633			blist = TAILQ_FIRST(&vp->v_dirtyblkhd);
634		if (!blist)
635			break;
636
637		for (bp = blist; bp; bp = nbp) {
638			nbp = TAILQ_NEXT(bp, b_vnbufs);
639			if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
640				error = BUF_TIMELOCK(bp,
641				    LK_EXCLUSIVE | LK_SLEEPFAIL,
642				    "vinvalbuf", slpflag, slptimeo);
643				if (error == ENOLCK)
644					break;
645				splx(s);
646				return (error);
647			}
648			/*
649			 * XXX Since there are no node locks for NFS, I
650			 * believe there is a slight chance that a delayed
651			 * write will occur while sleeping just above, so
652			 * check for it.  Note that vfs_bio_awrite expects
653			 * buffers to reside on a queue, while VOP_BWRITE and
654			 * brelse do not.
655			 */
656			if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) &&
657				(flags & V_SAVE)) {
658
659				if (bp->b_vp == vp) {
660					if (bp->b_flags & B_CLUSTEROK) {
661						BUF_UNLOCK(bp);
662						vfs_bio_awrite(bp);
663					} else {
664						bremfree(bp);
665						bp->b_flags |= B_ASYNC;
666						VOP_BWRITE(bp->b_vp, bp);
667					}
668				} else {
669					bremfree(bp);
670					(void) VOP_BWRITE(bp->b_vp, bp);
671				}
672				break;
673			}
674			bremfree(bp);
675			bp->b_flags |= (B_INVAL | B_NOCACHE | B_RELBUF);
676			bp->b_flags &= ~B_ASYNC;
677			brelse(bp);
678		}
679	}
680
681	while (vp->v_numoutput > 0) {
682		vp->v_flag |= VBWAIT;
683		tsleep(&vp->v_numoutput, PVM, "vnvlbv", 0);
684	}
685
686	splx(s);
687
688	/*
689	 * Destroy the copy in the VM cache, too.
690	 */
691	simple_lock(&vp->v_interlock);
692	object = vp->v_object;
693	if (object != NULL) {
694		vm_object_page_remove(object, 0, 0,
695			(flags & V_SAVE) ? TRUE : FALSE);
696	}
697	simple_unlock(&vp->v_interlock);
698
699	if (!TAILQ_EMPTY(&vp->v_dirtyblkhd) || !TAILQ_EMPTY(&vp->v_cleanblkhd))
700		panic("vinvalbuf: flush failed");
701	return (0);
702}
703
704/*
705 * Truncate a file's buffer and pages to a specified length.  This
706 * is in lieu of the old vinvalbuf mechanism, which performed unneeded
707 * sync activity.
708 */
709int
710vtruncbuf(vp, cred, p, length, blksize)
711	register struct vnode *vp;
712	struct ucred *cred;
713	struct proc *p;
714	off_t length;
715	int blksize;
716{
717	register struct buf *bp;
718	struct buf *nbp;
719	int s, anyfreed;
720	int trunclbn;
721
722	/*
723	 * Round up to the *next* lbn.
724	 */
725	trunclbn = (length + blksize - 1) / blksize;
726
727	s = splbio();
728restart:
729	anyfreed = 1;
730	for (;anyfreed;) {
731		anyfreed = 0;
732		for (bp = TAILQ_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) {
733			nbp = TAILQ_NEXT(bp, b_vnbufs);
734			if (bp->b_lblkno >= trunclbn) {
735				if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
736					BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL);
737					goto restart;
738				} else {
739					bremfree(bp);
740					bp->b_flags |= (B_INVAL | B_RELBUF);
741					bp->b_flags &= ~B_ASYNC;
742					brelse(bp);
743					anyfreed = 1;
744				}
745				if (nbp && (((nbp->b_xflags & B_VNCLEAN) == 0)||
746					 (nbp->b_vp != vp) ||
747					 (nbp->b_flags & B_DELWRI))) {
748					goto restart;
749				}
750			}
751		}
752
753		for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
754			nbp = TAILQ_NEXT(bp, b_vnbufs);
755			if (bp->b_lblkno >= trunclbn) {
756				if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
757					BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL);
758					goto restart;
759				} else {
760					bremfree(bp);
761					bp->b_flags |= (B_INVAL | B_RELBUF);
762					bp->b_flags &= ~B_ASYNC;
763					brelse(bp);
764					anyfreed = 1;
765				}
766				if (nbp && (((nbp->b_xflags & B_VNDIRTY) == 0)||
767					 (nbp->b_vp != vp) ||
768					 (nbp->b_flags & B_DELWRI) == 0)) {
769					goto restart;
770				}
771			}
772		}
773	}
774
775	if (length > 0) {
776restartsync:
777		for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
778			nbp = TAILQ_NEXT(bp, b_vnbufs);
779			if ((bp->b_flags & B_DELWRI) && (bp->b_lblkno < 0)) {
780				if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
781					BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL);
782					goto restart;
783				} else {
784					bremfree(bp);
785					if (bp->b_vp == vp) {
786						bp->b_flags |= B_ASYNC;
787					} else {
788						bp->b_flags &= ~B_ASYNC;
789					}
790					VOP_BWRITE(bp->b_vp, bp);
791				}
792				goto restartsync;
793			}
794
795		}
796	}
797
798	while (vp->v_numoutput > 0) {
799		vp->v_flag |= VBWAIT;
800		tsleep(&vp->v_numoutput, PVM, "vbtrunc", 0);
801	}
802
803	splx(s);
804
805	vnode_pager_setsize(vp, length);
806
807	return (0);
808}
809
810/*
811 * Associate a buffer with a vnode.
812 */
813void
814bgetvp(vp, bp)
815	register struct vnode *vp;
816	register struct buf *bp;
817{
818	int s;
819
820	KASSERT(bp->b_vp == NULL, ("bgetvp: not free"));
821
822	vhold(vp);
823	bp->b_vp = vp;
824	if (vp->v_type == VBLK || vp->v_type == VCHR)
825		bp->b_dev = vp->v_rdev;
826	else
827		bp->b_dev = NODEV;
828	/*
829	 * Insert onto list for new vnode.
830	 */
831	s = splbio();
832	bp->b_xflags |= B_VNCLEAN;
833	bp->b_xflags &= ~B_VNDIRTY;
834	TAILQ_INSERT_TAIL(&vp->v_cleanblkhd, bp, b_vnbufs);
835	splx(s);
836}
837
838/*
839 * Disassociate a buffer from a vnode.
840 */
841void
842brelvp(bp)
843	register struct buf *bp;
844{
845	struct vnode *vp;
846	struct buflists *listheadp;
847	int s;
848
849	KASSERT(bp->b_vp != NULL, ("brelvp: NULL"));
850
851	/*
852	 * Delete from old vnode list, if on one.
853	 */
854	vp = bp->b_vp;
855	s = splbio();
856	if (bp->b_xflags & (B_VNDIRTY|B_VNCLEAN)) {
857		if (bp->b_xflags & B_VNDIRTY)
858			listheadp = &vp->v_dirtyblkhd;
859		else
860			listheadp = &vp->v_cleanblkhd;
861		TAILQ_REMOVE(listheadp, bp, b_vnbufs);
862		bp->b_xflags &= ~(B_VNDIRTY|B_VNCLEAN);
863	}
864	if ((vp->v_flag & VONWORKLST) && TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
865		vp->v_flag &= ~VONWORKLST;
866		LIST_REMOVE(vp, v_synclist);
867	}
868	splx(s);
869	bp->b_vp = (struct vnode *) 0;
870	vdrop(vp);
871}
872
873/*
874 * The workitem queue.
875 *
876 * It is useful to delay writes of file data and filesystem metadata
877 * for tens of seconds so that quickly created and deleted files need
878 * not waste disk bandwidth being created and removed. To realize this,
879 * we append vnodes to a "workitem" queue. When running with a soft
880 * updates implementation, most pending metadata dependencies should
881 * not wait for more than a few seconds. Thus, mounted on block devices
882 * are delayed only about a half the time that file data is delayed.
883 * Similarly, directory updates are more critical, so are only delayed
884 * about a third the time that file data is delayed. Thus, there are
885 * SYNCER_MAXDELAY queues that are processed round-robin at a rate of
886 * one each second (driven off the filesystem syncer process). The
887 * syncer_delayno variable indicates the next queue that is to be processed.
888 * Items that need to be processed soon are placed in this queue:
889 *
890 *	syncer_workitem_pending[syncer_delayno]
891 *
892 * A delay of fifteen seconds is done by placing the request fifteen
893 * entries later in the queue:
894 *
895 *	syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
896 *
897 */
898
899/*
900 * Add an item to the syncer work queue.
901 */
902static void
903vn_syncer_add_to_worklist(struct vnode *vp, int delay)
904{
905	int s, slot;
906
907	s = splbio();
908
909	if (vp->v_flag & VONWORKLST) {
910		LIST_REMOVE(vp, v_synclist);
911	}
912
913	if (delay > syncer_maxdelay - 2)
914		delay = syncer_maxdelay - 2;
915	slot = (syncer_delayno + delay) & syncer_mask;
916
917	LIST_INSERT_HEAD(&syncer_workitem_pending[slot], vp, v_synclist);
918	vp->v_flag |= VONWORKLST;
919	splx(s);
920}
921
922struct  proc *updateproc;
923static void sched_sync __P((void));
924static struct kproc_desc up_kp = {
925	"syncer",
926	sched_sync,
927	&updateproc
928};
929SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp)
930
931/*
932 * System filesystem synchronizer daemon.
933 */
934void
935sched_sync(void)
936{
937	struct synclist *slp;
938	struct vnode *vp;
939	long starttime;
940	int s;
941	struct proc *p = updateproc;
942
943	p->p_flag |= P_BUFEXHAUST;
944
945	for (;;) {
946		starttime = time_second;
947
948		/*
949		 * Push files whose dirty time has expired.  Be careful
950		 * of interrupt race on slp queue.
951		 */
952		s = splbio();
953		slp = &syncer_workitem_pending[syncer_delayno];
954		syncer_delayno += 1;
955		if (syncer_delayno == syncer_maxdelay)
956			syncer_delayno = 0;
957		splx(s);
958
959		while ((vp = LIST_FIRST(slp)) != NULL) {
960			if (VOP_ISLOCKED(vp) == 0) {
961				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
962				(void) VOP_FSYNC(vp, p->p_ucred, MNT_LAZY, p);
963				VOP_UNLOCK(vp, 0, p);
964			}
965			s = splbio();
966			if (LIST_FIRST(slp) == vp) {
967				/*
968				 * Note: v_tag VT_VFS vps can remain on the
969				 * worklist too with no dirty blocks, but
970				 * since sync_fsync() moves it to a different
971				 * slot we are safe.
972				 */
973				if (TAILQ_EMPTY(&vp->v_dirtyblkhd) &&
974				    vp->v_type != VBLK)
975					panic("sched_sync: fsync failed vp %p tag %d", vp, vp->v_tag);
976				/*
977				 * Put us back on the worklist.  The worklist
978				 * routine will remove us from our current
979				 * position and then add us back in at a later
980				 * position.
981				 */
982				vn_syncer_add_to_worklist(vp, syncdelay);
983			}
984			splx(s);
985		}
986
987		/*
988		 * Do soft update processing.
989		 */
990		if (bioops.io_sync)
991			(*bioops.io_sync)(NULL);
992
993		/*
994		 * The variable rushjob allows the kernel to speed up the
995		 * processing of the filesystem syncer process. A rushjob
996		 * value of N tells the filesystem syncer to process the next
997		 * N seconds worth of work on its queue ASAP. Currently rushjob
998		 * is used by the soft update code to speed up the filesystem
999		 * syncer process when the incore state is getting so far
1000		 * ahead of the disk that the kernel memory pool is being
1001		 * threatened with exhaustion.
1002		 */
1003		if (rushjob > 0) {
1004			rushjob -= 1;
1005			continue;
1006		}
1007		/*
1008		 * If it has taken us less than a second to process the
1009		 * current work, then wait. Otherwise start right over
1010		 * again. We can still lose time if any single round
1011		 * takes more than two seconds, but it does not really
1012		 * matter as we are just trying to generally pace the
1013		 * filesystem activity.
1014		 */
1015		if (time_second == starttime)
1016			tsleep(&lbolt, PPAUSE, "syncer", 0);
1017	}
1018}
1019
1020/*
1021 * Request the syncer daemon to speed up its work.
1022 * We never push it to speed up more than half of its
1023 * normal turn time, otherwise it could take over the cpu.
1024 */
1025int
1026speedup_syncer()
1027{
1028	int s;
1029
1030	s = splhigh();
1031	if (updateproc->p_wchan == &lbolt)
1032		setrunnable(updateproc);
1033	splx(s);
1034	if (rushjob < syncdelay / 2) {
1035		rushjob += 1;
1036		stat_rush_requests += 1;
1037		return (1);
1038	}
1039	return(0);
1040}
1041
1042/*
1043 * Associate a p-buffer with a vnode.
1044 *
1045 * Also sets B_PAGING flag to indicate that vnode is not fully associated
1046 * with the buffer.  i.e. the bp has not been linked into the vnode or
1047 * ref-counted.
1048 */
1049void
1050pbgetvp(vp, bp)
1051	register struct vnode *vp;
1052	register struct buf *bp;
1053{
1054
1055	KASSERT(bp->b_vp == NULL, ("pbgetvp: not free"));
1056
1057	bp->b_vp = vp;
1058	bp->b_flags |= B_PAGING;
1059	if (vp->v_type == VBLK || vp->v_type == VCHR)
1060		bp->b_dev = vp->v_rdev;
1061	else
1062		bp->b_dev = NODEV;
1063}
1064
1065/*
1066 * Disassociate a p-buffer from a vnode.
1067 */
1068void
1069pbrelvp(bp)
1070	register struct buf *bp;
1071{
1072
1073	KASSERT(bp->b_vp != NULL, ("pbrelvp: NULL"));
1074
1075#if !defined(MAX_PERF)
1076	/* XXX REMOVE ME */
1077	if (bp->b_vnbufs.tqe_next != NULL) {
1078		panic(
1079		    "relpbuf(): b_vp was probably reassignbuf()d %p %x",
1080		    bp,
1081		    (int)bp->b_flags
1082		);
1083	}
1084#endif
1085	bp->b_vp = (struct vnode *) 0;
1086	bp->b_flags &= ~B_PAGING;
1087}
1088
1089void
1090pbreassignbuf(bp, newvp)
1091	struct buf *bp;
1092	struct vnode *newvp;
1093{
1094#if !defined(MAX_PERF)
1095	if ((bp->b_flags & B_PAGING) == 0) {
1096		panic(
1097		    "pbreassignbuf() on non phys bp %p",
1098		    bp
1099		);
1100	}
1101#endif
1102	bp->b_vp = newvp;
1103}
1104
1105/*
1106 * Reassign a buffer from one vnode to another.
1107 * Used to assign file specific control information
1108 * (indirect blocks) to the vnode to which they belong.
1109 */
1110void
1111reassignbuf(bp, newvp)
1112	register struct buf *bp;
1113	register struct vnode *newvp;
1114{
1115	struct buflists *listheadp;
1116	int delay;
1117	int s;
1118
1119	if (newvp == NULL) {
1120		printf("reassignbuf: NULL");
1121		return;
1122	}
1123	++reassignbufcalls;
1124
1125#if !defined(MAX_PERF)
1126	/*
1127	 * B_PAGING flagged buffers cannot be reassigned because their vp
1128	 * is not fully linked in.
1129	 */
1130	if (bp->b_flags & B_PAGING)
1131		panic("cannot reassign paging buffer");
1132#endif
1133
1134	s = splbio();
1135	/*
1136	 * Delete from old vnode list, if on one.
1137	 */
1138	if (bp->b_xflags & (B_VNDIRTY|B_VNCLEAN)) {
1139		if (bp->b_xflags & B_VNDIRTY)
1140			listheadp = &bp->b_vp->v_dirtyblkhd;
1141		else
1142			listheadp = &bp->b_vp->v_cleanblkhd;
1143		TAILQ_REMOVE(listheadp, bp, b_vnbufs);
1144		bp->b_xflags &= ~(B_VNDIRTY|B_VNCLEAN);
1145		if (bp->b_vp != newvp) {
1146			vdrop(bp->b_vp);
1147			bp->b_vp = NULL;	/* for clarification */
1148		}
1149	}
1150	/*
1151	 * If dirty, put on list of dirty buffers; otherwise insert onto list
1152	 * of clean buffers.
1153	 */
1154	if (bp->b_flags & B_DELWRI) {
1155		struct buf *tbp;
1156
1157		listheadp = &newvp->v_dirtyblkhd;
1158		if ((newvp->v_flag & VONWORKLST) == 0) {
1159			switch (newvp->v_type) {
1160			case VDIR:
1161				delay = dirdelay;
1162				break;
1163			case VBLK:
1164				if (newvp->v_specmountpoint != NULL) {
1165					delay = metadelay;
1166					break;
1167				}
1168				/* fall through */
1169			default:
1170				delay = filedelay;
1171			}
1172			vn_syncer_add_to_worklist(newvp, delay);
1173		}
1174		bp->b_xflags |= B_VNDIRTY;
1175		tbp = TAILQ_FIRST(listheadp);
1176		if (tbp == NULL ||
1177		    bp->b_lblkno == 0 ||
1178		    (bp->b_lblkno > 0 && bp->b_lblkno < tbp->b_lblkno)) {
1179			TAILQ_INSERT_HEAD(listheadp, bp, b_vnbufs);
1180			++reassignbufsortgood;
1181		} else if (bp->b_lblkno < 0) {
1182			TAILQ_INSERT_TAIL(listheadp, bp, b_vnbufs);
1183			++reassignbufsortgood;
1184		} else if (reassignbufmethod == 1) {
1185			/*
1186			 * New sorting algorithm, only handle sequential case,
1187			 * otherwise guess.
1188			 */
1189			if ((tbp = gbincore(newvp, bp->b_lblkno - 1)) != NULL &&
1190			    (tbp->b_xflags & B_VNDIRTY)) {
1191				TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs);
1192				++reassignbufsortgood;
1193			} else {
1194				TAILQ_INSERT_HEAD(listheadp, bp, b_vnbufs);
1195				++reassignbufsortbad;
1196			}
1197		} else {
1198			/*
1199			 * Old sorting algorithm, scan queue and insert
1200			 */
1201			struct buf *ttbp;
1202			while ((ttbp = TAILQ_NEXT(tbp, b_vnbufs)) &&
1203			    (ttbp->b_lblkno < bp->b_lblkno)) {
1204				++reassignbufloops;
1205				tbp = ttbp;
1206			}
1207			TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs);
1208		}
1209	} else {
1210		bp->b_xflags |= B_VNCLEAN;
1211		TAILQ_INSERT_TAIL(&newvp->v_cleanblkhd, bp, b_vnbufs);
1212		if ((newvp->v_flag & VONWORKLST) &&
1213		    TAILQ_EMPTY(&newvp->v_dirtyblkhd)) {
1214			newvp->v_flag &= ~VONWORKLST;
1215			LIST_REMOVE(newvp, v_synclist);
1216		}
1217	}
1218	if (bp->b_vp != newvp) {
1219		bp->b_vp = newvp;
1220		vhold(bp->b_vp);
1221	}
1222	splx(s);
1223}
1224
1225/*
1226 * Create a vnode for a block device.
1227 * Used for mounting the root file system.
1228 */
1229int
1230bdevvp(dev, vpp)
1231	dev_t dev;
1232	struct vnode **vpp;
1233{
1234	register struct vnode *vp;
1235	struct vnode *nvp;
1236	int error;
1237
1238	if (dev == NODEV) {
1239		*vpp = NULLVP;
1240		return (ENXIO);
1241	}
1242	error = getnewvnode(VT_NON, (struct mount *)0, spec_vnodeop_p, &nvp);
1243	if (error) {
1244		*vpp = NULLVP;
1245		return (error);
1246	}
1247	vp = nvp;
1248	/* dev2udev() results in a CDEV, so we need to cheat here. */
1249	vp->v_type = VBLK;
1250	if ((nvp = checkalias2(vp, dev, (struct mount *)0)) != NULL) {
1251		vput(vp);
1252		vp = nvp;
1253	}
1254	*vpp = vp;
1255	return (0);
1256}
1257
1258/*
1259 * Check to see if the new vnode represents a special device
1260 * for which we already have a vnode (either because of
1261 * bdevvp() or because of a different vnode representing
1262 * the same block device). If such an alias exists, deallocate
1263 * the existing contents and return the aliased vnode. The
1264 * caller is responsible for filling it with its new contents.
1265 */
1266struct vnode *
1267checkalias(nvp, nvp_rdev, mp)
1268	register struct vnode *nvp;
1269	udev_t nvp_rdev;
1270	struct mount *mp;
1271{
1272	dev_t	dev;
1273
1274	if (nvp->v_type != VBLK && nvp->v_type != VCHR)
1275		return (NULLVP);
1276
1277	dev = udev2dev(nvp_rdev, nvp->v_type == VBLK ? 1 : 0);
1278	return (checkalias2(nvp, dev, mp));
1279}
1280
1281static struct vnode *
1282checkalias2(nvp, dev, mp)
1283	register struct vnode *nvp;
1284	dev_t dev;
1285	struct mount *mp;
1286{
1287	struct proc *p = curproc;	/* XXX */
1288	struct vnode *vp;
1289	struct vnode **vpp;
1290
1291	if (nvp->v_type != VBLK && nvp->v_type != VCHR)
1292		return (NULLVP);
1293
1294	vpp = &dev->si_hlist;
1295loop:
1296	simple_lock(&spechash_slock);
1297	for (vp = *vpp; vp; vp = vp->v_specnext) {
1298		if (nvp->v_type != vp->v_type)
1299			continue;
1300		/*
1301		 * Alias, but not in use, so flush it out.
1302		 * Only alias active device nodes.
1303		 * Not sure why we don't re-use this like we do below.
1304		 */
1305		simple_lock(&vp->v_interlock);
1306		if (vp->v_usecount == 0) {
1307			simple_unlock(&spechash_slock);
1308			vgonel(vp, p);
1309			goto loop;
1310		}
1311		if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, p)) {
1312			/*
1313			 * It dissappeared, and we may have slept.
1314			 * Restart from the beginning
1315			 */
1316			simple_unlock(&spechash_slock);
1317			goto loop;
1318		}
1319		break;
1320	}
1321	/*
1322	 * It would be a lot clearer what is going on here if
1323	 * this had been expressed as:
1324	 * if ( vp && (vp->v_tag == VT_NULL))
1325	 * and the clauses had been swapped.
1326	 */
1327	if (vp == NULL || vp->v_tag != VT_NON) {
1328		struct specinfo *sinfo;
1329
1330		/*
1331		 * Put the new vnode into the hash chain.
1332		 * and if there was an alias, connect them.
1333		 */
1334		nvp->v_specnext = *vpp;
1335		*vpp = nvp;
1336		nvp->v_specinfo = sinfo = dev;
1337
1338		simple_unlock(&spechash_slock);
1339		if (vp != NULLVP) {
1340			nvp->v_flag |= VALIASED;
1341			vp->v_flag |= VALIASED;
1342			vput(vp);
1343		}
1344		return (NULLVP);
1345	}
1346	/*
1347	 * if ( vp && (vp->v_tag == VT_NULL))
1348	 * We have a vnode alias, but it is a trashed.
1349	 * Make it look like it's newly allocated. (by getnewvnode())
1350	 * The caller should use this instead.
1351	 */
1352	simple_unlock(&spechash_slock);
1353	VOP_UNLOCK(vp, 0, p);
1354	simple_lock(&vp->v_interlock);
1355	vclean(vp, 0, p);
1356	vp->v_op = nvp->v_op;
1357	vp->v_tag = nvp->v_tag;
1358	nvp->v_type = VNON;
1359	insmntque(vp, mp);
1360	return (vp);
1361}
1362
1363/*
1364 * Grab a particular vnode from the free list, increment its
1365 * reference count and lock it. The vnode lock bit is set if the
1366 * vnode is being eliminated in vgone. The process is awakened
1367 * when the transition is completed, and an error returned to
1368 * indicate that the vnode is no longer usable (possibly having
1369 * been changed to a new file system type).
1370 */
1371int
1372vget(vp, flags, p)
1373	register struct vnode *vp;
1374	int flags;
1375	struct proc *p;
1376{
1377	int error;
1378
1379	/*
1380	 * If the vnode is in the process of being cleaned out for
1381	 * another use, we wait for the cleaning to finish and then
1382	 * return failure. Cleaning is determined by checking that
1383	 * the VXLOCK flag is set.
1384	 */
1385	if ((flags & LK_INTERLOCK) == 0) {
1386		simple_lock(&vp->v_interlock);
1387	}
1388	if (vp->v_flag & VXLOCK) {
1389		vp->v_flag |= VXWANT;
1390		simple_unlock(&vp->v_interlock);
1391		tsleep((caddr_t)vp, PINOD, "vget", 0);
1392		return (ENOENT);
1393	}
1394
1395	vp->v_usecount++;
1396
1397	if (VSHOULDBUSY(vp))
1398		vbusy(vp);
1399	if (flags & LK_TYPE_MASK) {
1400		if ((error = vn_lock(vp, flags | LK_INTERLOCK, p)) != 0) {
1401			/*
1402			 * must expand vrele here because we do not want
1403			 * to call VOP_INACTIVE if the reference count
1404			 * drops back to zero since it was never really
1405			 * active. We must remove it from the free list
1406			 * before sleeping so that multiple processes do
1407			 * not try to recycle it.
1408			 */
1409			simple_lock(&vp->v_interlock);
1410			vp->v_usecount--;
1411			if (VSHOULDFREE(vp))
1412				vfree(vp);
1413			simple_unlock(&vp->v_interlock);
1414		}
1415		return (error);
1416	}
1417	simple_unlock(&vp->v_interlock);
1418	return (0);
1419}
1420
1421void
1422vref(struct vnode *vp)
1423{
1424	simple_lock(&vp->v_interlock);
1425	vp->v_usecount++;
1426	simple_unlock(&vp->v_interlock);
1427}
1428
1429/*
1430 * Vnode put/release.
1431 * If count drops to zero, call inactive routine and return to freelist.
1432 */
1433void
1434vrele(vp)
1435	struct vnode *vp;
1436{
1437	struct proc *p = curproc;	/* XXX */
1438
1439	KASSERT(vp != NULL, ("vrele: null vp"));
1440
1441	simple_lock(&vp->v_interlock);
1442
1443	if (vp->v_usecount > 1) {
1444
1445		vp->v_usecount--;
1446		simple_unlock(&vp->v_interlock);
1447
1448		return;
1449	}
1450
1451	if (vp->v_usecount == 1) {
1452
1453		vp->v_usecount--;
1454		if (VSHOULDFREE(vp))
1455			vfree(vp);
1456	/*
1457	 * If we are doing a vput, the node is already locked, and we must
1458	 * call VOP_INACTIVE with the node locked.  So, in the case of
1459	 * vrele, we explicitly lock the vnode before calling VOP_INACTIVE.
1460	 */
1461		if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK, p) == 0) {
1462			VOP_INACTIVE(vp, p);
1463		}
1464
1465	} else {
1466#ifdef DIAGNOSTIC
1467		vprint("vrele: negative ref count", vp);
1468		simple_unlock(&vp->v_interlock);
1469#endif
1470		panic("vrele: negative ref cnt");
1471	}
1472}
1473
1474void
1475vput(vp)
1476	struct vnode *vp;
1477{
1478	struct proc *p = curproc;	/* XXX */
1479
1480	KASSERT(vp != NULL, ("vput: null vp"));
1481
1482	simple_lock(&vp->v_interlock);
1483
1484	if (vp->v_usecount > 1) {
1485
1486		vp->v_usecount--;
1487		VOP_UNLOCK(vp, LK_INTERLOCK, p);
1488		return;
1489
1490	}
1491
1492	if (vp->v_usecount == 1) {
1493
1494		vp->v_usecount--;
1495		if (VSHOULDFREE(vp))
1496			vfree(vp);
1497	/*
1498	 * If we are doing a vput, the node is already locked, and we must
1499	 * call VOP_INACTIVE with the node locked.  So, in the case of
1500	 * vrele, we explicitly lock the vnode before calling VOP_INACTIVE.
1501	 */
1502		simple_unlock(&vp->v_interlock);
1503		VOP_INACTIVE(vp, p);
1504
1505	} else {
1506#ifdef DIAGNOSTIC
1507		vprint("vput: negative ref count", vp);
1508#endif
1509		panic("vput: negative ref cnt");
1510	}
1511}
1512
1513/*
1514 * Somebody doesn't want the vnode recycled.
1515 */
1516void
1517vhold(vp)
1518	register struct vnode *vp;
1519{
1520	int s;
1521
1522  	s = splbio();
1523	vp->v_holdcnt++;
1524	if (VSHOULDBUSY(vp))
1525		vbusy(vp);
1526	splx(s);
1527}
1528
1529/*
1530 * One less who cares about this vnode.
1531 */
1532void
1533vdrop(vp)
1534	register struct vnode *vp;
1535{
1536	int s;
1537
1538	s = splbio();
1539	if (vp->v_holdcnt <= 0)
1540		panic("vdrop: holdcnt");
1541	vp->v_holdcnt--;
1542	if (VSHOULDFREE(vp))
1543		vfree(vp);
1544	splx(s);
1545}
1546
1547/*
1548 * Remove any vnodes in the vnode table belonging to mount point mp.
1549 *
1550 * If MNT_NOFORCE is specified, there should not be any active ones,
1551 * return error if any are found (nb: this is a user error, not a
1552 * system error). If MNT_FORCE is specified, detach any active vnodes
1553 * that are found.
1554 */
1555#ifdef DIAGNOSTIC
1556static int busyprt = 0;		/* print out busy vnodes */
1557SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "");
1558#endif
1559
1560int
1561vflush(mp, skipvp, flags)
1562	struct mount *mp;
1563	struct vnode *skipvp;
1564	int flags;
1565{
1566	struct proc *p = curproc;	/* XXX */
1567	struct vnode *vp, *nvp;
1568	int busy = 0;
1569
1570	simple_lock(&mntvnode_slock);
1571loop:
1572	for (vp = mp->mnt_vnodelist.lh_first; vp; vp = nvp) {
1573		/*
1574		 * Make sure this vnode wasn't reclaimed in getnewvnode().
1575		 * Start over if it has (it won't be on the list anymore).
1576		 */
1577		if (vp->v_mount != mp)
1578			goto loop;
1579		nvp = vp->v_mntvnodes.le_next;
1580		/*
1581		 * Skip over a selected vnode.
1582		 */
1583		if (vp == skipvp)
1584			continue;
1585
1586		simple_lock(&vp->v_interlock);
1587		/*
1588		 * Skip over a vnodes marked VSYSTEM.
1589		 */
1590		if ((flags & SKIPSYSTEM) && (vp->v_flag & VSYSTEM)) {
1591			simple_unlock(&vp->v_interlock);
1592			continue;
1593		}
1594		/*
1595		 * If WRITECLOSE is set, only flush out regular file vnodes
1596		 * open for writing.
1597		 */
1598		if ((flags & WRITECLOSE) &&
1599		    (vp->v_writecount == 0 || vp->v_type != VREG)) {
1600			simple_unlock(&vp->v_interlock);
1601			continue;
1602		}
1603
1604		/*
1605		 * With v_usecount == 0, all we need to do is clear out the
1606		 * vnode data structures and we are done.
1607		 */
1608		if (vp->v_usecount == 0) {
1609			simple_unlock(&mntvnode_slock);
1610			vgonel(vp, p);
1611			simple_lock(&mntvnode_slock);
1612			continue;
1613		}
1614
1615		/*
1616		 * If FORCECLOSE is set, forcibly close the vnode. For block
1617		 * or character devices, revert to an anonymous device. For
1618		 * all other files, just kill them.
1619		 */
1620		if (flags & FORCECLOSE) {
1621			simple_unlock(&mntvnode_slock);
1622			if (vp->v_type != VBLK && vp->v_type != VCHR) {
1623				vgonel(vp, p);
1624			} else {
1625				vclean(vp, 0, p);
1626				vp->v_op = spec_vnodeop_p;
1627				insmntque(vp, (struct mount *) 0);
1628			}
1629			simple_lock(&mntvnode_slock);
1630			continue;
1631		}
1632#ifdef DIAGNOSTIC
1633		if (busyprt)
1634			vprint("vflush: busy vnode", vp);
1635#endif
1636		simple_unlock(&vp->v_interlock);
1637		busy++;
1638	}
1639	simple_unlock(&mntvnode_slock);
1640	if (busy)
1641		return (EBUSY);
1642	return (0);
1643}
1644
1645/*
1646 * Disassociate the underlying file system from a vnode.
1647 */
1648static void
1649vclean(vp, flags, p)
1650	struct vnode *vp;
1651	int flags;
1652	struct proc *p;
1653{
1654	int active;
1655	vm_object_t obj;
1656
1657	/*
1658	 * Check to see if the vnode is in use. If so we have to reference it
1659	 * before we clean it out so that its count cannot fall to zero and
1660	 * generate a race against ourselves to recycle it.
1661	 */
1662	if ((active = vp->v_usecount))
1663		vp->v_usecount++;
1664
1665	/*
1666	 * Prevent the vnode from being recycled or brought into use while we
1667	 * clean it out.
1668	 */
1669	if (vp->v_flag & VXLOCK)
1670		panic("vclean: deadlock");
1671	vp->v_flag |= VXLOCK;
1672	/*
1673	 * Even if the count is zero, the VOP_INACTIVE routine may still
1674	 * have the object locked while it cleans it out. The VOP_LOCK
1675	 * ensures that the VOP_INACTIVE routine is done with its work.
1676	 * For active vnodes, it ensures that no other activity can
1677	 * occur while the underlying object is being cleaned out.
1678	 */
1679	VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK, p);
1680
1681	/*
1682	 * Clean out any buffers associated with the vnode.
1683	 */
1684	vinvalbuf(vp, V_SAVE, NOCRED, p, 0, 0);
1685	if ((obj = vp->v_object) != NULL) {
1686		if (obj->ref_count == 0) {
1687			/*
1688			 * This is a normal way of shutting down the object/vnode
1689			 * association.
1690			 */
1691			vm_object_terminate(obj);
1692		} else {
1693			/*
1694			 * Woe to the process that tries to page now :-).
1695			 */
1696			vm_pager_deallocate(obj);
1697		}
1698	}
1699
1700	/*
1701	 * If purging an active vnode, it must be closed and
1702	 * deactivated before being reclaimed. Note that the
1703	 * VOP_INACTIVE will unlock the vnode.
1704	 */
1705	if (active) {
1706		if (flags & DOCLOSE)
1707			VOP_CLOSE(vp, FNONBLOCK, NOCRED, p);
1708		VOP_INACTIVE(vp, p);
1709	} else {
1710		/*
1711		 * Any other processes trying to obtain this lock must first
1712		 * wait for VXLOCK to clear, then call the new lock operation.
1713		 */
1714		VOP_UNLOCK(vp, 0, p);
1715	}
1716	/*
1717	 * Reclaim the vnode.
1718	 */
1719	if (VOP_RECLAIM(vp, p))
1720		panic("vclean: cannot reclaim");
1721
1722	if (active)
1723		vrele(vp);
1724
1725	cache_purge(vp);
1726	if (vp->v_vnlock) {
1727#if 0 /* This is the only place we have LK_DRAINED in the entire kernel ??? */
1728#ifdef DIAGNOSTIC
1729		if ((vp->v_vnlock->lk_flags & LK_DRAINED) == 0)
1730			vprint("vclean: lock not drained", vp);
1731#endif
1732#endif
1733		FREE(vp->v_vnlock, M_VNODE);
1734		vp->v_vnlock = NULL;
1735	}
1736
1737	if (VSHOULDFREE(vp))
1738		vfree(vp);
1739
1740	/*
1741	 * Done with purge, notify sleepers of the grim news.
1742	 */
1743	vp->v_op = dead_vnodeop_p;
1744	vn_pollgone(vp);
1745	vp->v_tag = VT_NON;
1746	vp->v_flag &= ~VXLOCK;
1747	if (vp->v_flag & VXWANT) {
1748		vp->v_flag &= ~VXWANT;
1749		wakeup((caddr_t) vp);
1750	}
1751}
1752
1753/*
1754 * Eliminate all activity associated with the requested vnode
1755 * and with all vnodes aliased to the requested vnode.
1756 */
1757int
1758vop_revoke(ap)
1759	struct vop_revoke_args /* {
1760		struct vnode *a_vp;
1761		int a_flags;
1762	} */ *ap;
1763{
1764	struct vnode *vp, *vq;
1765	struct proc *p = curproc;	/* XXX */
1766
1767	KASSERT((ap->a_flags & REVOKEALL) != 0, ("vop_revoke"));
1768
1769	vp = ap->a_vp;
1770	simple_lock(&vp->v_interlock);
1771
1772	if (vp->v_flag & VALIASED) {
1773		/*
1774		 * If a vgone (or vclean) is already in progress,
1775		 * wait until it is done and return.
1776		 */
1777		if (vp->v_flag & VXLOCK) {
1778			vp->v_flag |= VXWANT;
1779			simple_unlock(&vp->v_interlock);
1780			tsleep((caddr_t)vp, PINOD, "vop_revokeall", 0);
1781			return (0);
1782		}
1783		/*
1784		 * Ensure that vp will not be vgone'd while we
1785		 * are eliminating its aliases.
1786		 */
1787		vp->v_flag |= VXLOCK;
1788		simple_unlock(&vp->v_interlock);
1789		while (vp->v_flag & VALIASED) {
1790			simple_lock(&spechash_slock);
1791			for (vq = vp->v_hashchain; vq; vq = vq->v_specnext) {
1792				if (vq->v_type != vp->v_type || vp == vq)
1793					continue;
1794				simple_unlock(&spechash_slock);
1795				vgone(vq);
1796				break;
1797			}
1798			if (vq == NULLVP) {
1799				simple_unlock(&spechash_slock);
1800			}
1801		}
1802		/*
1803		 * Remove the lock so that vgone below will
1804		 * really eliminate the vnode after which time
1805		 * vgone will awaken any sleepers.
1806		 */
1807		simple_lock(&vp->v_interlock);
1808		vp->v_flag &= ~VXLOCK;
1809		if (vp->v_flag & VXWANT) {
1810			vp->v_flag &= ~VXWANT;
1811			wakeup(vp);
1812		}
1813	}
1814	vgonel(vp, p);
1815	return (0);
1816}
1817
1818/*
1819 * Recycle an unused vnode to the front of the free list.
1820 * Release the passed interlock if the vnode will be recycled.
1821 */
1822int
1823vrecycle(vp, inter_lkp, p)
1824	struct vnode *vp;
1825	struct simplelock *inter_lkp;
1826	struct proc *p;
1827{
1828
1829	simple_lock(&vp->v_interlock);
1830	if (vp->v_usecount == 0) {
1831		if (inter_lkp) {
1832			simple_unlock(inter_lkp);
1833		}
1834		vgonel(vp, p);
1835		return (1);
1836	}
1837	simple_unlock(&vp->v_interlock);
1838	return (0);
1839}
1840
1841/*
1842 * Eliminate all activity associated with a vnode
1843 * in preparation for reuse.
1844 */
1845void
1846vgone(vp)
1847	register struct vnode *vp;
1848{
1849	struct proc *p = curproc;	/* XXX */
1850
1851	simple_lock(&vp->v_interlock);
1852	vgonel(vp, p);
1853}
1854
1855/*
1856 * vgone, with the vp interlock held.
1857 */
1858static void
1859vgonel(vp, p)
1860	struct vnode *vp;
1861	struct proc *p;
1862{
1863	int s;
1864	struct vnode *vq;
1865	struct vnode *vx;
1866
1867	/*
1868	 * If a vgone (or vclean) is already in progress,
1869	 * wait until it is done and return.
1870	 */
1871	if (vp->v_flag & VXLOCK) {
1872		vp->v_flag |= VXWANT;
1873		simple_unlock(&vp->v_interlock);
1874		tsleep((caddr_t)vp, PINOD, "vgone", 0);
1875		return;
1876	}
1877
1878	/*
1879	 * Clean out the filesystem specific data.
1880	 */
1881	vclean(vp, DOCLOSE, p);
1882	simple_lock(&vp->v_interlock);
1883
1884	/*
1885	 * Delete from old mount point vnode list, if on one.
1886	 */
1887	if (vp->v_mount != NULL)
1888		insmntque(vp, (struct mount *)0);
1889	/*
1890	 * If special device, remove it from special device alias list
1891	 * if it is on one.
1892	 */
1893	if ((vp->v_type == VBLK || vp->v_type == VCHR) && vp->v_specinfo != 0) {
1894		simple_lock(&spechash_slock);
1895		if (vp->v_hashchain == vp) {
1896			vp->v_hashchain = vp->v_specnext;
1897		} else {
1898			for (vq = vp->v_hashchain; vq; vq = vq->v_specnext) {
1899				if (vq->v_specnext != vp)
1900					continue;
1901				vq->v_specnext = vp->v_specnext;
1902				break;
1903			}
1904			if (vq == NULL)
1905				panic("missing bdev");
1906		}
1907		if (vp->v_flag & VALIASED) {
1908			vx = NULL;
1909			for (vq = vp->v_hashchain; vq; vq = vq->v_specnext) {
1910				if (vq->v_type != vp->v_type)
1911					continue;
1912				if (vx)
1913					break;
1914				vx = vq;
1915			}
1916			if (vx == NULL)
1917				panic("missing alias");
1918			if (vq == NULL)
1919				vx->v_flag &= ~VALIASED;
1920			vp->v_flag &= ~VALIASED;
1921		}
1922		simple_unlock(&spechash_slock);
1923		vp->v_specinfo = NULL;
1924	}
1925
1926	/*
1927	 * If it is on the freelist and not already at the head,
1928	 * move it to the head of the list. The test of the back
1929	 * pointer and the reference count of zero is because
1930	 * it will be removed from the free list by getnewvnode,
1931	 * but will not have its reference count incremented until
1932	 * after calling vgone. If the reference count were
1933	 * incremented first, vgone would (incorrectly) try to
1934	 * close the previous instance of the underlying object.
1935	 */
1936	if (vp->v_usecount == 0 && !(vp->v_flag & VDOOMED)) {
1937		s = splbio();
1938		simple_lock(&vnode_free_list_slock);
1939		if (vp->v_flag & VFREE) {
1940			TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
1941		} else if (vp->v_flag & VTBFREE) {
1942			TAILQ_REMOVE(&vnode_tobefree_list, vp, v_freelist);
1943			vp->v_flag &= ~VTBFREE;
1944			freevnodes++;
1945		} else
1946			freevnodes++;
1947		vp->v_flag |= VFREE;
1948		TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
1949		simple_unlock(&vnode_free_list_slock);
1950		splx(s);
1951	}
1952
1953	vp->v_type = VBAD;
1954	simple_unlock(&vp->v_interlock);
1955}
1956
1957/*
1958 * Lookup a vnode by device number.
1959 */
1960int
1961vfinddev(dev, type, vpp)
1962	dev_t dev;
1963	enum vtype type;
1964	struct vnode **vpp;
1965{
1966	register struct vnode *vp;
1967	int rc = 0;
1968
1969	simple_lock(&spechash_slock);
1970	for (vp = dev->si_hlist; vp; vp = vp->v_specnext) {
1971		if (type != vp->v_type)
1972			continue;
1973		*vpp = vp;
1974		rc = 1;
1975		break;
1976	}
1977	simple_unlock(&spechash_slock);
1978	return (rc);
1979}
1980
1981/*
1982 * Calculate the total number of references to a special device.
1983 */
1984int
1985vcount(vp)
1986	register struct vnode *vp;
1987{
1988	struct vnode *vq, *vnext;
1989	int count;
1990
1991loop:
1992	if ((vp->v_flag & VALIASED) == 0)
1993		return (vp->v_usecount);
1994	simple_lock(&spechash_slock);
1995	for (count = 0, vq = vp->v_hashchain; vq; vq = vnext) {
1996		vnext = vq->v_specnext;
1997		if (vq->v_type != vp->v_type)
1998			continue;
1999		/*
2000		 * Alias, but not in use, so flush it out.
2001		 */
2002		if (vq->v_usecount == 0 && vq != vp) {
2003			simple_unlock(&spechash_slock);
2004			vgone(vq);
2005			goto loop;
2006		}
2007		count += vq->v_usecount;
2008	}
2009	simple_unlock(&spechash_slock);
2010	return (count);
2011}
2012/*
2013 * Print out a description of a vnode.
2014 */
2015static char *typename[] =
2016{"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"};
2017
2018void
2019vprint(label, vp)
2020	char *label;
2021	register struct vnode *vp;
2022{
2023	char buf[96];
2024
2025	if (label != NULL)
2026		printf("%s: %p: ", label, (void *)vp);
2027	else
2028		printf("%p: ", (void *)vp);
2029	printf("type %s, usecount %d, writecount %d, refcount %d,",
2030	    typename[vp->v_type], vp->v_usecount, vp->v_writecount,
2031	    vp->v_holdcnt);
2032	buf[0] = '\0';
2033	if (vp->v_flag & VROOT)
2034		strcat(buf, "|VROOT");
2035	if (vp->v_flag & VTEXT)
2036		strcat(buf, "|VTEXT");
2037	if (vp->v_flag & VSYSTEM)
2038		strcat(buf, "|VSYSTEM");
2039	if (vp->v_flag & VXLOCK)
2040		strcat(buf, "|VXLOCK");
2041	if (vp->v_flag & VXWANT)
2042		strcat(buf, "|VXWANT");
2043	if (vp->v_flag & VBWAIT)
2044		strcat(buf, "|VBWAIT");
2045	if (vp->v_flag & VALIASED)
2046		strcat(buf, "|VALIASED");
2047	if (vp->v_flag & VDOOMED)
2048		strcat(buf, "|VDOOMED");
2049	if (vp->v_flag & VFREE)
2050		strcat(buf, "|VFREE");
2051	if (vp->v_flag & VOBJBUF)
2052		strcat(buf, "|VOBJBUF");
2053	if (buf[0] != '\0')
2054		printf(" flags (%s)", &buf[1]);
2055	if (vp->v_data == NULL) {
2056		printf("\n");
2057	} else {
2058		printf("\n\t");
2059		VOP_PRINT(vp);
2060	}
2061}
2062
2063#ifdef DDB
2064#include <ddb/ddb.h>
2065/*
2066 * List all of the locked vnodes in the system.
2067 * Called when debugging the kernel.
2068 */
2069DB_SHOW_COMMAND(lockedvnodes, lockedvnodes)
2070{
2071	struct proc *p = curproc;	/* XXX */
2072	struct mount *mp, *nmp;
2073	struct vnode *vp;
2074
2075	printf("Locked vnodes\n");
2076	simple_lock(&mountlist_slock);
2077	for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) {
2078		if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) {
2079			nmp = mp->mnt_list.cqe_next;
2080			continue;
2081		}
2082		for (vp = mp->mnt_vnodelist.lh_first;
2083		     vp != NULL;
2084		     vp = vp->v_mntvnodes.le_next) {
2085			if (VOP_ISLOCKED(vp))
2086				vprint((char *)0, vp);
2087		}
2088		simple_lock(&mountlist_slock);
2089		nmp = mp->mnt_list.cqe_next;
2090		vfs_unbusy(mp, p);
2091	}
2092	simple_unlock(&mountlist_slock);
2093}
2094#endif
2095
2096/*
2097 * Top level filesystem related information gathering.
2098 */
2099static int	sysctl_ovfs_conf __P(SYSCTL_HANDLER_ARGS);
2100
2101static int
2102vfs_sysctl SYSCTL_HANDLER_ARGS
2103{
2104	int *name = (int *)arg1 - 1;	/* XXX */
2105	u_int namelen = arg2 + 1;	/* XXX */
2106	struct vfsconf *vfsp;
2107
2108#if 1 || defined(COMPAT_PRELITE2)
2109	/* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */
2110	if (namelen == 1)
2111		return (sysctl_ovfs_conf(oidp, arg1, arg2, req));
2112#endif
2113
2114#ifdef notyet
2115	/* all sysctl names at this level are at least name and field */
2116	if (namelen < 2)
2117		return (ENOTDIR);		/* overloaded */
2118	if (name[0] != VFS_GENERIC) {
2119		for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
2120			if (vfsp->vfc_typenum == name[0])
2121				break;
2122		if (vfsp == NULL)
2123			return (EOPNOTSUPP);
2124		return ((*vfsp->vfc_vfsops->vfs_sysctl)(&name[1], namelen - 1,
2125		    oldp, oldlenp, newp, newlen, p));
2126	}
2127#endif
2128	switch (name[1]) {
2129	case VFS_MAXTYPENUM:
2130		if (namelen != 2)
2131			return (ENOTDIR);
2132		return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int)));
2133	case VFS_CONF:
2134		if (namelen != 3)
2135			return (ENOTDIR);	/* overloaded */
2136		for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
2137			if (vfsp->vfc_typenum == name[2])
2138				break;
2139		if (vfsp == NULL)
2140			return (EOPNOTSUPP);
2141		return (SYSCTL_OUT(req, vfsp, sizeof *vfsp));
2142	}
2143	return (EOPNOTSUPP);
2144}
2145
2146SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD, vfs_sysctl,
2147	"Generic filesystem");
2148
2149#if 1 || defined(COMPAT_PRELITE2)
2150
2151static int
2152sysctl_ovfs_conf SYSCTL_HANDLER_ARGS
2153{
2154	int error;
2155	struct vfsconf *vfsp;
2156	struct ovfsconf ovfs;
2157
2158	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
2159		ovfs.vfc_vfsops = vfsp->vfc_vfsops;	/* XXX used as flag */
2160		strcpy(ovfs.vfc_name, vfsp->vfc_name);
2161		ovfs.vfc_index = vfsp->vfc_typenum;
2162		ovfs.vfc_refcount = vfsp->vfc_refcount;
2163		ovfs.vfc_flags = vfsp->vfc_flags;
2164		error = SYSCTL_OUT(req, &ovfs, sizeof ovfs);
2165		if (error)
2166			return error;
2167	}
2168	return 0;
2169}
2170
2171#endif /* 1 || COMPAT_PRELITE2 */
2172
2173#if 0
2174#define KINFO_VNODESLOP	10
2175/*
2176 * Dump vnode list (via sysctl).
2177 * Copyout address of vnode followed by vnode.
2178 */
2179/* ARGSUSED */
2180static int
2181sysctl_vnode SYSCTL_HANDLER_ARGS
2182{
2183	struct proc *p = curproc;	/* XXX */
2184	struct mount *mp, *nmp;
2185	struct vnode *nvp, *vp;
2186	int error;
2187
2188#define VPTRSZ	sizeof (struct vnode *)
2189#define VNODESZ	sizeof (struct vnode)
2190
2191	req->lock = 0;
2192	if (!req->oldptr) /* Make an estimate */
2193		return (SYSCTL_OUT(req, 0,
2194			(numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ)));
2195
2196	simple_lock(&mountlist_slock);
2197	for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) {
2198		if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) {
2199			nmp = mp->mnt_list.cqe_next;
2200			continue;
2201		}
2202again:
2203		simple_lock(&mntvnode_slock);
2204		for (vp = mp->mnt_vnodelist.lh_first;
2205		     vp != NULL;
2206		     vp = nvp) {
2207			/*
2208			 * Check that the vp is still associated with
2209			 * this filesystem.  RACE: could have been
2210			 * recycled onto the same filesystem.
2211			 */
2212			if (vp->v_mount != mp) {
2213				simple_unlock(&mntvnode_slock);
2214				goto again;
2215			}
2216			nvp = vp->v_mntvnodes.le_next;
2217			simple_unlock(&mntvnode_slock);
2218			if ((error = SYSCTL_OUT(req, &vp, VPTRSZ)) ||
2219			    (error = SYSCTL_OUT(req, vp, VNODESZ)))
2220				return (error);
2221			simple_lock(&mntvnode_slock);
2222		}
2223		simple_unlock(&mntvnode_slock);
2224		simple_lock(&mountlist_slock);
2225		nmp = mp->mnt_list.cqe_next;
2226		vfs_unbusy(mp, p);
2227	}
2228	simple_unlock(&mountlist_slock);
2229
2230	return (0);
2231}
2232#endif
2233
2234/*
2235 * XXX
2236 * Exporting the vnode list on large systems causes them to crash.
2237 * Exporting the vnode list on medium systems causes sysctl to coredump.
2238 */
2239#if 0
2240SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD,
2241	0, 0, sysctl_vnode, "S,vnode", "");
2242#endif
2243
2244/*
2245 * Check to see if a filesystem is mounted on a block device.
2246 */
2247int
2248vfs_mountedon(vp)
2249	struct vnode *vp;
2250{
2251	struct vnode *vq;
2252	int error = 0;
2253
2254	if (vp->v_specmountpoint != NULL)
2255		return (EBUSY);
2256	if (vp->v_flag & VALIASED) {
2257		simple_lock(&spechash_slock);
2258		for (vq = vp->v_hashchain; vq; vq = vq->v_specnext) {
2259			if (vq->v_type != vp->v_type)
2260				continue;
2261			if (vq->v_specmountpoint != NULL) {
2262				error = EBUSY;
2263				break;
2264			}
2265		}
2266		simple_unlock(&spechash_slock);
2267	}
2268	return (error);
2269}
2270
2271/*
2272 * Unmount all filesystems. The list is traversed in reverse order
2273 * of mounting to avoid dependencies.
2274 */
2275void
2276vfs_unmountall()
2277{
2278	struct mount *mp, *nmp;
2279	struct proc *p;
2280	int error;
2281
2282	if (curproc != NULL)
2283		p = curproc;
2284	else
2285		p = initproc;	/* XXX XXX should this be proc0? */
2286	/*
2287	 * Since this only runs when rebooting, it is not interlocked.
2288	 */
2289	for (mp = mountlist.cqh_last; mp != (void *)&mountlist; mp = nmp) {
2290		nmp = mp->mnt_list.cqe_prev;
2291		error = dounmount(mp, MNT_FORCE, p);
2292		if (error) {
2293			printf("unmount of %s failed (",
2294			    mp->mnt_stat.f_mntonname);
2295			if (error == EBUSY)
2296				printf("BUSY)\n");
2297			else
2298				printf("%d)\n", error);
2299		}
2300	}
2301}
2302
2303/*
2304 * Build hash lists of net addresses and hang them off the mount point.
2305 * Called by ufs_mount() to set up the lists of export addresses.
2306 */
2307static int
2308vfs_hang_addrlist(mp, nep, argp)
2309	struct mount *mp;
2310	struct netexport *nep;
2311	struct export_args *argp;
2312{
2313	register struct netcred *np;
2314	register struct radix_node_head *rnh;
2315	register int i;
2316	struct radix_node *rn;
2317	struct sockaddr *saddr, *smask = 0;
2318	struct domain *dom;
2319	int error;
2320
2321	if (argp->ex_addrlen == 0) {
2322		if (mp->mnt_flag & MNT_DEFEXPORTED)
2323			return (EPERM);
2324		np = &nep->ne_defexported;
2325		np->netc_exflags = argp->ex_flags;
2326		np->netc_anon = argp->ex_anon;
2327		np->netc_anon.cr_ref = 1;
2328		mp->mnt_flag |= MNT_DEFEXPORTED;
2329		return (0);
2330	}
2331	i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen;
2332	np = (struct netcred *) malloc(i, M_NETADDR, M_WAITOK);
2333	bzero((caddr_t) np, i);
2334	saddr = (struct sockaddr *) (np + 1);
2335	if ((error = copyin(argp->ex_addr, (caddr_t) saddr, argp->ex_addrlen)))
2336		goto out;
2337	if (saddr->sa_len > argp->ex_addrlen)
2338		saddr->sa_len = argp->ex_addrlen;
2339	if (argp->ex_masklen) {
2340		smask = (struct sockaddr *) ((caddr_t) saddr + argp->ex_addrlen);
2341		error = copyin(argp->ex_mask, (caddr_t) smask, argp->ex_masklen);
2342		if (error)
2343			goto out;
2344		if (smask->sa_len > argp->ex_masklen)
2345			smask->sa_len = argp->ex_masklen;
2346	}
2347	i = saddr->sa_family;
2348	if ((rnh = nep->ne_rtable[i]) == 0) {
2349		/*
2350		 * Seems silly to initialize every AF when most are not used,
2351		 * do so on demand here
2352		 */
2353		for (dom = domains; dom; dom = dom->dom_next)
2354			if (dom->dom_family == i && dom->dom_rtattach) {
2355				dom->dom_rtattach((void **) &nep->ne_rtable[i],
2356				    dom->dom_rtoffset);
2357				break;
2358			}
2359		if ((rnh = nep->ne_rtable[i]) == 0) {
2360			error = ENOBUFS;
2361			goto out;
2362		}
2363	}
2364	rn = (*rnh->rnh_addaddr) ((caddr_t) saddr, (caddr_t) smask, rnh,
2365	    np->netc_rnodes);
2366	if (rn == 0 || np != (struct netcred *) rn) {	/* already exists */
2367		error = EPERM;
2368		goto out;
2369	}
2370	np->netc_exflags = argp->ex_flags;
2371	np->netc_anon = argp->ex_anon;
2372	np->netc_anon.cr_ref = 1;
2373	return (0);
2374out:
2375	free(np, M_NETADDR);
2376	return (error);
2377}
2378
2379/* ARGSUSED */
2380static int
2381vfs_free_netcred(rn, w)
2382	struct radix_node *rn;
2383	void *w;
2384{
2385	register struct radix_node_head *rnh = (struct radix_node_head *) w;
2386
2387	(*rnh->rnh_deladdr) (rn->rn_key, rn->rn_mask, rnh);
2388	free((caddr_t) rn, M_NETADDR);
2389	return (0);
2390}
2391
2392/*
2393 * Free the net address hash lists that are hanging off the mount points.
2394 */
2395static void
2396vfs_free_addrlist(nep)
2397	struct netexport *nep;
2398{
2399	register int i;
2400	register struct radix_node_head *rnh;
2401
2402	for (i = 0; i <= AF_MAX; i++)
2403		if ((rnh = nep->ne_rtable[i])) {
2404			(*rnh->rnh_walktree) (rnh, vfs_free_netcred,
2405			    (caddr_t) rnh);
2406			free((caddr_t) rnh, M_RTABLE);
2407			nep->ne_rtable[i] = 0;
2408		}
2409}
2410
2411int
2412vfs_export(mp, nep, argp)
2413	struct mount *mp;
2414	struct netexport *nep;
2415	struct export_args *argp;
2416{
2417	int error;
2418
2419	if (argp->ex_flags & MNT_DELEXPORT) {
2420		if (mp->mnt_flag & MNT_EXPUBLIC) {
2421			vfs_setpublicfs(NULL, NULL, NULL);
2422			mp->mnt_flag &= ~MNT_EXPUBLIC;
2423		}
2424		vfs_free_addrlist(nep);
2425		mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED);
2426	}
2427	if (argp->ex_flags & MNT_EXPORTED) {
2428		if (argp->ex_flags & MNT_EXPUBLIC) {
2429			if ((error = vfs_setpublicfs(mp, nep, argp)) != 0)
2430				return (error);
2431			mp->mnt_flag |= MNT_EXPUBLIC;
2432		}
2433		if ((error = vfs_hang_addrlist(mp, nep, argp)))
2434			return (error);
2435		mp->mnt_flag |= MNT_EXPORTED;
2436	}
2437	return (0);
2438}
2439
2440
2441/*
2442 * Set the publicly exported filesystem (WebNFS). Currently, only
2443 * one public filesystem is possible in the spec (RFC 2054 and 2055)
2444 */
2445int
2446vfs_setpublicfs(mp, nep, argp)
2447	struct mount *mp;
2448	struct netexport *nep;
2449	struct export_args *argp;
2450{
2451	int error;
2452	struct vnode *rvp;
2453	char *cp;
2454
2455	/*
2456	 * mp == NULL -> invalidate the current info, the FS is
2457	 * no longer exported. May be called from either vfs_export
2458	 * or unmount, so check if it hasn't already been done.
2459	 */
2460	if (mp == NULL) {
2461		if (nfs_pub.np_valid) {
2462			nfs_pub.np_valid = 0;
2463			if (nfs_pub.np_index != NULL) {
2464				FREE(nfs_pub.np_index, M_TEMP);
2465				nfs_pub.np_index = NULL;
2466			}
2467		}
2468		return (0);
2469	}
2470
2471	/*
2472	 * Only one allowed at a time.
2473	 */
2474	if (nfs_pub.np_valid != 0 && mp != nfs_pub.np_mount)
2475		return (EBUSY);
2476
2477	/*
2478	 * Get real filehandle for root of exported FS.
2479	 */
2480	bzero((caddr_t)&nfs_pub.np_handle, sizeof(nfs_pub.np_handle));
2481	nfs_pub.np_handle.fh_fsid = mp->mnt_stat.f_fsid;
2482
2483	if ((error = VFS_ROOT(mp, &rvp)))
2484		return (error);
2485
2486	if ((error = VFS_VPTOFH(rvp, &nfs_pub.np_handle.fh_fid)))
2487		return (error);
2488
2489	vput(rvp);
2490
2491	/*
2492	 * If an indexfile was specified, pull it in.
2493	 */
2494	if (argp->ex_indexfile != NULL) {
2495		MALLOC(nfs_pub.np_index, char *, MAXNAMLEN + 1, M_TEMP,
2496		    M_WAITOK);
2497		error = copyinstr(argp->ex_indexfile, nfs_pub.np_index,
2498		    MAXNAMLEN, (size_t *)0);
2499		if (!error) {
2500			/*
2501			 * Check for illegal filenames.
2502			 */
2503			for (cp = nfs_pub.np_index; *cp; cp++) {
2504				if (*cp == '/') {
2505					error = EINVAL;
2506					break;
2507				}
2508			}
2509		}
2510		if (error) {
2511			FREE(nfs_pub.np_index, M_TEMP);
2512			return (error);
2513		}
2514	}
2515
2516	nfs_pub.np_mount = mp;
2517	nfs_pub.np_valid = 1;
2518	return (0);
2519}
2520
2521struct netcred *
2522vfs_export_lookup(mp, nep, nam)
2523	register struct mount *mp;
2524	struct netexport *nep;
2525	struct sockaddr *nam;
2526{
2527	register struct netcred *np;
2528	register struct radix_node_head *rnh;
2529	struct sockaddr *saddr;
2530
2531	np = NULL;
2532	if (mp->mnt_flag & MNT_EXPORTED) {
2533		/*
2534		 * Lookup in the export list first.
2535		 */
2536		if (nam != NULL) {
2537			saddr = nam;
2538			rnh = nep->ne_rtable[saddr->sa_family];
2539			if (rnh != NULL) {
2540				np = (struct netcred *)
2541					(*rnh->rnh_matchaddr)((caddr_t)saddr,
2542							      rnh);
2543				if (np && np->netc_rnodes->rn_flags & RNF_ROOT)
2544					np = NULL;
2545			}
2546		}
2547		/*
2548		 * If no address match, use the default if it exists.
2549		 */
2550		if (np == NULL && mp->mnt_flag & MNT_DEFEXPORTED)
2551			np = &nep->ne_defexported;
2552	}
2553	return (np);
2554}
2555
2556/*
2557 * perform msync on all vnodes under a mount point
2558 * the mount point must be locked.
2559 */
2560void
2561vfs_msync(struct mount *mp, int flags) {
2562	struct vnode *vp, *nvp;
2563	struct vm_object *obj;
2564	int anyio, tries;
2565
2566	tries = 5;
2567loop:
2568	anyio = 0;
2569	for (vp = mp->mnt_vnodelist.lh_first; vp != NULL; vp = nvp) {
2570
2571		nvp = vp->v_mntvnodes.le_next;
2572
2573		if (vp->v_mount != mp) {
2574			goto loop;
2575		}
2576
2577		if (vp->v_flag & VXLOCK)	/* XXX: what if MNT_WAIT? */
2578			continue;
2579
2580		if (flags != MNT_WAIT) {
2581			obj = vp->v_object;
2582			if (obj == NULL || (obj->flags & OBJ_MIGHTBEDIRTY) == 0)
2583				continue;
2584			if (VOP_ISLOCKED(vp))
2585				continue;
2586		}
2587
2588		simple_lock(&vp->v_interlock);
2589		if (vp->v_object &&
2590		   (vp->v_object->flags & OBJ_MIGHTBEDIRTY)) {
2591			if (!vget(vp,
2592				LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY | LK_NOOBJ, curproc)) {
2593				if (vp->v_object) {
2594					vm_object_page_clean(vp->v_object, 0, 0, flags == MNT_WAIT ? OBJPC_SYNC : 0);
2595					anyio = 1;
2596				}
2597				vput(vp);
2598			}
2599		} else {
2600			simple_unlock(&vp->v_interlock);
2601		}
2602	}
2603	if (anyio && (--tries > 0))
2604		goto loop;
2605}
2606
2607/*
2608 * Create the VM object needed for VMIO and mmap support.  This
2609 * is done for all VREG files in the system.  Some filesystems might
2610 * afford the additional metadata buffering capability of the
2611 * VMIO code by making the device node be VMIO mode also.
2612 *
2613 * vp must be locked when vfs_object_create is called.
2614 */
2615int
2616vfs_object_create(vp, p, cred)
2617	struct vnode *vp;
2618	struct proc *p;
2619	struct ucred *cred;
2620{
2621	struct vattr vat;
2622	vm_object_t object;
2623	int error = 0;
2624
2625	if (vp->v_type != VBLK && vn_canvmio(vp) == FALSE)
2626		return 0;
2627
2628retry:
2629	if ((object = vp->v_object) == NULL) {
2630		if (vp->v_type == VREG || vp->v_type == VDIR) {
2631			if ((error = VOP_GETATTR(vp, &vat, cred, p)) != 0)
2632				goto retn;
2633			object = vnode_pager_alloc(vp, vat.va_size, 0, 0);
2634		} else if (bdevsw(vp->v_rdev) != NULL) {
2635			/*
2636			 * This simply allocates the biggest object possible
2637			 * for a VBLK vnode.  This should be fixed, but doesn't
2638			 * cause any problems (yet).
2639			 */
2640			object = vnode_pager_alloc(vp, IDX_TO_OFF(INT_MAX), 0, 0);
2641		} else {
2642			goto retn;
2643		}
2644		/*
2645		 * Dereference the reference we just created.  This assumes
2646		 * that the object is associated with the vp.
2647		 */
2648		object->ref_count--;
2649		vp->v_usecount--;
2650	} else {
2651		if (object->flags & OBJ_DEAD) {
2652			VOP_UNLOCK(vp, 0, p);
2653			tsleep(object, PVM, "vodead", 0);
2654			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
2655			goto retry;
2656		}
2657	}
2658
2659	KASSERT(vp->v_object != NULL, ("vfs_object_create: NULL object"));
2660	vp->v_flag |= VOBJBUF;
2661
2662retn:
2663	return error;
2664}
2665
2666static void
2667vfree(vp)
2668	struct vnode *vp;
2669{
2670	int s;
2671
2672	s = splbio();
2673	simple_lock(&vnode_free_list_slock);
2674	if (vp->v_flag & VTBFREE) {
2675		TAILQ_REMOVE(&vnode_tobefree_list, vp, v_freelist);
2676		vp->v_flag &= ~VTBFREE;
2677	}
2678	if (vp->v_flag & VAGE) {
2679		TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
2680	} else {
2681		TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
2682	}
2683	freevnodes++;
2684	simple_unlock(&vnode_free_list_slock);
2685	vp->v_flag &= ~VAGE;
2686	vp->v_flag |= VFREE;
2687	splx(s);
2688}
2689
2690void
2691vbusy(vp)
2692	struct vnode *vp;
2693{
2694	int s;
2695
2696	s = splbio();
2697	simple_lock(&vnode_free_list_slock);
2698	if (vp->v_flag & VTBFREE) {
2699		TAILQ_REMOVE(&vnode_tobefree_list, vp, v_freelist);
2700		vp->v_flag &= ~VTBFREE;
2701	} else {
2702		TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
2703		freevnodes--;
2704	}
2705	simple_unlock(&vnode_free_list_slock);
2706	vp->v_flag &= ~(VFREE|VAGE);
2707	splx(s);
2708}
2709
2710/*
2711 * Record a process's interest in events which might happen to
2712 * a vnode.  Because poll uses the historic select-style interface
2713 * internally, this routine serves as both the ``check for any
2714 * pending events'' and the ``record my interest in future events''
2715 * functions.  (These are done together, while the lock is held,
2716 * to avoid race conditions.)
2717 */
2718int
2719vn_pollrecord(vp, p, events)
2720	struct vnode *vp;
2721	struct proc *p;
2722	short events;
2723{
2724	simple_lock(&vp->v_pollinfo.vpi_lock);
2725	if (vp->v_pollinfo.vpi_revents & events) {
2726		/*
2727		 * This leaves events we are not interested
2728		 * in available for the other process which
2729		 * which presumably had requested them
2730		 * (otherwise they would never have been
2731		 * recorded).
2732		 */
2733		events &= vp->v_pollinfo.vpi_revents;
2734		vp->v_pollinfo.vpi_revents &= ~events;
2735
2736		simple_unlock(&vp->v_pollinfo.vpi_lock);
2737		return events;
2738	}
2739	vp->v_pollinfo.vpi_events |= events;
2740	selrecord(p, &vp->v_pollinfo.vpi_selinfo);
2741	simple_unlock(&vp->v_pollinfo.vpi_lock);
2742	return 0;
2743}
2744
2745/*
2746 * Note the occurrence of an event.  If the VN_POLLEVENT macro is used,
2747 * it is possible for us to miss an event due to race conditions, but
2748 * that condition is expected to be rare, so for the moment it is the
2749 * preferred interface.
2750 */
2751void
2752vn_pollevent(vp, events)
2753	struct vnode *vp;
2754	short events;
2755{
2756	simple_lock(&vp->v_pollinfo.vpi_lock);
2757	if (vp->v_pollinfo.vpi_events & events) {
2758		/*
2759		 * We clear vpi_events so that we don't
2760		 * call selwakeup() twice if two events are
2761		 * posted before the polling process(es) is
2762		 * awakened.  This also ensures that we take at
2763		 * most one selwakeup() if the polling process
2764		 * is no longer interested.  However, it does
2765		 * mean that only one event can be noticed at
2766		 * a time.  (Perhaps we should only clear those
2767		 * event bits which we note?) XXX
2768		 */
2769		vp->v_pollinfo.vpi_events = 0;	/* &= ~events ??? */
2770		vp->v_pollinfo.vpi_revents |= events;
2771		selwakeup(&vp->v_pollinfo.vpi_selinfo);
2772	}
2773	simple_unlock(&vp->v_pollinfo.vpi_lock);
2774}
2775
2776/*
2777 * Wake up anyone polling on vp because it is being revoked.
2778 * This depends on dead_poll() returning POLLHUP for correct
2779 * behavior.
2780 */
2781void
2782vn_pollgone(vp)
2783	struct vnode *vp;
2784{
2785	simple_lock(&vp->v_pollinfo.vpi_lock);
2786	if (vp->v_pollinfo.vpi_events) {
2787		vp->v_pollinfo.vpi_events = 0;
2788		selwakeup(&vp->v_pollinfo.vpi_selinfo);
2789	}
2790	simple_unlock(&vp->v_pollinfo.vpi_lock);
2791}
2792
2793
2794
2795/*
2796 * Routine to create and manage a filesystem syncer vnode.
2797 */
2798#define sync_close ((int (*) __P((struct  vop_close_args *)))nullop)
2799static int	sync_fsync __P((struct  vop_fsync_args *));
2800static int	sync_inactive __P((struct  vop_inactive_args *));
2801static int	sync_reclaim  __P((struct  vop_reclaim_args *));
2802#define sync_lock ((int (*) __P((struct  vop_lock_args *)))vop_nolock)
2803#define sync_unlock ((int (*) __P((struct  vop_unlock_args *)))vop_nounlock)
2804static int	sync_print __P((struct vop_print_args *));
2805#define sync_islocked ((int(*) __P((struct vop_islocked_args *)))vop_noislocked)
2806
2807static vop_t **sync_vnodeop_p;
2808static struct vnodeopv_entry_desc sync_vnodeop_entries[] = {
2809	{ &vop_default_desc,	(vop_t *) vop_eopnotsupp },
2810	{ &vop_close_desc,	(vop_t *) sync_close },		/* close */
2811	{ &vop_fsync_desc,	(vop_t *) sync_fsync },		/* fsync */
2812	{ &vop_inactive_desc,	(vop_t *) sync_inactive },	/* inactive */
2813	{ &vop_reclaim_desc,	(vop_t *) sync_reclaim },	/* reclaim */
2814	{ &vop_lock_desc,	(vop_t *) sync_lock },		/* lock */
2815	{ &vop_unlock_desc,	(vop_t *) sync_unlock },	/* unlock */
2816	{ &vop_print_desc,	(vop_t *) sync_print },		/* print */
2817	{ &vop_islocked_desc,	(vop_t *) sync_islocked },	/* islocked */
2818	{ NULL, NULL }
2819};
2820static struct vnodeopv_desc sync_vnodeop_opv_desc =
2821	{ &sync_vnodeop_p, sync_vnodeop_entries };
2822
2823VNODEOP_SET(sync_vnodeop_opv_desc);
2824
2825/*
2826 * Create a new filesystem syncer vnode for the specified mount point.
2827 */
2828int
2829vfs_allocate_syncvnode(mp)
2830	struct mount *mp;
2831{
2832	struct vnode *vp;
2833	static long start, incr, next;
2834	int error;
2835
2836	/* Allocate a new vnode */
2837	if ((error = getnewvnode(VT_VFS, mp, sync_vnodeop_p, &vp)) != 0) {
2838		mp->mnt_syncer = NULL;
2839		return (error);
2840	}
2841	vp->v_type = VNON;
2842	/*
2843	 * Place the vnode onto the syncer worklist. We attempt to
2844	 * scatter them about on the list so that they will go off
2845	 * at evenly distributed times even if all the filesystems
2846	 * are mounted at once.
2847	 */
2848	next += incr;
2849	if (next == 0 || next > syncer_maxdelay) {
2850		start /= 2;
2851		incr /= 2;
2852		if (start == 0) {
2853			start = syncer_maxdelay / 2;
2854			incr = syncer_maxdelay;
2855		}
2856		next = start;
2857	}
2858	vn_syncer_add_to_worklist(vp, syncdelay > 0 ? next % syncdelay : 0);
2859	mp->mnt_syncer = vp;
2860	return (0);
2861}
2862
2863/*
2864 * Do a lazy sync of the filesystem.
2865 */
2866static int
2867sync_fsync(ap)
2868	struct vop_fsync_args /* {
2869		struct vnode *a_vp;
2870		struct ucred *a_cred;
2871		int a_waitfor;
2872		struct proc *a_p;
2873	} */ *ap;
2874{
2875	struct vnode *syncvp = ap->a_vp;
2876	struct mount *mp = syncvp->v_mount;
2877	struct proc *p = ap->a_p;
2878	int asyncflag;
2879
2880	/*
2881	 * We only need to do something if this is a lazy evaluation.
2882	 */
2883	if (ap->a_waitfor != MNT_LAZY)
2884		return (0);
2885
2886	/*
2887	 * Move ourselves to the back of the sync list.
2888	 */
2889	vn_syncer_add_to_worklist(syncvp, syncdelay);
2890
2891	/*
2892	 * Walk the list of vnodes pushing all that are dirty and
2893	 * not already on the sync list.
2894	 */
2895	simple_lock(&mountlist_slock);
2896	if (vfs_busy(mp, LK_EXCLUSIVE | LK_NOWAIT, &mountlist_slock, p) != 0) {
2897		simple_unlock(&mountlist_slock);
2898		return (0);
2899	}
2900	asyncflag = mp->mnt_flag & MNT_ASYNC;
2901	mp->mnt_flag &= ~MNT_ASYNC;
2902	vfs_msync(mp, MNT_NOWAIT);
2903	VFS_SYNC(mp, MNT_LAZY, ap->a_cred, p);
2904	if (asyncflag)
2905		mp->mnt_flag |= MNT_ASYNC;
2906	vfs_unbusy(mp, p);
2907	return (0);
2908}
2909
2910/*
2911 * The syncer vnode is no referenced.
2912 */
2913static int
2914sync_inactive(ap)
2915	struct vop_inactive_args /* {
2916		struct vnode *a_vp;
2917		struct proc *a_p;
2918	} */ *ap;
2919{
2920
2921	vgone(ap->a_vp);
2922	return (0);
2923}
2924
2925/*
2926 * The syncer vnode is no longer needed and is being decommissioned.
2927 *
2928 * Modifications to the worklist must be protected at splbio().
2929 */
2930static int
2931sync_reclaim(ap)
2932	struct vop_reclaim_args /* {
2933		struct vnode *a_vp;
2934	} */ *ap;
2935{
2936	struct vnode *vp = ap->a_vp;
2937	int s;
2938
2939	s = splbio();
2940	vp->v_mount->mnt_syncer = NULL;
2941	if (vp->v_flag & VONWORKLST) {
2942		LIST_REMOVE(vp, v_synclist);
2943		vp->v_flag &= ~VONWORKLST;
2944	}
2945	splx(s);
2946
2947	return (0);
2948}
2949
2950/*
2951 * Print out a syncer vnode.
2952 */
2953static int
2954sync_print(ap)
2955	struct vop_print_args /* {
2956		struct vnode *a_vp;
2957	} */ *ap;
2958{
2959	struct vnode *vp = ap->a_vp;
2960
2961	printf("syncer vnode");
2962	if (vp->v_vnlock != NULL)
2963		lockmgr_printinfo(vp->v_vnlock);
2964	printf("\n");
2965	return (0);
2966}
2967
2968/*
2969 * extract the dev_t from a VBLK or VCHR
2970 */
2971dev_t
2972vn_todev(vp)
2973	struct vnode *vp;
2974{
2975	if (vp->v_type != VBLK && vp->v_type != VCHR)
2976		return (NODEV);
2977	return (vp->v_rdev);
2978}
2979