vfs_export.c revision 43311
1/*
2 * Copyright (c) 1989, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 *    must display the following acknowledgement:
20 *	This product includes software developed by the University of
21 *	California, Berkeley and its contributors.
22 * 4. Neither the name of the University nor the names of its contributors
23 *    may be used to endorse or promote products derived from this software
24 *    without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 *
38 *	@(#)vfs_subr.c	8.31 (Berkeley) 5/26/95
39 * $Id: vfs_subr.c,v 1.183 1999/01/21 08:29:05 dillon Exp $
40 */
41
42/*
43 * External virtual filesystem routines
44 */
45#include "opt_ddb.h"
46
47#include <sys/param.h>
48#include <sys/systm.h>
49#include <sys/conf.h>
50#include <sys/fcntl.h>
51#include <sys/kernel.h>
52#include <sys/proc.h>
53#include <sys/malloc.h>
54#include <sys/mount.h>
55#include <sys/socket.h>
56#include <sys/vnode.h>
57#include <sys/stat.h>
58#include <sys/buf.h>
59#include <sys/domain.h>
60#include <sys/dirent.h>
61#include <sys/vmmeter.h>
62
63#include <machine/limits.h>
64
65#include <vm/vm.h>
66#include <vm/vm_param.h>
67#include <vm/vm_prot.h>
68#include <vm/vm_object.h>
69#include <vm/vm_extern.h>
70#include <vm/pmap.h>
71#include <vm/vm_map.h>
72#include <vm/vm_page.h>
73#include <vm/vm_pager.h>
74#include <vm/vnode_pager.h>
75#include <vm/vm_zone.h>
76#include <sys/sysctl.h>
77
78#include <miscfs/specfs/specdev.h>
79
80static MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure");
81
82static void	insmntque __P((struct vnode *vp, struct mount *mp));
83static void	vclean __P((struct vnode *vp, int flags, struct proc *p));
84static void	vfree __P((struct vnode *));
85static void	vgonel __P((struct vnode *vp, struct proc *p));
86static unsigned long	numvnodes;
87SYSCTL_INT(_debug, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, "");
88
89enum vtype iftovt_tab[16] = {
90	VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
91	VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
92};
93int vttoif_tab[9] = {
94	0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
95	S_IFSOCK, S_IFIFO, S_IFMT,
96};
97
98static TAILQ_HEAD(freelst, vnode) vnode_free_list;	/* vnode free list */
99struct tobefreelist vnode_tobefree_list;	/* vnode free list */
100
101static u_long wantfreevnodes = 25;
102SYSCTL_INT(_debug, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, "");
103static u_long freevnodes = 0;
104SYSCTL_INT(_debug, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, "");
105
106int vfs_ioopt = 0;
107#ifdef ENABLE_VFS_IOOPT
108SYSCTL_INT(_vfs, OID_AUTO, ioopt, CTLFLAG_RW, &vfs_ioopt, 0, "");
109#endif
110
111struct mntlist mountlist;	/* mounted filesystem list */
112struct simplelock mountlist_slock;
113struct simplelock mntvnode_slock;
114int	nfs_mount_type = -1;
115#ifndef NULL_SIMPLELOCKS
116static struct simplelock mntid_slock;
117static struct simplelock vnode_free_list_slock;
118static struct simplelock spechash_slock;
119#endif
120struct nfs_public nfs_pub;	/* publicly exported FS */
121static vm_zone_t vnode_zone;
122
123/*
124 * The workitem queue.
125 */
126#define SYNCER_MAXDELAY		32
127static int syncer_maxdelay = SYNCER_MAXDELAY;	/* maximum delay time */
128time_t syncdelay =		30;
129int rushjob;				/* number of slots to run ASAP */
130
131static int syncer_delayno = 0;
132static long syncer_mask;
133LIST_HEAD(synclist, vnode);
134static struct synclist *syncer_workitem_pending;
135
136int desiredvnodes;
137SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW, &desiredvnodes, 0, "");
138
139static void	vfs_free_addrlist __P((struct netexport *nep));
140static int	vfs_free_netcred __P((struct radix_node *rn, void *w));
141static int	vfs_hang_addrlist __P((struct mount *mp, struct netexport *nep,
142				       struct export_args *argp));
143
144/*
145 * Initialize the vnode management data structures.
146 */
147void
148vntblinit()
149{
150
151	desiredvnodes = maxproc + cnt.v_page_count / 4;
152	simple_lock_init(&mntvnode_slock);
153	simple_lock_init(&mntid_slock);
154	simple_lock_init(&spechash_slock);
155	TAILQ_INIT(&vnode_free_list);
156	TAILQ_INIT(&vnode_tobefree_list);
157	simple_lock_init(&vnode_free_list_slock);
158	CIRCLEQ_INIT(&mountlist);
159	vnode_zone = zinit("VNODE", sizeof (struct vnode), 0, 0, 5);
160	/*
161	 * Initialize the filesystem syncer.
162	 */
163	syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE,
164		&syncer_mask);
165	syncer_maxdelay = syncer_mask + 1;
166}
167
168/*
169 * Mark a mount point as busy. Used to synchronize access and to delay
170 * unmounting. Interlock is not released on failure.
171 */
172int
173vfs_busy(mp, flags, interlkp, p)
174	struct mount *mp;
175	int flags;
176	struct simplelock *interlkp;
177	struct proc *p;
178{
179	int lkflags;
180
181	if (mp->mnt_kern_flag & MNTK_UNMOUNT) {
182		if (flags & LK_NOWAIT)
183			return (ENOENT);
184		mp->mnt_kern_flag |= MNTK_MWAIT;
185		if (interlkp) {
186			simple_unlock(interlkp);
187		}
188		/*
189		 * Since all busy locks are shared except the exclusive
190		 * lock granted when unmounting, the only place that a
191		 * wakeup needs to be done is at the release of the
192		 * exclusive lock at the end of dounmount.
193		 */
194		tsleep((caddr_t)mp, PVFS, "vfs_busy", 0);
195		if (interlkp) {
196			simple_lock(interlkp);
197		}
198		return (ENOENT);
199	}
200	lkflags = LK_SHARED | LK_NOPAUSE;
201	if (interlkp)
202		lkflags |= LK_INTERLOCK;
203	if (lockmgr(&mp->mnt_lock, lkflags, interlkp, p))
204		panic("vfs_busy: unexpected lock failure");
205	return (0);
206}
207
208/*
209 * Free a busy filesystem.
210 */
211void
212vfs_unbusy(mp, p)
213	struct mount *mp;
214	struct proc *p;
215{
216
217	lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, p);
218}
219
220/*
221 * Lookup a filesystem type, and if found allocate and initialize
222 * a mount structure for it.
223 *
224 * Devname is usually updated by mount(8) after booting.
225 */
226int
227vfs_rootmountalloc(fstypename, devname, mpp)
228	char *fstypename;
229	char *devname;
230	struct mount **mpp;
231{
232	struct proc *p = curproc;	/* XXX */
233	struct vfsconf *vfsp;
234	struct mount *mp;
235
236	if (fstypename == NULL)
237		return (ENODEV);
238	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
239		if (!strcmp(vfsp->vfc_name, fstypename))
240			break;
241	if (vfsp == NULL)
242		return (ENODEV);
243	mp = malloc((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK);
244	bzero((char *)mp, (u_long)sizeof(struct mount));
245	lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, LK_NOPAUSE);
246	(void)vfs_busy(mp, LK_NOWAIT, 0, p);
247	LIST_INIT(&mp->mnt_vnodelist);
248	mp->mnt_vfc = vfsp;
249	mp->mnt_op = vfsp->vfc_vfsops;
250	mp->mnt_flag = MNT_RDONLY;
251	mp->mnt_vnodecovered = NULLVP;
252	vfsp->vfc_refcount++;
253	mp->mnt_stat.f_type = vfsp->vfc_typenum;
254	mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
255	strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN);
256	mp->mnt_stat.f_mntonname[0] = '/';
257	mp->mnt_stat.f_mntonname[1] = 0;
258	(void) copystr(devname, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, 0);
259	*mpp = mp;
260	return (0);
261}
262
263/*
264 * Find an appropriate filesystem to use for the root. If a filesystem
265 * has not been preselected, walk through the list of known filesystems
266 * trying those that have mountroot routines, and try them until one
267 * works or we have tried them all.
268 */
269#ifdef notdef	/* XXX JH */
270int
271lite2_vfs_mountroot()
272{
273	struct vfsconf *vfsp;
274	extern int (*lite2_mountroot) __P((void));
275	int error;
276
277	if (lite2_mountroot != NULL)
278		return ((*lite2_mountroot)());
279	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
280		if (vfsp->vfc_mountroot == NULL)
281			continue;
282		if ((error = (*vfsp->vfc_mountroot)()) == 0)
283			return (0);
284		printf("%s_mountroot failed: %d\n", vfsp->vfc_name, error);
285	}
286	return (ENODEV);
287}
288#endif
289
290/*
291 * Lookup a mount point by filesystem identifier.
292 */
293struct mount *
294vfs_getvfs(fsid)
295	fsid_t *fsid;
296{
297	register struct mount *mp;
298
299	simple_lock(&mountlist_slock);
300	for (mp = mountlist.cqh_first; mp != (void *)&mountlist;
301	    mp = mp->mnt_list.cqe_next) {
302		if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
303		    mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
304			simple_unlock(&mountlist_slock);
305			return (mp);
306	    }
307	}
308	simple_unlock(&mountlist_slock);
309	return ((struct mount *) 0);
310}
311
312/*
313 * Get a new unique fsid
314 */
315void
316vfs_getnewfsid(mp)
317	struct mount *mp;
318{
319	static u_short xxxfs_mntid;
320
321	fsid_t tfsid;
322	int mtype;
323
324	simple_lock(&mntid_slock);
325	mtype = mp->mnt_vfc->vfc_typenum;
326	mp->mnt_stat.f_fsid.val[0] = makedev(nblkdev + mtype, 0);
327	mp->mnt_stat.f_fsid.val[1] = mtype;
328	if (xxxfs_mntid == 0)
329		++xxxfs_mntid;
330	tfsid.val[0] = makedev(nblkdev + mtype, xxxfs_mntid);
331	tfsid.val[1] = mtype;
332	if (mountlist.cqh_first != (void *)&mountlist) {
333		while (vfs_getvfs(&tfsid)) {
334			tfsid.val[0]++;
335			xxxfs_mntid++;
336		}
337	}
338	mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
339	simple_unlock(&mntid_slock);
340}
341
342/*
343 * Set vnode attributes to VNOVAL
344 */
345void
346vattr_null(vap)
347	register struct vattr *vap;
348{
349
350	vap->va_type = VNON;
351	vap->va_size = VNOVAL;
352	vap->va_bytes = VNOVAL;
353	vap->va_mode = VNOVAL;
354	vap->va_nlink = VNOVAL;
355	vap->va_uid = VNOVAL;
356	vap->va_gid = VNOVAL;
357	vap->va_fsid = VNOVAL;
358	vap->va_fileid = VNOVAL;
359	vap->va_blocksize = VNOVAL;
360	vap->va_rdev = VNOVAL;
361	vap->va_atime.tv_sec = VNOVAL;
362	vap->va_atime.tv_nsec = VNOVAL;
363	vap->va_mtime.tv_sec = VNOVAL;
364	vap->va_mtime.tv_nsec = VNOVAL;
365	vap->va_ctime.tv_sec = VNOVAL;
366	vap->va_ctime.tv_nsec = VNOVAL;
367	vap->va_flags = VNOVAL;
368	vap->va_gen = VNOVAL;
369	vap->va_vaflags = 0;
370}
371
372/*
373 * Routines having to do with the management of the vnode table.
374 */
375extern vop_t **dead_vnodeop_p;
376
377/*
378 * Return the next vnode from the free list.
379 */
380int
381getnewvnode(tag, mp, vops, vpp)
382	enum vtagtype tag;
383	struct mount *mp;
384	vop_t **vops;
385	struct vnode **vpp;
386{
387	int s;
388	struct proc *p = curproc;	/* XXX */
389	struct vnode *vp, *tvp, *nvp;
390	vm_object_t object;
391	TAILQ_HEAD(freelst, vnode) vnode_tmp_list;
392
393	/*
394	 * We take the least recently used vnode from the freelist
395	 * if we can get it and it has no cached pages, and no
396	 * namecache entries are relative to it.
397	 * Otherwise we allocate a new vnode
398	 */
399
400	s = splbio();
401	simple_lock(&vnode_free_list_slock);
402	TAILQ_INIT(&vnode_tmp_list);
403
404	for (vp = TAILQ_FIRST(&vnode_tobefree_list); vp; vp = nvp) {
405		nvp = TAILQ_NEXT(vp, v_freelist);
406		TAILQ_REMOVE(&vnode_tobefree_list, vp, v_freelist);
407		if (vp->v_flag & VAGE) {
408			TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
409		} else {
410			TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
411		}
412		vp->v_flag &= ~(VTBFREE|VAGE);
413		vp->v_flag |= VFREE;
414		if (vp->v_usecount)
415			panic("tobe free vnode isn't");
416		freevnodes++;
417	}
418
419	if (wantfreevnodes && freevnodes < wantfreevnodes) {
420		vp = NULL;
421	} else if (!wantfreevnodes && freevnodes <= desiredvnodes) {
422		/*
423		 * XXX: this is only here to be backwards compatible
424		 */
425		vp = NULL;
426	} else {
427		for (vp = TAILQ_FIRST(&vnode_free_list); vp; vp = nvp) {
428			nvp = TAILQ_NEXT(vp, v_freelist);
429			if (!simple_lock_try(&vp->v_interlock))
430				continue;
431			if (vp->v_usecount)
432				panic("free vnode isn't");
433
434			object = vp->v_object;
435			if (object && (object->resident_page_count || object->ref_count)) {
436				printf("object inconsistant state: RPC: %d, RC: %d\n",
437					object->resident_page_count, object->ref_count);
438				/* Don't recycle if it's caching some pages */
439				TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
440				TAILQ_INSERT_TAIL(&vnode_tmp_list, vp, v_freelist);
441				continue;
442			} else if (LIST_FIRST(&vp->v_cache_src)) {
443				/* Don't recycle if active in the namecache */
444				simple_unlock(&vp->v_interlock);
445				continue;
446			} else {
447				break;
448			}
449		}
450	}
451
452	for (tvp = TAILQ_FIRST(&vnode_tmp_list); tvp; tvp = nvp) {
453		nvp = TAILQ_NEXT(tvp, v_freelist);
454		TAILQ_REMOVE(&vnode_tmp_list, tvp, v_freelist);
455		TAILQ_INSERT_TAIL(&vnode_free_list, tvp, v_freelist);
456		simple_unlock(&tvp->v_interlock);
457	}
458
459	if (vp) {
460		vp->v_flag |= VDOOMED;
461		TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
462		freevnodes--;
463		simple_unlock(&vnode_free_list_slock);
464		cache_purge(vp);
465		vp->v_lease = NULL;
466		if (vp->v_type != VBAD) {
467			vgonel(vp, p);
468		} else {
469			simple_unlock(&vp->v_interlock);
470		}
471
472#ifdef INVARIANTS
473		{
474			int s;
475
476			if (vp->v_data)
477				panic("cleaned vnode isn't");
478			s = splbio();
479			if (vp->v_numoutput)
480				panic("Clean vnode has pending I/O's");
481			splx(s);
482		}
483#endif
484		vp->v_flag = 0;
485		vp->v_lastr = 0;
486		vp->v_lastw = 0;
487		vp->v_lasta = 0;
488		vp->v_cstart = 0;
489		vp->v_clen = 0;
490		vp->v_socket = 0;
491		vp->v_writecount = 0;	/* XXX */
492		vp->v_maxio = 0;
493	} else {
494		simple_unlock(&vnode_free_list_slock);
495		vp = (struct vnode *) zalloc(vnode_zone);
496		bzero((char *) vp, sizeof *vp);
497		simple_lock_init(&vp->v_interlock);
498		vp->v_dd = vp;
499		cache_purge(vp);
500		LIST_INIT(&vp->v_cache_src);
501		TAILQ_INIT(&vp->v_cache_dst);
502		numvnodes++;
503	}
504
505	TAILQ_INIT(&vp->v_cleanblkhd);
506	TAILQ_INIT(&vp->v_dirtyblkhd);
507	vp->v_type = VNON;
508	vp->v_tag = tag;
509	vp->v_op = vops;
510	insmntque(vp, mp);
511	*vpp = vp;
512	vp->v_usecount = 1;
513	vp->v_data = 0;
514	splx(s);
515
516	vfs_object_create(vp, p, p->p_ucred);
517	return (0);
518}
519
520/*
521 * Move a vnode from one mount queue to another.
522 */
523static void
524insmntque(vp, mp)
525	register struct vnode *vp;
526	register struct mount *mp;
527{
528
529	simple_lock(&mntvnode_slock);
530	/*
531	 * Delete from old mount point vnode list, if on one.
532	 */
533	if (vp->v_mount != NULL)
534		LIST_REMOVE(vp, v_mntvnodes);
535	/*
536	 * Insert into list of vnodes for the new mount point, if available.
537	 */
538	if ((vp->v_mount = mp) == NULL) {
539		simple_unlock(&mntvnode_slock);
540		return;
541	}
542	LIST_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes);
543	simple_unlock(&mntvnode_slock);
544}
545
546/*
547 * Update outstanding I/O count and do wakeup if requested.
548 */
549void
550vwakeup(bp)
551	register struct buf *bp;
552{
553	register struct vnode *vp;
554
555	bp->b_flags &= ~B_WRITEINPROG;
556	if ((vp = bp->b_vp)) {
557		vp->v_numoutput--;
558		if (vp->v_numoutput < 0)
559			panic("vwakeup: neg numoutput");
560		if ((vp->v_numoutput == 0) && (vp->v_flag & VBWAIT)) {
561			vp->v_flag &= ~VBWAIT;
562			wakeup((caddr_t) &vp->v_numoutput);
563		}
564	}
565}
566
567/*
568 * Flush out and invalidate all buffers associated with a vnode.
569 * Called with the underlying object locked.
570 */
571int
572vinvalbuf(vp, flags, cred, p, slpflag, slptimeo)
573	register struct vnode *vp;
574	int flags;
575	struct ucred *cred;
576	struct proc *p;
577	int slpflag, slptimeo;
578{
579	register struct buf *bp;
580	struct buf *nbp, *blist;
581	int s, error;
582	vm_object_t object;
583
584	if (flags & V_SAVE) {
585		s = splbio();
586		while (vp->v_numoutput) {
587			vp->v_flag |= VBWAIT;
588			error = tsleep((caddr_t)&vp->v_numoutput,
589			    slpflag | (PRIBIO + 1), "vinvlbuf", slptimeo);
590			if (error) {
591				splx(s);
592				return (error);
593			}
594		}
595		if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
596			splx(s);
597			if ((error = VOP_FSYNC(vp, cred, MNT_WAIT, p)) != 0)
598				return (error);
599			s = splbio();
600			if (vp->v_numoutput > 0 ||
601			    !TAILQ_EMPTY(&vp->v_dirtyblkhd))
602				panic("vinvalbuf: dirty bufs");
603		}
604		splx(s);
605  	}
606	s = splbio();
607	for (;;) {
608		blist = TAILQ_FIRST(&vp->v_cleanblkhd);
609		if (!blist)
610			blist = TAILQ_FIRST(&vp->v_dirtyblkhd);
611		if (!blist)
612			break;
613
614		for (bp = blist; bp; bp = nbp) {
615			nbp = TAILQ_NEXT(bp, b_vnbufs);
616			if (bp->b_flags & B_BUSY) {
617				bp->b_flags |= B_WANTED;
618				error = tsleep((caddr_t) bp,
619				    slpflag | (PRIBIO + 4), "vinvalbuf",
620				    slptimeo);
621				if (error) {
622					splx(s);
623					return (error);
624				}
625				break;
626			}
627			/*
628			 * XXX Since there are no node locks for NFS, I
629			 * believe there is a slight chance that a delayed
630			 * write will occur while sleeping just above, so
631			 * check for it.  Note that vfs_bio_awrite expects
632			 * buffers to reside on a queue, while VOP_BWRITE and
633			 * brelse do not.
634			 */
635			if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) &&
636				(flags & V_SAVE)) {
637
638				if (bp->b_vp == vp) {
639					if (bp->b_flags & B_CLUSTEROK) {
640						vfs_bio_awrite(bp);
641					} else {
642						bremfree(bp);
643						bp->b_flags |= (B_BUSY | B_ASYNC);
644						VOP_BWRITE(bp);
645					}
646				} else {
647					bremfree(bp);
648					bp->b_flags |= B_BUSY;
649					(void) VOP_BWRITE(bp);
650				}
651				break;
652			}
653			bremfree(bp);
654			bp->b_flags |= (B_INVAL | B_NOCACHE | B_RELBUF | B_BUSY);
655			bp->b_flags &= ~B_ASYNC;
656			brelse(bp);
657		}
658	}
659
660	while (vp->v_numoutput > 0) {
661		vp->v_flag |= VBWAIT;
662		tsleep(&vp->v_numoutput, PVM, "vnvlbv", 0);
663	}
664
665	splx(s);
666
667	/*
668	 * Destroy the copy in the VM cache, too.
669	 */
670	simple_lock(&vp->v_interlock);
671	object = vp->v_object;
672	if (object != NULL) {
673		vm_object_page_remove(object, 0, 0,
674			(flags & V_SAVE) ? TRUE : FALSE);
675	}
676	simple_unlock(&vp->v_interlock);
677
678	if (!TAILQ_EMPTY(&vp->v_dirtyblkhd) || !TAILQ_EMPTY(&vp->v_cleanblkhd))
679		panic("vinvalbuf: flush failed");
680	return (0);
681}
682
683/*
684 * Truncate a file's buffer and pages to a specified length.  This
685 * is in lieu of the old vinvalbuf mechanism, which performed unneeded
686 * sync activity.
687 */
688int
689vtruncbuf(vp, cred, p, length, blksize)
690	register struct vnode *vp;
691	struct ucred *cred;
692	struct proc *p;
693	off_t length;
694	int blksize;
695{
696	register struct buf *bp;
697	struct buf *nbp;
698	int s, anyfreed;
699	int trunclbn;
700
701	/*
702	 * Round up to the *next* lbn.
703	 */
704	trunclbn = (length + blksize - 1) / blksize;
705
706	s = splbio();
707restart:
708	anyfreed = 1;
709	for (;anyfreed;) {
710		anyfreed = 0;
711		for (bp = TAILQ_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) {
712			nbp = TAILQ_NEXT(bp, b_vnbufs);
713			if (bp->b_lblkno >= trunclbn) {
714				if (bp->b_flags & B_BUSY) {
715					bp->b_flags |= B_WANTED;
716					tsleep(bp, PRIBIO + 4, "vtrb1", 0);
717					goto restart;
718				} else {
719					bremfree(bp);
720					bp->b_flags |= (B_BUSY | B_INVAL | B_RELBUF);
721					bp->b_flags &= ~B_ASYNC;
722					brelse(bp);
723					anyfreed = 1;
724				}
725				if (nbp && (((nbp->b_xflags & B_VNCLEAN) == 0)||
726					 (nbp->b_vp != vp) ||
727					 (nbp->b_flags & B_DELWRI))) {
728					goto restart;
729				}
730			}
731		}
732
733		for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
734			nbp = TAILQ_NEXT(bp, b_vnbufs);
735			if (bp->b_lblkno >= trunclbn) {
736				if (bp->b_flags & B_BUSY) {
737					bp->b_flags |= B_WANTED;
738					tsleep(bp, PRIBIO + 4, "vtrb2", 0);
739					goto restart;
740				} else {
741					bremfree(bp);
742					bp->b_flags |= (B_BUSY | B_INVAL | B_RELBUF);
743					bp->b_flags &= ~B_ASYNC;
744					brelse(bp);
745					anyfreed = 1;
746				}
747				if (nbp && (((nbp->b_xflags & B_VNDIRTY) == 0)||
748					 (nbp->b_vp != vp) ||
749					 (nbp->b_flags & B_DELWRI) == 0)) {
750					goto restart;
751				}
752			}
753		}
754	}
755
756	if (length > 0) {
757restartsync:
758		for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
759			nbp = TAILQ_NEXT(bp, b_vnbufs);
760			if ((bp->b_flags & B_DELWRI) && (bp->b_lblkno < 0)) {
761				if (bp->b_flags & B_BUSY) {
762					bp->b_flags |= B_WANTED;
763					tsleep(bp, PRIBIO, "vtrb3", 0);
764				} else {
765					bremfree(bp);
766					bp->b_flags |= B_BUSY;
767					if (bp->b_vp == vp) {
768						bp->b_flags |= B_ASYNC;
769					} else {
770						bp->b_flags &= ~B_ASYNC;
771					}
772					VOP_BWRITE(bp);
773				}
774				goto restartsync;
775			}
776
777		}
778	}
779
780	while (vp->v_numoutput > 0) {
781		vp->v_flag |= VBWAIT;
782		tsleep(&vp->v_numoutput, PVM, "vbtrunc", 0);
783	}
784
785	splx(s);
786
787	vnode_pager_setsize(vp, length);
788
789	return (0);
790}
791
792/*
793 * Associate a buffer with a vnode.
794 */
795void
796bgetvp(vp, bp)
797	register struct vnode *vp;
798	register struct buf *bp;
799{
800	int s;
801
802	KASSERT(bp->b_vp == NULL, ("bgetvp: not free"));
803
804	vhold(vp);
805	bp->b_vp = vp;
806	if (vp->v_type == VBLK || vp->v_type == VCHR)
807		bp->b_dev = vp->v_rdev;
808	else
809		bp->b_dev = NODEV;
810	/*
811	 * Insert onto list for new vnode.
812	 */
813	s = splbio();
814	bp->b_xflags |= B_VNCLEAN;
815	bp->b_xflags &= ~B_VNDIRTY;
816	TAILQ_INSERT_TAIL(&vp->v_cleanblkhd, bp, b_vnbufs);
817	splx(s);
818}
819
820/*
821 * Disassociate a buffer from a vnode.
822 */
823void
824brelvp(bp)
825	register struct buf *bp;
826{
827	struct vnode *vp;
828	struct buflists *listheadp;
829	int s;
830
831	KASSERT(bp->b_vp != NULL, ("brelvp: NULL"));
832
833	/*
834	 * Delete from old vnode list, if on one.
835	 */
836	vp = bp->b_vp;
837	s = splbio();
838	if (bp->b_xflags & (B_VNDIRTY|B_VNCLEAN)) {
839		if (bp->b_xflags & B_VNDIRTY)
840			listheadp = &vp->v_dirtyblkhd;
841		else
842			listheadp = &vp->v_cleanblkhd;
843		TAILQ_REMOVE(listheadp, bp, b_vnbufs);
844		bp->b_xflags &= ~(B_VNDIRTY|B_VNCLEAN);
845	}
846	if ((vp->v_flag & VONWORKLST) && TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
847		vp->v_flag &= ~VONWORKLST;
848		LIST_REMOVE(vp, v_synclist);
849	}
850	splx(s);
851	bp->b_vp = (struct vnode *) 0;
852	vdrop(vp);
853}
854
855/*
856 * The workitem queue.
857 *
858 * It is useful to delay writes of file data and filesystem metadata
859 * for tens of seconds so that quickly created and deleted files need
860 * not waste disk bandwidth being created and removed. To realize this,
861 * we append vnodes to a "workitem" queue. When running with a soft
862 * updates implementation, most pending metadata dependencies should
863 * not wait for more than a few seconds. Thus, mounted on block devices
864 * are delayed only about a half the time that file data is delayed.
865 * Similarly, directory updates are more critical, so are only delayed
866 * about a third the time that file data is delayed. Thus, there are
867 * SYNCER_MAXDELAY queues that are processed round-robin at a rate of
868 * one each second (driven off the filesystem syner process). The
869 * syncer_delayno variable indicates the next queue that is to be processed.
870 * Items that need to be processed soon are placed in this queue:
871 *
872 *	syncer_workitem_pending[syncer_delayno]
873 *
874 * A delay of fifteen seconds is done by placing the request fifteen
875 * entries later in the queue:
876 *
877 *	syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
878 *
879 */
880
881/*
882 * Add an item to the syncer work queue.
883 */
884void
885vn_syncer_add_to_worklist(vp, delay)
886	struct vnode *vp;
887	int delay;
888{
889	int s, slot;
890
891	s = splbio();
892
893	if (vp->v_flag & VONWORKLST) {
894		LIST_REMOVE(vp, v_synclist);
895	}
896
897	if (delay > syncer_maxdelay - 2)
898		delay = syncer_maxdelay - 2;
899	slot = (syncer_delayno + delay) & syncer_mask;
900
901	LIST_INSERT_HEAD(&syncer_workitem_pending[slot], vp, v_synclist);
902	vp->v_flag |= VONWORKLST;
903	splx(s);
904}
905
906static void sched_sync __P((void));
907static struct	proc *updateproc;
908static struct kproc_desc up_kp = {
909	"syncer",
910	sched_sync,
911	&updateproc
912};
913SYSINIT_KT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp)
914
915/*
916 * System filesystem synchronizer daemon.
917 */
918void
919sched_sync(void)
920{
921	struct synclist *slp;
922	struct vnode *vp;
923	long starttime;
924	int s;
925	struct proc *p = updateproc;
926
927	for (;;) {
928		starttime = time_second;
929
930		/*
931		 * Push files whose dirty time has expired.
932		 */
933		s = splbio();
934		slp = &syncer_workitem_pending[syncer_delayno];
935		syncer_delayno += 1;
936		if (syncer_delayno == syncer_maxdelay)
937			syncer_delayno = 0;
938		splx(s);
939
940		while ((vp = LIST_FIRST(slp)) != NULL) {
941			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
942			(void) VOP_FSYNC(vp, p->p_ucred, MNT_LAZY, p);
943			VOP_UNLOCK(vp, 0, p);
944			if (LIST_FIRST(slp) == vp) {
945				if (TAILQ_EMPTY(&vp->v_dirtyblkhd) &&
946				    vp->v_type != VBLK)
947					panic("sched_sync: fsync failed");
948				/*
949				 * Move ourselves to the back of the sync list.
950				 */
951				LIST_REMOVE(vp, v_synclist);
952				vn_syncer_add_to_worklist(vp, syncdelay);
953			}
954		}
955
956		/*
957		 * Do soft update processing.
958		 */
959		if (bioops.io_sync)
960			(*bioops.io_sync)(NULL);
961
962		/*
963		 * The variable rushjob allows the kernel to speed up the
964		 * processing of the filesystem syncer process. A rushjob
965		 * value of N tells the filesystem syncer to process the next
966		 * N seconds worth of work on its queue ASAP. Currently rushjob
967		 * is used by the soft update code to speed up the filesystem
968		 * syncer process when the incore state is getting so far
969		 * ahead of the disk that the kernel memory pool is being
970		 * threatened with exhaustion.
971		 */
972		if (rushjob > 0) {
973			rushjob -= 1;
974			continue;
975		}
976		/*
977		 * If it has taken us less than a second to process the
978		 * current work, then wait. Otherwise start right over
979		 * again. We can still lose time if any single round
980		 * takes more than two seconds, but it does not really
981		 * matter as we are just trying to generally pace the
982		 * filesystem activity.
983		 */
984		if (time_second == starttime)
985			tsleep(&lbolt, PPAUSE, "syncer", 0);
986	}
987}
988
989/*
990 * Associate a p-buffer with a vnode.
991 *
992 * Also sets B_PAGING flag to indicate that vnode is not fully associated
993 * with the buffer.  i.e. the bp has not been linked into the vnode or
994 * ref-counted.
995 */
996void
997pbgetvp(vp, bp)
998	register struct vnode *vp;
999	register struct buf *bp;
1000{
1001
1002	KASSERT(bp->b_vp == NULL, ("pbgetvp: not free"));
1003
1004	bp->b_vp = vp;
1005	bp->b_flags |= B_PAGING;
1006	if (vp->v_type == VBLK || vp->v_type == VCHR)
1007		bp->b_dev = vp->v_rdev;
1008	else
1009		bp->b_dev = NODEV;
1010}
1011
1012/*
1013 * Disassociate a p-buffer from a vnode.
1014 */
1015void
1016pbrelvp(bp)
1017	register struct buf *bp;
1018{
1019
1020	KASSERT(bp->b_vp != NULL, ("pbrelvp: NULL"));
1021
1022#if !defined(MAX_PERF)
1023	/* XXX REMOVE ME */
1024	if (bp->b_vnbufs.tqe_next != NULL) {
1025		panic(
1026		    "relpbuf(): b_vp was probably reassignbuf()d %p %x",
1027		    bp,
1028		    (int)bp->b_flags
1029		);
1030	}
1031#endif
1032	bp->b_vp = (struct vnode *) 0;
1033	bp->b_flags &= ~B_PAGING;
1034}
1035
1036void
1037pbreassignbuf(bp, newvp)
1038	struct buf *bp;
1039	struct vnode *newvp;
1040{
1041#if !defined(MAX_PERF)
1042	if ((bp->b_flags & B_PAGING) == 0) {
1043		panic(
1044		    "pbreassignbuf() on non phys bp %p",
1045		    bp
1046		);
1047	}
1048#endif
1049	bp->b_vp = newvp;
1050}
1051
1052/*
1053 * Reassign a buffer from one vnode to another.
1054 * Used to assign file specific control information
1055 * (indirect blocks) to the vnode to which they belong.
1056 */
1057void
1058reassignbuf(bp, newvp)
1059	register struct buf *bp;
1060	register struct vnode *newvp;
1061{
1062	struct buflists *listheadp;
1063	struct vnode *oldvp;
1064	int delay;
1065	int s;
1066
1067	if (newvp == NULL) {
1068		printf("reassignbuf: NULL");
1069		return;
1070	}
1071
1072#if !defined(MAX_PERF)
1073	/*
1074	 * B_PAGING flagged buffers cannot be reassigned because their vp
1075	 * is not fully linked in.
1076	 */
1077	if (bp->b_flags & B_PAGING)
1078		panic("cannot reassign paging buffer");
1079#endif
1080
1081	s = splbio();
1082	/*
1083	 * Delete from old vnode list, if on one.
1084	 */
1085	if (bp->b_xflags & (B_VNDIRTY|B_VNCLEAN)) {
1086		oldvp = bp->b_vp;
1087		if (bp->b_xflags & B_VNDIRTY)
1088			listheadp = &oldvp->v_dirtyblkhd;
1089		else
1090			listheadp = &oldvp->v_cleanblkhd;
1091		TAILQ_REMOVE(listheadp, bp, b_vnbufs);
1092		bp->b_xflags &= ~(B_VNDIRTY|B_VNCLEAN);
1093		vdrop(oldvp);
1094	}
1095	/*
1096	 * If dirty, put on list of dirty buffers; otherwise insert onto list
1097	 * of clean buffers.
1098	 */
1099	if (bp->b_flags & B_DELWRI) {
1100		struct buf *tbp;
1101
1102		listheadp = &newvp->v_dirtyblkhd;
1103		if ((newvp->v_flag & VONWORKLST) == 0) {
1104			switch (newvp->v_type) {
1105			case VDIR:
1106				delay = syncdelay / 3;
1107				break;
1108			case VBLK:
1109				if (newvp->v_specmountpoint != NULL) {
1110					delay = syncdelay / 2;
1111					break;
1112				}
1113				/* fall through */
1114			default:
1115				delay = syncdelay;
1116			}
1117			vn_syncer_add_to_worklist(newvp, delay);
1118		}
1119		bp->b_xflags |= B_VNDIRTY;
1120		tbp = TAILQ_FIRST(listheadp);
1121		if (tbp == NULL ||
1122		    (bp->b_lblkno >= 0 && tbp->b_lblkno > bp->b_lblkno)) {
1123			TAILQ_INSERT_HEAD(listheadp, bp, b_vnbufs);
1124		} else {
1125			if (bp->b_lblkno >= 0) {
1126				struct buf *ttbp;
1127				while ((ttbp = TAILQ_NEXT(tbp, b_vnbufs)) &&
1128				    (ttbp->b_lblkno < bp->b_lblkno)) {
1129					tbp = ttbp;
1130				}
1131				TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs);
1132			} else {
1133				TAILQ_INSERT_TAIL(listheadp, bp, b_vnbufs);
1134			}
1135		}
1136	} else {
1137		bp->b_xflags |= B_VNCLEAN;
1138		TAILQ_INSERT_TAIL(&newvp->v_cleanblkhd, bp, b_vnbufs);
1139		if ((newvp->v_flag & VONWORKLST) &&
1140		    TAILQ_EMPTY(&newvp->v_dirtyblkhd)) {
1141			newvp->v_flag &= ~VONWORKLST;
1142			LIST_REMOVE(newvp, v_synclist);
1143		}
1144	}
1145	bp->b_vp = newvp;
1146	vhold(bp->b_vp);
1147	splx(s);
1148}
1149
1150/*
1151 * Create a vnode for a block device.
1152 * Used for mounting the root file system.
1153 */
1154int
1155bdevvp(dev, vpp)
1156	dev_t dev;
1157	struct vnode **vpp;
1158{
1159	register struct vnode *vp;
1160	struct vnode *nvp;
1161	int error;
1162
1163	/* XXX 255 is for mfs. */
1164	if (dev == NODEV || (major(dev) != 255 && (major(dev) >= nblkdev ||
1165	    bdevsw[major(dev)] == NULL))) {
1166		*vpp = NULLVP;
1167		return (ENXIO);
1168	}
1169	error = getnewvnode(VT_NON, (struct mount *)0, spec_vnodeop_p, &nvp);
1170	if (error) {
1171		*vpp = NULLVP;
1172		return (error);
1173	}
1174	vp = nvp;
1175	vp->v_type = VBLK;
1176	if ((nvp = checkalias(vp, dev, (struct mount *)0)) != NULL) {
1177		vput(vp);
1178		vp = nvp;
1179	}
1180	*vpp = vp;
1181	return (0);
1182}
1183
1184/*
1185 * Check to see if the new vnode represents a special device
1186 * for which we already have a vnode (either because of
1187 * bdevvp() or because of a different vnode representing
1188 * the same block device). If such an alias exists, deallocate
1189 * the existing contents and return the aliased vnode. The
1190 * caller is responsible for filling it with its new contents.
1191 */
1192struct vnode *
1193checkalias(nvp, nvp_rdev, mp)
1194	register struct vnode *nvp;
1195	dev_t nvp_rdev;
1196	struct mount *mp;
1197{
1198	struct proc *p = curproc;	/* XXX */
1199	struct vnode *vp;
1200	struct vnode **vpp;
1201
1202	if (nvp->v_type != VBLK && nvp->v_type != VCHR)
1203		return (NULLVP);
1204
1205	vpp = &speclisth[SPECHASH(nvp_rdev)];
1206loop:
1207	simple_lock(&spechash_slock);
1208	for (vp = *vpp; vp; vp = vp->v_specnext) {
1209		if (nvp_rdev != vp->v_rdev || nvp->v_type != vp->v_type)
1210			continue;
1211		/*
1212		 * Alias, but not in use, so flush it out.
1213		 * Only alias active device nodes.
1214		 * Not sure why we don't re-use this like we do below.
1215		 */
1216		simple_lock(&vp->v_interlock);
1217		if (vp->v_usecount == 0) {
1218			simple_unlock(&spechash_slock);
1219			vgonel(vp, p);
1220			goto loop;
1221		}
1222		if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, p)) {
1223			/*
1224			 * It dissappeared, and we may have slept.
1225			 * Restart from the beginning
1226			 */
1227			simple_unlock(&spechash_slock);
1228			goto loop;
1229		}
1230		break;
1231	}
1232	/*
1233	 * It would be a lot clearer what is going on here if
1234	 * this had been expressed as:
1235	 * if ( vp && (vp->v_tag == VT_NULL))
1236	 * and the clauses had been swapped.
1237	 */
1238	if (vp == NULL || vp->v_tag != VT_NON) {
1239		/*
1240		 * Put the new vnode into the hash chain.
1241		 * and if there was an alias, connect them.
1242		 */
1243		MALLOC(nvp->v_specinfo, struct specinfo *,
1244		    sizeof(struct specinfo), M_VNODE, M_WAITOK);
1245		nvp->v_rdev = nvp_rdev;
1246		nvp->v_hashchain = vpp;
1247		nvp->v_specnext = *vpp;
1248		nvp->v_specmountpoint = NULL;
1249		simple_unlock(&spechash_slock);
1250		*vpp = nvp;
1251		if (vp != NULLVP) {
1252			nvp->v_flag |= VALIASED;
1253			vp->v_flag |= VALIASED;
1254			vput(vp);
1255		}
1256		return (NULLVP);
1257	}
1258	/*
1259	 * if ( vp && (vp->v_tag == VT_NULL))
1260	 * We have a vnode alias, but it is a trashed.
1261	 * Make it look like it's newley allocated. (by getnewvnode())
1262	 * The caller should use this instead.
1263	 */
1264	simple_unlock(&spechash_slock);
1265	VOP_UNLOCK(vp, 0, p);
1266	simple_lock(&vp->v_interlock);
1267	vclean(vp, 0, p);
1268	vp->v_op = nvp->v_op;
1269	vp->v_tag = nvp->v_tag;
1270	nvp->v_type = VNON;
1271	insmntque(vp, mp);
1272	return (vp);
1273}
1274
1275/*
1276 * Grab a particular vnode from the free list, increment its
1277 * reference count and lock it. The vnode lock bit is set the
1278 * vnode is being eliminated in vgone. The process is awakened
1279 * when the transition is completed, and an error returned to
1280 * indicate that the vnode is no longer usable (possibly having
1281 * been changed to a new file system type).
1282 */
1283int
1284vget(vp, flags, p)
1285	register struct vnode *vp;
1286	int flags;
1287	struct proc *p;
1288{
1289	int error;
1290
1291	/*
1292	 * If the vnode is in the process of being cleaned out for
1293	 * another use, we wait for the cleaning to finish and then
1294	 * return failure. Cleaning is determined by checking that
1295	 * the VXLOCK flag is set.
1296	 */
1297	if ((flags & LK_INTERLOCK) == 0) {
1298		simple_lock(&vp->v_interlock);
1299	}
1300	if (vp->v_flag & VXLOCK) {
1301		vp->v_flag |= VXWANT;
1302		simple_unlock(&vp->v_interlock);
1303		tsleep((caddr_t)vp, PINOD, "vget", 0);
1304		return (ENOENT);
1305	}
1306
1307	vp->v_usecount++;
1308
1309	if (VSHOULDBUSY(vp))
1310		vbusy(vp);
1311	if (flags & LK_TYPE_MASK) {
1312		if ((error = vn_lock(vp, flags | LK_INTERLOCK, p)) != 0) {
1313			/*
1314			 * must expand vrele here because we do not want
1315			 * to call VOP_INACTIVE if the reference count
1316			 * drops back to zero since it was never really
1317			 * active. We must remove it from the free list
1318			 * before sleeping so that multiple processes do
1319			 * not try to recycle it.
1320			 */
1321			simple_lock(&vp->v_interlock);
1322			vp->v_usecount--;
1323			if (VSHOULDFREE(vp))
1324				vfree(vp);
1325			simple_unlock(&vp->v_interlock);
1326		}
1327		return (error);
1328	}
1329	simple_unlock(&vp->v_interlock);
1330	return (0);
1331}
1332
1333void
1334vref(struct vnode *vp)
1335{
1336	simple_lock(&vp->v_interlock);
1337	vp->v_usecount++;
1338	simple_unlock(&vp->v_interlock);
1339}
1340
1341/*
1342 * Vnode put/release.
1343 * If count drops to zero, call inactive routine and return to freelist.
1344 */
1345void
1346vrele(vp)
1347	struct vnode *vp;
1348{
1349	struct proc *p = curproc;	/* XXX */
1350
1351	KASSERT(vp != NULL, ("vrele: null vp"));
1352
1353	simple_lock(&vp->v_interlock);
1354
1355	if (vp->v_usecount > 1) {
1356
1357		vp->v_usecount--;
1358		simple_unlock(&vp->v_interlock);
1359
1360		return;
1361	}
1362
1363	if (vp->v_usecount == 1) {
1364
1365		vp->v_usecount--;
1366		if (VSHOULDFREE(vp))
1367			vfree(vp);
1368	/*
1369	 * If we are doing a vput, the node is already locked, and we must
1370	 * call VOP_INACTIVE with the node locked.  So, in the case of
1371	 * vrele, we explicitly lock the vnode before calling VOP_INACTIVE.
1372	 */
1373		if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK, p) == 0) {
1374			VOP_INACTIVE(vp, p);
1375		}
1376
1377	} else {
1378#ifdef DIAGNOSTIC
1379		vprint("vrele: negative ref count", vp);
1380		simple_unlock(&vp->v_interlock);
1381#endif
1382		panic("vrele: negative ref cnt");
1383	}
1384}
1385
1386void
1387vput(vp)
1388	struct vnode *vp;
1389{
1390	struct proc *p = curproc;	/* XXX */
1391
1392	KASSERT(vp != NULL, ("vput: null vp"));
1393
1394	simple_lock(&vp->v_interlock);
1395
1396	if (vp->v_usecount > 1) {
1397
1398		vp->v_usecount--;
1399		VOP_UNLOCK(vp, LK_INTERLOCK, p);
1400		return;
1401
1402	}
1403
1404	if (vp->v_usecount == 1) {
1405
1406		vp->v_usecount--;
1407		if (VSHOULDFREE(vp))
1408			vfree(vp);
1409	/*
1410	 * If we are doing a vput, the node is already locked, and we must
1411	 * call VOP_INACTIVE with the node locked.  So, in the case of
1412	 * vrele, we explicitly lock the vnode before calling VOP_INACTIVE.
1413	 */
1414		simple_unlock(&vp->v_interlock);
1415		VOP_INACTIVE(vp, p);
1416
1417	} else {
1418#ifdef DIAGNOSTIC
1419		vprint("vput: negative ref count", vp);
1420#endif
1421		panic("vput: negative ref cnt");
1422	}
1423}
1424
1425/*
1426 * Somebody doesn't want the vnode recycled.
1427 */
1428void
1429vhold(vp)
1430	register struct vnode *vp;
1431{
1432	int s;
1433
1434  	s = splbio();
1435	vp->v_holdcnt++;
1436	if (VSHOULDBUSY(vp))
1437		vbusy(vp);
1438	splx(s);
1439}
1440
1441/*
1442 * One less who cares about this vnode.
1443 */
1444void
1445vdrop(vp)
1446	register struct vnode *vp;
1447{
1448	int s;
1449
1450	s = splbio();
1451	if (vp->v_holdcnt <= 0)
1452		panic("vdrop: holdcnt");
1453	vp->v_holdcnt--;
1454	if (VSHOULDFREE(vp))
1455		vfree(vp);
1456	splx(s);
1457}
1458
1459/*
1460 * Remove any vnodes in the vnode table belonging to mount point mp.
1461 *
1462 * If MNT_NOFORCE is specified, there should not be any active ones,
1463 * return error if any are found (nb: this is a user error, not a
1464 * system error). If MNT_FORCE is specified, detach any active vnodes
1465 * that are found.
1466 */
1467#ifdef DIAGNOSTIC
1468static int busyprt = 0;		/* print out busy vnodes */
1469SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "");
1470#endif
1471
1472int
1473vflush(mp, skipvp, flags)
1474	struct mount *mp;
1475	struct vnode *skipvp;
1476	int flags;
1477{
1478	struct proc *p = curproc;	/* XXX */
1479	struct vnode *vp, *nvp;
1480	int busy = 0;
1481
1482	simple_lock(&mntvnode_slock);
1483loop:
1484	for (vp = mp->mnt_vnodelist.lh_first; vp; vp = nvp) {
1485		/*
1486		 * Make sure this vnode wasn't reclaimed in getnewvnode().
1487		 * Start over if it has (it won't be on the list anymore).
1488		 */
1489		if (vp->v_mount != mp)
1490			goto loop;
1491		nvp = vp->v_mntvnodes.le_next;
1492		/*
1493		 * Skip over a selected vnode.
1494		 */
1495		if (vp == skipvp)
1496			continue;
1497
1498		simple_lock(&vp->v_interlock);
1499		/*
1500		 * Skip over a vnodes marked VSYSTEM.
1501		 */
1502		if ((flags & SKIPSYSTEM) && (vp->v_flag & VSYSTEM)) {
1503			simple_unlock(&vp->v_interlock);
1504			continue;
1505		}
1506		/*
1507		 * If WRITECLOSE is set, only flush out regular file vnodes
1508		 * open for writing.
1509		 */
1510		if ((flags & WRITECLOSE) &&
1511		    (vp->v_writecount == 0 || vp->v_type != VREG)) {
1512			simple_unlock(&vp->v_interlock);
1513			continue;
1514		}
1515
1516		/*
1517		 * With v_usecount == 0, all we need to do is clear out the
1518		 * vnode data structures and we are done.
1519		 */
1520		if (vp->v_usecount == 0) {
1521			simple_unlock(&mntvnode_slock);
1522			vgonel(vp, p);
1523			simple_lock(&mntvnode_slock);
1524			continue;
1525		}
1526
1527		/*
1528		 * If FORCECLOSE is set, forcibly close the vnode. For block
1529		 * or character devices, revert to an anonymous device. For
1530		 * all other files, just kill them.
1531		 */
1532		if (flags & FORCECLOSE) {
1533			simple_unlock(&mntvnode_slock);
1534			if (vp->v_type != VBLK && vp->v_type != VCHR) {
1535				vgonel(vp, p);
1536			} else {
1537				vclean(vp, 0, p);
1538				vp->v_op = spec_vnodeop_p;
1539				insmntque(vp, (struct mount *) 0);
1540			}
1541			simple_lock(&mntvnode_slock);
1542			continue;
1543		}
1544#ifdef DIAGNOSTIC
1545		if (busyprt)
1546			vprint("vflush: busy vnode", vp);
1547#endif
1548		simple_unlock(&vp->v_interlock);
1549		busy++;
1550	}
1551	simple_unlock(&mntvnode_slock);
1552	if (busy)
1553		return (EBUSY);
1554	return (0);
1555}
1556
1557/*
1558 * Disassociate the underlying file system from a vnode.
1559 */
1560static void
1561vclean(vp, flags, p)
1562	struct vnode *vp;
1563	int flags;
1564	struct proc *p;
1565{
1566	int active;
1567	vm_object_t obj;
1568
1569	/*
1570	 * Check to see if the vnode is in use. If so we have to reference it
1571	 * before we clean it out so that its count cannot fall to zero and
1572	 * generate a race against ourselves to recycle it.
1573	 */
1574	if ((active = vp->v_usecount))
1575		vp->v_usecount++;
1576
1577	/*
1578	 * Prevent the vnode from being recycled or brought into use while we
1579	 * clean it out.
1580	 */
1581	if (vp->v_flag & VXLOCK)
1582		panic("vclean: deadlock");
1583	vp->v_flag |= VXLOCK;
1584	/*
1585	 * Even if the count is zero, the VOP_INACTIVE routine may still
1586	 * have the object locked while it cleans it out. The VOP_LOCK
1587	 * ensures that the VOP_INACTIVE routine is done with its work.
1588	 * For active vnodes, it ensures that no other activity can
1589	 * occur while the underlying object is being cleaned out.
1590	 */
1591	VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK, p);
1592
1593	/*
1594	 * Clean out any buffers associated with the vnode.
1595	 */
1596	vinvalbuf(vp, V_SAVE, NOCRED, p, 0, 0);
1597	if ((obj = vp->v_object) != NULL) {
1598		if (obj->ref_count == 0) {
1599			/*
1600			 * This is a normal way of shutting down the object/vnode
1601			 * association.
1602			 */
1603			vm_object_terminate(obj);
1604		} else {
1605			/*
1606			 * Woe to the process that tries to page now :-).
1607			 */
1608			vm_pager_deallocate(obj);
1609		}
1610	}
1611
1612	/*
1613	 * If purging an active vnode, it must be closed and
1614	 * deactivated before being reclaimed. Note that the
1615	 * VOP_INACTIVE will unlock the vnode.
1616	 */
1617	if (active) {
1618		if (flags & DOCLOSE)
1619			VOP_CLOSE(vp, FNONBLOCK, NOCRED, p);
1620		VOP_INACTIVE(vp, p);
1621	} else {
1622		/*
1623		 * Any other processes trying to obtain this lock must first
1624		 * wait for VXLOCK to clear, then call the new lock operation.
1625		 */
1626		VOP_UNLOCK(vp, 0, p);
1627	}
1628	/*
1629	 * Reclaim the vnode.
1630	 */
1631	if (VOP_RECLAIM(vp, p))
1632		panic("vclean: cannot reclaim");
1633
1634	if (active)
1635		vrele(vp);
1636
1637	cache_purge(vp);
1638	if (vp->v_vnlock) {
1639#if 0 /* This is the only place we have LK_DRAINED in the entire kernel ??? */
1640#ifdef DIAGNOSTIC
1641		if ((vp->v_vnlock->lk_flags & LK_DRAINED) == 0)
1642			vprint("vclean: lock not drained", vp);
1643#endif
1644#endif
1645		FREE(vp->v_vnlock, M_VNODE);
1646		vp->v_vnlock = NULL;
1647	}
1648
1649	if (VSHOULDFREE(vp))
1650		vfree(vp);
1651
1652	/*
1653	 * Done with purge, notify sleepers of the grim news.
1654	 */
1655	vp->v_op = dead_vnodeop_p;
1656	vn_pollgone(vp);
1657	vp->v_tag = VT_NON;
1658	vp->v_flag &= ~VXLOCK;
1659	if (vp->v_flag & VXWANT) {
1660		vp->v_flag &= ~VXWANT;
1661		wakeup((caddr_t) vp);
1662	}
1663}
1664
1665/*
1666 * Eliminate all activity associated with the requested vnode
1667 * and with all vnodes aliased to the requested vnode.
1668 */
1669int
1670vop_revoke(ap)
1671	struct vop_revoke_args /* {
1672		struct vnode *a_vp;
1673		int a_flags;
1674	} */ *ap;
1675{
1676	struct vnode *vp, *vq;
1677	struct proc *p = curproc;	/* XXX */
1678
1679	KASSERT((ap->a_flags & REVOKEALL) != 0, ("vop_revoke"));
1680
1681	vp = ap->a_vp;
1682	simple_lock(&vp->v_interlock);
1683
1684	if (vp->v_flag & VALIASED) {
1685		/*
1686		 * If a vgone (or vclean) is already in progress,
1687		 * wait until it is done and return.
1688		 */
1689		if (vp->v_flag & VXLOCK) {
1690			vp->v_flag |= VXWANT;
1691			simple_unlock(&vp->v_interlock);
1692			tsleep((caddr_t)vp, PINOD, "vop_revokeall", 0);
1693			return (0);
1694		}
1695		/*
1696		 * Ensure that vp will not be vgone'd while we
1697		 * are eliminating its aliases.
1698		 */
1699		vp->v_flag |= VXLOCK;
1700		simple_unlock(&vp->v_interlock);
1701		while (vp->v_flag & VALIASED) {
1702			simple_lock(&spechash_slock);
1703			for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
1704				if (vq->v_rdev != vp->v_rdev ||
1705				    vq->v_type != vp->v_type || vp == vq)
1706					continue;
1707				simple_unlock(&spechash_slock);
1708				vgone(vq);
1709				break;
1710			}
1711			if (vq == NULLVP) {
1712				simple_unlock(&spechash_slock);
1713			}
1714		}
1715		/*
1716		 * Remove the lock so that vgone below will
1717		 * really eliminate the vnode after which time
1718		 * vgone will awaken any sleepers.
1719		 */
1720		simple_lock(&vp->v_interlock);
1721		vp->v_flag &= ~VXLOCK;
1722		if (vp->v_flag & VXWANT) {
1723			vp->v_flag &= ~VXWANT;
1724			wakeup(vp);
1725		}
1726	}
1727	vgonel(vp, p);
1728	return (0);
1729}
1730
1731/*
1732 * Recycle an unused vnode to the front of the free list.
1733 * Release the passed interlock if the vnode will be recycled.
1734 */
1735int
1736vrecycle(vp, inter_lkp, p)
1737	struct vnode *vp;
1738	struct simplelock *inter_lkp;
1739	struct proc *p;
1740{
1741
1742	simple_lock(&vp->v_interlock);
1743	if (vp->v_usecount == 0) {
1744		if (inter_lkp) {
1745			simple_unlock(inter_lkp);
1746		}
1747		vgonel(vp, p);
1748		return (1);
1749	}
1750	simple_unlock(&vp->v_interlock);
1751	return (0);
1752}
1753
1754/*
1755 * Eliminate all activity associated with a vnode
1756 * in preparation for reuse.
1757 */
1758void
1759vgone(vp)
1760	register struct vnode *vp;
1761{
1762	struct proc *p = curproc;	/* XXX */
1763
1764	simple_lock(&vp->v_interlock);
1765	vgonel(vp, p);
1766}
1767
1768/*
1769 * vgone, with the vp interlock held.
1770 */
1771static void
1772vgonel(vp, p)
1773	struct vnode *vp;
1774	struct proc *p;
1775{
1776	int s;
1777	struct vnode *vq;
1778	struct vnode *vx;
1779
1780	/*
1781	 * If a vgone (or vclean) is already in progress,
1782	 * wait until it is done and return.
1783	 */
1784	if (vp->v_flag & VXLOCK) {
1785		vp->v_flag |= VXWANT;
1786		simple_unlock(&vp->v_interlock);
1787		tsleep((caddr_t)vp, PINOD, "vgone", 0);
1788		return;
1789	}
1790
1791	/*
1792	 * Clean out the filesystem specific data.
1793	 */
1794	vclean(vp, DOCLOSE, p);
1795	simple_lock(&vp->v_interlock);
1796
1797	/*
1798	 * Delete from old mount point vnode list, if on one.
1799	 */
1800	if (vp->v_mount != NULL)
1801		insmntque(vp, (struct mount *)0);
1802	/*
1803	 * If special device, remove it from special device alias list
1804	 * if it is on one.
1805	 */
1806	if ((vp->v_type == VBLK || vp->v_type == VCHR) && vp->v_specinfo != 0) {
1807		simple_lock(&spechash_slock);
1808		if (*vp->v_hashchain == vp) {
1809			*vp->v_hashchain = vp->v_specnext;
1810		} else {
1811			for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
1812				if (vq->v_specnext != vp)
1813					continue;
1814				vq->v_specnext = vp->v_specnext;
1815				break;
1816			}
1817			if (vq == NULL)
1818				panic("missing bdev");
1819		}
1820		if (vp->v_flag & VALIASED) {
1821			vx = NULL;
1822			for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
1823				if (vq->v_rdev != vp->v_rdev ||
1824				    vq->v_type != vp->v_type)
1825					continue;
1826				if (vx)
1827					break;
1828				vx = vq;
1829			}
1830			if (vx == NULL)
1831				panic("missing alias");
1832			if (vq == NULL)
1833				vx->v_flag &= ~VALIASED;
1834			vp->v_flag &= ~VALIASED;
1835		}
1836		simple_unlock(&spechash_slock);
1837		FREE(vp->v_specinfo, M_VNODE);
1838		vp->v_specinfo = NULL;
1839	}
1840
1841	/*
1842	 * If it is on the freelist and not already at the head,
1843	 * move it to the head of the list. The test of the back
1844	 * pointer and the reference count of zero is because
1845	 * it will be removed from the free list by getnewvnode,
1846	 * but will not have its reference count incremented until
1847	 * after calling vgone. If the reference count were
1848	 * incremented first, vgone would (incorrectly) try to
1849	 * close the previous instance of the underlying object.
1850	 */
1851	if (vp->v_usecount == 0 && !(vp->v_flag & VDOOMED)) {
1852		s = splbio();
1853		simple_lock(&vnode_free_list_slock);
1854		if (vp->v_flag & VFREE) {
1855			TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
1856		} else if (vp->v_flag & VTBFREE) {
1857			TAILQ_REMOVE(&vnode_tobefree_list, vp, v_freelist);
1858			vp->v_flag &= ~VTBFREE;
1859			freevnodes++;
1860		} else
1861			freevnodes++;
1862		vp->v_flag |= VFREE;
1863		TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
1864		simple_unlock(&vnode_free_list_slock);
1865		splx(s);
1866	}
1867
1868	vp->v_type = VBAD;
1869	simple_unlock(&vp->v_interlock);
1870}
1871
1872/*
1873 * Lookup a vnode by device number.
1874 */
1875int
1876vfinddev(dev, type, vpp)
1877	dev_t dev;
1878	enum vtype type;
1879	struct vnode **vpp;
1880{
1881	register struct vnode *vp;
1882	int rc = 0;
1883
1884	simple_lock(&spechash_slock);
1885	for (vp = speclisth[SPECHASH(dev)]; vp; vp = vp->v_specnext) {
1886		if (dev != vp->v_rdev || type != vp->v_type)
1887			continue;
1888		*vpp = vp;
1889		rc = 1;
1890		break;
1891	}
1892	simple_unlock(&spechash_slock);
1893	return (rc);
1894}
1895
1896/*
1897 * Calculate the total number of references to a special device.
1898 */
1899int
1900vcount(vp)
1901	register struct vnode *vp;
1902{
1903	struct vnode *vq, *vnext;
1904	int count;
1905
1906loop:
1907	if ((vp->v_flag & VALIASED) == 0)
1908		return (vp->v_usecount);
1909	simple_lock(&spechash_slock);
1910	for (count = 0, vq = *vp->v_hashchain; vq; vq = vnext) {
1911		vnext = vq->v_specnext;
1912		if (vq->v_rdev != vp->v_rdev || vq->v_type != vp->v_type)
1913			continue;
1914		/*
1915		 * Alias, but not in use, so flush it out.
1916		 */
1917		if (vq->v_usecount == 0 && vq != vp) {
1918			simple_unlock(&spechash_slock);
1919			vgone(vq);
1920			goto loop;
1921		}
1922		count += vq->v_usecount;
1923	}
1924	simple_unlock(&spechash_slock);
1925	return (count);
1926}
1927/*
1928 * Print out a description of a vnode.
1929 */
1930static char *typename[] =
1931{"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"};
1932
1933void
1934vprint(label, vp)
1935	char *label;
1936	register struct vnode *vp;
1937{
1938	char buf[96];
1939
1940	if (label != NULL)
1941		printf("%s: %p: ", label, (void *)vp);
1942	else
1943		printf("%p: ", (void *)vp);
1944	printf("type %s, usecount %d, writecount %d, refcount %d,",
1945	    typename[vp->v_type], vp->v_usecount, vp->v_writecount,
1946	    vp->v_holdcnt);
1947	buf[0] = '\0';
1948	if (vp->v_flag & VROOT)
1949		strcat(buf, "|VROOT");
1950	if (vp->v_flag & VTEXT)
1951		strcat(buf, "|VTEXT");
1952	if (vp->v_flag & VSYSTEM)
1953		strcat(buf, "|VSYSTEM");
1954	if (vp->v_flag & VXLOCK)
1955		strcat(buf, "|VXLOCK");
1956	if (vp->v_flag & VXWANT)
1957		strcat(buf, "|VXWANT");
1958	if (vp->v_flag & VBWAIT)
1959		strcat(buf, "|VBWAIT");
1960	if (vp->v_flag & VALIASED)
1961		strcat(buf, "|VALIASED");
1962	if (vp->v_flag & VDOOMED)
1963		strcat(buf, "|VDOOMED");
1964	if (vp->v_flag & VFREE)
1965		strcat(buf, "|VFREE");
1966	if (vp->v_flag & VOBJBUF)
1967		strcat(buf, "|VOBJBUF");
1968	if (buf[0] != '\0')
1969		printf(" flags (%s)", &buf[1]);
1970	if (vp->v_data == NULL) {
1971		printf("\n");
1972	} else {
1973		printf("\n\t");
1974		VOP_PRINT(vp);
1975	}
1976}
1977
1978#ifdef DDB
1979#include <ddb/ddb.h>
1980/*
1981 * List all of the locked vnodes in the system.
1982 * Called when debugging the kernel.
1983 */
1984DB_SHOW_COMMAND(lockedvnodes, lockedvnodes)
1985{
1986	struct proc *p = curproc;	/* XXX */
1987	struct mount *mp, *nmp;
1988	struct vnode *vp;
1989
1990	printf("Locked vnodes\n");
1991	simple_lock(&mountlist_slock);
1992	for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) {
1993		if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) {
1994			nmp = mp->mnt_list.cqe_next;
1995			continue;
1996		}
1997		for (vp = mp->mnt_vnodelist.lh_first;
1998		     vp != NULL;
1999		     vp = vp->v_mntvnodes.le_next) {
2000			if (VOP_ISLOCKED(vp))
2001				vprint((char *)0, vp);
2002		}
2003		simple_lock(&mountlist_slock);
2004		nmp = mp->mnt_list.cqe_next;
2005		vfs_unbusy(mp, p);
2006	}
2007	simple_unlock(&mountlist_slock);
2008}
2009#endif
2010
2011/*
2012 * Top level filesystem related information gathering.
2013 */
2014static int	sysctl_ovfs_conf __P(SYSCTL_HANDLER_ARGS);
2015
2016static int
2017vfs_sysctl SYSCTL_HANDLER_ARGS
2018{
2019	int *name = (int *)arg1 - 1;	/* XXX */
2020	u_int namelen = arg2 + 1;	/* XXX */
2021	struct vfsconf *vfsp;
2022
2023#if 1 || defined(COMPAT_PRELITE2)
2024	/* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */
2025	if (namelen == 1)
2026		return (sysctl_ovfs_conf(oidp, arg1, arg2, req));
2027#endif
2028
2029#ifdef notyet
2030	/* all sysctl names at this level are at least name and field */
2031	if (namelen < 2)
2032		return (ENOTDIR);		/* overloaded */
2033	if (name[0] != VFS_GENERIC) {
2034		for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
2035			if (vfsp->vfc_typenum == name[0])
2036				break;
2037		if (vfsp == NULL)
2038			return (EOPNOTSUPP);
2039		return ((*vfsp->vfc_vfsops->vfs_sysctl)(&name[1], namelen - 1,
2040		    oldp, oldlenp, newp, newlen, p));
2041	}
2042#endif
2043	switch (name[1]) {
2044	case VFS_MAXTYPENUM:
2045		if (namelen != 2)
2046			return (ENOTDIR);
2047		return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int)));
2048	case VFS_CONF:
2049		if (namelen != 3)
2050			return (ENOTDIR);	/* overloaded */
2051		for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
2052			if (vfsp->vfc_typenum == name[2])
2053				break;
2054		if (vfsp == NULL)
2055			return (EOPNOTSUPP);
2056		return (SYSCTL_OUT(req, vfsp, sizeof *vfsp));
2057	}
2058	return (EOPNOTSUPP);
2059}
2060
2061SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD, vfs_sysctl,
2062	"Generic filesystem");
2063
2064#if 1 || defined(COMPAT_PRELITE2)
2065
2066static int
2067sysctl_ovfs_conf SYSCTL_HANDLER_ARGS
2068{
2069	int error;
2070	struct vfsconf *vfsp;
2071	struct ovfsconf ovfs;
2072
2073	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
2074		ovfs.vfc_vfsops = vfsp->vfc_vfsops;	/* XXX used as flag */
2075		strcpy(ovfs.vfc_name, vfsp->vfc_name);
2076		ovfs.vfc_index = vfsp->vfc_typenum;
2077		ovfs.vfc_refcount = vfsp->vfc_refcount;
2078		ovfs.vfc_flags = vfsp->vfc_flags;
2079		error = SYSCTL_OUT(req, &ovfs, sizeof ovfs);
2080		if (error)
2081			return error;
2082	}
2083	return 0;
2084}
2085
2086#endif /* 1 || COMPAT_PRELITE2 */
2087
2088#if 0
2089#define KINFO_VNODESLOP	10
2090/*
2091 * Dump vnode list (via sysctl).
2092 * Copyout address of vnode followed by vnode.
2093 */
2094/* ARGSUSED */
2095static int
2096sysctl_vnode SYSCTL_HANDLER_ARGS
2097{
2098	struct proc *p = curproc;	/* XXX */
2099	struct mount *mp, *nmp;
2100	struct vnode *nvp, *vp;
2101	int error;
2102
2103#define VPTRSZ	sizeof (struct vnode *)
2104#define VNODESZ	sizeof (struct vnode)
2105
2106	req->lock = 0;
2107	if (!req->oldptr) /* Make an estimate */
2108		return (SYSCTL_OUT(req, 0,
2109			(numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ)));
2110
2111	simple_lock(&mountlist_slock);
2112	for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) {
2113		if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) {
2114			nmp = mp->mnt_list.cqe_next;
2115			continue;
2116		}
2117again:
2118		simple_lock(&mntvnode_slock);
2119		for (vp = mp->mnt_vnodelist.lh_first;
2120		     vp != NULL;
2121		     vp = nvp) {
2122			/*
2123			 * Check that the vp is still associated with
2124			 * this filesystem.  RACE: could have been
2125			 * recycled onto the same filesystem.
2126			 */
2127			if (vp->v_mount != mp) {
2128				simple_unlock(&mntvnode_slock);
2129				goto again;
2130			}
2131			nvp = vp->v_mntvnodes.le_next;
2132			simple_unlock(&mntvnode_slock);
2133			if ((error = SYSCTL_OUT(req, &vp, VPTRSZ)) ||
2134			    (error = SYSCTL_OUT(req, vp, VNODESZ)))
2135				return (error);
2136			simple_lock(&mntvnode_slock);
2137		}
2138		simple_unlock(&mntvnode_slock);
2139		simple_lock(&mountlist_slock);
2140		nmp = mp->mnt_list.cqe_next;
2141		vfs_unbusy(mp, p);
2142	}
2143	simple_unlock(&mountlist_slock);
2144
2145	return (0);
2146}
2147#endif
2148
2149/*
2150 * XXX
2151 * Exporting the vnode list on large systems causes them to crash.
2152 * Exporting the vnode list on medium systems causes sysctl to coredump.
2153 */
2154#if 0
2155SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD,
2156	0, 0, sysctl_vnode, "S,vnode", "");
2157#endif
2158
2159/*
2160 * Check to see if a filesystem is mounted on a block device.
2161 */
2162int
2163vfs_mountedon(vp)
2164	struct vnode *vp;
2165{
2166	struct vnode *vq;
2167	int error = 0;
2168
2169	if (vp->v_specmountpoint != NULL)
2170		return (EBUSY);
2171	if (vp->v_flag & VALIASED) {
2172		simple_lock(&spechash_slock);
2173		for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
2174			if (vq->v_rdev != vp->v_rdev ||
2175			    vq->v_type != vp->v_type)
2176				continue;
2177			if (vq->v_specmountpoint != NULL) {
2178				error = EBUSY;
2179				break;
2180			}
2181		}
2182		simple_unlock(&spechash_slock);
2183	}
2184	return (error);
2185}
2186
2187/*
2188 * Unmount all filesystems. The list is traversed in reverse order
2189 * of mounting to avoid dependencies.
2190 */
2191void
2192vfs_unmountall()
2193{
2194	struct mount *mp, *nmp;
2195	struct proc *p;
2196	int error;
2197
2198	if (curproc != NULL)
2199		p = curproc;
2200	else
2201		p = initproc;	/* XXX XXX should this be proc0? */
2202	/*
2203	 * Since this only runs when rebooting, it is not interlocked.
2204	 */
2205	for (mp = mountlist.cqh_last; mp != (void *)&mountlist; mp = nmp) {
2206		nmp = mp->mnt_list.cqe_prev;
2207		error = dounmount(mp, MNT_FORCE, p);
2208		if (error) {
2209			printf("unmount of %s failed (",
2210			    mp->mnt_stat.f_mntonname);
2211			if (error == EBUSY)
2212				printf("BUSY)\n");
2213			else
2214				printf("%d)\n", error);
2215		}
2216	}
2217}
2218
2219/*
2220 * Build hash lists of net addresses and hang them off the mount point.
2221 * Called by ufs_mount() to set up the lists of export addresses.
2222 */
2223static int
2224vfs_hang_addrlist(mp, nep, argp)
2225	struct mount *mp;
2226	struct netexport *nep;
2227	struct export_args *argp;
2228{
2229	register struct netcred *np;
2230	register struct radix_node_head *rnh;
2231	register int i;
2232	struct radix_node *rn;
2233	struct sockaddr *saddr, *smask = 0;
2234	struct domain *dom;
2235	int error;
2236
2237	if (argp->ex_addrlen == 0) {
2238		if (mp->mnt_flag & MNT_DEFEXPORTED)
2239			return (EPERM);
2240		np = &nep->ne_defexported;
2241		np->netc_exflags = argp->ex_flags;
2242		np->netc_anon = argp->ex_anon;
2243		np->netc_anon.cr_ref = 1;
2244		mp->mnt_flag |= MNT_DEFEXPORTED;
2245		return (0);
2246	}
2247	i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen;
2248	np = (struct netcred *) malloc(i, M_NETADDR, M_WAITOK);
2249	bzero((caddr_t) np, i);
2250	saddr = (struct sockaddr *) (np + 1);
2251	if ((error = copyin(argp->ex_addr, (caddr_t) saddr, argp->ex_addrlen)))
2252		goto out;
2253	if (saddr->sa_len > argp->ex_addrlen)
2254		saddr->sa_len = argp->ex_addrlen;
2255	if (argp->ex_masklen) {
2256		smask = (struct sockaddr *) ((caddr_t) saddr + argp->ex_addrlen);
2257		error = copyin(argp->ex_mask, (caddr_t) smask, argp->ex_masklen);
2258		if (error)
2259			goto out;
2260		if (smask->sa_len > argp->ex_masklen)
2261			smask->sa_len = argp->ex_masklen;
2262	}
2263	i = saddr->sa_family;
2264	if ((rnh = nep->ne_rtable[i]) == 0) {
2265		/*
2266		 * Seems silly to initialize every AF when most are not used,
2267		 * do so on demand here
2268		 */
2269		for (dom = domains; dom; dom = dom->dom_next)
2270			if (dom->dom_family == i && dom->dom_rtattach) {
2271				dom->dom_rtattach((void **) &nep->ne_rtable[i],
2272				    dom->dom_rtoffset);
2273				break;
2274			}
2275		if ((rnh = nep->ne_rtable[i]) == 0) {
2276			error = ENOBUFS;
2277			goto out;
2278		}
2279	}
2280	rn = (*rnh->rnh_addaddr) ((caddr_t) saddr, (caddr_t) smask, rnh,
2281	    np->netc_rnodes);
2282	if (rn == 0 || np != (struct netcred *) rn) {	/* already exists */
2283		error = EPERM;
2284		goto out;
2285	}
2286	np->netc_exflags = argp->ex_flags;
2287	np->netc_anon = argp->ex_anon;
2288	np->netc_anon.cr_ref = 1;
2289	return (0);
2290out:
2291	free(np, M_NETADDR);
2292	return (error);
2293}
2294
2295/* ARGSUSED */
2296static int
2297vfs_free_netcred(rn, w)
2298	struct radix_node *rn;
2299	void *w;
2300{
2301	register struct radix_node_head *rnh = (struct radix_node_head *) w;
2302
2303	(*rnh->rnh_deladdr) (rn->rn_key, rn->rn_mask, rnh);
2304	free((caddr_t) rn, M_NETADDR);
2305	return (0);
2306}
2307
2308/*
2309 * Free the net address hash lists that are hanging off the mount points.
2310 */
2311static void
2312vfs_free_addrlist(nep)
2313	struct netexport *nep;
2314{
2315	register int i;
2316	register struct radix_node_head *rnh;
2317
2318	for (i = 0; i <= AF_MAX; i++)
2319		if ((rnh = nep->ne_rtable[i])) {
2320			(*rnh->rnh_walktree) (rnh, vfs_free_netcred,
2321			    (caddr_t) rnh);
2322			free((caddr_t) rnh, M_RTABLE);
2323			nep->ne_rtable[i] = 0;
2324		}
2325}
2326
2327int
2328vfs_export(mp, nep, argp)
2329	struct mount *mp;
2330	struct netexport *nep;
2331	struct export_args *argp;
2332{
2333	int error;
2334
2335	if (argp->ex_flags & MNT_DELEXPORT) {
2336		if (mp->mnt_flag & MNT_EXPUBLIC) {
2337			vfs_setpublicfs(NULL, NULL, NULL);
2338			mp->mnt_flag &= ~MNT_EXPUBLIC;
2339		}
2340		vfs_free_addrlist(nep);
2341		mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED);
2342	}
2343	if (argp->ex_flags & MNT_EXPORTED) {
2344		if (argp->ex_flags & MNT_EXPUBLIC) {
2345			if ((error = vfs_setpublicfs(mp, nep, argp)) != 0)
2346				return (error);
2347			mp->mnt_flag |= MNT_EXPUBLIC;
2348		}
2349		if ((error = vfs_hang_addrlist(mp, nep, argp)))
2350			return (error);
2351		mp->mnt_flag |= MNT_EXPORTED;
2352	}
2353	return (0);
2354}
2355
2356
2357/*
2358 * Set the publicly exported filesystem (WebNFS). Currently, only
2359 * one public filesystem is possible in the spec (RFC 2054 and 2055)
2360 */
2361int
2362vfs_setpublicfs(mp, nep, argp)
2363	struct mount *mp;
2364	struct netexport *nep;
2365	struct export_args *argp;
2366{
2367	int error;
2368	struct vnode *rvp;
2369	char *cp;
2370
2371	/*
2372	 * mp == NULL -> invalidate the current info, the FS is
2373	 * no longer exported. May be called from either vfs_export
2374	 * or unmount, so check if it hasn't already been done.
2375	 */
2376	if (mp == NULL) {
2377		if (nfs_pub.np_valid) {
2378			nfs_pub.np_valid = 0;
2379			if (nfs_pub.np_index != NULL) {
2380				FREE(nfs_pub.np_index, M_TEMP);
2381				nfs_pub.np_index = NULL;
2382			}
2383		}
2384		return (0);
2385	}
2386
2387	/*
2388	 * Only one allowed at a time.
2389	 */
2390	if (nfs_pub.np_valid != 0 && mp != nfs_pub.np_mount)
2391		return (EBUSY);
2392
2393	/*
2394	 * Get real filehandle for root of exported FS.
2395	 */
2396	bzero((caddr_t)&nfs_pub.np_handle, sizeof(nfs_pub.np_handle));
2397	nfs_pub.np_handle.fh_fsid = mp->mnt_stat.f_fsid;
2398
2399	if ((error = VFS_ROOT(mp, &rvp)))
2400		return (error);
2401
2402	if ((error = VFS_VPTOFH(rvp, &nfs_pub.np_handle.fh_fid)))
2403		return (error);
2404
2405	vput(rvp);
2406
2407	/*
2408	 * If an indexfile was specified, pull it in.
2409	 */
2410	if (argp->ex_indexfile != NULL) {
2411		MALLOC(nfs_pub.np_index, char *, MAXNAMLEN + 1, M_TEMP,
2412		    M_WAITOK);
2413		error = copyinstr(argp->ex_indexfile, nfs_pub.np_index,
2414		    MAXNAMLEN, (size_t *)0);
2415		if (!error) {
2416			/*
2417			 * Check for illegal filenames.
2418			 */
2419			for (cp = nfs_pub.np_index; *cp; cp++) {
2420				if (*cp == '/') {
2421					error = EINVAL;
2422					break;
2423				}
2424			}
2425		}
2426		if (error) {
2427			FREE(nfs_pub.np_index, M_TEMP);
2428			return (error);
2429		}
2430	}
2431
2432	nfs_pub.np_mount = mp;
2433	nfs_pub.np_valid = 1;
2434	return (0);
2435}
2436
2437struct netcred *
2438vfs_export_lookup(mp, nep, nam)
2439	register struct mount *mp;
2440	struct netexport *nep;
2441	struct sockaddr *nam;
2442{
2443	register struct netcred *np;
2444	register struct radix_node_head *rnh;
2445	struct sockaddr *saddr;
2446
2447	np = NULL;
2448	if (mp->mnt_flag & MNT_EXPORTED) {
2449		/*
2450		 * Lookup in the export list first.
2451		 */
2452		if (nam != NULL) {
2453			saddr = nam;
2454			rnh = nep->ne_rtable[saddr->sa_family];
2455			if (rnh != NULL) {
2456				np = (struct netcred *)
2457					(*rnh->rnh_matchaddr)((caddr_t)saddr,
2458							      rnh);
2459				if (np && np->netc_rnodes->rn_flags & RNF_ROOT)
2460					np = NULL;
2461			}
2462		}
2463		/*
2464		 * If no address match, use the default if it exists.
2465		 */
2466		if (np == NULL && mp->mnt_flag & MNT_DEFEXPORTED)
2467			np = &nep->ne_defexported;
2468	}
2469	return (np);
2470}
2471
2472/*
2473 * perform msync on all vnodes under a mount point
2474 * the mount point must be locked.
2475 */
2476void
2477vfs_msync(struct mount *mp, int flags) {
2478	struct vnode *vp, *nvp;
2479	struct vm_object *obj;
2480	int anyio, tries;
2481
2482	tries = 5;
2483loop:
2484	anyio = 0;
2485	for (vp = mp->mnt_vnodelist.lh_first; vp != NULL; vp = nvp) {
2486
2487		nvp = vp->v_mntvnodes.le_next;
2488
2489		if (vp->v_mount != mp) {
2490			goto loop;
2491		}
2492
2493		if (vp->v_flag & VXLOCK)	/* XXX: what if MNT_WAIT? */
2494			continue;
2495
2496		if (flags != MNT_WAIT) {
2497			obj = vp->v_object;
2498			if (obj == NULL || (obj->flags & OBJ_MIGHTBEDIRTY) == 0)
2499				continue;
2500			if (VOP_ISLOCKED(vp))
2501				continue;
2502		}
2503
2504		simple_lock(&vp->v_interlock);
2505		if (vp->v_object &&
2506		   (vp->v_object->flags & OBJ_MIGHTBEDIRTY)) {
2507			if (!vget(vp,
2508				LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY | LK_NOOBJ, curproc)) {
2509				if (vp->v_object) {
2510					vm_object_page_clean(vp->v_object, 0, 0, flags == MNT_WAIT ? OBJPC_SYNC : 0);
2511					anyio = 1;
2512				}
2513				vput(vp);
2514			}
2515		} else {
2516			simple_unlock(&vp->v_interlock);
2517		}
2518	}
2519	if (anyio && (--tries > 0))
2520		goto loop;
2521}
2522
2523/*
2524 * Create the VM object needed for VMIO and mmap support.  This
2525 * is done for all VREG files in the system.  Some filesystems might
2526 * afford the additional metadata buffering capability of the
2527 * VMIO code by making the device node be VMIO mode also.
2528 *
2529 * vp must be locked when vfs_object_create is called.
2530 */
2531int
2532vfs_object_create(vp, p, cred)
2533	struct vnode *vp;
2534	struct proc *p;
2535	struct ucred *cred;
2536{
2537	struct vattr vat;
2538	vm_object_t object;
2539	int error = 0;
2540
2541	if ((vp->v_type != VREG) && (vp->v_type != VBLK))
2542		return 0;
2543
2544retry:
2545	if ((object = vp->v_object) == NULL) {
2546		if (vp->v_type == VREG) {
2547			if ((error = VOP_GETATTR(vp, &vat, cred, p)) != 0)
2548				goto retn;
2549			object = vnode_pager_alloc(vp, vat.va_size, 0, 0);
2550		} else if (major(vp->v_rdev) < nblkdev &&
2551		    bdevsw[major(vp->v_rdev)] != NULL) {
2552			/*
2553			 * This simply allocates the biggest object possible
2554			 * for a VBLK vnode.  This should be fixed, but doesn't
2555			 * cause any problems (yet).
2556			 */
2557			object = vnode_pager_alloc(vp, IDX_TO_OFF(INT_MAX), 0, 0);
2558		}
2559		object->ref_count--;
2560		vp->v_usecount--;
2561	} else {
2562		if (object->flags & OBJ_DEAD) {
2563			VOP_UNLOCK(vp, 0, p);
2564			tsleep(object, PVM, "vodead", 0);
2565			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
2566			goto retry;
2567		}
2568	}
2569
2570	if (vp->v_object)
2571		vp->v_flag |= VOBJBUF;
2572
2573retn:
2574	return error;
2575}
2576
2577static void
2578vfree(vp)
2579	struct vnode *vp;
2580{
2581	int s;
2582
2583	s = splbio();
2584	simple_lock(&vnode_free_list_slock);
2585	if (vp->v_flag & VTBFREE) {
2586		TAILQ_REMOVE(&vnode_tobefree_list, vp, v_freelist);
2587		vp->v_flag &= ~VTBFREE;
2588	}
2589	if (vp->v_flag & VAGE) {
2590		TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
2591	} else {
2592		TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
2593	}
2594	freevnodes++;
2595	simple_unlock(&vnode_free_list_slock);
2596	vp->v_flag &= ~VAGE;
2597	vp->v_flag |= VFREE;
2598	splx(s);
2599}
2600
2601void
2602vbusy(vp)
2603	struct vnode *vp;
2604{
2605	int s;
2606
2607	s = splbio();
2608	simple_lock(&vnode_free_list_slock);
2609	if (vp->v_flag & VTBFREE) {
2610		TAILQ_REMOVE(&vnode_tobefree_list, vp, v_freelist);
2611		vp->v_flag &= ~VTBFREE;
2612	} else {
2613		TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
2614		freevnodes--;
2615	}
2616	simple_unlock(&vnode_free_list_slock);
2617	vp->v_flag &= ~(VFREE|VAGE);
2618	splx(s);
2619}
2620
2621/*
2622 * Record a process's interest in events which might happen to
2623 * a vnode.  Because poll uses the historic select-style interface
2624 * internally, this routine serves as both the ``check for any
2625 * pending events'' and the ``record my interest in future events''
2626 * functions.  (These are done together, while the lock is held,
2627 * to avoid race conditions.)
2628 */
2629int
2630vn_pollrecord(vp, p, events)
2631	struct vnode *vp;
2632	struct proc *p;
2633	short events;
2634{
2635	simple_lock(&vp->v_pollinfo.vpi_lock);
2636	if (vp->v_pollinfo.vpi_revents & events) {
2637		/*
2638		 * This leaves events we are not interested
2639		 * in available for the other process which
2640		 * which presumably had requested them
2641		 * (otherwise they would never have been
2642		 * recorded).
2643		 */
2644		events &= vp->v_pollinfo.vpi_revents;
2645		vp->v_pollinfo.vpi_revents &= ~events;
2646
2647		simple_unlock(&vp->v_pollinfo.vpi_lock);
2648		return events;
2649	}
2650	vp->v_pollinfo.vpi_events |= events;
2651	selrecord(p, &vp->v_pollinfo.vpi_selinfo);
2652	simple_unlock(&vp->v_pollinfo.vpi_lock);
2653	return 0;
2654}
2655
2656/*
2657 * Note the occurrence of an event.  If the VN_POLLEVENT macro is used,
2658 * it is possible for us to miss an event due to race conditions, but
2659 * that condition is expected to be rare, so for the moment it is the
2660 * preferred interface.
2661 */
2662void
2663vn_pollevent(vp, events)
2664	struct vnode *vp;
2665	short events;
2666{
2667	simple_lock(&vp->v_pollinfo.vpi_lock);
2668	if (vp->v_pollinfo.vpi_events & events) {
2669		/*
2670		 * We clear vpi_events so that we don't
2671		 * call selwakeup() twice if two events are
2672		 * posted before the polling process(es) is
2673		 * awakened.  This also ensures that we take at
2674		 * most one selwakeup() if the polling process
2675		 * is no longer interested.  However, it does
2676		 * mean that only one event can be noticed at
2677		 * a time.  (Perhaps we should only clear those
2678		 * event bits which we note?) XXX
2679		 */
2680		vp->v_pollinfo.vpi_events = 0;	/* &= ~events ??? */
2681		vp->v_pollinfo.vpi_revents |= events;
2682		selwakeup(&vp->v_pollinfo.vpi_selinfo);
2683	}
2684	simple_unlock(&vp->v_pollinfo.vpi_lock);
2685}
2686
2687/*
2688 * Wake up anyone polling on vp because it is being revoked.
2689 * This depends on dead_poll() returning POLLHUP for correct
2690 * behavior.
2691 */
2692void
2693vn_pollgone(vp)
2694	struct vnode *vp;
2695{
2696	simple_lock(&vp->v_pollinfo.vpi_lock);
2697	if (vp->v_pollinfo.vpi_events) {
2698		vp->v_pollinfo.vpi_events = 0;
2699		selwakeup(&vp->v_pollinfo.vpi_selinfo);
2700	}
2701	simple_unlock(&vp->v_pollinfo.vpi_lock);
2702}
2703
2704
2705
2706/*
2707 * Routine to create and manage a filesystem syncer vnode.
2708 */
2709#define sync_close ((int (*) __P((struct  vop_close_args *)))nullop)
2710static int	sync_fsync __P((struct  vop_fsync_args *));
2711static int	sync_inactive __P((struct  vop_inactive_args *));
2712static int	sync_reclaim  __P((struct  vop_reclaim_args *));
2713#define sync_lock ((int (*) __P((struct  vop_lock_args *)))vop_nolock)
2714#define sync_unlock ((int (*) __P((struct  vop_unlock_args *)))vop_nounlock)
2715static int	sync_print __P((struct vop_print_args *));
2716#define sync_islocked ((int(*) __P((struct vop_islocked_args *)))vop_noislocked)
2717
2718static vop_t **sync_vnodeop_p;
2719static struct vnodeopv_entry_desc sync_vnodeop_entries[] = {
2720	{ &vop_default_desc,	(vop_t *) vop_eopnotsupp },
2721	{ &vop_close_desc,	(vop_t *) sync_close },		/* close */
2722	{ &vop_fsync_desc,	(vop_t *) sync_fsync },		/* fsync */
2723	{ &vop_inactive_desc,	(vop_t *) sync_inactive },	/* inactive */
2724	{ &vop_reclaim_desc,	(vop_t *) sync_reclaim },	/* reclaim */
2725	{ &vop_lock_desc,	(vop_t *) sync_lock },		/* lock */
2726	{ &vop_unlock_desc,	(vop_t *) sync_unlock },	/* unlock */
2727	{ &vop_print_desc,	(vop_t *) sync_print },		/* print */
2728	{ &vop_islocked_desc,	(vop_t *) sync_islocked },	/* islocked */
2729	{ NULL, NULL }
2730};
2731static struct vnodeopv_desc sync_vnodeop_opv_desc =
2732	{ &sync_vnodeop_p, sync_vnodeop_entries };
2733
2734VNODEOP_SET(sync_vnodeop_opv_desc);
2735
2736/*
2737 * Create a new filesystem syncer vnode for the specified mount point.
2738 */
2739int
2740vfs_allocate_syncvnode(mp)
2741	struct mount *mp;
2742{
2743	struct vnode *vp;
2744	static long start, incr, next;
2745	int error;
2746
2747	/* Allocate a new vnode */
2748	if ((error = getnewvnode(VT_VFS, mp, sync_vnodeop_p, &vp)) != 0) {
2749		mp->mnt_syncer = NULL;
2750		return (error);
2751	}
2752	vp->v_type = VNON;
2753	/*
2754	 * Place the vnode onto the syncer worklist. We attempt to
2755	 * scatter them about on the list so that they will go off
2756	 * at evenly distributed times even if all the filesystems
2757	 * are mounted at once.
2758	 */
2759	next += incr;
2760	if (next == 0 || next > syncer_maxdelay) {
2761		start /= 2;
2762		incr /= 2;
2763		if (start == 0) {
2764			start = syncer_maxdelay / 2;
2765			incr = syncer_maxdelay;
2766		}
2767		next = start;
2768	}
2769	vn_syncer_add_to_worklist(vp, syncdelay > 0 ? next % syncdelay : 0);
2770	mp->mnt_syncer = vp;
2771	return (0);
2772}
2773
2774/*
2775 * Do a lazy sync of the filesystem.
2776 */
2777static int
2778sync_fsync(ap)
2779	struct vop_fsync_args /* {
2780		struct vnode *a_vp;
2781		struct ucred *a_cred;
2782		int a_waitfor;
2783		struct proc *a_p;
2784	} */ *ap;
2785{
2786	struct vnode *syncvp = ap->a_vp;
2787	struct mount *mp = syncvp->v_mount;
2788	struct proc *p = ap->a_p;
2789	int asyncflag;
2790
2791	/*
2792	 * We only need to do something if this is a lazy evaluation.
2793	 */
2794	if (ap->a_waitfor != MNT_LAZY)
2795		return (0);
2796
2797	/*
2798	 * Move ourselves to the back of the sync list.
2799	 */
2800	vn_syncer_add_to_worklist(syncvp, syncdelay);
2801
2802	/*
2803	 * Walk the list of vnodes pushing all that are dirty and
2804	 * not already on the sync list.
2805	 */
2806	simple_lock(&mountlist_slock);
2807	if (vfs_busy(mp, LK_EXCLUSIVE | LK_NOWAIT, &mountlist_slock, p) != 0) {
2808		simple_unlock(&mountlist_slock);
2809		return (0);
2810	}
2811	asyncflag = mp->mnt_flag & MNT_ASYNC;
2812	mp->mnt_flag &= ~MNT_ASYNC;
2813	vfs_msync(mp, MNT_NOWAIT);
2814	VFS_SYNC(mp, MNT_LAZY, ap->a_cred, p);
2815	if (asyncflag)
2816		mp->mnt_flag |= MNT_ASYNC;
2817	vfs_unbusy(mp, p);
2818	return (0);
2819}
2820
2821/*
2822 * The syncer vnode is no referenced.
2823 */
2824static int
2825sync_inactive(ap)
2826	struct vop_inactive_args /* {
2827		struct vnode *a_vp;
2828		struct proc *a_p;
2829	} */ *ap;
2830{
2831
2832	vgone(ap->a_vp);
2833	return (0);
2834}
2835
2836/*
2837 * The syncer vnode is no longer needed and is being decommissioned.
2838 */
2839static int
2840sync_reclaim(ap)
2841	struct vop_reclaim_args /* {
2842		struct vnode *a_vp;
2843	} */ *ap;
2844{
2845	struct vnode *vp = ap->a_vp;
2846
2847	vp->v_mount->mnt_syncer = NULL;
2848	if (vp->v_flag & VONWORKLST) {
2849		LIST_REMOVE(vp, v_synclist);
2850		vp->v_flag &= ~VONWORKLST;
2851	}
2852
2853	return (0);
2854}
2855
2856/*
2857 * Print out a syncer vnode.
2858 */
2859static int
2860sync_print(ap)
2861	struct vop_print_args /* {
2862		struct vnode *a_vp;
2863	} */ *ap;
2864{
2865	struct vnode *vp = ap->a_vp;
2866
2867	printf("syncer vnode");
2868	if (vp->v_vnlock != NULL)
2869		lockmgr_printinfo(vp->v_vnlock);
2870	printf("\n");
2871	return (0);
2872}
2873