vfs_subr.c revision 34577
1/*
2 * Copyright (c) 1989, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 *    must display the following acknowledgement:
20 *	This product includes software developed by the University of
21 *	California, Berkeley and its contributors.
22 * 4. Neither the name of the University nor the names of its contributors
23 *    may be used to endorse or promote products derived from this software
24 *    without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 *
38 *	@(#)vfs_subr.c	8.31 (Berkeley) 5/26/95
39 * $Id: vfs_subr.c,v 1.139 1998/03/14 02:55:01 tegge Exp $
40 */
41
42/*
43 * External virtual filesystem routines
44 */
45#include "opt_ddb.h"
46#include "opt_devfs.h"
47
48#include <sys/param.h>
49#include <sys/systm.h>
50#include <sys/kernel.h>
51#include <sys/proc.h>
52#include <sys/malloc.h>
53#include <sys/mount.h>
54#include <sys/vnode.h>
55#include <sys/stat.h>
56#include <sys/buf.h>
57#include <sys/poll.h>
58#include <sys/domain.h>
59#include <sys/dirent.h>
60#include <sys/vmmeter.h>
61
62#include <machine/limits.h>
63
64#include <vm/vm.h>
65#include <vm/vm_object.h>
66#include <vm/vm_extern.h>
67#include <vm/pmap.h>
68#include <vm/vm_map.h>
69#include <vm/vm_pager.h>
70#include <vm/vnode_pager.h>
71#include <vm/vm_zone.h>
72#include <sys/sysctl.h>
73
74#include <miscfs/specfs/specdev.h>
75
76static MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure");
77
78static void	insmntque __P((struct vnode *vp, struct mount *mp));
79#ifdef DDB
80static void	printlockedvnodes __P((void));
81#endif
82static void	vclean __P((struct vnode *vp, int flags, struct proc *p));
83static void	vfree __P((struct vnode *));
84static void	vgonel __P((struct vnode *vp, struct proc *p));
85static unsigned long	numvnodes;
86SYSCTL_INT(_debug, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, "");
87
88enum vtype iftovt_tab[16] = {
89	VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
90	VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
91};
92int vttoif_tab[9] = {
93	0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
94	S_IFSOCK, S_IFIFO, S_IFMT,
95};
96
97/*
98 * Insq/Remq for the vnode usage lists.
99 */
100#define	bufinsvn(bp, dp)	LIST_INSERT_HEAD(dp, bp, b_vnbufs)
101#define	bufremvn(bp) {							\
102	LIST_REMOVE(bp, b_vnbufs);					\
103	(bp)->b_vnbufs.le_next = NOLIST;				\
104}
105
106static TAILQ_HEAD(freelst, vnode) vnode_free_list;	/* vnode free list */
107struct tobefreelist vnode_tobefree_list;	/* vnode free list */
108
109static u_long wantfreevnodes = 25;
110SYSCTL_INT(_debug, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, "");
111static u_long freevnodes = 0;
112SYSCTL_INT(_debug, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, "");
113
114int vfs_ioopt = 0;
115#ifdef REALLYBADBUG
116SYSCTL_INT(_vfs, OID_AUTO, ioopt, CTLFLAG_RW, &vfs_ioopt, 0, "");
117#endif
118
119struct mntlist mountlist;	/* mounted filesystem list */
120struct simplelock mountlist_slock;
121static struct simplelock mntid_slock;
122struct simplelock mntvnode_slock;
123static struct simplelock vnode_free_list_slock;
124static struct simplelock spechash_slock;
125struct nfs_public nfs_pub;	/* publicly exported FS */
126static vm_zone_t vnode_zone;
127
128/*
129 * The workitem queue.
130 */
131#define SYNCER_MAXDELAY		32
132int syncer_maxdelay =		SYNCER_MAXDELAY;	/* maximum delay time */
133time_t syncdelay =		30;
134int rushjob;				/* number of slots to run ASAP */
135
136static int syncer_delayno = 0;
137static long syncer_mask;
138LIST_HEAD(synclist, vnode);
139static struct synclist *syncer_workitem_pending;
140
141int desiredvnodes;
142SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW, &desiredvnodes, 0, "");
143
144static void	vfs_free_addrlist __P((struct netexport *nep));
145static int	vfs_free_netcred __P((struct radix_node *rn, void *w));
146static int	vfs_hang_addrlist __P((struct mount *mp, struct netexport *nep,
147				       struct export_args *argp));
148
149/*
150 * Initialize the vnode management data structures.
151 */
152void
153vntblinit()
154{
155
156	desiredvnodes = maxproc + cnt.v_page_count / 4;
157	simple_lock_init(&mntvnode_slock);
158	simple_lock_init(&mntid_slock);
159	simple_lock_init(&spechash_slock);
160	TAILQ_INIT(&vnode_free_list);
161	TAILQ_INIT(&vnode_tobefree_list);
162	simple_lock_init(&vnode_free_list_slock);
163	CIRCLEQ_INIT(&mountlist);
164	vnode_zone = zinit("VNODE", sizeof (struct vnode), 0, 0, 5);
165	/*
166	 * Initialize the filesystem syncer.
167	 */
168	syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE,
169		&syncer_mask);
170	syncer_maxdelay = syncer_mask + 1;
171}
172
173/*
174 * Mark a mount point as busy. Used to synchronize access and to delay
175 * unmounting. Interlock is not released on failure.
176 */
177int
178vfs_busy(mp, flags, interlkp, p)
179	struct mount *mp;
180	int flags;
181	struct simplelock *interlkp;
182	struct proc *p;
183{
184	int lkflags;
185
186	if (mp->mnt_kern_flag & MNTK_UNMOUNT) {
187		if (flags & LK_NOWAIT)
188			return (ENOENT);
189		mp->mnt_kern_flag |= MNTK_MWAIT;
190		if (interlkp) {
191			simple_unlock(interlkp);
192		}
193		/*
194		 * Since all busy locks are shared except the exclusive
195		 * lock granted when unmounting, the only place that a
196		 * wakeup needs to be done is at the release of the
197		 * exclusive lock at the end of dounmount.
198		 */
199		tsleep((caddr_t)mp, PVFS, "vfs_busy", 0);
200		if (interlkp) {
201			simple_lock(interlkp);
202		}
203		return (ENOENT);
204	}
205	lkflags = LK_SHARED | LK_NOPAUSE;
206	if (interlkp)
207		lkflags |= LK_INTERLOCK;
208	if (lockmgr(&mp->mnt_lock, lkflags, interlkp, p))
209		panic("vfs_busy: unexpected lock failure");
210	return (0);
211}
212
213/*
214 * Free a busy filesystem.
215 */
216void
217vfs_unbusy(mp, p)
218	struct mount *mp;
219	struct proc *p;
220{
221
222	lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, p);
223}
224
225/*
226 * Lookup a filesystem type, and if found allocate and initialize
227 * a mount structure for it.
228 *
229 * Devname is usually updated by mount(8) after booting.
230 */
231int
232vfs_rootmountalloc(fstypename, devname, mpp)
233	char *fstypename;
234	char *devname;
235	struct mount **mpp;
236{
237	struct proc *p = curproc;	/* XXX */
238	struct vfsconf *vfsp;
239	struct mount *mp;
240
241	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
242		if (!strcmp(vfsp->vfc_name, fstypename))
243			break;
244	if (vfsp == NULL)
245		return (ENODEV);
246	mp = malloc((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK);
247	bzero((char *)mp, (u_long)sizeof(struct mount));
248	lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, LK_NOPAUSE);
249	(void)vfs_busy(mp, LK_NOWAIT, 0, p);
250	LIST_INIT(&mp->mnt_vnodelist);
251	mp->mnt_vfc = vfsp;
252	mp->mnt_op = vfsp->vfc_vfsops;
253	mp->mnt_flag = MNT_RDONLY;
254	mp->mnt_vnodecovered = NULLVP;
255	vfsp->vfc_refcount++;
256	mp->mnt_stat.f_type = vfsp->vfc_typenum;
257	mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
258	strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN);
259	mp->mnt_stat.f_mntonname[0] = '/';
260	mp->mnt_stat.f_mntonname[1] = 0;
261	(void) copystr(devname, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, 0);
262	*mpp = mp;
263	return (0);
264}
265
266/*
267 * Find an appropriate filesystem to use for the root. If a filesystem
268 * has not been preselected, walk through the list of known filesystems
269 * trying those that have mountroot routines, and try them until one
270 * works or we have tried them all.
271 */
272#ifdef notdef	/* XXX JH */
273int
274lite2_vfs_mountroot()
275{
276	struct vfsconf *vfsp;
277	extern int (*lite2_mountroot) __P((void));
278	int error;
279
280	if (lite2_mountroot != NULL)
281		return ((*lite2_mountroot)());
282	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
283		if (vfsp->vfc_mountroot == NULL)
284			continue;
285		if ((error = (*vfsp->vfc_mountroot)()) == 0)
286			return (0);
287		printf("%s_mountroot failed: %d\n", vfsp->vfc_name, error);
288	}
289	return (ENODEV);
290}
291#endif
292
293/*
294 * Lookup a mount point by filesystem identifier.
295 */
296struct mount *
297vfs_getvfs(fsid)
298	fsid_t *fsid;
299{
300	register struct mount *mp;
301
302	simple_lock(&mountlist_slock);
303	for (mp = mountlist.cqh_first; mp != (void *)&mountlist;
304	    mp = mp->mnt_list.cqe_next) {
305		if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
306		    mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
307			simple_unlock(&mountlist_slock);
308			return (mp);
309	    }
310	}
311	simple_unlock(&mountlist_slock);
312	return ((struct mount *) 0);
313}
314
315/*
316 * Get a new unique fsid
317 */
318void
319vfs_getnewfsid(mp)
320	struct mount *mp;
321{
322	static u_short xxxfs_mntid;
323
324	fsid_t tfsid;
325	int mtype;
326
327	simple_lock(&mntid_slock);
328	mtype = mp->mnt_vfc->vfc_typenum;
329	mp->mnt_stat.f_fsid.val[0] = makedev(nblkdev + mtype, 0);
330	mp->mnt_stat.f_fsid.val[1] = mtype;
331	if (xxxfs_mntid == 0)
332		++xxxfs_mntid;
333	tfsid.val[0] = makedev(nblkdev + mtype, xxxfs_mntid);
334	tfsid.val[1] = mtype;
335	if (mountlist.cqh_first != (void *)&mountlist) {
336		while (vfs_getvfs(&tfsid)) {
337			tfsid.val[0]++;
338			xxxfs_mntid++;
339		}
340	}
341	mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
342	simple_unlock(&mntid_slock);
343}
344
345/*
346 * Set vnode attributes to VNOVAL
347 */
348void
349vattr_null(vap)
350	register struct vattr *vap;
351{
352
353	vap->va_type = VNON;
354	vap->va_size = VNOVAL;
355	vap->va_bytes = VNOVAL;
356	vap->va_mode = vap->va_nlink = vap->va_uid = vap->va_gid =
357	    vap->va_fsid = vap->va_fileid =
358	    vap->va_blocksize = vap->va_rdev =
359	    vap->va_atime.tv_sec = vap->va_atime.tv_nsec =
360	    vap->va_mtime.tv_sec = vap->va_mtime.tv_nsec =
361	    vap->va_ctime.tv_sec = vap->va_ctime.tv_nsec =
362	    vap->va_flags = vap->va_gen = VNOVAL;
363	vap->va_vaflags = 0;
364}
365
366/*
367 * Routines having to do with the management of the vnode table.
368 */
369extern vop_t **dead_vnodeop_p;
370
371/*
372 * Return the next vnode from the free list.
373 */
374int
375getnewvnode(tag, mp, vops, vpp)
376	enum vtagtype tag;
377	struct mount *mp;
378	vop_t **vops;
379	struct vnode **vpp;
380{
381	int s;
382	struct proc *p = curproc;	/* XXX */
383	struct vnode *vp, *tvp, *nvp;
384	vm_object_t object;
385	TAILQ_HEAD(freelst, vnode) vnode_tmp_list;
386
387	/*
388	 * We take the least recently used vnode from the freelist
389	 * if we can get it and it has no cached pages, and no
390	 * namecache entries are relative to it.
391	 * Otherwise we allocate a new vnode
392	 */
393
394	s = splbio();
395	simple_lock(&vnode_free_list_slock);
396	TAILQ_INIT(&vnode_tmp_list);
397
398	for (vp = TAILQ_FIRST(&vnode_tobefree_list); vp; vp = nvp) {
399		nvp = TAILQ_NEXT(vp, v_freelist);
400		TAILQ_REMOVE(&vnode_tobefree_list, vp, v_freelist);
401		if (vp->v_flag & VAGE) {
402			TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
403		} else {
404			TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
405		}
406		vp->v_flag &= ~(VTBFREE|VAGE);
407		vp->v_flag |= VFREE;
408		if (vp->v_usecount)
409			panic("tobe free vnode isn't");
410		freevnodes++;
411	}
412
413	if (wantfreevnodes && freevnodes < wantfreevnodes) {
414		vp = NULL;
415	} else if (!wantfreevnodes && freevnodes <= desiredvnodes) {
416		/*
417		 * XXX: this is only here to be backwards compatible
418		 */
419		vp = NULL;
420	} else {
421		for (vp = TAILQ_FIRST(&vnode_free_list); vp; vp = nvp) {
422
423			nvp = TAILQ_NEXT(vp, v_freelist);
424
425			if (!simple_lock_try(&vp->v_interlock))
426				continue;
427			if (vp->v_usecount)
428				panic("free vnode isn't");
429
430			object = vp->v_object;
431			if (object && (object->resident_page_count || object->ref_count)) {
432				printf("object inconsistant state: RPC: %d, RC: %d\n",
433					object->resident_page_count, object->ref_count);
434				/* Don't recycle if it's caching some pages */
435				TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
436				TAILQ_INSERT_TAIL(&vnode_tmp_list, vp, v_freelist);
437				continue;
438			} else if (LIST_FIRST(&vp->v_cache_src)) {
439				/* Don't recycle if active in the namecache */
440				simple_unlock(&vp->v_interlock);
441				continue;
442			} else {
443				break;
444			}
445		}
446	}
447
448	for (tvp = TAILQ_FIRST(&vnode_tmp_list); tvp; tvp = nvp) {
449		nvp = TAILQ_NEXT(tvp, v_freelist);
450		TAILQ_REMOVE(&vnode_tmp_list, tvp, v_freelist);
451		TAILQ_INSERT_TAIL(&vnode_free_list, tvp, v_freelist);
452		simple_unlock(&tvp->v_interlock);
453	}
454
455	if (vp) {
456		vp->v_flag |= VDOOMED;
457		TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
458		freevnodes--;
459		simple_unlock(&vnode_free_list_slock);
460		cache_purge(vp);
461		vp->v_lease = NULL;
462		if (vp->v_type != VBAD) {
463			vgonel(vp, p);
464		} else {
465			simple_unlock(&vp->v_interlock);
466		}
467
468#ifdef DIAGNOSTIC
469		{
470			int s;
471
472			if (vp->v_data)
473				panic("cleaned vnode isn't");
474			s = splbio();
475			if (vp->v_numoutput)
476				panic("Clean vnode has pending I/O's");
477			splx(s);
478		}
479#endif
480		vp->v_flag = 0;
481		vp->v_lastr = 0;
482		vp->v_lastw = 0;
483		vp->v_lasta = 0;
484		vp->v_cstart = 0;
485		vp->v_clen = 0;
486		vp->v_socket = 0;
487		vp->v_writecount = 0;	/* XXX */
488		vp->v_maxio = 0;
489	} else {
490		simple_unlock(&vnode_free_list_slock);
491		vp = (struct vnode *) zalloc(vnode_zone);
492		bzero((char *) vp, sizeof *vp);
493		simple_lock_init(&vp->v_interlock);
494		vp->v_dd = vp;
495		cache_purge(vp);
496		LIST_INIT(&vp->v_cache_src);
497		TAILQ_INIT(&vp->v_cache_dst);
498		numvnodes++;
499	}
500
501	vp->v_type = VNON;
502	vp->v_tag = tag;
503	vp->v_op = vops;
504	insmntque(vp, mp);
505	*vpp = vp;
506	vp->v_usecount = 1;
507	vp->v_data = 0;
508	splx(s);
509
510	vfs_object_create(vp, p, p->p_ucred, TRUE);
511	return (0);
512}
513
514/*
515 * Move a vnode from one mount queue to another.
516 */
517static void
518insmntque(vp, mp)
519	register struct vnode *vp;
520	register struct mount *mp;
521{
522
523	simple_lock(&mntvnode_slock);
524	/*
525	 * Delete from old mount point vnode list, if on one.
526	 */
527	if (vp->v_mount != NULL)
528		LIST_REMOVE(vp, v_mntvnodes);
529	/*
530	 * Insert into list of vnodes for the new mount point, if available.
531	 */
532	if ((vp->v_mount = mp) == NULL) {
533		simple_unlock(&mntvnode_slock);
534		return;
535	}
536	LIST_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes);
537	simple_unlock(&mntvnode_slock);
538}
539
540/*
541 * Update outstanding I/O count and do wakeup if requested.
542 */
543void
544vwakeup(bp)
545	register struct buf *bp;
546{
547	register struct vnode *vp;
548
549	bp->b_flags &= ~B_WRITEINPROG;
550	if ((vp = bp->b_vp)) {
551		vp->v_numoutput--;
552		if (vp->v_numoutput < 0)
553			panic("vwakeup: neg numoutput");
554		if ((vp->v_numoutput == 0) && (vp->v_flag & VBWAIT)) {
555			vp->v_flag &= ~VBWAIT;
556			wakeup((caddr_t) &vp->v_numoutput);
557		}
558	}
559}
560
561/*
562 * Flush out and invalidate all buffers associated with a vnode.
563 * Called with the underlying object locked.
564 */
565int
566vinvalbuf(vp, flags, cred, p, slpflag, slptimeo)
567	register struct vnode *vp;
568	int flags;
569	struct ucred *cred;
570	struct proc *p;
571	int slpflag, slptimeo;
572{
573	register struct buf *bp;
574	struct buf *nbp, *blist;
575	int s, error;
576	vm_object_t object;
577
578	if ((flags & V_SAVE) && vp->v_dirtyblkhd.lh_first != NULL) {
579		if ((error = VOP_FSYNC(vp, cred, MNT_WAIT, p)))
580			return (error);
581		if (vp->v_dirtyblkhd.lh_first != NULL)
582			panic("vinvalbuf: dirty bufs");
583	}
584
585	s = splbio();
586	for (;;) {
587		if ((blist = vp->v_cleanblkhd.lh_first) && (flags & V_SAVEMETA))
588			while (blist && blist->b_lblkno < 0)
589				blist = blist->b_vnbufs.le_next;
590		if (!blist && (blist = vp->v_dirtyblkhd.lh_first) &&
591		    (flags & V_SAVEMETA))
592			while (blist && blist->b_lblkno < 0)
593				blist = blist->b_vnbufs.le_next;
594		if (!blist)
595			break;
596
597		for (bp = blist; bp; bp = nbp) {
598			nbp = bp->b_vnbufs.le_next;
599			if ((flags & V_SAVEMETA) && bp->b_lblkno < 0)
600				continue;
601			if (bp->b_flags & B_BUSY) {
602				bp->b_flags |= B_WANTED;
603				error = tsleep((caddr_t) bp,
604				    slpflag | (PRIBIO + 1), "vinvalbuf",
605				    slptimeo);
606				if (error) {
607					splx(s);
608					return (error);
609				}
610				break;
611			}
612			bremfree(bp);
613			bp->b_flags |= B_BUSY;
614			/*
615			 * XXX Since there are no node locks for NFS, I
616			 * believe there is a slight chance that a delayed
617			 * write will occur while sleeping just above, so
618			 * check for it.
619			 */
620			if ((bp->b_flags & B_DELWRI) && (flags & V_SAVE)) {
621				if (bp->b_vp == vp) {
622					if (bp->b_flags & B_CLUSTEROK) {
623						vfs_bio_awrite(bp);
624					} else {
625						bp->b_flags |= B_ASYNC;
626						VOP_BWRITE(bp);
627					}
628				} else {
629					(void) VOP_BWRITE(bp);
630				}
631				break;
632			}
633			bp->b_flags |= (B_INVAL|B_NOCACHE|B_RELBUF);
634			brelse(bp);
635		}
636	}
637
638	while (vp->v_numoutput > 0) {
639		vp->v_flag |= VBWAIT;
640		tsleep(&vp->v_numoutput, PVM, "vnvlbv", 0);
641	}
642
643	splx(s);
644
645	/*
646	 * Destroy the copy in the VM cache, too.
647	 */
648	simple_lock(&vp->v_interlock);
649	object = vp->v_object;
650	if (object != NULL) {
651		if (flags & V_SAVEMETA)
652			vm_object_page_remove(object, 0, object->size,
653				(flags & V_SAVE) ? TRUE : FALSE);
654		else
655			vm_object_page_remove(object, 0, 0,
656				(flags & V_SAVE) ? TRUE : FALSE);
657	}
658	simple_unlock(&vp->v_interlock);
659
660	if (!(flags & V_SAVEMETA) &&
661	    (vp->v_dirtyblkhd.lh_first || vp->v_cleanblkhd.lh_first))
662		panic("vinvalbuf: flush failed");
663	return (0);
664}
665
666/*
667 * Associate a buffer with a vnode.
668 */
669void
670bgetvp(vp, bp)
671	register struct vnode *vp;
672	register struct buf *bp;
673{
674	int s;
675
676#if defined(DIAGNOSTIC)
677	if (bp->b_vp)
678		panic("bgetvp: not free");
679#endif
680	vhold(vp);
681	bp->b_vp = vp;
682	if (vp->v_type == VBLK || vp->v_type == VCHR)
683		bp->b_dev = vp->v_rdev;
684	else
685		bp->b_dev = NODEV;
686	/*
687	 * Insert onto list for new vnode.
688	 */
689	s = splbio();
690	bufinsvn(bp, &vp->v_cleanblkhd);
691	splx(s);
692}
693
694/*
695 * Disassociate a buffer from a vnode.
696 */
697void
698brelvp(bp)
699	register struct buf *bp;
700{
701	struct vnode *vp;
702	int s;
703
704#if defined(DIAGNOSTIC)
705	if (bp->b_vp == (struct vnode *) 0)
706		panic("brelvp: NULL");
707#endif
708
709	/*
710	 * Delete from old vnode list, if on one.
711	 */
712	vp = bp->b_vp;
713	s = splbio();
714	if (bp->b_vnbufs.le_next != NOLIST)
715		bufremvn(bp);
716	if ((vp->v_flag & VONWORKLST) && (LIST_FIRST(&vp->v_dirtyblkhd) == NULL)) {
717		vp->v_flag &= ~VONWORKLST;
718		LIST_REMOVE(vp, v_synclist);
719	}
720	splx(s);
721	bp->b_vp = (struct vnode *) 0;
722	vdrop(vp);
723}
724
725/*
726 * The workitem queue.
727 *
728 * It is useful to delay writes of file data and filesystem metadata
729 * for tens of seconds so that quickly created and deleted files need
730 * not waste disk bandwidth being created and removed. To realize this,
731 * we append vnodes to a "workitem" queue. When running with a soft
732 * updates implementation, most pending metadata dependencies should
733 * not wait for more than a few seconds. Thus, mounted on block devices
734 * are delayed only about a half the time that file data is delayed.
735 * Similarly, directory updates are more critical, so are only delayed
736 * about a third the time that file data is delayed. Thus, there are
737 * SYNCER_MAXDELAY queues that are processed round-robin at a rate of
738 * one each second (driven off the filesystem syner process). The
739 * syncer_delayno variable indicates the next queue that is to be processed.
740 * Items that need to be processed soon are placed in this queue:
741 *
742 *	syncer_workitem_pending[syncer_delayno]
743 *
744 * A delay of fifteen seconds is done by placing the request fifteen
745 * entries later in the queue:
746 *
747 *	syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
748 *
749 */
750
751/*
752 * Add an item to the syncer work queue.
753 */
754void
755vn_syncer_add_to_worklist(vp, delay)
756	struct vnode *vp;
757	int delay;
758{
759	int s, slot;
760
761	s = splbio();
762
763	if (vp->v_flag & VONWORKLST) {
764		LIST_REMOVE(vp, v_synclist);
765	}
766
767	if (delay > syncer_maxdelay - 2)
768		delay = syncer_maxdelay - 2;
769	slot = (syncer_delayno + delay) & syncer_mask;
770
771	LIST_INSERT_HEAD(&syncer_workitem_pending[slot], vp, v_synclist);
772	vp->v_flag |= VONWORKLST;
773	splx(s);
774}
775
776static void sched_sync __P((void));
777static struct	proc *updateproc;
778static struct kproc_desc up_kp = {
779	"syncer",
780	sched_sync,
781	&updateproc
782};
783SYSINIT_KT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp)
784
785/*
786 * System filesystem synchronizer daemon.
787 */
788void
789sched_sync(void)
790{
791	struct synclist *slp;
792	struct vnode *vp;
793	long starttime;
794	int s;
795	struct proc *p = updateproc;
796
797	for (;;) {
798		starttime = time.tv_sec;
799
800		/*
801		 * Push files whose dirty time has expired.
802		 */
803		s = splbio();
804		slp = &syncer_workitem_pending[syncer_delayno];
805		syncer_delayno += 1;
806		if (syncer_delayno == syncer_maxdelay)
807			syncer_delayno = 0;
808		splx(s);
809
810		while ((vp = LIST_FIRST(slp)) != NULL) {
811			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
812			(void) VOP_FSYNC(vp, p->p_ucred, MNT_LAZY, p);
813			VOP_UNLOCK(vp, 0, p);
814			if (LIST_FIRST(slp) == vp) {
815				if (LIST_FIRST(&vp->v_dirtyblkhd) == NULL &&
816				    vp->v_type != VBLK)
817					panic("sched_sync: fsync failed");
818				/*
819				 * Move ourselves to the back of the sync list.
820				 */
821				LIST_REMOVE(vp, v_synclist);
822				vn_syncer_add_to_worklist(vp, syncdelay);
823			}
824		}
825
826		/*
827		 * Do soft update processing.
828		 */
829		if (bioops.io_sync)
830			(*bioops.io_sync)(NULL);
831
832		/*
833		 * The variable rushjob allows the kernel to speed up the
834		 * processing of the filesystem syncer process. A rushjob
835		 * value of N tells the filesystem syncer to process the next
836		 * N seconds worth of work on its queue ASAP. Currently rushjob
837		 * is used by the soft update code to speed up the filesystem
838		 * syncer process when the incore state is getting so far
839		 * ahead of the disk that the kernel memory pool is being
840		 * threatened with exhaustion.
841		 */
842		if (rushjob > 0) {
843			rushjob -= 1;
844			continue;
845		}
846		/*
847		 * If it has taken us less than a second to process the
848		 * current work, then wait. Otherwise start right over
849		 * again. We can still lose time if any single round
850		 * takes more than two seconds, but it does not really
851		 * matter as we are just trying to generally pace the
852		 * filesystem activity.
853		 */
854		if (time.tv_sec == starttime)
855			tsleep(&lbolt, PPAUSE, "syncer", 0);
856	}
857}
858
859/*
860 * Associate a p-buffer with a vnode.
861 */
862void
863pbgetvp(vp, bp)
864	register struct vnode *vp;
865	register struct buf *bp;
866{
867#if defined(DIAGNOSTIC)
868	if (bp->b_vp)
869		panic("pbgetvp: not free");
870#endif
871	bp->b_vp = vp;
872	if (vp->v_type == VBLK || vp->v_type == VCHR)
873		bp->b_dev = vp->v_rdev;
874	else
875		bp->b_dev = NODEV;
876}
877
878/*
879 * Disassociate a p-buffer from a vnode.
880 */
881void
882pbrelvp(bp)
883	register struct buf *bp;
884{
885
886#if defined(DIAGNOSTIC)
887	if (bp->b_vp == (struct vnode *) 0)
888		panic("pbrelvp: NULL");
889#endif
890
891	bp->b_vp = (struct vnode *) 0;
892}
893
894/*
895 * Reassign a buffer from one vnode to another.
896 * Used to assign file specific control information
897 * (indirect blocks) to the vnode to which they belong.
898 */
899void
900reassignbuf(bp, newvp)
901	register struct buf *bp;
902	register struct vnode *newvp;
903{
904	struct buflists *listheadp;
905	int delay;
906	int s;
907
908	if (newvp == NULL) {
909		printf("reassignbuf: NULL");
910		return;
911	}
912
913	s = splbio();
914	/*
915	 * Delete from old vnode list, if on one.
916	 */
917	if (bp->b_vnbufs.le_next != NOLIST) {
918		bufremvn(bp);
919		vdrop(bp->b_vp);
920	}
921	/*
922	 * If dirty, put on list of dirty buffers; otherwise insert onto list
923	 * of clean buffers.
924	 */
925	if (bp->b_flags & B_DELWRI) {
926		struct buf *tbp;
927
928		listheadp = &newvp->v_dirtyblkhd;
929		if ((newvp->v_flag & VONWORKLST) == 0) {
930			switch (newvp->v_type) {
931			case VDIR:
932				delay = syncdelay / 3;
933				break;
934			case VBLK:
935				if (newvp->v_specmountpoint != NULL) {
936					delay = syncdelay / 2;
937					break;
938				}
939				/* fall through */
940			default:
941				delay = syncdelay;
942			}
943			vn_syncer_add_to_worklist(newvp, delay);
944		}
945		tbp = listheadp->lh_first;
946		if (!tbp || (tbp->b_lblkno > bp->b_lblkno)) {
947			bufinsvn(bp, listheadp);
948		} else {
949			while (tbp->b_vnbufs.le_next &&
950			    (tbp->b_vnbufs.le_next->b_lblkno < bp->b_lblkno)) {
951				tbp = tbp->b_vnbufs.le_next;
952			}
953			LIST_INSERT_AFTER(tbp, bp, b_vnbufs);
954		}
955	} else {
956		bufinsvn(bp, &newvp->v_cleanblkhd);
957		if ((newvp->v_flag & VONWORKLST) &&
958			LIST_FIRST(&newvp->v_dirtyblkhd) == NULL) {
959			newvp->v_flag &= ~VONWORKLST;
960			LIST_REMOVE(newvp, v_synclist);
961		}
962	}
963	bp->b_vp = newvp;
964	vhold(bp->b_vp);
965	splx(s);
966}
967
968#ifndef DEVFS_ROOT
969/*
970 * Create a vnode for a block device.
971 * Used for mounting the root file system.
972 */
973int
974bdevvp(dev, vpp)
975	dev_t dev;
976	struct vnode **vpp;
977{
978	register struct vnode *vp;
979	struct vnode *nvp;
980	int error;
981
982	if (dev == NODEV)
983		return (0);
984	error = getnewvnode(VT_NON, (struct mount *) 0, spec_vnodeop_p, &nvp);
985	if (error) {
986		*vpp = 0;
987		return (error);
988	}
989	vp = nvp;
990	vp->v_type = VBLK;
991	if ((nvp = checkalias(vp, dev, (struct mount *) 0))) {
992		vput(vp);
993		vp = nvp;
994	}
995	*vpp = vp;
996	return (0);
997}
998#endif /* !DEVFS_ROOT */
999
1000/*
1001 * Check to see if the new vnode represents a special device
1002 * for which we already have a vnode (either because of
1003 * bdevvp() or because of a different vnode representing
1004 * the same block device). If such an alias exists, deallocate
1005 * the existing contents and return the aliased vnode. The
1006 * caller is responsible for filling it with its new contents.
1007 */
1008struct vnode *
1009checkalias(nvp, nvp_rdev, mp)
1010	register struct vnode *nvp;
1011	dev_t nvp_rdev;
1012	struct mount *mp;
1013{
1014	struct proc *p = curproc;	/* XXX */
1015	struct vnode *vp;
1016	struct vnode **vpp;
1017
1018	if (nvp->v_type != VBLK && nvp->v_type != VCHR)
1019		return (NULLVP);
1020
1021	vpp = &speclisth[SPECHASH(nvp_rdev)];
1022loop:
1023	simple_lock(&spechash_slock);
1024	for (vp = *vpp; vp; vp = vp->v_specnext) {
1025		if (nvp_rdev != vp->v_rdev || nvp->v_type != vp->v_type)
1026			continue;
1027		/*
1028		 * Alias, but not in use, so flush it out.
1029		 */
1030		simple_lock(&vp->v_interlock);
1031		if (vp->v_usecount == 0) {
1032			simple_unlock(&spechash_slock);
1033			vgonel(vp, p);
1034			goto loop;
1035		}
1036		if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, p)) {
1037			simple_unlock(&spechash_slock);
1038			goto loop;
1039		}
1040		break;
1041	}
1042	if (vp == NULL || vp->v_tag != VT_NON) {
1043		MALLOC(nvp->v_specinfo, struct specinfo *,
1044		    sizeof(struct specinfo), M_VNODE, M_WAITOK);
1045		nvp->v_rdev = nvp_rdev;
1046		nvp->v_hashchain = vpp;
1047		nvp->v_specnext = *vpp;
1048		nvp->v_specmountpoint = NULL;
1049		simple_unlock(&spechash_slock);
1050		*vpp = nvp;
1051		if (vp != NULLVP) {
1052			nvp->v_flag |= VALIASED;
1053			vp->v_flag |= VALIASED;
1054			vput(vp);
1055		}
1056		return (NULLVP);
1057	}
1058	simple_unlock(&spechash_slock);
1059	VOP_UNLOCK(vp, 0, p);
1060	simple_lock(&vp->v_interlock);
1061	vclean(vp, 0, p);
1062	vp->v_op = nvp->v_op;
1063	vp->v_tag = nvp->v_tag;
1064	nvp->v_type = VNON;
1065	insmntque(vp, mp);
1066	return (vp);
1067}
1068
1069/*
1070 * Grab a particular vnode from the free list, increment its
1071 * reference count and lock it. The vnode lock bit is set the
1072 * vnode is being eliminated in vgone. The process is awakened
1073 * when the transition is completed, and an error returned to
1074 * indicate that the vnode is no longer usable (possibly having
1075 * been changed to a new file system type).
1076 */
1077int
1078vget(vp, flags, p)
1079	register struct vnode *vp;
1080	int flags;
1081	struct proc *p;
1082{
1083	int error;
1084
1085	/*
1086	 * If the vnode is in the process of being cleaned out for
1087	 * another use, we wait for the cleaning to finish and then
1088	 * return failure. Cleaning is determined by checking that
1089	 * the VXLOCK flag is set.
1090	 */
1091	if ((flags & LK_INTERLOCK) == 0) {
1092		simple_lock(&vp->v_interlock);
1093	}
1094	if (vp->v_flag & VXLOCK) {
1095		vp->v_flag |= VXWANT;
1096		simple_unlock(&vp->v_interlock);
1097		tsleep((caddr_t)vp, PINOD, "vget", 0);
1098		return (ENOENT);
1099	}
1100
1101	vp->v_usecount++;
1102
1103	if (VSHOULDBUSY(vp))
1104		vbusy(vp);
1105	if (flags & LK_TYPE_MASK) {
1106		if ((error = vn_lock(vp, flags | LK_INTERLOCK, p)) != 0) {
1107			/*
1108			 * must expand vrele here because we do not want
1109			 * to call VOP_INACTIVE if the reference count
1110			 * drops back to zero since it was never really
1111			 * active. We must remove it from the free list
1112			 * before sleeping so that multiple processes do
1113			 * not try to recycle it.
1114			 */
1115			simple_lock(&vp->v_interlock);
1116			vp->v_usecount--;
1117			if (VSHOULDFREE(vp))
1118				vfree(vp);
1119			simple_unlock(&vp->v_interlock);
1120		}
1121		return (error);
1122	}
1123	simple_unlock(&vp->v_interlock);
1124	return (0);
1125}
1126
1127void
1128vref(struct vnode *vp)
1129{
1130	simple_lock(&vp->v_interlock);
1131	vp->v_usecount++;
1132	simple_unlock(&vp->v_interlock);
1133}
1134
1135/*
1136 * Vnode put/release.
1137 * If count drops to zero, call inactive routine and return to freelist.
1138 */
1139void
1140vrele(vp)
1141	struct vnode *vp;
1142{
1143	struct proc *p = curproc;	/* XXX */
1144
1145#ifdef DIAGNOSTIC
1146	if (vp == NULL)
1147		panic("vrele: null vp");
1148#endif
1149	simple_lock(&vp->v_interlock);
1150
1151	if (vp->v_usecount > 1) {
1152
1153		vp->v_usecount--;
1154		simple_unlock(&vp->v_interlock);
1155
1156		return;
1157	}
1158
1159	if (vp->v_usecount == 1) {
1160
1161		vp->v_usecount--;
1162
1163		if (VSHOULDFREE(vp))
1164			vfree(vp);
1165	/*
1166	 * If we are doing a vput, the node is already locked, and we must
1167	 * call VOP_INACTIVE with the node locked.  So, in the case of
1168	 * vrele, we explicitly lock the vnode before calling VOP_INACTIVE.
1169	 */
1170		if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK, p) == 0) {
1171			VOP_INACTIVE(vp, p);
1172		}
1173
1174	} else {
1175#ifdef DIAGNOSTIC
1176		vprint("vrele: negative ref count", vp);
1177		simple_unlock(&vp->v_interlock);
1178#endif
1179		panic("vrele: negative ref cnt");
1180	}
1181}
1182
1183void
1184vput(vp)
1185	struct vnode *vp;
1186{
1187	struct proc *p = curproc;	/* XXX */
1188
1189#ifdef DIAGNOSTIC
1190	if (vp == NULL)
1191		panic("vput: null vp");
1192#endif
1193
1194	simple_lock(&vp->v_interlock);
1195
1196	if (vp->v_usecount > 1) {
1197
1198		vp->v_usecount--;
1199		VOP_UNLOCK(vp, LK_INTERLOCK, p);
1200		return;
1201
1202	}
1203
1204	if (vp->v_usecount == 1) {
1205
1206		vp->v_usecount--;
1207		if (VSHOULDFREE(vp))
1208			vfree(vp);
1209	/*
1210	 * If we are doing a vput, the node is already locked, and we must
1211	 * call VOP_INACTIVE with the node locked.  So, in the case of
1212	 * vrele, we explicitly lock the vnode before calling VOP_INACTIVE.
1213	 */
1214		simple_unlock(&vp->v_interlock);
1215		VOP_INACTIVE(vp, p);
1216
1217	} else {
1218#ifdef DIAGNOSTIC
1219		vprint("vput: negative ref count", vp);
1220#endif
1221		panic("vput: negative ref cnt");
1222	}
1223}
1224
1225/*
1226 * Somebody doesn't want the vnode recycled.
1227 */
1228void
1229vhold(vp)
1230	register struct vnode *vp;
1231{
1232	int s;
1233
1234  	s = splbio();
1235	vp->v_holdcnt++;
1236	if (VSHOULDBUSY(vp))
1237		vbusy(vp);
1238	splx(s);
1239}
1240
1241/*
1242 * One less who cares about this vnode.
1243 */
1244void
1245vdrop(vp)
1246	register struct vnode *vp;
1247{
1248	int s;
1249
1250	s = splbio();
1251	if (vp->v_holdcnt <= 0)
1252		panic("vdrop: holdcnt");
1253	vp->v_holdcnt--;
1254	if (VSHOULDFREE(vp))
1255		vfree(vp);
1256	splx(s);
1257}
1258
1259/*
1260 * Remove any vnodes in the vnode table belonging to mount point mp.
1261 *
1262 * If MNT_NOFORCE is specified, there should not be any active ones,
1263 * return error if any are found (nb: this is a user error, not a
1264 * system error). If MNT_FORCE is specified, detach any active vnodes
1265 * that are found.
1266 */
1267#ifdef DIAGNOSTIC
1268static int busyprt = 0;		/* print out busy vnodes */
1269SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "");
1270#endif
1271
1272int
1273vflush(mp, skipvp, flags)
1274	struct mount *mp;
1275	struct vnode *skipvp;
1276	int flags;
1277{
1278	struct proc *p = curproc;	/* XXX */
1279	struct vnode *vp, *nvp;
1280	int busy = 0;
1281
1282	simple_lock(&mntvnode_slock);
1283loop:
1284	for (vp = mp->mnt_vnodelist.lh_first; vp; vp = nvp) {
1285		/*
1286		 * Make sure this vnode wasn't reclaimed in getnewvnode().
1287		 * Start over if it has (it won't be on the list anymore).
1288		 */
1289		if (vp->v_mount != mp)
1290			goto loop;
1291		nvp = vp->v_mntvnodes.le_next;
1292		/*
1293		 * Skip over a selected vnode.
1294		 */
1295		if (vp == skipvp)
1296			continue;
1297
1298		simple_lock(&vp->v_interlock);
1299		/*
1300		 * Skip over a vnodes marked VSYSTEM.
1301		 */
1302		if ((flags & SKIPSYSTEM) && (vp->v_flag & VSYSTEM)) {
1303			simple_unlock(&vp->v_interlock);
1304			continue;
1305		}
1306		/*
1307		 * If WRITECLOSE is set, only flush out regular file vnodes
1308		 * open for writing.
1309		 */
1310		if ((flags & WRITECLOSE) &&
1311		    (vp->v_writecount == 0 || vp->v_type != VREG)) {
1312			simple_unlock(&vp->v_interlock);
1313			continue;
1314		}
1315
1316		/*
1317		 * With v_usecount == 0, all we need to do is clear out the
1318		 * vnode data structures and we are done.
1319		 */
1320		if (vp->v_usecount == 0) {
1321			simple_unlock(&mntvnode_slock);
1322			vgonel(vp, p);
1323			simple_lock(&mntvnode_slock);
1324			continue;
1325		}
1326
1327		/*
1328		 * If FORCECLOSE is set, forcibly close the vnode. For block
1329		 * or character devices, revert to an anonymous device. For
1330		 * all other files, just kill them.
1331		 */
1332		if (flags & FORCECLOSE) {
1333			simple_unlock(&mntvnode_slock);
1334			if (vp->v_type != VBLK && vp->v_type != VCHR) {
1335				vgonel(vp, p);
1336			} else {
1337				vclean(vp, 0, p);
1338				vp->v_op = spec_vnodeop_p;
1339				insmntque(vp, (struct mount *) 0);
1340			}
1341			simple_lock(&mntvnode_slock);
1342			continue;
1343		}
1344#ifdef DIAGNOSTIC
1345		if (busyprt)
1346			vprint("vflush: busy vnode", vp);
1347#endif
1348		simple_unlock(&vp->v_interlock);
1349		busy++;
1350	}
1351	simple_unlock(&mntvnode_slock);
1352	if (busy)
1353		return (EBUSY);
1354	return (0);
1355}
1356
1357/*
1358 * Disassociate the underlying file system from a vnode.
1359 */
1360static void
1361vclean(vp, flags, p)
1362	struct vnode *vp;
1363	int flags;
1364	struct proc *p;
1365{
1366	int active;
1367	vm_object_t obj;
1368
1369	/*
1370	 * Check to see if the vnode is in use. If so we have to reference it
1371	 * before we clean it out so that its count cannot fall to zero and
1372	 * generate a race against ourselves to recycle it.
1373	 */
1374	if ((active = vp->v_usecount))
1375		vp->v_usecount++;
1376
1377	/*
1378	 * Prevent the vnode from being recycled or brought into use while we
1379	 * clean it out.
1380	 */
1381	if (vp->v_flag & VXLOCK)
1382		panic("vclean: deadlock");
1383	vp->v_flag |= VXLOCK;
1384	/*
1385	 * Even if the count is zero, the VOP_INACTIVE routine may still
1386	 * have the object locked while it cleans it out. The VOP_LOCK
1387	 * ensures that the VOP_INACTIVE routine is done with its work.
1388	 * For active vnodes, it ensures that no other activity can
1389	 * occur while the underlying object is being cleaned out.
1390	 */
1391	VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK, p);
1392
1393	/*
1394	 * Clean out any buffers associated with the vnode.
1395	 */
1396	vinvalbuf(vp, V_SAVE, NOCRED, p, 0, 0);
1397	if (obj = vp->v_object) {
1398		if (obj->ref_count == 0) {
1399			/*
1400			 * This is a normal way of shutting down the object/vnode
1401			 * association.
1402			 */
1403			vm_object_terminate(obj);
1404		} else {
1405			/*
1406			 * Woe to the process that tries to page now :-).
1407			 */
1408			vm_pager_deallocate(obj);
1409		}
1410	}
1411
1412	/*
1413	 * If purging an active vnode, it must be closed and
1414	 * deactivated before being reclaimed. Note that the
1415	 * VOP_INACTIVE will unlock the vnode.
1416	 */
1417	if (active) {
1418		if (flags & DOCLOSE)
1419			VOP_CLOSE(vp, IO_NDELAY, NOCRED, p);
1420		VOP_INACTIVE(vp, p);
1421	} else {
1422		/*
1423		 * Any other processes trying to obtain this lock must first
1424		 * wait for VXLOCK to clear, then call the new lock operation.
1425		 */
1426		VOP_UNLOCK(vp, 0, p);
1427	}
1428	/*
1429	 * Reclaim the vnode.
1430	 */
1431	if (VOP_RECLAIM(vp, p))
1432		panic("vclean: cannot reclaim");
1433
1434	if (active)
1435		vrele(vp);
1436
1437	cache_purge(vp);
1438	if (vp->v_vnlock) {
1439#if 0 /* This is the only place we have LK_DRAINED in the entire kernel ??? */
1440#ifdef DIAGNOSTIC
1441		if ((vp->v_vnlock->lk_flags & LK_DRAINED) == 0)
1442			vprint("vclean: lock not drained", vp);
1443#endif
1444#endif
1445		FREE(vp->v_vnlock, M_VNODE);
1446		vp->v_vnlock = NULL;
1447	}
1448
1449	if (VSHOULDFREE(vp))
1450		vfree(vp);
1451
1452	/*
1453	 * Done with purge, notify sleepers of the grim news.
1454	 */
1455	vp->v_op = dead_vnodeop_p;
1456	vn_pollgone(vp);
1457	vp->v_tag = VT_NON;
1458	vp->v_flag &= ~VXLOCK;
1459	if (vp->v_flag & VXWANT) {
1460		vp->v_flag &= ~VXWANT;
1461		wakeup((caddr_t) vp);
1462	}
1463}
1464
1465/*
1466 * Eliminate all activity associated with the requested vnode
1467 * and with all vnodes aliased to the requested vnode.
1468 */
1469int
1470vop_revoke(ap)
1471	struct vop_revoke_args /* {
1472		struct vnode *a_vp;
1473		int a_flags;
1474	} */ *ap;
1475{
1476	struct vnode *vp, *vq;
1477	struct proc *p = curproc;	/* XXX */
1478
1479#ifdef DIAGNOSTIC
1480	if ((ap->a_flags & REVOKEALL) == 0)
1481		panic("vop_revoke");
1482#endif
1483
1484	vp = ap->a_vp;
1485	simple_lock(&vp->v_interlock);
1486
1487	if (vp->v_flag & VALIASED) {
1488		/*
1489		 * If a vgone (or vclean) is already in progress,
1490		 * wait until it is done and return.
1491		 */
1492		if (vp->v_flag & VXLOCK) {
1493			vp->v_flag |= VXWANT;
1494			simple_unlock(&vp->v_interlock);
1495			tsleep((caddr_t)vp, PINOD, "vop_revokeall", 0);
1496			return (0);
1497		}
1498		/*
1499		 * Ensure that vp will not be vgone'd while we
1500		 * are eliminating its aliases.
1501		 */
1502		vp->v_flag |= VXLOCK;
1503		simple_unlock(&vp->v_interlock);
1504		while (vp->v_flag & VALIASED) {
1505			simple_lock(&spechash_slock);
1506			for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
1507				if (vq->v_rdev != vp->v_rdev ||
1508				    vq->v_type != vp->v_type || vp == vq)
1509					continue;
1510				simple_unlock(&spechash_slock);
1511				vgone(vq);
1512				break;
1513			}
1514			if (vq == NULLVP) {
1515				simple_unlock(&spechash_slock);
1516			}
1517		}
1518		/*
1519		 * Remove the lock so that vgone below will
1520		 * really eliminate the vnode after which time
1521		 * vgone will awaken any sleepers.
1522		 */
1523		simple_lock(&vp->v_interlock);
1524		vp->v_flag &= ~VXLOCK;
1525		if (vp->v_flag & VXWANT) {
1526			vp->v_flag &= ~VXWANT;
1527			wakeup(vp);
1528		}
1529	}
1530	vgonel(vp, p);
1531	return (0);
1532}
1533
1534/*
1535 * Recycle an unused vnode to the front of the free list.
1536 * Release the passed interlock if the vnode will be recycled.
1537 */
1538int
1539vrecycle(vp, inter_lkp, p)
1540	struct vnode *vp;
1541	struct simplelock *inter_lkp;
1542	struct proc *p;
1543{
1544
1545	simple_lock(&vp->v_interlock);
1546	if (vp->v_usecount == 0) {
1547		if (inter_lkp) {
1548			simple_unlock(inter_lkp);
1549		}
1550		vgonel(vp, p);
1551		return (1);
1552	}
1553	simple_unlock(&vp->v_interlock);
1554	return (0);
1555}
1556
1557/*
1558 * Eliminate all activity associated with a vnode
1559 * in preparation for reuse.
1560 */
1561void
1562vgone(vp)
1563	register struct vnode *vp;
1564{
1565	struct proc *p = curproc;	/* XXX */
1566
1567	simple_lock(&vp->v_interlock);
1568	vgonel(vp, p);
1569}
1570
1571/*
1572 * vgone, with the vp interlock held.
1573 */
1574static void
1575vgonel(vp, p)
1576	struct vnode *vp;
1577	struct proc *p;
1578{
1579	int s;
1580	struct vnode *vq;
1581	struct vnode *vx;
1582
1583	/*
1584	 * If a vgone (or vclean) is already in progress,
1585	 * wait until it is done and return.
1586	 */
1587	if (vp->v_flag & VXLOCK) {
1588		vp->v_flag |= VXWANT;
1589		simple_unlock(&vp->v_interlock);
1590		tsleep((caddr_t)vp, PINOD, "vgone", 0);
1591		return;
1592	}
1593
1594	/*
1595	 * Clean out the filesystem specific data.
1596	 */
1597	vclean(vp, DOCLOSE, p);
1598	simple_lock(&vp->v_interlock);
1599
1600	/*
1601	 * Delete from old mount point vnode list, if on one.
1602	 */
1603	if (vp->v_mount != NULL)
1604		insmntque(vp, (struct mount *)0);
1605	/*
1606	 * If special device, remove it from special device alias list
1607	 * if it is on one.
1608	 */
1609	if ((vp->v_type == VBLK || vp->v_type == VCHR) && vp->v_specinfo != 0) {
1610		simple_lock(&spechash_slock);
1611		if (*vp->v_hashchain == vp) {
1612			*vp->v_hashchain = vp->v_specnext;
1613		} else {
1614			for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
1615				if (vq->v_specnext != vp)
1616					continue;
1617				vq->v_specnext = vp->v_specnext;
1618				break;
1619			}
1620			if (vq == NULL)
1621				panic("missing bdev");
1622		}
1623		if (vp->v_flag & VALIASED) {
1624			vx = NULL;
1625			for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
1626				if (vq->v_rdev != vp->v_rdev ||
1627				    vq->v_type != vp->v_type)
1628					continue;
1629				if (vx)
1630					break;
1631				vx = vq;
1632			}
1633			if (vx == NULL)
1634				panic("missing alias");
1635			if (vq == NULL)
1636				vx->v_flag &= ~VALIASED;
1637			vp->v_flag &= ~VALIASED;
1638		}
1639		simple_unlock(&spechash_slock);
1640		FREE(vp->v_specinfo, M_VNODE);
1641		vp->v_specinfo = NULL;
1642	}
1643
1644	/*
1645	 * If it is on the freelist and not already at the head,
1646	 * move it to the head of the list. The test of the back
1647	 * pointer and the reference count of zero is because
1648	 * it will be removed from the free list by getnewvnode,
1649	 * but will not have its reference count incremented until
1650	 * after calling vgone. If the reference count were
1651	 * incremented first, vgone would (incorrectly) try to
1652	 * close the previous instance of the underlying object.
1653	 */
1654	if (vp->v_usecount == 0 && !(vp->v_flag & VDOOMED)) {
1655		s = splbio();
1656		simple_lock(&vnode_free_list_slock);
1657		if (vp->v_flag & VFREE) {
1658			TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
1659		} else if (vp->v_flag & VTBFREE) {
1660			TAILQ_REMOVE(&vnode_tobefree_list, vp, v_freelist);
1661			vp->v_flag &= ~VTBFREE;
1662			freevnodes++;
1663		} else
1664			freevnodes++;
1665		vp->v_flag |= VFREE;
1666		TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
1667		simple_unlock(&vnode_free_list_slock);
1668		splx(s);
1669	}
1670
1671	vp->v_type = VBAD;
1672	simple_unlock(&vp->v_interlock);
1673}
1674
1675/*
1676 * Lookup a vnode by device number.
1677 */
1678int
1679vfinddev(dev, type, vpp)
1680	dev_t dev;
1681	enum vtype type;
1682	struct vnode **vpp;
1683{
1684	register struct vnode *vp;
1685	int rc = 0;
1686
1687	simple_lock(&spechash_slock);
1688	for (vp = speclisth[SPECHASH(dev)]; vp; vp = vp->v_specnext) {
1689		if (dev != vp->v_rdev || type != vp->v_type)
1690			continue;
1691		*vpp = vp;
1692		rc = 1;
1693		break;
1694	}
1695	simple_unlock(&spechash_slock);
1696	return (rc);
1697}
1698
1699/*
1700 * Calculate the total number of references to a special device.
1701 */
1702int
1703vcount(vp)
1704	register struct vnode *vp;
1705{
1706	struct vnode *vq, *vnext;
1707	int count;
1708
1709loop:
1710	if ((vp->v_flag & VALIASED) == 0)
1711		return (vp->v_usecount);
1712	simple_lock(&spechash_slock);
1713	for (count = 0, vq = *vp->v_hashchain; vq; vq = vnext) {
1714		vnext = vq->v_specnext;
1715		if (vq->v_rdev != vp->v_rdev || vq->v_type != vp->v_type)
1716			continue;
1717		/*
1718		 * Alias, but not in use, so flush it out.
1719		 */
1720		if (vq->v_usecount == 0 && vq != vp) {
1721			simple_unlock(&spechash_slock);
1722			vgone(vq);
1723			goto loop;
1724		}
1725		count += vq->v_usecount;
1726	}
1727	simple_unlock(&spechash_slock);
1728	return (count);
1729}
1730/*
1731 * Print out a description of a vnode.
1732 */
1733static char *typename[] =
1734{"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"};
1735
1736void
1737vprint(label, vp)
1738	char *label;
1739	register struct vnode *vp;
1740{
1741	char buf[64];
1742
1743	if (label != NULL)
1744		printf("%s: %x: ", label, vp);
1745	else
1746		printf("%x: ", vp);
1747	printf("type %s, usecount %d, writecount %d, refcount %ld,",
1748	    typename[vp->v_type], vp->v_usecount, vp->v_writecount,
1749	    vp->v_holdcnt);
1750	buf[0] = '\0';
1751	if (vp->v_flag & VROOT)
1752		strcat(buf, "|VROOT");
1753	if (vp->v_flag & VTEXT)
1754		strcat(buf, "|VTEXT");
1755	if (vp->v_flag & VSYSTEM)
1756		strcat(buf, "|VSYSTEM");
1757	if (vp->v_flag & VXLOCK)
1758		strcat(buf, "|VXLOCK");
1759	if (vp->v_flag & VXWANT)
1760		strcat(buf, "|VXWANT");
1761	if (vp->v_flag & VBWAIT)
1762		strcat(buf, "|VBWAIT");
1763	if (vp->v_flag & VALIASED)
1764		strcat(buf, "|VALIASED");
1765	if (vp->v_flag & VDOOMED)
1766		strcat(buf, "|VDOOMED");
1767	if (vp->v_flag & VFREE)
1768		strcat(buf, "|VFREE");
1769	if (vp->v_flag & VOBJBUF)
1770		strcat(buf, "|VOBJBUF");
1771	if (buf[0] != '\0')
1772		printf(" flags (%s)", &buf[1]);
1773	if (vp->v_data == NULL) {
1774		printf("\n");
1775	} else {
1776		printf("\n\t");
1777		VOP_PRINT(vp);
1778	}
1779}
1780
1781#ifdef DDB
1782/*
1783 * List all of the locked vnodes in the system.
1784 * Called when debugging the kernel.
1785 */
1786static void
1787printlockedvnodes()
1788{
1789	struct proc *p = curproc;	/* XXX */
1790	struct mount *mp, *nmp;
1791	struct vnode *vp;
1792
1793	printf("Locked vnodes\n");
1794	simple_lock(&mountlist_slock);
1795	for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) {
1796		if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) {
1797			nmp = mp->mnt_list.cqe_next;
1798			continue;
1799		}
1800		for (vp = mp->mnt_vnodelist.lh_first;
1801		     vp != NULL;
1802		     vp = vp->v_mntvnodes.le_next) {
1803			if (VOP_ISLOCKED(vp))
1804				vprint((char *)0, vp);
1805		}
1806		simple_lock(&mountlist_slock);
1807		nmp = mp->mnt_list.cqe_next;
1808		vfs_unbusy(mp, p);
1809	}
1810	simple_unlock(&mountlist_slock);
1811}
1812#endif
1813
1814/*
1815 * Top level filesystem related information gathering.
1816 */
1817static int	sysctl_ovfs_conf __P(SYSCTL_HANDLER_ARGS);
1818
1819static int
1820vfs_sysctl SYSCTL_HANDLER_ARGS
1821{
1822	int *name = (int *)arg1 - 1;	/* XXX */
1823	u_int namelen = arg2 + 1;	/* XXX */
1824	struct vfsconf *vfsp;
1825
1826#ifndef NO_COMPAT_PRELITE2
1827	/* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */
1828	if (namelen == 1)
1829		return (sysctl_ovfs_conf(oidp, arg1, arg2, req));
1830#endif
1831
1832#ifdef notyet
1833	/* all sysctl names at this level are at least name and field */
1834	if (namelen < 2)
1835		return (ENOTDIR);		/* overloaded */
1836	if (name[0] != VFS_GENERIC) {
1837		for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
1838			if (vfsp->vfc_typenum == name[0])
1839				break;
1840		if (vfsp == NULL)
1841			return (EOPNOTSUPP);
1842		return ((*vfsp->vfc_vfsops->vfs_sysctl)(&name[1], namelen - 1,
1843		    oldp, oldlenp, newp, newlen, p));
1844	}
1845#endif
1846	switch (name[1]) {
1847	case VFS_MAXTYPENUM:
1848		if (namelen != 2)
1849			return (ENOTDIR);
1850		return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int)));
1851	case VFS_CONF:
1852		if (namelen != 3)
1853			return (ENOTDIR);	/* overloaded */
1854		for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
1855			if (vfsp->vfc_typenum == name[2])
1856				break;
1857		if (vfsp == NULL)
1858			return (EOPNOTSUPP);
1859		return (SYSCTL_OUT(req, vfsp, sizeof *vfsp));
1860	}
1861	return (EOPNOTSUPP);
1862}
1863
1864SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD, vfs_sysctl,
1865	"Generic filesystem");
1866
1867#ifndef NO_COMPAT_PRELITE2
1868
1869static int
1870sysctl_ovfs_conf SYSCTL_HANDLER_ARGS
1871{
1872	int error;
1873	struct vfsconf *vfsp;
1874	struct ovfsconf ovfs;
1875
1876	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
1877		ovfs.vfc_vfsops = vfsp->vfc_vfsops;	/* XXX used as flag */
1878		strcpy(ovfs.vfc_name, vfsp->vfc_name);
1879		ovfs.vfc_index = vfsp->vfc_typenum;
1880		ovfs.vfc_refcount = vfsp->vfc_refcount;
1881		ovfs.vfc_flags = vfsp->vfc_flags;
1882		error = SYSCTL_OUT(req, &ovfs, sizeof ovfs);
1883		if (error)
1884			return error;
1885	}
1886	return 0;
1887}
1888
1889#endif /* !NO_COMPAT_PRELITE2 */
1890
1891static volatile int kinfo_vdebug = 1;
1892
1893#if 0
1894#define KINFO_VNODESLOP	10
1895/*
1896 * Dump vnode list (via sysctl).
1897 * Copyout address of vnode followed by vnode.
1898 */
1899/* ARGSUSED */
1900static int
1901sysctl_vnode SYSCTL_HANDLER_ARGS
1902{
1903	struct proc *p = curproc;	/* XXX */
1904	struct mount *mp, *nmp;
1905	struct vnode *nvp, *vp;
1906	int error;
1907
1908#define VPTRSZ	sizeof (struct vnode *)
1909#define VNODESZ	sizeof (struct vnode)
1910
1911	req->lock = 0;
1912	if (!req->oldptr) /* Make an estimate */
1913		return (SYSCTL_OUT(req, 0,
1914			(numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ)));
1915
1916	simple_lock(&mountlist_slock);
1917	for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) {
1918		if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) {
1919			nmp = mp->mnt_list.cqe_next;
1920			continue;
1921		}
1922again:
1923		simple_lock(&mntvnode_slock);
1924		for (vp = mp->mnt_vnodelist.lh_first;
1925		     vp != NULL;
1926		     vp = nvp) {
1927			/*
1928			 * Check that the vp is still associated with
1929			 * this filesystem.  RACE: could have been
1930			 * recycled onto the same filesystem.
1931			 */
1932			if (vp->v_mount != mp) {
1933				simple_unlock(&mntvnode_slock);
1934				if (kinfo_vdebug)
1935					printf("kinfo: vp changed\n");
1936				goto again;
1937			}
1938			nvp = vp->v_mntvnodes.le_next;
1939			simple_unlock(&mntvnode_slock);
1940			if ((error = SYSCTL_OUT(req, &vp, VPTRSZ)) ||
1941			    (error = SYSCTL_OUT(req, vp, VNODESZ)))
1942				return (error);
1943			simple_lock(&mntvnode_slock);
1944		}
1945		simple_unlock(&mntvnode_slock);
1946		simple_lock(&mountlist_slock);
1947		nmp = mp->mnt_list.cqe_next;
1948		vfs_unbusy(mp, p);
1949	}
1950	simple_unlock(&mountlist_slock);
1951
1952	return (0);
1953}
1954#endif
1955
1956/*
1957 * XXX
1958 * Exporting the vnode list on large systems causes them to crash.
1959 * Exporting the vnode list on medium systems causes sysctl to coredump.
1960 */
1961#if 0
1962SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD,
1963	0, 0, sysctl_vnode, "S,vnode", "");
1964#endif
1965
1966/*
1967 * Check to see if a filesystem is mounted on a block device.
1968 */
1969int
1970vfs_mountedon(vp)
1971	struct vnode *vp;
1972{
1973	struct vnode *vq;
1974	int error = 0;
1975
1976	if (vp->v_specmountpoint != NULL)
1977		return (EBUSY);
1978	if (vp->v_flag & VALIASED) {
1979		simple_lock(&spechash_slock);
1980		for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
1981			if (vq->v_rdev != vp->v_rdev ||
1982			    vq->v_type != vp->v_type)
1983				continue;
1984			if (vq->v_specmountpoint != NULL) {
1985				error = EBUSY;
1986				break;
1987			}
1988		}
1989		simple_unlock(&spechash_slock);
1990	}
1991	return (error);
1992}
1993
1994/*
1995 * Unmount all filesystems. The list is traversed in reverse order
1996 * of mounting to avoid dependencies.
1997 */
1998void
1999vfs_unmountall()
2000{
2001	struct mount *mp, *nmp;
2002	struct proc *p = initproc;	/* XXX XXX should this be proc0? */
2003	int error;
2004
2005	/*
2006	 * Since this only runs when rebooting, it is not interlocked.
2007	 */
2008	for (mp = mountlist.cqh_last; mp != (void *)&mountlist; mp = nmp) {
2009		nmp = mp->mnt_list.cqe_prev;
2010		error = dounmount(mp, MNT_FORCE, p);
2011		if (error) {
2012			printf("unmount of %s failed (",
2013			    mp->mnt_stat.f_mntonname);
2014			if (error == EBUSY)
2015				printf("BUSY)\n");
2016			else
2017				printf("%d)\n", error);
2018		}
2019	}
2020}
2021
2022/*
2023 * Build hash lists of net addresses and hang them off the mount point.
2024 * Called by ufs_mount() to set up the lists of export addresses.
2025 */
2026static int
2027vfs_hang_addrlist(mp, nep, argp)
2028	struct mount *mp;
2029	struct netexport *nep;
2030	struct export_args *argp;
2031{
2032	register struct netcred *np;
2033	register struct radix_node_head *rnh;
2034	register int i;
2035	struct radix_node *rn;
2036	struct sockaddr *saddr, *smask = 0;
2037	struct domain *dom;
2038	int error;
2039
2040	if (argp->ex_addrlen == 0) {
2041		if (mp->mnt_flag & MNT_DEFEXPORTED)
2042			return (EPERM);
2043		np = &nep->ne_defexported;
2044		np->netc_exflags = argp->ex_flags;
2045		np->netc_anon = argp->ex_anon;
2046		np->netc_anon.cr_ref = 1;
2047		mp->mnt_flag |= MNT_DEFEXPORTED;
2048		return (0);
2049	}
2050	i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen;
2051	np = (struct netcred *) malloc(i, M_NETADDR, M_WAITOK);
2052	bzero((caddr_t) np, i);
2053	saddr = (struct sockaddr *) (np + 1);
2054	if ((error = copyin(argp->ex_addr, (caddr_t) saddr, argp->ex_addrlen)))
2055		goto out;
2056	if (saddr->sa_len > argp->ex_addrlen)
2057		saddr->sa_len = argp->ex_addrlen;
2058	if (argp->ex_masklen) {
2059		smask = (struct sockaddr *) ((caddr_t) saddr + argp->ex_addrlen);
2060		error = copyin(argp->ex_mask, (caddr_t) smask, argp->ex_masklen);
2061		if (error)
2062			goto out;
2063		if (smask->sa_len > argp->ex_masklen)
2064			smask->sa_len = argp->ex_masklen;
2065	}
2066	i = saddr->sa_family;
2067	if ((rnh = nep->ne_rtable[i]) == 0) {
2068		/*
2069		 * Seems silly to initialize every AF when most are not used,
2070		 * do so on demand here
2071		 */
2072		for (dom = domains; dom; dom = dom->dom_next)
2073			if (dom->dom_family == i && dom->dom_rtattach) {
2074				dom->dom_rtattach((void **) &nep->ne_rtable[i],
2075				    dom->dom_rtoffset);
2076				break;
2077			}
2078		if ((rnh = nep->ne_rtable[i]) == 0) {
2079			error = ENOBUFS;
2080			goto out;
2081		}
2082	}
2083	rn = (*rnh->rnh_addaddr) ((caddr_t) saddr, (caddr_t) smask, rnh,
2084	    np->netc_rnodes);
2085	if (rn == 0 || np != (struct netcred *) rn) {	/* already exists */
2086		error = EPERM;
2087		goto out;
2088	}
2089	np->netc_exflags = argp->ex_flags;
2090	np->netc_anon = argp->ex_anon;
2091	np->netc_anon.cr_ref = 1;
2092	return (0);
2093out:
2094	free(np, M_NETADDR);
2095	return (error);
2096}
2097
2098/* ARGSUSED */
2099static int
2100vfs_free_netcred(rn, w)
2101	struct radix_node *rn;
2102	void *w;
2103{
2104	register struct radix_node_head *rnh = (struct radix_node_head *) w;
2105
2106	(*rnh->rnh_deladdr) (rn->rn_key, rn->rn_mask, rnh);
2107	free((caddr_t) rn, M_NETADDR);
2108	return (0);
2109}
2110
2111/*
2112 * Free the net address hash lists that are hanging off the mount points.
2113 */
2114static void
2115vfs_free_addrlist(nep)
2116	struct netexport *nep;
2117{
2118	register int i;
2119	register struct radix_node_head *rnh;
2120
2121	for (i = 0; i <= AF_MAX; i++)
2122		if ((rnh = nep->ne_rtable[i])) {
2123			(*rnh->rnh_walktree) (rnh, vfs_free_netcred,
2124			    (caddr_t) rnh);
2125			free((caddr_t) rnh, M_RTABLE);
2126			nep->ne_rtable[i] = 0;
2127		}
2128}
2129
2130int
2131vfs_export(mp, nep, argp)
2132	struct mount *mp;
2133	struct netexport *nep;
2134	struct export_args *argp;
2135{
2136	int error;
2137
2138	if (argp->ex_flags & MNT_DELEXPORT) {
2139		if (mp->mnt_flag & MNT_EXPUBLIC) {
2140			vfs_setpublicfs(NULL, NULL, NULL);
2141			mp->mnt_flag &= ~MNT_EXPUBLIC;
2142		}
2143		vfs_free_addrlist(nep);
2144		mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED);
2145	}
2146	if (argp->ex_flags & MNT_EXPORTED) {
2147		if (argp->ex_flags & MNT_EXPUBLIC) {
2148			if ((error = vfs_setpublicfs(mp, nep, argp)) != 0)
2149				return (error);
2150			mp->mnt_flag |= MNT_EXPUBLIC;
2151		}
2152		if ((error = vfs_hang_addrlist(mp, nep, argp)))
2153			return (error);
2154		mp->mnt_flag |= MNT_EXPORTED;
2155	}
2156	return (0);
2157}
2158
2159
2160/*
2161 * Set the publicly exported filesystem (WebNFS). Currently, only
2162 * one public filesystem is possible in the spec (RFC 2054 and 2055)
2163 */
2164int
2165vfs_setpublicfs(mp, nep, argp)
2166	struct mount *mp;
2167	struct netexport *nep;
2168	struct export_args *argp;
2169{
2170	int error;
2171	struct vnode *rvp;
2172	char *cp;
2173
2174	/*
2175	 * mp == NULL -> invalidate the current info, the FS is
2176	 * no longer exported. May be called from either vfs_export
2177	 * or unmount, so check if it hasn't already been done.
2178	 */
2179	if (mp == NULL) {
2180		if (nfs_pub.np_valid) {
2181			nfs_pub.np_valid = 0;
2182			if (nfs_pub.np_index != NULL) {
2183				FREE(nfs_pub.np_index, M_TEMP);
2184				nfs_pub.np_index = NULL;
2185			}
2186		}
2187		return (0);
2188	}
2189
2190	/*
2191	 * Only one allowed at a time.
2192	 */
2193	if (nfs_pub.np_valid != 0 && mp != nfs_pub.np_mount)
2194		return (EBUSY);
2195
2196	/*
2197	 * Get real filehandle for root of exported FS.
2198	 */
2199	bzero((caddr_t)&nfs_pub.np_handle, sizeof(nfs_pub.np_handle));
2200	nfs_pub.np_handle.fh_fsid = mp->mnt_stat.f_fsid;
2201
2202	if ((error = VFS_ROOT(mp, &rvp)))
2203		return (error);
2204
2205	if ((error = VFS_VPTOFH(rvp, &nfs_pub.np_handle.fh_fid)))
2206		return (error);
2207
2208	vput(rvp);
2209
2210	/*
2211	 * If an indexfile was specified, pull it in.
2212	 */
2213	if (argp->ex_indexfile != NULL) {
2214		MALLOC(nfs_pub.np_index, char *, MAXNAMLEN + 1, M_TEMP,
2215		    M_WAITOK);
2216		error = copyinstr(argp->ex_indexfile, nfs_pub.np_index,
2217		    MAXNAMLEN, (size_t *)0);
2218		if (!error) {
2219			/*
2220			 * Check for illegal filenames.
2221			 */
2222			for (cp = nfs_pub.np_index; *cp; cp++) {
2223				if (*cp == '/') {
2224					error = EINVAL;
2225					break;
2226				}
2227			}
2228		}
2229		if (error) {
2230			FREE(nfs_pub.np_index, M_TEMP);
2231			return (error);
2232		}
2233	}
2234
2235	nfs_pub.np_mount = mp;
2236	nfs_pub.np_valid = 1;
2237	return (0);
2238}
2239
2240struct netcred *
2241vfs_export_lookup(mp, nep, nam)
2242	register struct mount *mp;
2243	struct netexport *nep;
2244	struct sockaddr *nam;
2245{
2246	register struct netcred *np;
2247	register struct radix_node_head *rnh;
2248	struct sockaddr *saddr;
2249
2250	np = NULL;
2251	if (mp->mnt_flag & MNT_EXPORTED) {
2252		/*
2253		 * Lookup in the export list first.
2254		 */
2255		if (nam != NULL) {
2256			saddr = nam;
2257			rnh = nep->ne_rtable[saddr->sa_family];
2258			if (rnh != NULL) {
2259				np = (struct netcred *)
2260					(*rnh->rnh_matchaddr)((caddr_t)saddr,
2261							      rnh);
2262				if (np && np->netc_rnodes->rn_flags & RNF_ROOT)
2263					np = NULL;
2264			}
2265		}
2266		/*
2267		 * If no address match, use the default if it exists.
2268		 */
2269		if (np == NULL && mp->mnt_flag & MNT_DEFEXPORTED)
2270			np = &nep->ne_defexported;
2271	}
2272	return (np);
2273}
2274
2275/*
2276 * perform msync on all vnodes under a mount point
2277 * the mount point must be locked.
2278 */
2279void
2280vfs_msync(struct mount *mp, int flags) {
2281	struct vnode *vp, *nvp;
2282	int anyio, tries;
2283
2284	tries = 5;
2285loop:
2286	anyio = 0;
2287	for (vp = mp->mnt_vnodelist.lh_first; vp != NULL; vp = nvp) {
2288
2289		nvp = vp->v_mntvnodes.le_next;
2290
2291		if (vp->v_mount != mp) {
2292			goto loop;
2293		}
2294
2295		if ((vp->v_flag & VXLOCK) ||
2296			(VOP_ISLOCKED(vp) && (flags != MNT_WAIT))) {
2297			continue;
2298		}
2299
2300		simple_lock(&vp->v_interlock);
2301		if (vp->v_object &&
2302		   (vp->v_object->flags & OBJ_MIGHTBEDIRTY)) {
2303			if (!vget(vp,
2304				LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY | LK_NOOBJ, curproc)) {
2305				if (vp->v_object) {
2306					vm_object_page_clean(vp->v_object, 0, 0, TRUE);
2307					anyio = 1;
2308				}
2309				vput(vp);
2310			}
2311		} else {
2312			simple_unlock(&vp->v_interlock);
2313		}
2314	}
2315	if (anyio && (--tries > 0))
2316		goto loop;
2317}
2318
2319/*
2320 * Create the VM object needed for VMIO and mmap support.  This
2321 * is done for all VREG files in the system.  Some filesystems might
2322 * afford the additional metadata buffering capability of the
2323 * VMIO code by making the device node be VMIO mode also.
2324 *
2325 * If !waslocked, must be called with interlock.
2326 */
2327int
2328vfs_object_create(vp, p, cred, waslocked)
2329	struct vnode *vp;
2330	struct proc *p;
2331	struct ucred *cred;
2332	int waslocked;
2333{
2334	struct vattr vat;
2335	vm_object_t object;
2336	int error = 0;
2337
2338	if ((vp->v_type != VREG) && (vp->v_type != VBLK)) {
2339		if (!waslocked)
2340			simple_unlock(&vp->v_interlock);
2341		return 0;
2342	}
2343
2344	if (!waslocked)
2345		vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK | LK_RETRY, p);
2346
2347retry:
2348	if ((object = vp->v_object) == NULL) {
2349		if (vp->v_type == VREG) {
2350			if ((error = VOP_GETATTR(vp, &vat, cred, p)) != 0)
2351				goto retn;
2352			object = vnode_pager_alloc(vp,
2353				OFF_TO_IDX(round_page(vat.va_size)), 0, 0);
2354		} else if (major(vp->v_rdev) < nblkdev) {
2355			/*
2356			 * This simply allocates the biggest object possible
2357			 * for a VBLK vnode.  This should be fixed, but doesn't
2358			 * cause any problems (yet).
2359			 */
2360			object = vnode_pager_alloc(vp, INT_MAX, 0, 0);
2361		}
2362		object->ref_count--;
2363		vp->v_usecount--;
2364	} else {
2365		if (object->flags & OBJ_DEAD) {
2366			VOP_UNLOCK(vp, 0, p);
2367			tsleep(object, PVM, "vodead", 0);
2368			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
2369			goto retry;
2370		}
2371	}
2372
2373	if (vp->v_object) {
2374		vp->v_flag |= VOBJBUF;
2375	}
2376
2377retn:
2378	if (!waslocked) {
2379		simple_lock(&vp->v_interlock);
2380		VOP_UNLOCK(vp, LK_INTERLOCK, p);
2381	}
2382
2383	return error;
2384}
2385
2386static void
2387vfree(vp)
2388	struct vnode *vp;
2389{
2390	int s;
2391
2392	s = splbio();
2393	simple_lock(&vnode_free_list_slock);
2394	if (vp->v_flag & VTBFREE) {
2395		TAILQ_REMOVE(&vnode_tobefree_list, vp, v_freelist);
2396		vp->v_flag &= ~VTBFREE;
2397	}
2398	if (vp->v_flag & VAGE) {
2399		TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
2400	} else {
2401		TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
2402	}
2403	freevnodes++;
2404	simple_unlock(&vnode_free_list_slock);
2405	vp->v_flag &= ~VAGE;
2406	vp->v_flag |= VFREE;
2407	splx(s);
2408}
2409
2410void
2411vbusy(vp)
2412	struct vnode *vp;
2413{
2414	int s;
2415
2416	s = splbio();
2417	simple_lock(&vnode_free_list_slock);
2418	if (vp->v_flag & VTBFREE) {
2419		TAILQ_REMOVE(&vnode_tobefree_list, vp, v_freelist);
2420		vp->v_flag &= ~VTBFREE;
2421	} else {
2422		TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
2423		freevnodes--;
2424	}
2425	simple_unlock(&vnode_free_list_slock);
2426	vp->v_flag &= ~(VFREE|VAGE);
2427	splx(s);
2428}
2429
2430/*
2431 * Record a process's interest in events which might happen to
2432 * a vnode.  Because poll uses the historic select-style interface
2433 * internally, this routine serves as both the ``check for any
2434 * pending events'' and the ``record my interest in future events''
2435 * functions.  (These are done together, while the lock is held,
2436 * to avoid race conditions.)
2437 */
2438int
2439vn_pollrecord(vp, p, events)
2440	struct vnode *vp;
2441	struct proc *p;
2442	short events;
2443{
2444	simple_lock(&vp->v_pollinfo.vpi_lock);
2445	if (vp->v_pollinfo.vpi_revents & events) {
2446		/*
2447		 * This leaves events we are not interested
2448		 * in available for the other process which
2449		 * which presumably had requested them
2450		 * (otherwise they would never have been
2451		 * recorded).
2452		 */
2453		events &= vp->v_pollinfo.vpi_revents;
2454		vp->v_pollinfo.vpi_revents &= ~events;
2455
2456		simple_unlock(&vp->v_pollinfo.vpi_lock);
2457		return events;
2458	}
2459	vp->v_pollinfo.vpi_events |= events;
2460	selrecord(p, &vp->v_pollinfo.vpi_selinfo);
2461	simple_unlock(&vp->v_pollinfo.vpi_lock);
2462	return 0;
2463}
2464
2465/*
2466 * Note the occurrence of an event.  If the VN_POLLEVENT macro is used,
2467 * it is possible for us to miss an event due to race conditions, but
2468 * that condition is expected to be rare, so for the moment it is the
2469 * preferred interface.
2470 */
2471void
2472vn_pollevent(vp, events)
2473	struct vnode *vp;
2474	short events;
2475{
2476	simple_lock(&vp->v_pollinfo.vpi_lock);
2477	if (vp->v_pollinfo.vpi_events & events) {
2478		/*
2479		 * We clear vpi_events so that we don't
2480		 * call selwakeup() twice if two events are
2481		 * posted before the polling process(es) is
2482		 * awakened.  This also ensures that we take at
2483		 * most one selwakeup() if the polling process
2484		 * is no longer interested.  However, it does
2485		 * mean that only one event can be noticed at
2486		 * a time.  (Perhaps we should only clear those
2487		 * event bits which we note?) XXX
2488		 */
2489		vp->v_pollinfo.vpi_events = 0;	/* &= ~events ??? */
2490		vp->v_pollinfo.vpi_revents |= events;
2491		selwakeup(&vp->v_pollinfo.vpi_selinfo);
2492	}
2493	simple_unlock(&vp->v_pollinfo.vpi_lock);
2494}
2495
2496/*
2497 * Wake up anyone polling on vp because it is being revoked.
2498 * This depends on dead_poll() returning POLLHUP for correct
2499 * behavior.
2500 */
2501void
2502vn_pollgone(vp)
2503	struct vnode *vp;
2504{
2505	simple_lock(&vp->v_pollinfo.vpi_lock);
2506	if (vp->v_pollinfo.vpi_events) {
2507		vp->v_pollinfo.vpi_events = 0;
2508		selwakeup(&vp->v_pollinfo.vpi_selinfo);
2509	}
2510	simple_unlock(&vp->v_pollinfo.vpi_lock);
2511}
2512
2513
2514
2515/*
2516 * Routine to create and manage a filesystem syncer vnode.
2517 */
2518#define sync_close ((int (*) __P((struct  vop_close_args *)))nullop)
2519int	sync_fsync __P((struct  vop_fsync_args *));
2520int	sync_inactive __P((struct  vop_inactive_args *));
2521int	sync_reclaim  __P((struct  vop_reclaim_args *));
2522#define sync_lock ((int (*) __P((struct  vop_lock_args *)))vop_nolock)
2523#define sync_unlock ((int (*) __P((struct  vop_unlock_args *)))vop_nounlock)
2524int	sync_print __P((struct vop_print_args *));
2525#define sync_islocked ((int(*) __P((struct vop_islocked_args *)))vop_noislocked)
2526
2527vop_t **sync_vnodeop_p;
2528struct vnodeopv_entry_desc sync_vnodeop_entries[] = {
2529	{ &vop_default_desc,	(vop_t *) vop_eopnotsupp },
2530	{ &vop_close_desc,	(vop_t *) sync_close },		/* close */
2531	{ &vop_fsync_desc,	(vop_t *) sync_fsync },		/* fsync */
2532	{ &vop_inactive_desc,	(vop_t *) sync_inactive },	/* inactive */
2533	{ &vop_reclaim_desc,	(vop_t *) sync_reclaim },	/* reclaim */
2534	{ &vop_lock_desc,	(vop_t *) sync_lock },		/* lock */
2535	{ &vop_unlock_desc,	(vop_t *) sync_unlock },	/* unlock */
2536	{ &vop_print_desc,	(vop_t *) sync_print },		/* print */
2537	{ &vop_islocked_desc,	(vop_t *) sync_islocked },	/* islocked */
2538	{ NULL, NULL }
2539};
2540struct vnodeopv_desc sync_vnodeop_opv_desc =
2541	{ &sync_vnodeop_p, sync_vnodeop_entries };
2542
2543VNODEOP_SET(sync_vnodeop_opv_desc);
2544
2545/*
2546 * Create a new filesystem syncer vnode for the specified mount point.
2547 */
2548int
2549vfs_allocate_syncvnode(mp)
2550	struct mount *mp;
2551{
2552	struct vnode *vp;
2553	static long start, incr, next;
2554	int error;
2555
2556	/* Allocate a new vnode */
2557	if ((error = getnewvnode(VT_VFS, mp, sync_vnodeop_p, &vp)) != 0) {
2558		mp->mnt_syncer = NULL;
2559		return (error);
2560	}
2561	vp->v_type = VNON;
2562	/*
2563	 * Place the vnode onto the syncer worklist. We attempt to
2564	 * scatter them about on the list so that they will go off
2565	 * at evenly distributed times even if all the filesystems
2566	 * are mounted at once.
2567	 */
2568	next += incr;
2569	if (next == 0 || next > syncer_maxdelay) {
2570		start /= 2;
2571		incr /= 2;
2572		if (start == 0) {
2573			start = syncer_maxdelay / 2;
2574			incr = syncer_maxdelay;
2575		}
2576		next = start;
2577	}
2578	vn_syncer_add_to_worklist(vp, syncdelay > 0 ? next % syncdelay : 0);
2579	mp->mnt_syncer = vp;
2580	return (0);
2581}
2582
2583/*
2584 * Do a lazy sync of the filesystem.
2585 */
2586int
2587sync_fsync(ap)
2588	struct vop_fsync_args /* {
2589		struct vnode *a_vp;
2590		struct ucred *a_cred;
2591		int a_waitfor;
2592		struct proc *a_p;
2593	} */ *ap;
2594{
2595	struct vnode *syncvp = ap->a_vp;
2596	struct mount *mp = syncvp->v_mount;
2597	struct proc *p = ap->a_p;
2598	int asyncflag;
2599
2600	/*
2601	 * We only need to do something if this is a lazy evaluation.
2602	 */
2603	if (ap->a_waitfor != MNT_LAZY)
2604		return (0);
2605
2606	/*
2607	 * Move ourselves to the back of the sync list.
2608	 */
2609	vn_syncer_add_to_worklist(syncvp, syncdelay);
2610
2611	/*
2612	 * Walk the list of vnodes pushing all that are dirty and
2613	 * not already on the sync list.
2614	 */
2615	simple_lock(&mountlist_slock);
2616	if (vfs_busy(mp, LK_EXCLUSIVE | LK_NOWAIT, &mountlist_slock, p) != 0)
2617		return (0);
2618	asyncflag = mp->mnt_flag & MNT_ASYNC;
2619	mp->mnt_flag &= ~MNT_ASYNC;
2620	VFS_SYNC(mp, MNT_LAZY, ap->a_cred, p);
2621	if (asyncflag)
2622		mp->mnt_flag |= MNT_ASYNC;
2623	vfs_unbusy(mp, p);
2624	return (0);
2625}
2626
2627/*
2628 * The syncer vnode is no referenced.
2629 */
2630int
2631sync_inactive(ap)
2632	struct vop_inactive_args /* {
2633		struct vnode *a_vp;
2634		struct proc *a_p;
2635	} */ *ap;
2636{
2637
2638	vgone(ap->a_vp);
2639	return (0);
2640}
2641
2642/*
2643 * The syncer vnode is no longer needed and is being decommissioned.
2644 */
2645int
2646sync_reclaim(ap)
2647	struct vop_reclaim_args /* {
2648		struct vnode *a_vp;
2649	} */ *ap;
2650{
2651	struct vnode *vp = ap->a_vp;
2652
2653	vp->v_mount->mnt_syncer = NULL;
2654	if (vp->v_flag & VONWORKLST) {
2655		LIST_REMOVE(vp, v_synclist);
2656		vp->v_flag &= ~VONWORKLST;
2657	}
2658
2659	return (0);
2660}
2661
2662/*
2663 * Print out a syncer vnode.
2664 */
2665int
2666sync_print(ap)
2667	struct vop_print_args /* {
2668		struct vnode *a_vp;
2669	} */ *ap;
2670{
2671	struct vnode *vp = ap->a_vp;
2672
2673	printf("syncer vnode");
2674	if (vp->v_vnlock != NULL)
2675		lockmgr_printinfo(vp->v_vnlock);
2676	printf("\n");
2677	return (0);
2678}
2679