vfs_subr.c revision 1.18
1/*	$OpenBSD: vfs_subr.c,v 1.18 1998/01/11 02:10:44 csapuntz Exp $	*/
2/*	$NetBSD: vfs_subr.c,v 1.53 1996/04/22 01:39:13 christos Exp $	*/
3
4/*
5 * Copyright (c) 1989, 1993
6 *	The Regents of the University of California.  All rights reserved.
7 * (c) UNIX System Laboratories, Inc.
8 * All or some portions of this file are derived from material licensed
9 * to the University of California by American Telephone and Telegraph
10 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
11 * the permission of UNIX System Laboratories, Inc.
12 *
13 * Redistribution and use in source and binary forms, with or without
14 * modification, are permitted provided that the following conditions
15 * are met:
16 * 1. Redistributions of source code must retain the above copyright
17 *    notice, this list of conditions and the following disclaimer.
18 * 2. Redistributions in binary form must reproduce the above copyright
19 *    notice, this list of conditions and the following disclaimer in the
20 *    documentation and/or other materials provided with the distribution.
21 * 3. All advertising materials mentioning features or use of this software
22 *    must display the following acknowledgement:
23 *	This product includes software developed by the University of
24 *	California, Berkeley and its contributors.
25 * 4. Neither the name of the University nor the names of its contributors
26 *    may be used to endorse or promote products derived from this software
27 *    without specific prior written permission.
28 *
29 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
30 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
31 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
32 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
33 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
34 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
35 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
36 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
37 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
38 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
39 * SUCH DAMAGE.
40 *
41 *	@(#)vfs_subr.c	8.13 (Berkeley) 4/18/94
42 */
43
44/*
45 * External virtual filesystem routines
46 */
47
48#include <sys/param.h>
49#include <sys/systm.h>
50#include <sys/proc.h>
51#include <sys/mount.h>
52#include <sys/time.h>
53#include <sys/fcntl.h>
54#include <sys/kernel.h>
55#include <sys/vnode.h>
56#include <sys/stat.h>
57#include <sys/namei.h>
58#include <sys/ucred.h>
59#include <sys/buf.h>
60#include <sys/errno.h>
61#include <sys/malloc.h>
62#include <sys/domain.h>
63#include <sys/mbuf.h>
64#include <sys/syscallargs.h>
65
66#include <vm/vm.h>
67#include <sys/sysctl.h>
68
69#include <miscfs/specfs/specdev.h>
70
71enum vtype iftovt_tab[16] = {
72	VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
73	VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
74};
75int	vttoif_tab[9] = {
76	0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
77	S_IFSOCK, S_IFIFO, S_IFMT,
78};
79
80int doforce = 1;		/* 1 => permit forcible unmounting */
81int prtactive = 0;		/* 1 => print out reclaim of active vnodes */
82int suid_clear = 1;		/* 1 => clear SUID / SGID on owner change */
83
84/*
85 * Insq/Remq for the vnode usage lists.
86 */
87#define	bufinsvn(bp, dp)	LIST_INSERT_HEAD(dp, bp, b_vnbufs)
88#define	bufremvn(bp) {							\
89	LIST_REMOVE(bp, b_vnbufs);					\
90	(bp)->b_vnbufs.le_next = NOLIST;				\
91}
92
93struct freelst vnode_hold_list;   /* list of vnodes referencing buffers */
94struct freelst vnode_free_list;   /* vnode free list */
95
96struct mntlist mountlist;			/* mounted filesystem list */
97struct simplelock mountlist_slock;
98static struct simplelock mntid_slock;
99struct simplelock mntvnode_slock;
100struct simplelock vnode_free_list_slock;
101struct simplelock spechash_slock;
102
103
104void insmntque __P((struct vnode *, struct mount *));
105int getdevvp __P((dev_t, struct vnode **, enum vtype));
106int vunref __P((struct vnode *));
107
108int vfs_hang_addrlist __P((struct mount *, struct netexport *,
109				  struct export_args *));
110int vfs_free_netcred __P((struct radix_node *, void *));
111void vfs_free_addrlist __P((struct netexport *));
112
113#ifdef DEBUG
114void printlockedvnodes __P((void));
115#endif
116
117/*
118 * Initialize the vnode management data structures.
119 */
120void
121vntblinit()
122{
123
124	simple_lock_init(&mntvnode_slock);
125	simple_lock_init(&mntid_slock);
126	simple_lock_init(&spechash_slock);
127	TAILQ_INIT(&vnode_hold_list);
128	TAILQ_INIT(&vnode_free_list);
129	simple_lock_init(&vnode_free_list_slock);
130	CIRCLEQ_INIT(&mountlist);
131	/*
132	 * Initialize the filesystem syncer.
133	 */
134	vn_initialize_syncerd();
135}
136
137
138/*
139 * Mark a mount point as busy. Used to synchornize access and to delay
140 * unmounting. Interlock is not released n failure.
141 */
142
143int
144vfs_busy(mp, flags, interlkp, p)
145	struct mount *mp;
146	int flags;
147	struct simplelock *interlkp;
148	struct proc *p;
149{
150	int lkflags;
151
152	if (mp->mnt_flag & MNT_UNMOUNT) {
153		if (flags & LK_NOWAIT)
154			return (ENOENT);
155		mp->mnt_flag |= MNT_MWAIT;
156		if (interlkp)
157			simple_unlock(interlkp);
158		/*
159		 * Since all busy locks are shared except the exclusive
160		 * lock granted when unmounting, the only place that a
161		 * wakeup needs to be done is at the release of the
162		 * exclusive lock at the end of dounmount.
163		 */
164 		sleep((caddr_t)mp, PVFS);
165		if (interlkp)
166			simple_lock(interlkp);
167		return (ENOENT);
168	}
169	lkflags = LK_SHARED;
170	if (interlkp)
171		lkflags |= LK_INTERLOCK;
172	if (lockmgr(&mp->mnt_lock, lkflags, interlkp, p))
173		panic("vfs_busy: unexpected lock failure");
174        return (0);
175}
176
177
178/*
179 * Free a busy file system
180 */
181void
182vfs_unbusy(mp, p)
183	struct mount *mp;
184	struct proc *p;
185{
186	lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, p);
187}
188
189/*
190 * Lookup a filesystem type, and if found allocate and initialize
191 * a mount structure for it.
192 *
193 * Devname is usually updated by mount(8) after booting.
194 */
195
196int
197vfs_rootmountalloc(fstypename, devname, mpp)
198	char *fstypename;
199	char *devname;
200	struct mount **mpp;
201{
202	struct proc *p = curproc;	/* XXX */
203	struct vfsconf *vfsp;
204	struct mount *mp;
205
206	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
207		if (!strcmp(vfsp->vfc_name, fstypename))
208			break;
209	if (vfsp == NULL)
210		return (ENODEV);
211	mp = malloc((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK);
212	bzero((char *)mp, (u_long)sizeof(struct mount));
213	lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, 0);
214	(void)vfs_busy(mp, LK_NOWAIT, 0, p);
215	LIST_INIT(&mp->mnt_vnodelist);
216	mp->mnt_vfc = vfsp;
217	mp->mnt_op = vfsp->vfc_vfsops;
218	mp->mnt_flag = MNT_RDONLY;
219	mp->mnt_vnodecovered = NULLVP;
220	vfsp->vfc_refcount++;
221	mp->mnt_stat.f_type = vfsp->vfc_typenum;
222	mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
223	strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN);
224	mp->mnt_stat.f_mntonname[0] = '/';
225	(void) copystr(devname, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, 0);
226	*mpp = mp;
227 	return (0);
228 }
229
230/*
231 * Find an appropriate filesystem to use for the root. If a filesystem
232 * has not been preselected, walk through the list of known filesystems
233 * trying those that have mountroot routines, and try them until one
234 * works or we have tried them all.
235  */
236int
237vfs_mountroot()
238{
239	struct vfsconf *vfsp;
240	extern int (*mountroot)(void);
241	int error;
242
243	if (mountroot != NULL)
244		return ((*mountroot)());
245	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
246		if (vfsp->vfc_mountroot == NULL)
247			continue;
248		if ((error = (*vfsp->vfc_mountroot)()) == 0)
249			return (0);
250		printf("%s_mountroot failed: %d\n", vfsp->vfc_name, error);
251 	}
252	return (ENODEV);
253}
254
255/*
256 * Lookup a mount point by filesystem identifier.
257 */
258struct mount *
259vfs_getvfs(fsid)
260	fsid_t *fsid;
261{
262	register struct mount *mp;
263
264	simple_lock(&mountlist_slock);
265	for (mp = mountlist.cqh_first; mp != (void *)&mountlist;
266	     mp = mp->mnt_list.cqe_next) {
267		if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
268		    mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
269			simple_unlock(&mountlist_slock);
270			return (mp);
271		}
272	}
273	simple_unlock(&mountlist_slock);
274	return ((struct mount *)0);
275}
276
277
278/*
279 * Get a new unique fsid
280 */
281void
282vfs_getnewfsid(mp)
283	struct mount *mp;
284{
285	static u_short xxxfs_mntid;
286
287	fsid_t tfsid;
288	int mtype;
289
290	simple_lock(&mntid_slock);
291	mtype = mp->mnt_vfc->vfc_typenum;
292	mp->mnt_stat.f_fsid.val[0] = makedev(nblkdev + mtype, 0);
293	mp->mnt_stat.f_fsid.val[1] = mtype;
294	if (xxxfs_mntid == 0)
295		++xxxfs_mntid;
296	tfsid.val[0] = makedev(nblkdev + mtype, xxxfs_mntid);
297	tfsid.val[1] = mtype;
298	if (mountlist.cqh_first != (void *)&mountlist) {
299		while (vfs_getvfs(&tfsid)) {
300			tfsid.val[0]++;
301			xxxfs_mntid++;
302		}
303	}
304	mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
305	simple_unlock(&mntid_slock);
306}
307
308/*
309 * Make a 'unique' number from a mount type name.
310 * Note that this is no longer used for ffs which
311 * now has an on-disk filesystem id.
312 */
313long
314makefstype(type)
315	char *type;
316{
317	long rv;
318
319	for (rv = 0; *type; type++) {
320		rv <<= 2;
321		rv ^= *type;
322	}
323	return rv;
324}
325
326/*
327 * Set vnode attributes to VNOVAL
328 */
329void
330vattr_null(vap)
331	register struct vattr *vap;
332{
333
334	vap->va_type = VNON;
335	/* XXX These next two used to be one line, but for a GCC bug. */
336	vap->va_size = VNOVAL;
337	vap->va_bytes = VNOVAL;
338	vap->va_mode = vap->va_nlink = vap->va_uid = vap->va_gid =
339		vap->va_fsid = vap->va_fileid =
340		vap->va_blocksize = vap->va_rdev =
341		vap->va_atime.tv_sec = vap->va_atime.tv_nsec =
342		vap->va_mtime.tv_sec = vap->va_mtime.tv_nsec =
343		vap->va_ctime.tv_sec = vap->va_ctime.tv_nsec =
344		vap->va_flags = vap->va_gen = VNOVAL;
345	vap->va_vaflags = 0;
346}
347
348/*
349 * Routines having to do with the management of the vnode table.
350 */
351extern int (**dead_vnodeop_p) __P((void *));
352long numvnodes;
353
354/*
355 * Return the next vnode from the free list.
356 */
357int
358getnewvnode(tag, mp, vops, vpp)
359	enum vtagtype tag;
360	struct mount *mp;
361	int (**vops) __P((void *));
362	struct vnode **vpp;
363{
364	struct proc *p = curproc;			/* XXX */
365	struct freelst *listhd;
366	static int toggle;
367	struct vnode *vp;
368#ifdef DIAGNOSTIC
369	int s;
370#endif
371
372	/*
373	 * We must choose whether to allocate a new vnode or recycle an
374	 * existing one. The criterion for allocating a new one is that
375	 * the total number of vnodes is less than the number desired or
376	 * there are no vnodes on either free list. Generally we only
377	 * want to recycle vnodes that have no buffers associated with
378	 * them, so we look first on the vnode_free_list. If it is empty,
379	 * we next consider vnodes with referencing buffers on the
380	 * vnode_hold_list. The toggle ensures that half the time we
381	 * will use a buffer from the vnode_hold_list, and half the time
382	 * we will allocate a new one unless the list has grown to twice
383	 * the desired size. We are reticent to recycle vnodes from the
384	 * vnode_hold_list because we will lose the identity of all its
385	 * referencing buffers.
386	 */
387	toggle ^= 1;
388	if (numvnodes > 2 * desiredvnodes)
389		toggle = 0;
390
391
392	simple_lock(&vnode_free_list_slock);
393	if ((numvnodes < desiredvnodes) ||
394	    ((TAILQ_FIRST(listhd = &vnode_free_list) == NULL) &&
395	     ((TAILQ_FIRST(listhd = &vnode_hold_list) == NULL) || toggle))) {
396		simple_unlock(&vnode_free_list_slock);
397		vp = (struct vnode *)malloc((u_long)sizeof *vp,
398		    M_VNODE, M_WAITOK);
399		bzero((char *)vp, sizeof *vp);
400		numvnodes++;
401	} else {
402		for (vp = TAILQ_FIRST(listhd); vp != NULLVP;
403		     vp = TAILQ_NEXT(vp, v_freelist)) {
404			if (simple_lock_try(&vp->v_interlock))
405				break;
406		}
407		/*
408		 * Unless this is a bad time of the month, at most
409		 * the first NCPUS items on the free list are
410		 * locked, so this is close enough to being empty.
411		 */
412		if (vp == NULLVP) {
413			simple_unlock(&vnode_free_list_slock);
414			tablefull("vnode");
415			*vpp = 0;
416			return (ENFILE);
417		}
418		if (vp->v_usecount) {
419			vprint("free vnode", vp);
420			panic("free vnode isn't");
421		}
422		TAILQ_REMOVE(listhd, vp, v_freelist);
423		/* see comment on why 0xdeadb is set at end of vgone (below) */
424		vp->v_flag |= VGONEHACK;
425		simple_unlock(&vnode_free_list_slock);
426		vp->v_lease = NULL;
427		if (vp->v_type != VBAD)
428			vgonel(vp, p);
429		else
430			simple_unlock(&vp->v_interlock);
431#ifdef DIAGNOSTIC
432		if (vp->v_data) {
433			vprint("cleaned vnode", vp);
434			panic("cleaned vnode isn't");
435		}
436		s = splbio();
437		if (vp->v_numoutput)
438			panic("Clean vnode has pending I/O's");
439		splx(s);
440#endif
441		vp->v_flag = 0;
442		vp->v_lastr = 0;
443		vp->v_ralen = 0;
444		vp->v_maxra = 0;
445		vp->v_lastw = 0;
446		vp->v_lasta = 0;
447		vp->v_cstart = 0;
448		vp->v_clen = 0;
449		vp->v_socket = 0;
450	}
451	vp->v_type = VNON;
452	cache_purge(vp);
453	vp->v_tag = tag;
454	vp->v_op = vops;
455	insmntque(vp, mp);
456	*vpp = vp;
457	vp->v_usecount = 1;
458	vp->v_data = 0;
459	return (0);
460}
461
462/*
463 * Move a vnode from one mount queue to another.
464 */
465void
466insmntque(vp, mp)
467	register struct vnode *vp;
468	register struct mount *mp;
469{
470	simple_lock(&mntvnode_slock);
471	/*
472	 * Delete from old mount point vnode list, if on one.
473	 */
474
475	if (vp->v_mount != NULL)
476		LIST_REMOVE(vp, v_mntvnodes);
477	/*
478	 * Insert into list of vnodes for the new mount point, if available.
479	 */
480	if ((vp->v_mount = mp) != NULL)
481		LIST_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes);
482	simple_unlock(&mntvnode_slock);
483}
484
485
486/*
487 * Create a vnode for a block device.
488 * Used for root filesystem, argdev, and swap areas.
489 * Also used for memory file system special devices.
490 */
491int
492bdevvp(dev, vpp)
493	dev_t dev;
494	struct vnode **vpp;
495{
496
497	return (getdevvp(dev, vpp, VBLK));
498}
499
500/*
501 * Create a vnode for a character device.
502 * Used for kernfs and some console handling.
503 */
504int
505cdevvp(dev, vpp)
506	dev_t dev;
507	struct vnode **vpp;
508{
509
510	return (getdevvp(dev, vpp, VCHR));
511}
512
513/*
514 * Create a vnode for a device.
515 * Used by bdevvp (block device) for root file system etc.,
516 * and by cdevvp (character device) for console and kernfs.
517 */
518int
519getdevvp(dev, vpp, type)
520	dev_t dev;
521	struct vnode **vpp;
522	enum vtype type;
523{
524	register struct vnode *vp;
525	struct vnode *nvp;
526	int error;
527
528	if (dev == NODEV) {
529		*vpp = NULLVP;
530		return (0);
531	}
532	error = getnewvnode(VT_NON, NULL, spec_vnodeop_p, &nvp);
533	if (error) {
534		*vpp = NULLVP;
535		return (error);
536	}
537	vp = nvp;
538	vp->v_type = type;
539	if ((nvp = checkalias(vp, dev, NULL)) != 0) {
540		vput(vp);
541		vp = nvp;
542	}
543	*vpp = vp;
544	return (0);
545}
546
547/*
548 * Check to see if the new vnode represents a special device
549 * for which we already have a vnode (either because of
550 * bdevvp() or because of a different vnode representing
551 * the same block device). If such an alias exists, deallocate
552 * the existing contents and return the aliased vnode. The
553 * caller is responsible for filling it with its new contents.
554 */
555struct vnode *
556checkalias(nvp, nvp_rdev, mp)
557	register struct vnode *nvp;
558	dev_t nvp_rdev;
559	struct mount *mp;
560{
561	struct proc *p = curproc;
562	register struct vnode *vp;
563	struct vnode **vpp;
564
565	if (nvp->v_type != VBLK && nvp->v_type != VCHR)
566		return (NULLVP);
567
568	vpp = &speclisth[SPECHASH(nvp_rdev)];
569loop:
570	simple_lock(&spechash_slock);
571	for (vp = *vpp; vp; vp = vp->v_specnext) {
572		simple_lock(&vp->v_interlock);
573		if (nvp_rdev != vp->v_rdev || nvp->v_type != vp->v_type)
574			continue;
575		/*
576		 * Alias, but not in use, so flush it out.
577		 */
578		if (vp->v_usecount == 0) {
579			simple_unlock(&spechash_slock);
580			vgonel(vp, p);
581			goto loop;
582		}
583		if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, p)) {
584			simple_unlock(&spechash_slock);
585			goto loop;
586		}
587		break;
588	}
589	if (vp == NULL || vp->v_tag != VT_NON || vp->v_type != VBLK) {
590		MALLOC(nvp->v_specinfo, struct specinfo *,
591			sizeof(struct specinfo), M_VNODE, M_WAITOK);
592		nvp->v_rdev = nvp_rdev;
593		nvp->v_hashchain = vpp;
594		nvp->v_specnext = *vpp;
595		nvp->v_specmountpoint = NULL;
596		nvp->v_speclockf = NULL;
597		simple_unlock(&spechash_slock);
598		*vpp = nvp;
599		if (vp != NULLVP) {
600			nvp->v_flag |= VALIASED;
601			vp->v_flag |= VALIASED;
602			vput(vp);
603		}
604		return (NULLVP);
605	}
606	simple_unlock(&spechash_slock);
607	VOP_UNLOCK(vp, 0, p);
608	simple_lock(&vp->v_interlock);
609	vclean(vp, 0, p);
610	vp->v_op = nvp->v_op;
611	vp->v_tag = nvp->v_tag;
612	nvp->v_type = VNON;
613	insmntque(vp, mp);
614	return (vp);
615}
616
617/*
618 * Grab a particular vnode from the free list, increment its
619 * reference count and lock it. The vnode lock bit is set the
620 * vnode is being eliminated in vgone. The process is awakened
621 * when the transition is completed, and an error returned to
622 * indicate that the vnode is no longer usable (possibly having
623 * been changed to a new file system type).
624 */
625int
626vget(vp, flags, p)
627        struct vnode *vp;
628	int flags;
629	struct proc *p;
630{
631	int error;
632	/*
633	 * If the vnode is in the process of being cleaned out for
634	 * another use, we wait for the cleaning to finish and then
635	 * return failure. Cleaning is determined by checking that
636	 * the VXLOCK flag is set.
637	 */
638	if ((flags & LK_INTERLOCK) == 0)
639		simple_lock(&vp->v_interlock);
640	if (vp->v_flag & VXLOCK) {
641 		vp->v_flag |= VXWANT;
642		simple_unlock(&vp->v_interlock);
643		tsleep((caddr_t)vp, PINOD, "vget", 0);
644		return (ENOENT);
645 	}
646	if (vp->v_usecount == 0) {
647		simple_lock(&vnode_free_list_slock);
648		if (vp->v_holdcnt > 0)
649			TAILQ_REMOVE(&vnode_hold_list, vp, v_freelist);
650		else
651			TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
652		simple_unlock(&vnode_free_list_slock);
653	}
654 	vp->v_usecount++;
655	if (flags & LK_TYPE_MASK) {
656		if ((error = vn_lock(vp, flags | LK_INTERLOCK, p)) != 0) {
657			vunref(vp);
658			simple_unlock(&vp->v_interlock);
659		}
660		return (error);
661	}
662	simple_unlock(&vp->v_interlock);
663	return (0);
664}
665
666
667#ifdef DIAGNOSTIC
668/*
669 * Vnode reference.
670 */
671void
672vref(vp)
673	struct vnode *vp;
674{
675	simple_lock(&vp->v_interlock);
676	if (vp->v_usecount <= 0)
677		panic("vref used where vget required");
678	vp->v_usecount++;
679	simple_unlock(&vp->v_interlock);
680}
681#endif /* DIAGNOSTIC */
682
683int
684vunref(vp)
685	struct vnode *vp;
686{
687#ifdef DIAGNOSTIC
688	if (vp == NULL)
689		panic("vrele: null vp");
690#endif
691	simple_lock (&vp->v_interlock);
692	vp->v_usecount--;
693	if (vp->v_usecount > 0) {
694		simple_unlock(&vp->v_interlock);
695		return (vp->v_usecount);
696	}
697#ifdef DIAGNOSTIC
698	if (vp->v_usecount < 0 || vp->v_writecount != 0) {
699		vprint("vrele: bad ref count", vp);
700		panic("vrele: ref cnt");
701	}
702#endif
703	/*
704	 * insert at tail of LRU list
705	 */
706	simple_lock(&vnode_free_list_slock);
707	if (vp->v_holdcnt > 0)
708		TAILQ_INSERT_TAIL(&vnode_hold_list, vp, v_freelist);
709	else
710		TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
711	simple_unlock(&vnode_free_list_slock);
712
713	return (0);
714}
715
716/*
717 * vput(), just unlock and vrele()
718 */
719void
720vput(vp)
721	register struct vnode *vp;
722{
723	struct proc *p = curproc;	/* XXX */
724
725#ifdef DIGANOSTIC
726	if (vp == NULL)
727		panic("vput: null vp");
728#endif
729	simple_lock(&vp->v_interlock);
730	vp->v_usecount--;
731	if (vp->v_usecount > 0) {
732		simple_unlock(&vp->v_interlock);
733		VOP_UNLOCK(vp, 0, p);
734		return;
735	}
736#ifdef DIAGNOSTIC
737	if (vp->v_usecount < 0 || vp->v_writecount != 0) {
738		vprint("vput: bad ref count", vp);
739		panic("vput: ref cnt");
740	}
741#endif
742	/*
743	 * insert at tail of LRU list
744	 */
745	simple_lock(&vnode_free_list_slock);
746	if (vp->v_holdcnt > 0)
747		TAILQ_INSERT_TAIL(&vnode_hold_list, vp, v_freelist);
748	else
749		TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
750	simple_unlock(&vnode_free_list_slock);
751	simple_unlock(&vp->v_interlock);
752	VOP_INACTIVE(vp, p);
753}
754
755/*
756 * Vnode release - use for active VNODES.
757 * If count drops to zero, call inactive routine and return to freelist.
758 */
759void
760vrele(vp)
761	register struct vnode *vp;
762{
763	struct proc *p = curproc;
764
765	if (vunref(vp) == 0 &&
766	    vn_lock(vp, LK_EXCLUSIVE |LK_INTERLOCK, p) == 0)
767		VOP_INACTIVE(vp, p);
768}
769
770#ifdef DIAGNOSTIC
771/*
772 * Page or buffer structure gets a reference.
773 */
774void
775vhold(vp)
776	register struct vnode *vp;
777{
778
779	/*
780	 * If it is on the freelist and the hold count is currently
781	 * zero, move it to the hold list.
782	 *
783	 * The VGONEHACK flag reflects a call from getnewvnode,
784	 * which will remove the vnode from the free list, but
785	 * will not increment the ref count until after it calls vgone
786	 * If the ref count we're incremented first, vgone would
787	 * (incorrectly) try to close the previous instance of the
788	 * underlying object.
789	 */
790  	simple_lock(&vp->v_interlock);
791	if (!(vp->v_flag & VGONEHACK) &&
792	    vp->v_holdcnt == 0 && vp->v_usecount == 0) {
793		simple_lock(&vnode_free_list_slock);
794		TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
795		TAILQ_INSERT_TAIL(&vnode_hold_list, vp, v_freelist);
796		simple_unlock(&vnode_free_list_slock);
797	}
798	vp->v_holdcnt++;
799	simple_unlock(&vp->v_interlock);
800}
801
802/*
803 * Page or buffer structure frees a reference.
804 */
805void
806holdrele(vp)
807	register struct vnode *vp;
808{
809
810	simple_lock(&vp->v_interlock);
811	if (vp->v_holdcnt <= 0)
812		panic("holdrele: holdcnt");
813	vp->v_holdcnt--;
814	/*
815	 * If it is on the holdlist and the hold count drops to
816	 * zero, move it to the free list.
817	 *
818	 * See above for VGONEHACK
819	 */
820	if (!(vp->v_flag & VGONEHACK) &&
821	    vp->v_holdcnt == 0 && vp->v_usecount == 0) {
822		simple_lock(&vnode_free_list_slock);
823		TAILQ_REMOVE(&vnode_hold_list, vp, v_freelist);
824		TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
825		simple_unlock(&vnode_free_list_slock);
826	}
827	simple_unlock(&vp->v_interlock);
828}
829#endif /* DIAGNOSTIC */
830
831/*
832 * Remove any vnodes in the vnode table belonging to mount point mp.
833 *
834 * If MNT_NOFORCE is specified, there should not be any active ones,
835 * return error if any are found (nb: this is a user error, not a
836 * system error). If MNT_FORCE is specified, detach any active vnodes
837 * that are found.
838 */
839#ifdef DEBUG
840int busyprt = 0;	/* print out busy vnodes */
841struct ctldebug debug1 = { "busyprt", &busyprt };
842#endif
843
844int
845vflush(mp, skipvp, flags)
846	struct mount *mp;
847	struct vnode *skipvp;
848	int flags;
849{
850	struct proc *p = curproc;
851	register struct vnode *vp, *nvp;
852	int busy = 0;
853
854	simple_lock(&mntvnode_slock);
855loop:
856	for (vp = mp->mnt_vnodelist.lh_first; vp; vp = nvp) {
857		if (vp->v_mount != mp)
858			goto loop;
859		nvp = vp->v_mntvnodes.le_next;
860		/*
861		 * Skip over a selected vnode.
862		 */
863		if (vp == skipvp)
864			continue;
865
866		simple_lock(&vp->v_interlock);
867		/*
868		 * Skip over a vnodes marked VSYSTEM.
869		 */
870		if ((flags & SKIPSYSTEM) && (vp->v_flag & VSYSTEM)) {
871			simple_unlock(&vp->v_interlock);
872			continue;
873		}
874		/*
875		 * If WRITECLOSE is set, only flush out regular file
876		 * vnodes open for writing.
877		 */
878		if ((flags & WRITECLOSE) &&
879		    (vp->v_writecount == 0 || vp->v_type != VREG)) {
880			simple_unlock(&vp->v_interlock);
881			continue;
882		}
883		/*
884		 * With v_usecount == 0, all we need to do is clear
885		 * out the vnode data structures and we are done.
886		 */
887		if (vp->v_usecount == 0) {
888			simple_unlock(&mntvnode_slock);
889			vgonel(vp, p);
890			simple_lock(&mntvnode_slock);
891			continue;
892		}
893		/*
894		 * If FORCECLOSE is set, forcibly close the vnode.
895		 * For block or character devices, revert to an
896		 * anonymous device. For all other files, just kill them.
897		 */
898		if (flags & FORCECLOSE) {
899			simple_unlock(&mntvnode_slock);
900			if (vp->v_type != VBLK && vp->v_type != VCHR) {
901				vgonel(vp, p);
902			} else {
903				vclean(vp, 0, p);
904				vp->v_op = spec_vnodeop_p;
905				insmntque(vp, (struct mount *)0);
906			}
907			simple_lock(&mntvnode_slock);
908			continue;
909		}
910#ifdef DEBUG
911		if (busyprt)
912			vprint("vflush: busy vnode", vp);
913#endif
914		simple_unlock(&vp->v_interlock);
915		busy++;
916	}
917	simple_unlock(&mntvnode_slock);
918	if (busy)
919		return (EBUSY);
920	return (0);
921}
922
923/*
924 * Disassociate the underlying file system from a vnode.
925 * The vnode interlock is held on entry.
926 */
927void
928vclean(vp, flags, p)
929	register struct vnode *vp;
930	int flags;
931	struct proc *p;
932{
933	int active;
934
935	/*
936	 * Check to see if the vnode is in use.
937	 * If so we have to reference it before we clean it out
938	 * so that its count cannot fall to zero and generate a
939	 * race against ourselves to recycle it.
940	 */
941	if ((active = vp->v_usecount) != 0)
942		vp->v_usecount++;
943
944	/*
945	 * Prevent the vnode from being recycled or
946	 * brought into use while we clean it out.
947	 */
948	if (vp->v_flag & VXLOCK)
949		panic("vclean: deadlock");
950	vp->v_flag |= VXLOCK;
951
952
953	/*
954	 * Even if the count is zero, the VOP_INACTIVE routine may still
955	 * have the object locked while it cleans it out. The VOP_LOCK
956	 * ensures that the VOP_INACTIVE routine is done with its work.
957	 * For active vnodes, it ensures that no other activity can
958	 * occur while the underlying object is being cleaned out.
959	 */
960	VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK, p);
961
962	/*
963	 * Clean out any buffers associated with the vnode.
964	 */
965	if (flags & DOCLOSE)
966		vinvalbuf(vp, V_SAVE, NOCRED, p, 0, 0);
967	/*
968	 * If purging an active vnode, it must be closed and
969	 * deactivated before being reclaimed. Note that the
970	 * VOP_INACTIVE will unlock the vnode
971	 */
972	if (active) {
973		if (flags & DOCLOSE)
974			VOP_CLOSE(vp, FNONBLOCK, NOCRED, p);
975		VOP_INACTIVE(vp, p);
976	} else {
977		/*
978		 * Any other processes trying to obtain this lock must first
979		 * wait for VXLOCK to clear, then call the new lock operation.
980		 */
981		VOP_UNLOCK(vp, 0, p);
982	}
983
984	/*
985	 * Reclaim the vnode.
986	 */
987	if (VOP_RECLAIM(vp, p))
988		panic("vclean: cannot reclaim");
989	if (active) {
990		if (vunref(vp) == 0 &&
991		    vp->v_holdcnt > 0)
992			panic("vclean: not clean");
993		simple_unlock(&vp->v_interlock);
994	}
995	cache_purge(vp);
996	if (vp->v_vnlock) {
997		if ((vp->v_vnlock->lk_flags & LK_DRAINED) == 0)
998			vprint("vclean: lock not drained", vp);
999		FREE(vp->v_vnlock, M_VNODE);
1000		vp->v_vnlock = NULL;
1001	}
1002
1003	/*
1004	 * Done with purge, notify sleepers of the grim news.
1005	 */
1006	vp->v_op = dead_vnodeop_p;
1007	vp->v_tag = VT_NON;
1008	vp->v_flag &= ~VXLOCK;
1009	if (vp->v_flag & VXWANT) {
1010		vp->v_flag &= ~VXWANT;
1011		wakeup((caddr_t)vp);
1012	}
1013}
1014
1015
1016
1017/*
1018 * Recycle an unused vnode to the front of the free list.
1019 * Release the passed interlock if the vnode will be recycled.
1020 */
1021int
1022vrecycle(vp, inter_lkp, p)
1023	struct vnode *vp;
1024	struct simplelock *inter_lkp;
1025	struct proc *p;
1026{
1027
1028	simple_lock(&vp->v_interlock);
1029	if (vp->v_usecount == 0) {
1030		if (inter_lkp)
1031			simple_unlock(inter_lkp);
1032		vgonel(vp, p);
1033		return (1);
1034	}
1035	simple_unlock(&vp->v_interlock);
1036	return (0);
1037}
1038
1039/*
1040 * Eliminate all activity associated with a vnode
1041 * in preparation for reuse.
1042 */
1043void
1044vgone(vp)
1045	register struct vnode *vp;
1046{
1047	struct proc *p = curproc;
1048
1049	simple_lock (&vp->v_interlock);
1050	vgonel(vp, p);
1051}
1052
1053/*
1054 * vgone, with the vp interlock held.
1055 */
1056void
1057vgonel(vp, p)
1058	struct vnode *vp;
1059	struct proc *p;
1060{
1061	register struct vnode *vq;
1062	struct vnode *vx;
1063
1064	/*
1065	 * If a vgone (or vclean) is already in progress,
1066	 * wait until it is done and return.
1067	 */
1068	if (vp->v_flag & VXLOCK) {
1069		vp->v_flag |= VXWANT;
1070		simple_unlock(&vp->v_interlock);
1071		tsleep((caddr_t)vp, PINOD, "vgone", 0);
1072		return;
1073	}
1074	/*
1075	 * Clean out the filesystem specific data.
1076	 */
1077	vclean(vp, DOCLOSE, p);
1078	/*
1079	 * Delete from old mount point vnode list, if on one.
1080	 */
1081	if (vp->v_mount != NULL)
1082		insmntque(vp, (struct mount *)0);
1083	/*
1084	 * If special device, remove it from special device alias list
1085	 * if it is on one.
1086	 */
1087	if ((vp->v_type == VBLK || vp->v_type == VCHR) && vp->v_specinfo != 0) {
1088		simple_lock(&spechash_slock);
1089		if (*vp->v_hashchain == vp) {
1090			*vp->v_hashchain = vp->v_specnext;
1091		} else {
1092			for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
1093				if (vq->v_specnext != vp)
1094					continue;
1095				vq->v_specnext = vp->v_specnext;
1096				break;
1097			}
1098			if (vq == NULL)
1099				panic("missing bdev");
1100		}
1101		if (vp->v_flag & VALIASED) {
1102			vx = NULL;
1103			for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
1104				if (vq->v_rdev != vp->v_rdev ||
1105				    vq->v_type != vp->v_type)
1106					continue;
1107				if (vx)
1108					break;
1109				vx = vq;
1110			}
1111			if (vx == NULL)
1112				panic("missing alias");
1113			if (vq == NULL)
1114				vx->v_flag &= ~VALIASED;
1115			vp->v_flag &= ~VALIASED;
1116		}
1117		simple_unlock(&spechash_slock);
1118		FREE(vp->v_specinfo, M_VNODE);
1119		vp->v_specinfo = NULL;
1120	}
1121	/*
1122	 * If it is on the freelist and not already at the head,
1123	 * move it to the head of the list.
1124	 *
1125	 * See above about the VGONEHACK
1126	 */
1127	if (vp->v_usecount == 0) {
1128		simple_lock(&vnode_free_list_slock);
1129		if (vp->v_holdcnt > 0)
1130			panic("vgonel: not clean");
1131		if (!(vp->v_flag & VGONEHACK) &&
1132		    TAILQ_FIRST(&vnode_free_list) != vp) {
1133			TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
1134			TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
1135		}
1136		simple_unlock(&vnode_free_list_slock);
1137	}
1138	vp->v_type = VBAD;
1139}
1140
1141/*
1142 * Lookup a vnode by device number.
1143 */
1144int
1145vfinddev(dev, type, vpp)
1146	dev_t dev;
1147	enum vtype type;
1148	struct vnode **vpp;
1149{
1150	register struct vnode *vp;
1151	int rc =0;
1152
1153	simple_lock(&spechash_slock);
1154	for (vp = speclisth[SPECHASH(dev)]; vp; vp = vp->v_specnext) {
1155		if (dev != vp->v_rdev || type != vp->v_type)
1156			continue;
1157		*vpp = vp;
1158		rc = 1;
1159		break;
1160	}
1161	simple_unlock(&spechash_slock);
1162	return (rc);
1163}
1164
1165/*
1166 * Calculate the total number of references to a special device.
1167 */
1168int
1169vcount(vp)
1170	struct vnode *vp;
1171{
1172	struct vnode *vq, *vnext;
1173	int count;
1174
1175loop:
1176	if ((vp->v_flag & VALIASED) == 0)
1177		return (vp->v_usecount);
1178	simple_lock(&spechash_slock);
1179	for (count = 0, vq = *vp->v_hashchain; vq; vq = vnext) {
1180		vnext = vq->v_specnext;
1181		if (vq->v_rdev != vp->v_rdev || vq->v_type != vp->v_type)
1182			continue;
1183		/*
1184		 * Alias, but not in use, so flush it out.
1185		 */
1186		if (vq->v_usecount == 0 && vq != vp) {
1187			simple_unlock(&spechash_slock);
1188			vgone(vq);
1189			goto loop;
1190		}
1191		count += vq->v_usecount;
1192	}
1193	simple_unlock(&spechash_slock);
1194	return (count);
1195}
1196
1197/*
1198 * Print out a description of a vnode.
1199 */
1200static char *typename[] =
1201   { "VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD" };
1202
1203void
1204vprint(label, vp)
1205	char *label;
1206	register struct vnode *vp;
1207{
1208	char buf[64];
1209
1210	if (label != NULL)
1211		printf("%s: ", label);
1212	printf("type %s, usecount %d, writecount %d, refcount %ld,",
1213		typename[vp->v_type], vp->v_usecount, vp->v_writecount,
1214		vp->v_holdcnt);
1215	buf[0] = '\0';
1216	if (vp->v_flag & VROOT)
1217		strcat(buf, "|VROOT");
1218	if (vp->v_flag & VTEXT)
1219		strcat(buf, "|VTEXT");
1220	if (vp->v_flag & VSYSTEM)
1221		strcat(buf, "|VSYSTEM");
1222	if (vp->v_flag & VXLOCK)
1223		strcat(buf, "|VXLOCK");
1224	if (vp->v_flag & VXWANT)
1225		strcat(buf, "|VXWANT");
1226	if (vp->v_flag & VBWAIT)
1227		strcat(buf, "|VBWAIT");
1228	if (vp->v_flag & VALIASED)
1229		strcat(buf, "|VALIASED");
1230	if (buf[0] != '\0')
1231		printf(" flags (%s)", &buf[1]);
1232	if (vp->v_data == NULL) {
1233		printf("\n");
1234	} else {
1235		printf("\n\t");
1236		VOP_PRINT(vp);
1237	}
1238}
1239
1240#ifdef DEBUG
1241/*
1242 * List all of the locked vnodes in the system.
1243 * Called when debugging the kernel.
1244 */
1245void
1246printlockedvnodes()
1247{
1248	struct proc *p = curproc;
1249	register struct mount *mp, *nmp;
1250	register struct vnode *vp;
1251
1252	printf("Locked vnodes\n");
1253	simple_lock(&mountlist_slock);
1254	for (mp = CIRCLEQ_FIRST(&mountlist); mp != (void *)&mountlist;
1255	     mp = nmp) {
1256		if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) {
1257			nmp = CIRCLEQ_NEXT(mp, mnt_list);
1258			continue;
1259		}
1260		for (vp = mp->mnt_vnodelist.lh_first;
1261		     vp != NULL;
1262		     vp = vp->v_mntvnodes.le_next) {
1263			if (VOP_ISLOCKED(vp))
1264				vprint((char *)0, vp);
1265		}
1266		simple_lock(&mountlist_slock);
1267		nmp = CIRCLEQ_NEXT(mp, mnt_list);
1268		vfs_unbusy(mp, p);
1269 	}
1270	simple_unlock(&mountlist_slock);
1271
1272}
1273#endif
1274
1275/*
1276 * Top level filesystem related information gathering.
1277 */
1278int
1279vfs_sysctl(name, namelen, oldp, oldlenp, newp, newlen, p)
1280	int *name;
1281	u_int namelen;
1282	void *oldp;
1283	size_t *oldlenp;
1284	void *newp;
1285	size_t newlen;
1286	struct proc *p;
1287{
1288	struct vfsconf *vfsp;
1289
1290	/* all sysctl names at this level are at least name and field */
1291	if (namelen < 2)
1292		return (ENOTDIR);		/* overloaded */
1293	if (name[0] != VFS_GENERIC) {
1294		for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
1295			if (vfsp->vfc_typenum == name[0])
1296				break;
1297		if (vfsp == NULL)
1298			return (EOPNOTSUPP);
1299		return ((*vfsp->vfc_vfsops->vfs_sysctl)(&name[1], namelen - 1,
1300		    oldp, oldlenp, newp, newlen, p));
1301	}
1302	switch (name[1]) {
1303	case VFS_MAXTYPENUM:
1304		return (sysctl_rdint(oldp, oldlenp, newp, maxvfsconf));
1305	case VFS_CONF:
1306		if (namelen < 3)
1307			return (ENOTDIR);	/* overloaded */
1308		for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
1309			if (vfsp->vfc_typenum == name[2])
1310				break;
1311		if (vfsp == NULL)
1312			return (EOPNOTSUPP);
1313		return (sysctl_rdstruct(oldp, oldlenp, newp, vfsp,
1314		    sizeof(struct vfsconf)));
1315	}
1316	return (EOPNOTSUPP);
1317}
1318
1319
1320int kinfo_vdebug = 1;
1321int kinfo_vgetfailed;
1322#define KINFO_VNODESLOP	10
1323/*
1324 * Dump vnode list (via sysctl).
1325 * Copyout address of vnode followed by vnode.
1326 */
1327/* ARGSUSED */
1328int
1329sysctl_vnode(where, sizep, p)
1330	char *where;
1331	size_t *sizep;
1332	struct proc *p;
1333{
1334	register struct mount *mp, *nmp;
1335	struct vnode *vp, *nvp;
1336	register char *bp = where, *savebp;
1337	char *ewhere;
1338	int error;
1339
1340#define VPTRSZ	sizeof (struct vnode *)
1341#define VNODESZ	sizeof (struct vnode)
1342	if (where == NULL) {
1343		*sizep = (numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ);
1344		return (0);
1345	}
1346	ewhere = where + *sizep;
1347
1348	simple_lock(&mountlist_slock);
1349	for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) {
1350		if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) {
1351			nmp = mp->mnt_list.cqe_next;
1352			continue;
1353		}
1354		savebp = bp;
1355again:
1356		for (vp = mp->mnt_vnodelist.lh_first;
1357		     vp != NULL;
1358		     vp = nvp) {
1359			/*
1360			 * Check that the vp is still associated with
1361			 * this filesystem.  RACE: could have been
1362			 * recycled onto the same filesystem.
1363			 */
1364			if (vp->v_mount != mp) {
1365				simple_unlock(&mntvnode_slock);
1366				if (kinfo_vdebug)
1367					printf("kinfo: vp changed\n");
1368				bp = savebp;
1369				goto again;
1370			}
1371			nvp = vp->v_mntvnodes.le_next;
1372			if (bp + VPTRSZ + VNODESZ > ewhere) {
1373				simple_unlock(&mntvnode_slock);
1374				*sizep = bp - where;
1375				return (ENOMEM);
1376			}
1377			if ((error = copyout((caddr_t)&vp, bp, VPTRSZ)) ||
1378			   (error = copyout((caddr_t)vp, bp + VPTRSZ, VNODESZ)))
1379				return (error);
1380			bp += VPTRSZ + VNODESZ;
1381			simple_lock(&mntvnode_slock);
1382		}
1383
1384		simple_unlock(&mntvnode_slock);
1385		simple_lock(&mountlist_slock);
1386		nmp = mp->mnt_list.cqe_next;
1387		vfs_unbusy(mp, p);
1388	}
1389
1390	simple_unlock(&mountlist_slock);
1391
1392	*sizep = bp - where;
1393	return (0);
1394}
1395
1396/*
1397 * Check to see if a filesystem is mounted on a block device.
1398 */
1399int
1400vfs_mountedon(vp)
1401	register struct vnode *vp;
1402{
1403	register struct vnode *vq;
1404	int error = 0;
1405
1406 	if (vp->v_specmountpoint != NULL)
1407		return (EBUSY);
1408	if (vp->v_flag & VALIASED) {
1409		simple_lock(&spechash_slock);
1410		for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
1411			if (vq->v_rdev != vp->v_rdev ||
1412			    vq->v_type != vp->v_type)
1413				continue;
1414			if (vq->v_specmountpoint != NULL) {
1415				error = EBUSY;
1416				break;
1417			}
1418 		}
1419		simple_unlock(&spechash_slock);
1420	}
1421	return (error);
1422}
1423
1424/*
1425 * Build hash lists of net addresses and hang them off the mount point.
1426 * Called by ufs_mount() to set up the lists of export addresses.
1427 */
1428int
1429vfs_hang_addrlist(mp, nep, argp)
1430	struct mount *mp;
1431	struct netexport *nep;
1432	struct export_args *argp;
1433{
1434	register struct netcred *np;
1435	register struct radix_node_head *rnh;
1436	register int i;
1437	struct radix_node *rn;
1438	struct sockaddr *saddr, *smask = 0;
1439	struct domain *dom;
1440	int error;
1441
1442	if (argp->ex_addrlen == 0) {
1443		if (mp->mnt_flag & MNT_DEFEXPORTED)
1444			return (EPERM);
1445		np = &nep->ne_defexported;
1446		np->netc_exflags = argp->ex_flags;
1447		np->netc_anon = argp->ex_anon;
1448		np->netc_anon.cr_ref = 1;
1449		mp->mnt_flag |= MNT_DEFEXPORTED;
1450		return (0);
1451	}
1452	i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen;
1453	np = (struct netcred *)malloc(i, M_NETADDR, M_WAITOK);
1454	bzero((caddr_t)np, i);
1455	saddr = (struct sockaddr *)(np + 1);
1456	error = copyin(argp->ex_addr, (caddr_t)saddr, argp->ex_addrlen);
1457	if (error)
1458		goto out;
1459	if (saddr->sa_len > argp->ex_addrlen)
1460		saddr->sa_len = argp->ex_addrlen;
1461	if (argp->ex_masklen) {
1462		smask = (struct sockaddr *)((caddr_t)saddr + argp->ex_addrlen);
1463		error = copyin(argp->ex_mask, (caddr_t)smask, argp->ex_masklen);
1464		if (error)
1465			goto out;
1466		if (smask->sa_len > argp->ex_masklen)
1467			smask->sa_len = argp->ex_masklen;
1468	}
1469	i = saddr->sa_family;
1470	if ((rnh = nep->ne_rtable[i]) == 0) {
1471		/*
1472		 * Seems silly to initialize every AF when most are not
1473		 * used, do so on demand here
1474		 */
1475		for (dom = domains; dom; dom = dom->dom_next)
1476			if (dom->dom_family == i && dom->dom_rtattach) {
1477				dom->dom_rtattach((void **)&nep->ne_rtable[i],
1478					dom->dom_rtoffset);
1479				break;
1480			}
1481		if ((rnh = nep->ne_rtable[i]) == 0) {
1482			error = ENOBUFS;
1483			goto out;
1484		}
1485	}
1486	rn = (*rnh->rnh_addaddr)((caddr_t)saddr, (caddr_t)smask, rnh,
1487		np->netc_rnodes);
1488	if (rn == 0 || np != (struct netcred *)rn) { /* already exists */
1489		error = EPERM;
1490		goto out;
1491	}
1492	np->netc_exflags = argp->ex_flags;
1493	np->netc_anon = argp->ex_anon;
1494	np->netc_anon.cr_ref = 1;
1495	return (0);
1496out:
1497	free(np, M_NETADDR);
1498	return (error);
1499}
1500
1501/* ARGSUSED */
1502int
1503vfs_free_netcred(rn, w)
1504	struct radix_node *rn;
1505	void *w;
1506{
1507	register struct radix_node_head *rnh = (struct radix_node_head *)w;
1508
1509	(*rnh->rnh_deladdr)(rn->rn_key, rn->rn_mask, rnh);
1510	free((caddr_t)rn, M_NETADDR);
1511	return (0);
1512}
1513
1514/*
1515 * Free the net address hash lists that are hanging off the mount points.
1516 */
1517void
1518vfs_free_addrlist(nep)
1519	struct netexport *nep;
1520{
1521	register int i;
1522	register struct radix_node_head *rnh;
1523
1524	for (i = 0; i <= AF_MAX; i++)
1525		if ((rnh = nep->ne_rtable[i]) != NULL) {
1526			(*rnh->rnh_walktree)(rnh, vfs_free_netcred, rnh);
1527			free((caddr_t)rnh, M_RTABLE);
1528			nep->ne_rtable[i] = 0;
1529		}
1530}
1531
1532int
1533vfs_export(mp, nep, argp)
1534	struct mount *mp;
1535	struct netexport *nep;
1536	struct export_args *argp;
1537{
1538	int error;
1539
1540	if (argp->ex_flags & MNT_DELEXPORT) {
1541		vfs_free_addrlist(nep);
1542		mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED);
1543	}
1544	if (argp->ex_flags & MNT_EXPORTED) {
1545		if ((error = vfs_hang_addrlist(mp, nep, argp)) != 0)
1546			return (error);
1547		mp->mnt_flag |= MNT_EXPORTED;
1548	}
1549	return (0);
1550}
1551
1552struct netcred *
1553vfs_export_lookup(mp, nep, nam)
1554	register struct mount *mp;
1555	struct netexport *nep;
1556	struct mbuf *nam;
1557{
1558	register struct netcred *np;
1559	register struct radix_node_head *rnh;
1560	struct sockaddr *saddr;
1561
1562	np = NULL;
1563	if (mp->mnt_flag & MNT_EXPORTED) {
1564		/*
1565		 * Lookup in the export list first.
1566		 */
1567		if (nam != NULL) {
1568			saddr = mtod(nam, struct sockaddr *);
1569			rnh = nep->ne_rtable[saddr->sa_family];
1570			if (rnh != NULL) {
1571				np = (struct netcred *)
1572					(*rnh->rnh_matchaddr)((caddr_t)saddr,
1573							      rnh);
1574				if (np && np->netc_rnodes->rn_flags & RNF_ROOT)
1575					np = NULL;
1576			}
1577		}
1578		/*
1579		 * If no address match, use the default if it exists.
1580		 */
1581		if (np == NULL && mp->mnt_flag & MNT_DEFEXPORTED)
1582			np = &nep->ne_defexported;
1583	}
1584	return (np);
1585}
1586
1587/*
1588 * Do the usual access checking.
1589 * file_mode, uid and gid are from the vnode in question,
1590 * while acc_mode and cred are from the VOP_ACCESS parameter list
1591 */
1592int
1593vaccess(file_mode, uid, gid, acc_mode, cred)
1594	mode_t file_mode;
1595	uid_t uid;
1596	gid_t gid;
1597	mode_t acc_mode;
1598	struct ucred *cred;
1599{
1600	mode_t mask;
1601
1602	/* User id 0 always gets access. */
1603	if (cred->cr_uid == 0)
1604		return 0;
1605
1606	mask = 0;
1607
1608	/* Otherwise, check the owner. */
1609	if (cred->cr_uid == uid) {
1610		if (acc_mode & VEXEC)
1611			mask |= S_IXUSR;
1612		if (acc_mode & VREAD)
1613			mask |= S_IRUSR;
1614		if (acc_mode & VWRITE)
1615			mask |= S_IWUSR;
1616		return (file_mode & mask) == mask ? 0 : EACCES;
1617	}
1618
1619	/* Otherwise, check the groups. */
1620	if (cred->cr_gid == gid || groupmember(gid, cred)) {
1621		if (acc_mode & VEXEC)
1622			mask |= S_IXGRP;
1623		if (acc_mode & VREAD)
1624			mask |= S_IRGRP;
1625		if (acc_mode & VWRITE)
1626			mask |= S_IWGRP;
1627		return (file_mode & mask) == mask ? 0 : EACCES;
1628	}
1629
1630	/* Otherwise, check everyone else. */
1631	if (acc_mode & VEXEC)
1632		mask |= S_IXOTH;
1633	if (acc_mode & VREAD)
1634		mask |= S_IROTH;
1635	if (acc_mode & VWRITE)
1636		mask |= S_IWOTH;
1637	return (file_mode & mask) == mask ? 0 : EACCES;
1638}
1639
1640/*
1641 * Unmount all file systems.
1642 * We traverse the list in reverse order under the assumption that doing so
1643 * will avoid needing to worry about dependencies.
1644 */
1645void
1646vfs_unmountall()
1647{
1648	register struct mount *mp, *nmp;
1649	int allerror, error;
1650
1651	for (allerror = 0,
1652	     mp = mountlist.cqh_last; mp != (void *)&mountlist; mp = nmp) {
1653		nmp = mp->mnt_list.cqe_prev;
1654		if ((error = dounmount(mp, MNT_FORCE, curproc)) != 0) {
1655			printf("unmount of %s failed with error %d\n",
1656			    mp->mnt_stat.f_mntonname, error);
1657			allerror = 1;
1658		}
1659	}
1660	if (allerror)
1661		printf("WARNING: some file systems would not unmount\n");
1662}
1663
1664/*
1665 * Sync and unmount file systems before shutting down.
1666 */
1667void
1668vfs_shutdown()
1669{
1670	register struct buf *bp;
1671	int iter, nbusy;
1672
1673	/* XXX Should suspend scheduling. */
1674	(void) spl0();
1675
1676	printf("syncing disks... ");
1677
1678	if (panicstr == 0) {
1679		/* Release inodes held by texts before update. */
1680		vnode_pager_umount(NULL);
1681#ifdef notdef
1682		vnshutdown();
1683#endif
1684
1685		/* Sync before unmount, in case we hang on something. */
1686		sys_sync(&proc0, (void *)0, (register_t *)0);
1687
1688		/* Unmount file systems. */
1689		vfs_unmountall();
1690	}
1691
1692	/* Sync again after unmount, just in case. */
1693	sys_sync(&proc0, (void *)0, (register_t *)0);
1694
1695	/* Wait for sync to finish. */
1696	for (iter = 0; iter < 20; iter++) {
1697		nbusy = 0;
1698		for (bp = &buf[nbuf]; --bp >= buf; )
1699			if ((bp->b_flags & (B_BUSY|B_INVAL)) == B_BUSY)
1700				nbusy++;
1701		if (nbusy == 0)
1702			break;
1703		printf("%d ", nbusy);
1704		DELAY(40000 * iter);
1705	}
1706	if (nbusy)
1707		printf("giving up\n");
1708	else
1709		printf("done\n");
1710}
1711
1712/*
1713 * posix file system related system variables.
1714 */
1715int
1716fs_posix_sysctl(name, namelen, oldp, oldlenp, newp, newlen, p)
1717	int *name;
1718	u_int namelen;
1719	void *oldp;
1720	size_t *oldlenp;
1721	void *newp;
1722	size_t newlen;
1723	struct proc *p;
1724{
1725	/* all sysctl names at this level are terminal */
1726	if (namelen != 1)
1727		return (ENOTDIR);
1728
1729	switch (name[0]) {
1730	case FS_POSIX_SETUID:
1731		if (newp && securelevel > 0)
1732			return (EPERM);
1733		return(sysctl_int(oldp, oldlenp, newp, newlen, &suid_clear));
1734	default:
1735		return (EOPNOTSUPP);
1736	}
1737	/* NOTREACHED */
1738}
1739
1740/*
1741 * file system related system variables.
1742 */
1743int
1744fs_sysctl(name, namelen, oldp, oldlenp, newp, newlen, p)
1745	int *name;
1746	u_int namelen;
1747	void *oldp;
1748	size_t *oldlenp;
1749	void *newp;
1750	size_t newlen;
1751	struct proc *p;
1752{
1753	sysctlfn *fn;
1754
1755	switch (name[0]) {
1756	case FS_POSIX:
1757		fn = fs_posix_sysctl;
1758		break;
1759	default:
1760		return (EOPNOTSUPP);
1761	}
1762	return (*fn)(name + 1, namelen - 1, oldp, oldlenp, newp, newlen, p);
1763}
1764
1765
1766/*
1767 * Routines dealing with vnodes and buffers
1768 */
1769
1770/*
1771 * Update outstanding I/O count and do wakeup if requested.
1772 */
1773void
1774vwakeup(bp)
1775	register struct buf *bp;
1776{
1777	register struct vnode *vp;
1778
1779	bp->b_flags &= ~B_WRITEINPROG;
1780	if ((vp = bp->b_vp) != NULL) {
1781		if (--vp->v_numoutput < 0)
1782			panic("vwakeup: neg numoutput");
1783		if ((vp->v_flag & VBWAIT) && vp->v_numoutput <= 0) {
1784			vp->v_flag &= ~VBWAIT;
1785			wakeup((caddr_t)&vp->v_numoutput);
1786		}
1787	}
1788}
1789
1790/*
1791 * Flush out and invalidate all buffers associated with a vnode.
1792 * Called with the underlying object locked.
1793 */
1794int
1795vinvalbuf(vp, flags, cred, p, slpflag, slptimeo)
1796	register struct vnode *vp;
1797	int flags;
1798	struct ucred *cred;
1799	struct proc *p;
1800	int slpflag, slptimeo;
1801{
1802	register struct buf *bp;
1803	struct buf *nbp, *blist;
1804	int s, error;
1805
1806	if ((flags & V_SAVE) && vp->v_dirtyblkhd.lh_first != NULL) {
1807		if ((error = VOP_FSYNC(vp, cred, MNT_WAIT, p)) != 0)
1808			return (error);
1809		if (vp->v_dirtyblkhd.lh_first != NULL)
1810			panic("vinvalbuf: dirty bufs");
1811	}
1812	for (;;) {
1813		if ((blist = vp->v_cleanblkhd.lh_first) &&
1814		    (flags & V_SAVEMETA))
1815			while (blist && blist->b_lblkno < 0)
1816				blist = blist->b_vnbufs.le_next;
1817		if (!blist && (blist = vp->v_dirtyblkhd.lh_first) &&
1818		    (flags & V_SAVEMETA))
1819			while (blist && blist->b_lblkno < 0)
1820				blist = blist->b_vnbufs.le_next;
1821		if (!blist)
1822			break;
1823
1824		for (bp = blist; bp; bp = nbp) {
1825			nbp = bp->b_vnbufs.le_next;
1826			if (flags & V_SAVEMETA && bp->b_lblkno < 0)
1827				continue;
1828			s = splbio();
1829			if (bp->b_flags & B_BUSY) {
1830				bp->b_flags |= B_WANTED;
1831				error = tsleep((caddr_t)bp,
1832					slpflag | (PRIBIO + 1), "vinvalbuf",
1833					slptimeo);
1834				splx(s);
1835				if (error)
1836					return (error);
1837				break;
1838			}
1839			bp->b_flags |= B_BUSY | B_VFLUSH;
1840			splx(s);
1841			/*
1842			 * XXX Since there are no node locks for NFS, I believe
1843			 * there is a slight chance that a delayed write will
1844			 * occur while sleeping just above, so check for it.
1845			 */
1846			if ((bp->b_flags & B_DELWRI) && (flags & V_SAVE)) {
1847				(void) VOP_BWRITE(bp);
1848				break;
1849			}
1850			bp->b_flags |= B_INVAL;
1851			brelse(bp);
1852		}
1853	}
1854	if (!(flags & V_SAVEMETA) &&
1855	    (vp->v_dirtyblkhd.lh_first || vp->v_cleanblkhd.lh_first))
1856		panic("vinvalbuf: flush failed");
1857	return (0);
1858}
1859
1860void
1861vflushbuf(vp, sync)
1862	register struct vnode *vp;
1863	int sync;
1864{
1865	register struct buf *bp, *nbp;
1866	int s;
1867
1868loop:
1869	s = splbio();
1870	for (bp = vp->v_dirtyblkhd.lh_first; bp; bp = nbp) {
1871		nbp = bp->b_vnbufs.le_next;
1872		if ((bp->b_flags & B_BUSY))
1873			continue;
1874		if ((bp->b_flags & B_DELWRI) == 0)
1875			panic("vflushbuf: not dirty");
1876		bp->b_flags |= B_BUSY | B_VFLUSH;
1877		splx(s);
1878		/*
1879		 * Wait for I/O associated with indirect blocks to complete,
1880		 * since there is no way to quickly wait for them below.
1881		 */
1882		if (bp->b_vp == vp || sync == 0)
1883			(void) bawrite(bp);
1884		else
1885			(void) bwrite(bp);
1886		goto loop;
1887	}
1888	if (sync == 0) {
1889		splx(s);
1890		return;
1891	}
1892	while (vp->v_numoutput) {
1893		vp->v_flag |= VBWAIT;
1894		tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "vflushbuf", 0);
1895	}
1896	splx(s);
1897	if (vp->v_dirtyblkhd.lh_first != NULL) {
1898		vprint("vflushbuf: dirty", vp);
1899		goto loop;
1900	}
1901}
1902
1903/*
1904 * Associate a buffer with a vnode.
1905 */
1906void
1907bgetvp(vp, bp)
1908	register struct vnode *vp;
1909	register struct buf *bp;
1910{
1911
1912	if (bp->b_vp)
1913		panic("bgetvp: not free");
1914	VHOLD(vp);
1915	bp->b_vp = vp;
1916	if (vp->v_type == VBLK || vp->v_type == VCHR)
1917		bp->b_dev = vp->v_rdev;
1918	else
1919		bp->b_dev = NODEV;
1920	/*
1921	 * Insert onto list for new vnode.
1922	 */
1923	bufinsvn(bp, &vp->v_cleanblkhd);
1924}
1925
1926/*
1927 * Disassociate a buffer from a vnode.
1928 */
1929void
1930brelvp(bp)
1931	register struct buf *bp;
1932{
1933	struct vnode *vp;
1934	struct buf *wasdirty;
1935
1936	if ((vp = bp->b_vp) == (struct vnode *) 0)
1937		panic("brelvp: NULL");
1938	/*
1939	 * Delete from old vnode list, if on one.
1940	 */
1941	wasdirty = vp->v_dirtyblkhd.lh_first;
1942	if (bp->b_vnbufs.le_next != NOLIST)
1943		bufremvn(bp);
1944	if (wasdirty && LIST_FIRST(&vp->v_dirtyblkhd) == NULL)
1945		LIST_REMOVE(vp, v_synclist);
1946	bp->b_vp = (struct vnode *) 0;
1947	HOLDRELE(vp);
1948}
1949
1950/*
1951 * Reassign a buffer from one vnode to another. Used to assign buffers
1952 * to the appropriate clean or dirty list and to add newly dirty vnodes
1953 * to the appropriate filesystem syncer list.
1954 */
1955void
1956reassignbuf(bp, newvp)
1957	register struct buf *bp;
1958	register struct vnode *newvp;
1959{
1960	struct buflists *listheadp;
1961	struct buf *wasdirty;
1962	int delay;
1963
1964	if (newvp == NULL) {
1965		printf("reassignbuf: NULL");
1966		return;
1967	}
1968	/*
1969	 * Delete from old vnode list, if on one.
1970	 */
1971	wasdirty = newvp->v_dirtyblkhd.lh_first;
1972	if (bp->b_vnbufs.le_next != NOLIST)
1973		bufremvn(bp);
1974	/*
1975	 * If dirty, put on list of dirty buffers;
1976	 * otherwise insert onto list of clean buffers.
1977	 */
1978	if ((bp->b_flags & B_DELWRI) == 0) {
1979		listheadp = &newvp->v_cleanblkhd;
1980		if (wasdirty && LIST_FIRST(&newvp->v_dirtyblkhd) == NULL)
1981			LIST_REMOVE(newvp, v_synclist);
1982	} else {
1983		listheadp = &newvp->v_dirtyblkhd;
1984		if (LIST_FIRST(listheadp) == NULL) {
1985			switch (newvp->v_type) {
1986			case VDIR:
1987				delay = syncdelay / 3;
1988				break;
1989			case VBLK:
1990				if (newvp->v_specmountpoint != NULL) {
1991					delay = syncdelay / 2;
1992					break;
1993				}
1994				/* fall through */
1995			default:
1996				delay = syncdelay;
1997			}
1998			vn_syncer_add_to_worklist(newvp, delay);
1999		}
2000	}
2001	bufinsvn(bp, listheadp);
2002}
2003