vfs_subr.c revision 1.30
1/*	$OpenBSD: vfs_subr.c,v 1.30 1998/12/28 19:35:35 art Exp $	*/
2/*	$NetBSD: vfs_subr.c,v 1.53 1996/04/22 01:39:13 christos Exp $	*/
3
4/*
5 * Copyright (c) 1989, 1993
6 *	The Regents of the University of California.  All rights reserved.
7 * (c) UNIX System Laboratories, Inc.
8 * All or some portions of this file are derived from material licensed
9 * to the University of California by American Telephone and Telegraph
10 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
11 * the permission of UNIX System Laboratories, Inc.
12 *
13 * Redistribution and use in source and binary forms, with or without
14 * modification, are permitted provided that the following conditions
15 * are met:
16 * 1. Redistributions of source code must retain the above copyright
17 *    notice, this list of conditions and the following disclaimer.
18 * 2. Redistributions in binary form must reproduce the above copyright
19 *    notice, this list of conditions and the following disclaimer in the
20 *    documentation and/or other materials provided with the distribution.
21 * 3. All advertising materials mentioning features or use of this software
22 *    must display the following acknowledgement:
23 *	This product includes software developed by the University of
24 *	California, Berkeley and its contributors.
25 * 4. Neither the name of the University nor the names of its contributors
26 *    may be used to endorse or promote products derived from this software
27 *    without specific prior written permission.
28 *
29 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
30 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
31 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
32 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
33 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
34 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
35 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
36 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
37 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
38 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
39 * SUCH DAMAGE.
40 *
41 *	@(#)vfs_subr.c	8.13 (Berkeley) 4/18/94
42 */
43
44/*
45 * External virtual filesystem routines
46 */
47
48#include <sys/param.h>
49#include <sys/systm.h>
50#include <sys/proc.h>
51#include <sys/mount.h>
52#include <sys/time.h>
53#include <sys/fcntl.h>
54#include <sys/kernel.h>
55#include <sys/vnode.h>
56#include <sys/stat.h>
57#include <sys/namei.h>
58#include <sys/ucred.h>
59#include <sys/buf.h>
60#include <sys/errno.h>
61#include <sys/malloc.h>
62#include <sys/domain.h>
63#include <sys/mbuf.h>
64#include <sys/syscallargs.h>
65
66#include <vm/vm.h>
67#include <sys/sysctl.h>
68
69#include <miscfs/specfs/specdev.h>
70
71enum vtype iftovt_tab[16] = {
72	VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
73	VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
74};
75int	vttoif_tab[9] = {
76	0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
77	S_IFSOCK, S_IFIFO, S_IFMT,
78};
79
80int doforce = 1;		/* 1 => permit forcible unmounting */
81int prtactive = 0;		/* 1 => print out reclaim of active vnodes */
82int suid_clear = 1;		/* 1 => clear SUID / SGID on owner change */
83
84/*
85 * Insq/Remq for the vnode usage lists.
86 */
87#define	bufinsvn(bp, dp)	LIST_INSERT_HEAD(dp, bp, b_vnbufs)
88#define	bufremvn(bp) {							\
89	LIST_REMOVE(bp, b_vnbufs);					\
90	(bp)->b_vnbufs.le_next = NOLIST;				\
91}
92
93struct freelst vnode_hold_list;   /* list of vnodes referencing buffers */
94struct freelst vnode_free_list;   /* vnode free list */
95
96struct mntlist mountlist;			/* mounted filesystem list */
97struct simplelock mountlist_slock;
98static struct simplelock mntid_slock;
99struct simplelock mntvnode_slock;
100struct simplelock vnode_free_list_slock;
101struct simplelock spechash_slock;
102
103
104void insmntque __P((struct vnode *, struct mount *));
105int getdevvp __P((dev_t, struct vnode **, enum vtype));
106
107int vfs_hang_addrlist __P((struct mount *, struct netexport *,
108				  struct export_args *));
109int vfs_free_netcred __P((struct radix_node *, void *));
110void vfs_free_addrlist __P((struct netexport *));
111static __inline__ void vputonfreelist __P((struct vnode *));
112
113#ifdef DEBUG
114void printlockedvnodes __P((void));
115#endif
116
117/*
118 * Initialize the vnode management data structures.
119 */
120void
121vntblinit()
122{
123
124	simple_lock_init(&mntvnode_slock);
125	simple_lock_init(&mntid_slock);
126	simple_lock_init(&spechash_slock);
127	TAILQ_INIT(&vnode_hold_list);
128	TAILQ_INIT(&vnode_free_list);
129	simple_lock_init(&vnode_free_list_slock);
130	CIRCLEQ_INIT(&mountlist);
131	/*
132	 * Initialize the filesystem syncer.
133	 */
134	vn_initialize_syncerd();
135}
136
137
138/*
139 * Mark a mount point as busy. Used to synchronize access and to delay
140 * unmounting. Interlock is not released on failure.
141 */
142
143int
144vfs_busy(mp, flags, interlkp, p)
145	struct mount *mp;
146	int flags;
147	struct simplelock *interlkp;
148	struct proc *p;
149{
150	int lkflags;
151
152	if (mp->mnt_flag & MNT_UNMOUNT) {
153		if (flags & LK_NOWAIT)
154			return (ENOENT);
155		mp->mnt_flag |= MNT_MWAIT;
156		if (interlkp)
157			simple_unlock(interlkp);
158		/*
159		 * Since all busy locks are shared except the exclusive
160		 * lock granted when unmounting, the only place that a
161		 * wakeup needs to be done is at the release of the
162		 * exclusive lock at the end of dounmount.
163		 */
164 		sleep((caddr_t)mp, PVFS);
165		if (interlkp)
166			simple_lock(interlkp);
167		return (ENOENT);
168	}
169	lkflags = LK_SHARED;
170	if (interlkp)
171		lkflags |= LK_INTERLOCK;
172	if (lockmgr(&mp->mnt_lock, lkflags, interlkp, p))
173		panic("vfs_busy: unexpected lock failure");
174        return (0);
175}
176
177
178/*
179 * Free a busy file system
180 */
181void
182vfs_unbusy(mp, p)
183	struct mount *mp;
184	struct proc *p;
185{
186	lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, p);
187}
188
189/*
190 * Lookup a filesystem type, and if found allocate and initialize
191 * a mount structure for it.
192 *
193 * Devname is usually updated by mount(8) after booting.
194 */
195
196int
197vfs_rootmountalloc(fstypename, devname, mpp)
198	char *fstypename;
199	char *devname;
200	struct mount **mpp;
201{
202	struct proc *p = curproc;	/* XXX */
203	struct vfsconf *vfsp;
204	struct mount *mp;
205
206	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
207		if (!strcmp(vfsp->vfc_name, fstypename))
208			break;
209	if (vfsp == NULL)
210		return (ENODEV);
211	mp = malloc((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK);
212	bzero((char *)mp, (u_long)sizeof(struct mount));
213	lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, 0);
214	(void)vfs_busy(mp, LK_NOWAIT, 0, p);
215	LIST_INIT(&mp->mnt_vnodelist);
216	mp->mnt_vfc = vfsp;
217	mp->mnt_op = vfsp->vfc_vfsops;
218	mp->mnt_flag = MNT_RDONLY;
219	mp->mnt_vnodecovered = NULLVP;
220	vfsp->vfc_refcount++;
221	mp->mnt_stat.f_type = vfsp->vfc_typenum;
222	mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
223	strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN);
224	mp->mnt_stat.f_mntonname[0] = '/';
225	(void) copystr(devname, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, 0);
226	*mpp = mp;
227 	return (0);
228 }
229
230/*
231 * Find an appropriate filesystem to use for the root. If a filesystem
232 * has not been preselected, walk through the list of known filesystems
233 * trying those that have mountroot routines, and try them until one
234 * works or we have tried them all.
235  */
236int
237vfs_mountroot()
238{
239	struct vfsconf *vfsp;
240	extern int (*mountroot)(void);
241	int error;
242
243	if (mountroot != NULL)
244		return ((*mountroot)());
245	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
246		if (vfsp->vfc_mountroot == NULL)
247			continue;
248		if ((error = (*vfsp->vfc_mountroot)()) == 0)
249			return (0);
250		printf("%s_mountroot failed: %d\n", vfsp->vfc_name, error);
251 	}
252	return (ENODEV);
253}
254
255/*
256 * Lookup a mount point by filesystem identifier.
257 */
258struct mount *
259vfs_getvfs(fsid)
260	fsid_t *fsid;
261{
262	register struct mount *mp;
263
264	simple_lock(&mountlist_slock);
265	for (mp = mountlist.cqh_first; mp != (void *)&mountlist;
266	     mp = mp->mnt_list.cqe_next) {
267		if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
268		    mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
269			simple_unlock(&mountlist_slock);
270			return (mp);
271		}
272	}
273	simple_unlock(&mountlist_slock);
274	return ((struct mount *)0);
275}
276
277
278/*
279 * Get a new unique fsid
280 */
281void
282vfs_getnewfsid(mp)
283	struct mount *mp;
284{
285	static u_short xxxfs_mntid;
286
287	fsid_t tfsid;
288	int mtype;
289
290	simple_lock(&mntid_slock);
291	mtype = mp->mnt_vfc->vfc_typenum;
292	mp->mnt_stat.f_fsid.val[0] = makedev(nblkdev + mtype, 0);
293	mp->mnt_stat.f_fsid.val[1] = mtype;
294	if (xxxfs_mntid == 0)
295		++xxxfs_mntid;
296	tfsid.val[0] = makedev(nblkdev + mtype, xxxfs_mntid);
297	tfsid.val[1] = mtype;
298	if (mountlist.cqh_first != (void *)&mountlist) {
299		while (vfs_getvfs(&tfsid)) {
300			tfsid.val[0]++;
301			xxxfs_mntid++;
302		}
303	}
304	mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
305	simple_unlock(&mntid_slock);
306}
307
308/*
309 * Make a 'unique' number from a mount type name.
310 * Note that this is no longer used for ffs which
311 * now has an on-disk filesystem id.
312 */
313long
314makefstype(type)
315	char *type;
316{
317	long rv;
318
319	for (rv = 0; *type; type++) {
320		rv <<= 2;
321		rv ^= *type;
322	}
323	return rv;
324}
325
326/*
327 * Set vnode attributes to VNOVAL
328 */
329void
330vattr_null(vap)
331	register struct vattr *vap;
332{
333
334	vap->va_type = VNON;
335	/* XXX These next two used to be one line, but for a GCC bug. */
336	vap->va_size = VNOVAL;
337	vap->va_bytes = VNOVAL;
338	vap->va_mode = vap->va_nlink = vap->va_uid = vap->va_gid =
339		vap->va_fsid = vap->va_fileid =
340		vap->va_blocksize = vap->va_rdev =
341		vap->va_atime.tv_sec = vap->va_atime.tv_nsec =
342		vap->va_mtime.tv_sec = vap->va_mtime.tv_nsec =
343		vap->va_ctime.tv_sec = vap->va_ctime.tv_nsec =
344		vap->va_flags = vap->va_gen = VNOVAL;
345	vap->va_vaflags = 0;
346}
347
348/*
349 * Routines having to do with the management of the vnode table.
350 */
351extern int (**dead_vnodeop_p) __P((void *));
352long numvnodes;
353
354/*
355 * Return the next vnode from the free list.
356 */
357int
358getnewvnode(tag, mp, vops, vpp)
359	enum vtagtype tag;
360	struct mount *mp;
361	int (**vops) __P((void *));
362	struct vnode **vpp;
363{
364	struct proc *p = curproc;			/* XXX */
365	struct freelst *listhd;
366	static int toggle;
367	struct vnode *vp;
368#ifdef DIAGNOSTIC
369	int s;
370#endif
371
372	/*
373	 * We must choose whether to allocate a new vnode or recycle an
374	 * existing one. The criterion for allocating a new one is that
375	 * the total number of vnodes is less than the number desired or
376	 * there are no vnodes on either free list. Generally we only
377	 * want to recycle vnodes that have no buffers associated with
378	 * them, so we look first on the vnode_free_list. If it is empty,
379	 * we next consider vnodes with referencing buffers on the
380	 * vnode_hold_list. The toggle ensures that half the time we
381	 * will use a buffer from the vnode_hold_list, and half the time
382	 * we will allocate a new one unless the list has grown to twice
383	 * the desired size. We are reticent to recycle vnodes from the
384	 * vnode_hold_list because we will lose the identity of all its
385	 * referencing buffers.
386	 */
387	toggle ^= 1;
388	if (numvnodes > 2 * desiredvnodes)
389		toggle = 0;
390
391
392	simple_lock(&vnode_free_list_slock);
393	if ((numvnodes < desiredvnodes) ||
394	    ((TAILQ_FIRST(listhd = &vnode_free_list) == NULL) &&
395	     ((TAILQ_FIRST(listhd = &vnode_hold_list) == NULL) || toggle))) {
396		simple_unlock(&vnode_free_list_slock);
397		vp = (struct vnode *)malloc((u_long)sizeof *vp,
398		    M_VNODE, M_WAITOK);
399		bzero((char *)vp, sizeof *vp);
400		numvnodes++;
401	} else {
402		for (vp = TAILQ_FIRST(listhd); vp != NULLVP;
403		     vp = TAILQ_NEXT(vp, v_freelist)) {
404			if (simple_lock_try(&vp->v_interlock))
405				break;
406		}
407		/*
408		 * Unless this is a bad time of the month, at most
409		 * the first NCPUS items on the free list are
410		 * locked, so this is close enough to being empty.
411		 */
412		if (vp == NULLVP) {
413			simple_unlock(&vnode_free_list_slock);
414			tablefull("vnode");
415			*vpp = 0;
416			return (ENFILE);
417		}
418		if (vp->v_usecount) {
419			vprint("free vnode", vp);
420			panic("free vnode isn't");
421		}
422
423		TAILQ_REMOVE(listhd, vp, v_freelist);
424		vp->v_flag &= ~VONFREELIST;
425
426		simple_unlock(&vnode_free_list_slock);
427		vp->v_lease = NULL;
428		if (vp->v_type != VBAD)
429			vgonel(vp, p);
430		else
431			simple_unlock(&vp->v_interlock);
432#ifdef DIAGNOSTIC
433		if (vp->v_data) {
434			vprint("cleaned vnode", vp);
435			panic("cleaned vnode isn't");
436		}
437		s = splbio();
438		if (vp->v_numoutput)
439			panic("Clean vnode has pending I/O's");
440		splx(s);
441#endif
442		vp->v_flag = 0;
443		vp->v_lastr = 0;
444		vp->v_ralen = 0;
445		vp->v_maxra = 0;
446		vp->v_lastw = 0;
447		vp->v_lasta = 0;
448		vp->v_cstart = 0;
449		vp->v_clen = 0;
450		vp->v_socket = 0;
451	}
452	vp->v_type = VNON;
453	cache_purge(vp);
454	vp->v_tag = tag;
455	vp->v_op = vops;
456	insmntque(vp, mp);
457	*vpp = vp;
458	vp->v_usecount = 1;
459	vp->v_data = 0;
460	return (0);
461}
462
463/*
464 * Move a vnode from one mount queue to another.
465 */
466void
467insmntque(vp, mp)
468	register struct vnode *vp;
469	register struct mount *mp;
470{
471	simple_lock(&mntvnode_slock);
472	/*
473	 * Delete from old mount point vnode list, if on one.
474	 */
475
476	if (vp->v_mount != NULL)
477		LIST_REMOVE(vp, v_mntvnodes);
478	/*
479	 * Insert into list of vnodes for the new mount point, if available.
480	 */
481	if ((vp->v_mount = mp) != NULL)
482		LIST_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes);
483	simple_unlock(&mntvnode_slock);
484}
485
486
487/*
488 * Create a vnode for a block device.
489 * Used for root filesystem, argdev, and swap areas.
490 * Also used for memory file system special devices.
491 */
492int
493bdevvp(dev, vpp)
494	dev_t dev;
495	struct vnode **vpp;
496{
497
498	return (getdevvp(dev, vpp, VBLK));
499}
500
501/*
502 * Create a vnode for a character device.
503 * Used for kernfs and some console handling.
504 */
505int
506cdevvp(dev, vpp)
507	dev_t dev;
508	struct vnode **vpp;
509{
510
511	return (getdevvp(dev, vpp, VCHR));
512}
513
514/*
515 * Create a vnode for a device.
516 * Used by bdevvp (block device) for root file system etc.,
517 * and by cdevvp (character device) for console and kernfs.
518 */
519int
520getdevvp(dev, vpp, type)
521	dev_t dev;
522	struct vnode **vpp;
523	enum vtype type;
524{
525	register struct vnode *vp;
526	struct vnode *nvp;
527	int error;
528
529	if (dev == NODEV) {
530		*vpp = NULLVP;
531		return (0);
532	}
533	error = getnewvnode(VT_NON, NULL, spec_vnodeop_p, &nvp);
534	if (error) {
535		*vpp = NULLVP;
536		return (error);
537	}
538	vp = nvp;
539	vp->v_type = type;
540	if ((nvp = checkalias(vp, dev, NULL)) != 0) {
541		vput(vp);
542		vp = nvp;
543	}
544	*vpp = vp;
545	return (0);
546}
547
548/*
549 * Check to see if the new vnode represents a special device
550 * for which we already have a vnode (either because of
551 * bdevvp() or because of a different vnode representing
552 * the same block device). If such an alias exists, deallocate
553 * the existing contents and return the aliased vnode. The
554 * caller is responsible for filling it with its new contents.
555 */
556struct vnode *
557checkalias(nvp, nvp_rdev, mp)
558	register struct vnode *nvp;
559	dev_t nvp_rdev;
560	struct mount *mp;
561{
562	struct proc *p = curproc;
563	register struct vnode *vp;
564	struct vnode **vpp;
565
566	if (nvp->v_type != VBLK && nvp->v_type != VCHR)
567		return (NULLVP);
568
569	vpp = &speclisth[SPECHASH(nvp_rdev)];
570loop:
571	simple_lock(&spechash_slock);
572	for (vp = *vpp; vp; vp = vp->v_specnext) {
573		simple_lock(&vp->v_interlock);
574		if (nvp_rdev != vp->v_rdev || nvp->v_type != vp->v_type) {
575			simple_unlock(&vp->v_interlock);
576			continue;
577		}
578		/*
579		 * Alias, but not in use, so flush it out.
580		 */
581		if (vp->v_usecount == 0) {
582			simple_unlock(&spechash_slock);
583			vgonel(vp, p);
584			goto loop;
585		}
586		if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, p)) {
587			simple_unlock(&spechash_slock);
588			goto loop;
589		}
590		break;
591	}
592
593
594	/*
595	 * Common case is actually in the if statement
596	 */
597	if (vp == NULL || !(vp->v_tag == VT_NON && vp->v_type == VBLK)) {
598		MALLOC(nvp->v_specinfo, struct specinfo *,
599			sizeof(struct specinfo), M_VNODE, M_WAITOK);
600		nvp->v_rdev = nvp_rdev;
601		nvp->v_hashchain = vpp;
602		nvp->v_specnext = *vpp;
603		nvp->v_specmountpoint = NULL;
604		nvp->v_speclockf = NULL;
605		simple_unlock(&spechash_slock);
606		*vpp = nvp;
607		if (vp != NULLVP) {
608			nvp->v_flag |= VALIASED;
609			vp->v_flag |= VALIASED;
610			vput(vp);
611		}
612		return (NULLVP);
613	}
614
615	/*
616	 * This code is the uncommon case. It is called in case
617	 * we found an alias that was VT_NON && vtype of VBLK
618	 * This means we found a block device that was created
619	 * using bdevvp.
620	 * An example of such a vnode is the root partition device vnode
621	 * craeted in ffs_mountroot.
622	 *
623	 * The vnodes created by bdevvp should not be aliased (why?).
624	 */
625
626	simple_unlock(&spechash_slock);
627	VOP_UNLOCK(vp, 0, p);
628	simple_lock(&vp->v_interlock);
629	vclean(vp, 0, p);
630	vp->v_op = nvp->v_op;
631	vp->v_tag = nvp->v_tag;
632	nvp->v_type = VNON;
633	insmntque(vp, mp);
634	return (vp);
635}
636
637/*
638 * Grab a particular vnode from the free list, increment its
639 * reference count and lock it. The vnode lock bit is set the
640 * vnode is being eliminated in vgone. The process is awakened
641 * when the transition is completed, and an error returned to
642 * indicate that the vnode is no longer usable (possibly having
643 * been changed to a new file system type).
644 */
645int
646vget(vp, flags, p)
647        struct vnode *vp;
648	int flags;
649	struct proc *p;
650{
651	int error;
652	/*
653	 * If the vnode is in the process of being cleaned out for
654	 * another use, we wait for the cleaning to finish and then
655	 * return failure. Cleaning is determined by checking that
656	 * the VXLOCK flag is set.
657	 */
658	if ((flags & LK_INTERLOCK) == 0)
659		simple_lock(&vp->v_interlock);
660	if (vp->v_flag & VXLOCK) {
661 		vp->v_flag |= VXWANT;
662		simple_unlock(&vp->v_interlock);
663		tsleep((caddr_t)vp, PINOD, "vget", 0);
664		return (ENOENT);
665 	}
666	if ((vp->v_flag & VONFREELIST) && (vp->v_usecount == 0)) {
667		simple_lock(&vnode_free_list_slock);
668		if (vp->v_holdcnt > 0)
669			TAILQ_REMOVE(&vnode_hold_list, vp, v_freelist);
670		else
671			TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
672		simple_unlock(&vnode_free_list_slock);
673		vp->v_flag &= ~VONFREELIST;
674	}
675 	vp->v_usecount++;
676	if (flags & LK_TYPE_MASK) {
677		if ((error = vn_lock(vp, flags | LK_INTERLOCK, p)) != 0) {
678			vp->v_usecount--;
679			if (vp->v_usecount == 0)
680				vputonfreelist(vp);
681
682			simple_unlock(&vp->v_interlock);
683		}
684		return (error);
685	}
686	simple_unlock(&vp->v_interlock);
687	return (0);
688}
689
690
691#ifdef DIAGNOSTIC
692/*
693 * Vnode reference.
694 */
695void
696vref(vp)
697	struct vnode *vp;
698{
699	simple_lock(&vp->v_interlock);
700	if (vp->v_usecount <= 0)
701		panic("vref used where vget required");
702	vp->v_usecount++;
703	simple_unlock(&vp->v_interlock);
704}
705#endif /* DIAGNOSTIC */
706
707static __inline__ void
708vputonfreelist(vp)
709        struct vnode *vp;
710
711{
712	struct freelst *lst;
713
714	/*
715	 * insert at tail of LRU list
716	 */
717#ifdef DIAGNOSTIC
718	if (vp->v_usecount != 0) {
719		panic("Use count is not zero!");
720	}
721
722	if (vp->v_flag & VONFREELIST) {
723		vprint ("vnode already on free list: ", vp);
724		panic ("vnode already on free list");
725		return;
726	}
727#endif
728
729	vp->v_flag |= VONFREELIST;
730
731	simple_lock(&vnode_free_list_slock);
732
733	if (vp->v_holdcnt > 0)
734		lst = &vnode_hold_list;
735	else
736		lst = &vnode_free_list;
737
738
739	if (vp->v_type == VBAD)
740		TAILQ_INSERT_HEAD(lst, vp, v_freelist);
741	else
742		TAILQ_INSERT_TAIL(lst, vp, v_freelist);
743
744	simple_unlock(&vnode_free_list_slock);
745}
746
747/*
748 * vput(), just unlock and vrele()
749 */
750void
751vput(vp)
752	register struct vnode *vp;
753{
754	struct proc *p = curproc;	/* XXX */
755
756#ifdef DIAGNOSTIC
757	if (vp == NULL)
758		panic("vput: null vp");
759#endif
760	simple_lock(&vp->v_interlock);
761	vp->v_usecount--;
762	if (vp->v_usecount > 0) {
763		simple_unlock(&vp->v_interlock);
764		VOP_UNLOCK(vp, 0, p);
765		return;
766	}
767#ifdef DIAGNOSTIC
768	if (vp->v_usecount < 0 || vp->v_writecount != 0) {
769		vprint("vput: bad ref count", vp);
770		panic("vput: ref cnt");
771	}
772#endif
773	vputonfreelist(vp);
774
775	VOP_INACTIVE(vp, p);
776
777	simple_unlock(&vp->v_interlock);
778}
779
780/*
781 * Vnode release - use for active VNODES.
782 * If count drops to zero, call inactive routine and return to freelist.
783 */
784void
785vrele(vp)
786	register struct vnode *vp;
787{
788	struct proc *p = curproc;	/* XXX */
789
790#ifdef DIAGNOSTIC
791	if (vp == NULL)
792		panic("vrele: null vp");
793#endif
794	simple_lock(&vp->v_interlock);
795	vp->v_usecount--;
796	if (vp->v_usecount > 0) {
797		simple_unlock(&vp->v_interlock);
798		return;
799	}
800#ifdef DIAGNOSTIC
801	if (vp->v_usecount < 0 || vp->v_writecount != 0) {
802		vprint("vrele: bad ref count", vp);
803		panic("vrele: ref cnt");
804	}
805#endif
806	vputonfreelist(vp);
807
808	if (vn_lock(vp, LK_EXCLUSIVE|LK_INTERLOCK, p) == 0)
809		VOP_INACTIVE(vp, p);
810}
811
812#ifdef DIAGNOSTIC
813/*
814 * Page or buffer structure gets a reference.
815 */
816void
817vhold(vp)
818	register struct vnode *vp;
819{
820
821	/*
822	 * If it is on the freelist and the hold count is currently
823	 * zero, move it to the hold list.
824	 */
825  	simple_lock(&vp->v_interlock);
826	if ((vp->v_flag & VONFREELIST) &&
827	    vp->v_holdcnt == 0 && vp->v_usecount == 0) {
828		simple_lock(&vnode_free_list_slock);
829		TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
830		TAILQ_INSERT_TAIL(&vnode_hold_list, vp, v_freelist);
831		simple_unlock(&vnode_free_list_slock);
832	}
833	vp->v_holdcnt++;
834	simple_unlock(&vp->v_interlock);
835}
836
837/*
838 * Page or buffer structure frees a reference.
839 */
840void
841holdrele(vp)
842	register struct vnode *vp;
843{
844
845	simple_lock(&vp->v_interlock);
846	if (vp->v_holdcnt <= 0)
847		panic("holdrele: holdcnt");
848	vp->v_holdcnt--;
849	/*
850	 * If it is on the holdlist and the hold count drops to
851	 * zero, move it to the free list.
852	 */
853	if ((vp->v_flag & VONFREELIST) &&
854	    vp->v_holdcnt == 0 && vp->v_usecount == 0) {
855		simple_lock(&vnode_free_list_slock);
856		TAILQ_REMOVE(&vnode_hold_list, vp, v_freelist);
857		TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
858		simple_unlock(&vnode_free_list_slock);
859	}
860	simple_unlock(&vp->v_interlock);
861}
862#endif /* DIAGNOSTIC */
863
864/*
865 * Remove any vnodes in the vnode table belonging to mount point mp.
866 *
867 * If MNT_NOFORCE is specified, there should not be any active ones,
868 * return error if any are found (nb: this is a user error, not a
869 * system error). If MNT_FORCE is specified, detach any active vnodes
870 * that are found.
871 */
872#ifdef DEBUG
873int busyprt = 0;	/* print out busy vnodes */
874struct ctldebug debug1 = { "busyprt", &busyprt };
875#endif
876
877int
878vflush(mp, skipvp, flags)
879	struct mount *mp;
880	struct vnode *skipvp;
881	int flags;
882{
883	struct proc *p = curproc;
884	register struct vnode *vp, *nvp;
885	int busy = 0;
886
887	simple_lock(&mntvnode_slock);
888loop:
889	for (vp = mp->mnt_vnodelist.lh_first; vp; vp = nvp) {
890		if (vp->v_mount != mp)
891			goto loop;
892		nvp = vp->v_mntvnodes.le_next;
893		/*
894		 * Skip over a selected vnode.
895		 */
896		if (vp == skipvp)
897			continue;
898
899		simple_lock(&vp->v_interlock);
900		/*
901		 * Skip over a vnodes marked VSYSTEM.
902		 */
903		if ((flags & SKIPSYSTEM) && (vp->v_flag & VSYSTEM)) {
904			simple_unlock(&vp->v_interlock);
905			continue;
906		}
907		/*
908		 * If WRITECLOSE is set, only flush out regular file
909		 * vnodes open for writing.
910		 */
911		if ((flags & WRITECLOSE) &&
912		    (vp->v_writecount == 0 || vp->v_type != VREG)) {
913			simple_unlock(&vp->v_interlock);
914			continue;
915		}
916		/*
917		 * With v_usecount == 0, all we need to do is clear
918		 * out the vnode data structures and we are done.
919		 */
920		if (vp->v_usecount == 0) {
921			simple_unlock(&mntvnode_slock);
922			vgonel(vp, p);
923			simple_lock(&mntvnode_slock);
924			continue;
925		}
926		/*
927		 * If FORCECLOSE is set, forcibly close the vnode.
928		 * For block or character devices, revert to an
929		 * anonymous device. For all other files, just kill them.
930		 */
931		if (flags & FORCECLOSE) {
932			simple_unlock(&mntvnode_slock);
933			if (vp->v_type != VBLK && vp->v_type != VCHR) {
934				vgonel(vp, p);
935			} else {
936				vclean(vp, 0, p);
937				vp->v_op = spec_vnodeop_p;
938				insmntque(vp, (struct mount *)0);
939			}
940			simple_lock(&mntvnode_slock);
941			continue;
942		}
943#ifdef DEBUG
944		if (busyprt)
945			vprint("vflush: busy vnode", vp);
946#endif
947		simple_unlock(&vp->v_interlock);
948		busy++;
949	}
950	simple_unlock(&mntvnode_slock);
951	if (busy)
952		return (EBUSY);
953	return (0);
954}
955
956/*
957 * Disassociate the underlying file system from a vnode.
958 * The vnode interlock is held on entry.
959 */
960void
961vclean(vp, flags, p)
962	register struct vnode *vp;
963	int flags;
964	struct proc *p;
965{
966	int active;
967
968	/*
969	 * Check to see if the vnode is in use.
970	 * If so we have to reference it before we clean it out
971	 * so that its count cannot fall to zero and generate a
972	 * race against ourselves to recycle it.
973	 */
974	if ((active = vp->v_usecount) != 0)
975		vp->v_usecount++;
976
977	/*
978	 * Prevent the vnode from being recycled or
979	 * brought into use while we clean it out.
980	 */
981	if (vp->v_flag & VXLOCK)
982		panic("vclean: deadlock");
983	vp->v_flag |= VXLOCK;
984
985
986	/*
987	 * Even if the count is zero, the VOP_INACTIVE routine may still
988	 * have the object locked while it cleans it out. The VOP_LOCK
989	 * ensures that the VOP_INACTIVE routine is done with its work.
990	 * For active vnodes, it ensures that no other activity can
991	 * occur while the underlying object is being cleaned out.
992	 */
993	VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK, p);
994
995	/*
996	 * Clean out any buffers associated with the vnode.
997	 */
998	if (flags & DOCLOSE)
999		vinvalbuf(vp, V_SAVE, NOCRED, p, 0, 0);
1000	/*
1001	 * If purging an active vnode, it must be closed and
1002	 * deactivated before being reclaimed. Note that the
1003	 * VOP_INACTIVE will unlock the vnode
1004	 */
1005	if (active) {
1006		if (flags & DOCLOSE)
1007			VOP_CLOSE(vp, FNONBLOCK, NOCRED, p);
1008		VOP_INACTIVE(vp, p);
1009	} else {
1010		/*
1011		 * Any other processes trying to obtain this lock must first
1012		 * wait for VXLOCK to clear, then call the new lock operation.
1013		 */
1014		VOP_UNLOCK(vp, 0, p);
1015	}
1016
1017	/*
1018	 * Reclaim the vnode.
1019	 */
1020	if (VOP_RECLAIM(vp, p))
1021		panic("vclean: cannot reclaim");
1022	if (active) {
1023		simple_lock(&vp->v_interlock);
1024
1025		vp->v_usecount--;
1026		if (vp->v_usecount == 0) {
1027			if (vp->v_holdcnt > 0)
1028				panic("vclean: not clean");
1029			vputonfreelist(vp);
1030		}
1031
1032		simple_unlock(&vp->v_interlock);
1033	}
1034	cache_purge(vp);
1035	if (vp->v_vnlock) {
1036		if ((vp->v_vnlock->lk_flags & LK_DRAINED) == 0)
1037			vprint("vclean: lock not drained", vp);
1038		FREE(vp->v_vnlock, M_VNODE);
1039		vp->v_vnlock = NULL;
1040	}
1041
1042	/*
1043	 * Done with purge, notify sleepers of the grim news.
1044	 */
1045	vp->v_op = dead_vnodeop_p;
1046	vp->v_tag = VT_NON;
1047	vp->v_flag &= ~VXLOCK;
1048#ifdef DIAGNOSTIC
1049	vp->v_flag &= ~VLOCKSWORK;
1050#endif
1051	if (vp->v_flag & VXWANT) {
1052		vp->v_flag &= ~VXWANT;
1053		wakeup((caddr_t)vp);
1054	}
1055}
1056
1057
1058
1059/*
1060 * Recycle an unused vnode to the front of the free list.
1061 * Release the passed interlock if the vnode will be recycled.
1062 */
1063int
1064vrecycle(vp, inter_lkp, p)
1065	struct vnode *vp;
1066	struct simplelock *inter_lkp;
1067	struct proc *p;
1068{
1069
1070	simple_lock(&vp->v_interlock);
1071	if (vp->v_usecount == 0) {
1072		if (inter_lkp)
1073			simple_unlock(inter_lkp);
1074		vgonel(vp, p);
1075		return (1);
1076	}
1077	simple_unlock(&vp->v_interlock);
1078	return (0);
1079}
1080
1081/*
1082 * Eliminate all activity associated with a vnode
1083 * in preparation for reuse.
1084 */
1085void
1086vgone(vp)
1087	register struct vnode *vp;
1088{
1089	struct proc *p = curproc;
1090
1091	simple_lock (&vp->v_interlock);
1092	vgonel(vp, p);
1093}
1094
1095/*
1096 * vgone, with the vp interlock held.
1097 */
1098void
1099vgonel(vp, p)
1100	struct vnode *vp;
1101	struct proc *p;
1102{
1103	register struct vnode *vq;
1104	struct vnode *vx;
1105
1106	/*
1107	 * If a vgone (or vclean) is already in progress,
1108	 * wait until it is done and return.
1109	 */
1110	if (vp->v_flag & VXLOCK) {
1111		vp->v_flag |= VXWANT;
1112		simple_unlock(&vp->v_interlock);
1113		tsleep((caddr_t)vp, PINOD, "vgone", 0);
1114		return;
1115	}
1116	/*
1117	 * Clean out the filesystem specific data.
1118	 */
1119	vclean(vp, DOCLOSE, p);
1120	/*
1121	 * Delete from old mount point vnode list, if on one.
1122	 */
1123	if (vp->v_mount != NULL)
1124		insmntque(vp, (struct mount *)0);
1125	/*
1126	 * If special device, remove it from special device alias list
1127	 * if it is on one.
1128	 */
1129	if ((vp->v_type == VBLK || vp->v_type == VCHR) && vp->v_specinfo != 0) {
1130		simple_lock(&spechash_slock);
1131		if (*vp->v_hashchain == vp) {
1132			*vp->v_hashchain = vp->v_specnext;
1133		} else {
1134			for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
1135				if (vq->v_specnext != vp)
1136					continue;
1137				vq->v_specnext = vp->v_specnext;
1138				break;
1139			}
1140			if (vq == NULL)
1141				panic("missing bdev");
1142		}
1143		if (vp->v_flag & VALIASED) {
1144			vx = NULL;
1145			for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
1146				if (vq->v_rdev != vp->v_rdev ||
1147				    vq->v_type != vp->v_type)
1148					continue;
1149				if (vx)
1150					break;
1151				vx = vq;
1152			}
1153			if (vx == NULL)
1154				panic("missing alias");
1155			if (vq == NULL)
1156				vx->v_flag &= ~VALIASED;
1157			vp->v_flag &= ~VALIASED;
1158		}
1159		simple_unlock(&spechash_slock);
1160		FREE(vp->v_specinfo, M_VNODE);
1161		vp->v_specinfo = NULL;
1162	}
1163	/*
1164	 * If it is on the freelist and not already at the head,
1165	 * move it to the head of the list.
1166	 */
1167	vp->v_type = VBAD;
1168
1169	if ((vp->v_flag & VONFREELIST) &&
1170	    vp->v_usecount == 0) {
1171                simple_lock(&vnode_free_list_slock);
1172		if (vp->v_holdcnt > 0)
1173			panic("vgonel: not clean");
1174                if (TAILQ_FIRST(&vnode_free_list) != vp) {
1175                        TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
1176                        TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
1177                }
1178                simple_unlock(&vnode_free_list_slock);
1179	}
1180}
1181
1182/*
1183 * Lookup a vnode by device number.
1184 */
1185int
1186vfinddev(dev, type, vpp)
1187	dev_t dev;
1188	enum vtype type;
1189	struct vnode **vpp;
1190{
1191	register struct vnode *vp;
1192	int rc =0;
1193
1194	simple_lock(&spechash_slock);
1195	for (vp = speclisth[SPECHASH(dev)]; vp; vp = vp->v_specnext) {
1196		if (dev != vp->v_rdev || type != vp->v_type)
1197			continue;
1198		*vpp = vp;
1199		rc = 1;
1200		break;
1201	}
1202	simple_unlock(&spechash_slock);
1203	return (rc);
1204}
1205
1206/*
1207 * Calculate the total number of references to a special device.
1208 */
1209int
1210vcount(vp)
1211	struct vnode *vp;
1212{
1213	struct vnode *vq, *vnext;
1214	int count;
1215
1216loop:
1217	if ((vp->v_flag & VALIASED) == 0)
1218		return (vp->v_usecount);
1219	simple_lock(&spechash_slock);
1220	for (count = 0, vq = *vp->v_hashchain; vq; vq = vnext) {
1221		vnext = vq->v_specnext;
1222		if (vq->v_rdev != vp->v_rdev || vq->v_type != vp->v_type)
1223			continue;
1224		/*
1225		 * Alias, but not in use, so flush it out.
1226		 */
1227		if (vq->v_usecount == 0 && vq != vp) {
1228			simple_unlock(&spechash_slock);
1229			vgone(vq);
1230			goto loop;
1231		}
1232		count += vq->v_usecount;
1233	}
1234	simple_unlock(&spechash_slock);
1235	return (count);
1236}
1237
1238/*
1239 * Print out a description of a vnode.
1240 */
1241static char *typename[] =
1242   { "VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD" };
1243
1244void
1245vprint(label, vp)
1246	char *label;
1247	register struct vnode *vp;
1248{
1249	char buf[64];
1250
1251	if (label != NULL)
1252		printf("%s: ", label);
1253	printf("type %s, usecount %d, writecount %d, holdcount %ld,",
1254		typename[vp->v_type], vp->v_usecount, vp->v_writecount,
1255		vp->v_holdcnt);
1256	buf[0] = '\0';
1257	if (vp->v_flag & VROOT)
1258		strcat(buf, "|VROOT");
1259	if (vp->v_flag & VTEXT)
1260		strcat(buf, "|VTEXT");
1261	if (vp->v_flag & VSYSTEM)
1262		strcat(buf, "|VSYSTEM");
1263	if (vp->v_flag & VXLOCK)
1264		strcat(buf, "|VXLOCK");
1265	if (vp->v_flag & VXWANT)
1266		strcat(buf, "|VXWANT");
1267	if (vp->v_flag & VBWAIT)
1268		strcat(buf, "|VBWAIT");
1269	if (vp->v_flag & VALIASED)
1270		strcat(buf, "|VALIASED");
1271	if (buf[0] != '\0')
1272		printf(" flags (%s)", &buf[1]);
1273	if (vp->v_data == NULL) {
1274		printf("\n");
1275	} else {
1276		printf("\n\t");
1277		VOP_PRINT(vp);
1278	}
1279}
1280
1281#ifdef DEBUG
1282/*
1283 * List all of the locked vnodes in the system.
1284 * Called when debugging the kernel.
1285 */
1286void
1287printlockedvnodes()
1288{
1289	struct proc *p = curproc;
1290	register struct mount *mp, *nmp;
1291	register struct vnode *vp;
1292
1293	printf("Locked vnodes\n");
1294	simple_lock(&mountlist_slock);
1295	for (mp = CIRCLEQ_FIRST(&mountlist); mp != (void *)&mountlist;
1296	     mp = nmp) {
1297		if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) {
1298			nmp = CIRCLEQ_NEXT(mp, mnt_list);
1299			continue;
1300		}
1301		for (vp = mp->mnt_vnodelist.lh_first;
1302		     vp != NULL;
1303		     vp = vp->v_mntvnodes.le_next) {
1304			if (VOP_ISLOCKED(vp))
1305				vprint((char *)0, vp);
1306		}
1307		simple_lock(&mountlist_slock);
1308		nmp = CIRCLEQ_NEXT(mp, mnt_list);
1309		vfs_unbusy(mp, p);
1310 	}
1311	simple_unlock(&mountlist_slock);
1312
1313}
1314#endif
1315
1316/*
1317 * Top level filesystem related information gathering.
1318 */
1319int
1320vfs_sysctl(name, namelen, oldp, oldlenp, newp, newlen, p)
1321	int *name;
1322	u_int namelen;
1323	void *oldp;
1324	size_t *oldlenp;
1325	void *newp;
1326	size_t newlen;
1327	struct proc *p;
1328{
1329	struct vfsconf *vfsp;
1330
1331	/* all sysctl names at this level are at least name and field */
1332	if (namelen < 2)
1333		return (ENOTDIR);		/* overloaded */
1334	if (name[0] != VFS_GENERIC) {
1335		for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
1336			if (vfsp->vfc_typenum == name[0])
1337				break;
1338		if (vfsp == NULL)
1339			return (EOPNOTSUPP);
1340		return ((*vfsp->vfc_vfsops->vfs_sysctl)(&name[1], namelen - 1,
1341		    oldp, oldlenp, newp, newlen, p));
1342	}
1343	switch (name[1]) {
1344	case VFS_MAXTYPENUM:
1345		return (sysctl_rdint(oldp, oldlenp, newp, maxvfsconf));
1346	case VFS_CONF:
1347		if (namelen < 3)
1348			return (ENOTDIR);	/* overloaded */
1349		for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
1350			if (vfsp->vfc_typenum == name[2])
1351				break;
1352		if (vfsp == NULL)
1353			return (EOPNOTSUPP);
1354		return (sysctl_rdstruct(oldp, oldlenp, newp, vfsp,
1355		    sizeof(struct vfsconf)));
1356	}
1357	return (EOPNOTSUPP);
1358}
1359
1360
1361int kinfo_vdebug = 1;
1362int kinfo_vgetfailed;
1363#define KINFO_VNODESLOP	10
1364/*
1365 * Dump vnode list (via sysctl).
1366 * Copyout address of vnode followed by vnode.
1367 */
1368/* ARGSUSED */
1369int
1370sysctl_vnode(where, sizep, p)
1371	char *where;
1372	size_t *sizep;
1373	struct proc *p;
1374{
1375	register struct mount *mp, *nmp;
1376	struct vnode *vp, *nvp;
1377	register char *bp = where, *savebp;
1378	char *ewhere;
1379	int error;
1380
1381#define VPTRSZ	sizeof (struct vnode *)
1382#define VNODESZ	sizeof (struct vnode)
1383	if (where == NULL) {
1384		*sizep = (numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ);
1385		return (0);
1386	}
1387	ewhere = where + *sizep;
1388
1389	simple_lock(&mountlist_slock);
1390	for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) {
1391		if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) {
1392			nmp = mp->mnt_list.cqe_next;
1393			continue;
1394		}
1395		savebp = bp;
1396again:
1397		for (vp = mp->mnt_vnodelist.lh_first;
1398		     vp != NULL;
1399		     vp = nvp) {
1400			/*
1401			 * Check that the vp is still associated with
1402			 * this filesystem.  RACE: could have been
1403			 * recycled onto the same filesystem.
1404			 */
1405			if (vp->v_mount != mp) {
1406				simple_unlock(&mntvnode_slock);
1407				if (kinfo_vdebug)
1408					printf("kinfo: vp changed\n");
1409				bp = savebp;
1410				goto again;
1411			}
1412			nvp = vp->v_mntvnodes.le_next;
1413			if (bp + VPTRSZ + VNODESZ > ewhere) {
1414				simple_unlock(&mntvnode_slock);
1415				*sizep = bp - where;
1416				return (ENOMEM);
1417			}
1418			if ((error = copyout((caddr_t)&vp, bp, VPTRSZ)) ||
1419			   (error = copyout((caddr_t)vp, bp + VPTRSZ, VNODESZ)))
1420				return (error);
1421			bp += VPTRSZ + VNODESZ;
1422			simple_lock(&mntvnode_slock);
1423		}
1424
1425		simple_unlock(&mntvnode_slock);
1426		simple_lock(&mountlist_slock);
1427		nmp = mp->mnt_list.cqe_next;
1428		vfs_unbusy(mp, p);
1429	}
1430
1431	simple_unlock(&mountlist_slock);
1432
1433	*sizep = bp - where;
1434	return (0);
1435}
1436
1437/*
1438 * Check to see if a filesystem is mounted on a block device.
1439 */
1440int
1441vfs_mountedon(vp)
1442	register struct vnode *vp;
1443{
1444	register struct vnode *vq;
1445	int error = 0;
1446
1447 	if (vp->v_specmountpoint != NULL)
1448		return (EBUSY);
1449	if (vp->v_flag & VALIASED) {
1450		simple_lock(&spechash_slock);
1451		for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
1452			if (vq->v_rdev != vp->v_rdev ||
1453			    vq->v_type != vp->v_type)
1454				continue;
1455			if (vq->v_specmountpoint != NULL) {
1456				error = EBUSY;
1457				break;
1458			}
1459 		}
1460		simple_unlock(&spechash_slock);
1461	}
1462	return (error);
1463}
1464
1465/*
1466 * Build hash lists of net addresses and hang them off the mount point.
1467 * Called by ufs_mount() to set up the lists of export addresses.
1468 */
1469int
1470vfs_hang_addrlist(mp, nep, argp)
1471	struct mount *mp;
1472	struct netexport *nep;
1473	struct export_args *argp;
1474{
1475	register struct netcred *np;
1476	register struct radix_node_head *rnh;
1477	register int i;
1478	struct radix_node *rn;
1479	struct sockaddr *saddr, *smask = 0;
1480	struct domain *dom;
1481	int error;
1482
1483	if (argp->ex_addrlen == 0) {
1484		if (mp->mnt_flag & MNT_DEFEXPORTED)
1485			return (EPERM);
1486		np = &nep->ne_defexported;
1487		np->netc_exflags = argp->ex_flags;
1488		np->netc_anon = argp->ex_anon;
1489		np->netc_anon.cr_ref = 1;
1490		mp->mnt_flag |= MNT_DEFEXPORTED;
1491		return (0);
1492	}
1493	i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen;
1494	np = (struct netcred *)malloc(i, M_NETADDR, M_WAITOK);
1495	bzero((caddr_t)np, i);
1496	saddr = (struct sockaddr *)(np + 1);
1497	error = copyin(argp->ex_addr, (caddr_t)saddr, argp->ex_addrlen);
1498	if (error)
1499		goto out;
1500	if (saddr->sa_len > argp->ex_addrlen)
1501		saddr->sa_len = argp->ex_addrlen;
1502	if (argp->ex_masklen) {
1503		smask = (struct sockaddr *)((caddr_t)saddr + argp->ex_addrlen);
1504		error = copyin(argp->ex_mask, (caddr_t)smask, argp->ex_masklen);
1505		if (error)
1506			goto out;
1507		if (smask->sa_len > argp->ex_masklen)
1508			smask->sa_len = argp->ex_masklen;
1509	}
1510	i = saddr->sa_family;
1511	if ((rnh = nep->ne_rtable[i]) == 0) {
1512		/*
1513		 * Seems silly to initialize every AF when most are not
1514		 * used, do so on demand here
1515		 */
1516		for (dom = domains; dom; dom = dom->dom_next)
1517			if (dom->dom_family == i && dom->dom_rtattach) {
1518				dom->dom_rtattach((void **)&nep->ne_rtable[i],
1519					dom->dom_rtoffset);
1520				break;
1521			}
1522		if ((rnh = nep->ne_rtable[i]) == 0) {
1523			error = ENOBUFS;
1524			goto out;
1525		}
1526	}
1527	rn = (*rnh->rnh_addaddr)((caddr_t)saddr, (caddr_t)smask, rnh,
1528		np->netc_rnodes);
1529	if (rn == 0 || np != (struct netcred *)rn) { /* already exists */
1530		error = EPERM;
1531		goto out;
1532	}
1533	np->netc_exflags = argp->ex_flags;
1534	np->netc_anon = argp->ex_anon;
1535	np->netc_anon.cr_ref = 1;
1536	return (0);
1537out:
1538	free(np, M_NETADDR);
1539	return (error);
1540}
1541
1542/* ARGSUSED */
1543int
1544vfs_free_netcred(rn, w)
1545	struct radix_node *rn;
1546	void *w;
1547{
1548	register struct radix_node_head *rnh = (struct radix_node_head *)w;
1549
1550	(*rnh->rnh_deladdr)(rn->rn_key, rn->rn_mask, rnh);
1551	free((caddr_t)rn, M_NETADDR);
1552	return (0);
1553}
1554
1555/*
1556 * Free the net address hash lists that are hanging off the mount points.
1557 */
1558void
1559vfs_free_addrlist(nep)
1560	struct netexport *nep;
1561{
1562	register int i;
1563	register struct radix_node_head *rnh;
1564
1565	for (i = 0; i <= AF_MAX; i++)
1566		if ((rnh = nep->ne_rtable[i]) != NULL) {
1567			(*rnh->rnh_walktree)(rnh, vfs_free_netcred, rnh);
1568			free((caddr_t)rnh, M_RTABLE);
1569			nep->ne_rtable[i] = 0;
1570		}
1571}
1572
1573int
1574vfs_export(mp, nep, argp)
1575	struct mount *mp;
1576	struct netexport *nep;
1577	struct export_args *argp;
1578{
1579	int error;
1580
1581	if (argp->ex_flags & MNT_DELEXPORT) {
1582		vfs_free_addrlist(nep);
1583		mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED);
1584	}
1585	if (argp->ex_flags & MNT_EXPORTED) {
1586		if ((error = vfs_hang_addrlist(mp, nep, argp)) != 0)
1587			return (error);
1588		mp->mnt_flag |= MNT_EXPORTED;
1589	}
1590	return (0);
1591}
1592
1593struct netcred *
1594vfs_export_lookup(mp, nep, nam)
1595	register struct mount *mp;
1596	struct netexport *nep;
1597	struct mbuf *nam;
1598{
1599	register struct netcred *np;
1600	register struct radix_node_head *rnh;
1601	struct sockaddr *saddr;
1602
1603	np = NULL;
1604	if (mp->mnt_flag & MNT_EXPORTED) {
1605		/*
1606		 * Lookup in the export list first.
1607		 */
1608		if (nam != NULL) {
1609			saddr = mtod(nam, struct sockaddr *);
1610			rnh = nep->ne_rtable[saddr->sa_family];
1611			if (rnh != NULL) {
1612				np = (struct netcred *)
1613					(*rnh->rnh_matchaddr)((caddr_t)saddr,
1614							      rnh);
1615				if (np && np->netc_rnodes->rn_flags & RNF_ROOT)
1616					np = NULL;
1617			}
1618		}
1619		/*
1620		 * If no address match, use the default if it exists.
1621		 */
1622		if (np == NULL && mp->mnt_flag & MNT_DEFEXPORTED)
1623			np = &nep->ne_defexported;
1624	}
1625	return (np);
1626}
1627
1628/*
1629 * Do the usual access checking.
1630 * file_mode, uid and gid are from the vnode in question,
1631 * while acc_mode and cred are from the VOP_ACCESS parameter list
1632 */
1633int
1634vaccess(file_mode, uid, gid, acc_mode, cred)
1635	mode_t file_mode;
1636	uid_t uid;
1637	gid_t gid;
1638	mode_t acc_mode;
1639	struct ucred *cred;
1640{
1641	mode_t mask;
1642
1643	/* User id 0 always gets access. */
1644	if (cred->cr_uid == 0)
1645		return 0;
1646
1647	mask = 0;
1648
1649	/* Otherwise, check the owner. */
1650	if (cred->cr_uid == uid) {
1651		if (acc_mode & VEXEC)
1652			mask |= S_IXUSR;
1653		if (acc_mode & VREAD)
1654			mask |= S_IRUSR;
1655		if (acc_mode & VWRITE)
1656			mask |= S_IWUSR;
1657		return (file_mode & mask) == mask ? 0 : EACCES;
1658	}
1659
1660	/* Otherwise, check the groups. */
1661	if (cred->cr_gid == gid || groupmember(gid, cred)) {
1662		if (acc_mode & VEXEC)
1663			mask |= S_IXGRP;
1664		if (acc_mode & VREAD)
1665			mask |= S_IRGRP;
1666		if (acc_mode & VWRITE)
1667			mask |= S_IWGRP;
1668		return (file_mode & mask) == mask ? 0 : EACCES;
1669	}
1670
1671	/* Otherwise, check everyone else. */
1672	if (acc_mode & VEXEC)
1673		mask |= S_IXOTH;
1674	if (acc_mode & VREAD)
1675		mask |= S_IROTH;
1676	if (acc_mode & VWRITE)
1677		mask |= S_IWOTH;
1678	return (file_mode & mask) == mask ? 0 : EACCES;
1679}
1680
1681/*
1682 * Unmount all file systems.
1683 * We traverse the list in reverse order under the assumption that doing so
1684 * will avoid needing to worry about dependencies.
1685 */
1686void
1687vfs_unmountall()
1688{
1689	register struct mount *mp, *nmp;
1690	int allerror, error, again = 1;
1691
1692 retry:
1693	for (allerror = 0,
1694	     mp = mountlist.cqh_last; mp != (void *)&mountlist; mp = nmp) {
1695		nmp = mp->mnt_list.cqe_prev;
1696		if ((error = dounmount(mp, MNT_FORCE, curproc)) != 0) {
1697			printf("unmount of %s failed with error %d\n",
1698			    mp->mnt_stat.f_mntonname, error);
1699			allerror = 1;
1700		}
1701	}
1702
1703	if (allerror) {
1704		printf("WARNING: some file systems would not unmount\n");
1705		if (again) {
1706			printf("retrying\n");
1707			again = 0;
1708			goto retry;
1709		}
1710	}
1711}
1712
1713/*
1714 * Sync and unmount file systems before shutting down.
1715 */
1716void
1717vfs_shutdown()
1718{
1719	register struct buf *bp;
1720	int iter, nbusy;
1721
1722	/* XXX Should suspend scheduling. */
1723	(void) spl0();
1724
1725	printf("syncing disks... ");
1726
1727	if (panicstr == 0) {
1728		/* Release inodes held by texts before update. */
1729		vnode_pager_umount(NULL);
1730#ifdef notdef
1731		vnshutdown();
1732#endif
1733
1734		/* Sync before unmount, in case we hang on something. */
1735		sys_sync(&proc0, (void *)0, (register_t *)0);
1736
1737		/* Unmount file systems. */
1738		vfs_unmountall();
1739	}
1740
1741	/* Sync again after unmount, just in case. */
1742	sys_sync(&proc0, (void *)0, (register_t *)0);
1743
1744	/* Wait for sync to finish. */
1745	for (iter = 0; iter < 20; iter++) {
1746		nbusy = 0;
1747		for (bp = &buf[nbuf]; --bp >= buf; )
1748			if ((bp->b_flags & (B_BUSY|B_INVAL)) == B_BUSY)
1749				nbusy++;
1750		if (nbusy == 0)
1751			break;
1752		printf("%d ", nbusy);
1753		DELAY(40000 * iter);
1754	}
1755	if (nbusy)
1756		printf("giving up\n");
1757	else
1758		printf("done\n");
1759}
1760
1761/*
1762 * posix file system related system variables.
1763 */
1764int
1765fs_posix_sysctl(name, namelen, oldp, oldlenp, newp, newlen, p)
1766	int *name;
1767	u_int namelen;
1768	void *oldp;
1769	size_t *oldlenp;
1770	void *newp;
1771	size_t newlen;
1772	struct proc *p;
1773{
1774	/* all sysctl names at this level are terminal */
1775	if (namelen != 1)
1776		return (ENOTDIR);
1777
1778	switch (name[0]) {
1779	case FS_POSIX_SETUID:
1780		if (newp && securelevel > 0)
1781			return (EPERM);
1782		return(sysctl_int(oldp, oldlenp, newp, newlen, &suid_clear));
1783	default:
1784		return (EOPNOTSUPP);
1785	}
1786	/* NOTREACHED */
1787}
1788
1789/*
1790 * file system related system variables.
1791 */
1792int
1793fs_sysctl(name, namelen, oldp, oldlenp, newp, newlen, p)
1794	int *name;
1795	u_int namelen;
1796	void *oldp;
1797	size_t *oldlenp;
1798	void *newp;
1799	size_t newlen;
1800	struct proc *p;
1801{
1802	sysctlfn *fn;
1803
1804	switch (name[0]) {
1805	case FS_POSIX:
1806		fn = fs_posix_sysctl;
1807		break;
1808	default:
1809		return (EOPNOTSUPP);
1810	}
1811	return (*fn)(name + 1, namelen - 1, oldp, oldlenp, newp, newlen, p);
1812}
1813
1814
1815/*
1816 * Routines dealing with vnodes and buffers
1817 */
1818
1819/*
1820 * Update outstanding I/O count and do wakeup if requested.
1821 *
1822 * Manipulates v_numoutput. Must be called at splbio()
1823 */
1824void
1825vwakeup(bp)
1826	register struct buf *bp;
1827{
1828	register struct vnode *vp;
1829
1830	bp->b_flags &= ~B_WRITEINPROG;
1831	if ((vp = bp->b_vp) != NULL) {
1832		if (--vp->v_numoutput < 0)
1833			panic("vwakeup: neg numoutput");
1834		if ((vp->v_flag & VBWAIT) && vp->v_numoutput <= 0) {
1835			vp->v_flag &= ~VBWAIT;
1836			wakeup((caddr_t)&vp->v_numoutput);
1837		}
1838	}
1839}
1840
1841/*
1842 * Flush out and invalidate all buffers associated with a vnode.
1843 * Called with the underlying object locked.
1844 */
1845int
1846vinvalbuf(vp, flags, cred, p, slpflag, slptimeo)
1847	register struct vnode *vp;
1848	int flags;
1849	struct ucred *cred;
1850	struct proc *p;
1851	int slpflag, slptimeo;
1852{
1853	register struct buf *bp;
1854	struct buf *nbp, *blist;
1855	int s, error;
1856
1857	if (flags & V_SAVE) {
1858		s = splbio();
1859		while (vp->v_numoutput) {
1860			vp->v_flag |= VBWAIT;
1861			sleep((caddr_t)&vp->v_numoutput, PRIBIO + 1);
1862		}
1863		if (vp->v_dirtyblkhd.lh_first != NULL) {
1864			splx(s);
1865			if ((error = VOP_FSYNC(vp, cred, MNT_WAIT, p)) != 0)
1866				return (error);
1867			s = splbio();
1868			if (vp->v_numoutput > 0 ||
1869			    vp->v_dirtyblkhd.lh_first != NULL)
1870				panic("vinvalbuf: dirty bufs");
1871		}
1872		splx(s);
1873	}
1874loop:
1875	s = splbio();
1876	for (;;) {
1877		if ((blist = vp->v_cleanblkhd.lh_first) &&
1878		    (flags & V_SAVEMETA))
1879			while (blist && blist->b_lblkno < 0)
1880				blist = blist->b_vnbufs.le_next;
1881		if (!blist && (blist = vp->v_dirtyblkhd.lh_first) &&
1882		    (flags & V_SAVEMETA))
1883			while (blist && blist->b_lblkno < 0)
1884				blist = blist->b_vnbufs.le_next;
1885		if (!blist)
1886			break;
1887
1888		for (bp = blist; bp; bp = nbp) {
1889			nbp = bp->b_vnbufs.le_next;
1890			if (flags & V_SAVEMETA && bp->b_lblkno < 0)
1891				continue;
1892			if (bp->b_flags & B_BUSY) {
1893				bp->b_flags |= B_WANTED;
1894				error = tsleep((caddr_t)bp,
1895					slpflag | (PRIBIO + 1), "vinvalbuf",
1896					slptimeo);
1897				if (error) {
1898					splx(s);
1899					return (error);
1900				}
1901				break;
1902			}
1903			bp->b_flags |= B_BUSY | B_VFLUSH;
1904			/*
1905			 * XXX Since there are no node locks for NFS, I believe
1906			 * there is a slight chance that a delayed write will
1907			 * occur while sleeping just above, so check for it.
1908			 */
1909			if ((bp->b_flags & B_DELWRI) && (flags & V_SAVE)) {
1910				splx(s);
1911				(void) VOP_BWRITE(bp);
1912				goto loop;
1913			}
1914			bp->b_flags |= B_INVAL;
1915			brelse(bp);
1916		}
1917	}
1918	if (!(flags & V_SAVEMETA) &&
1919	    (vp->v_dirtyblkhd.lh_first || vp->v_cleanblkhd.lh_first))
1920		panic("vinvalbuf: flush failed");
1921	splx(s);
1922	return (0);
1923}
1924
1925void
1926vflushbuf(vp, sync)
1927	register struct vnode *vp;
1928	int sync;
1929{
1930	register struct buf *bp, *nbp;
1931	int s;
1932
1933loop:
1934	s = splbio();
1935	for (bp = vp->v_dirtyblkhd.lh_first; bp; bp = nbp) {
1936		nbp = bp->b_vnbufs.le_next;
1937		if ((bp->b_flags & B_BUSY))
1938			continue;
1939		if ((bp->b_flags & B_DELWRI) == 0)
1940			panic("vflushbuf: not dirty");
1941		bp->b_flags |= B_BUSY | B_VFLUSH;
1942		splx(s);
1943		/*
1944		 * Wait for I/O associated with indirect blocks to complete,
1945		 * since there is no way to quickly wait for them below.
1946		 */
1947		if (bp->b_vp == vp || sync == 0)
1948			(void) bawrite(bp);
1949		else
1950			(void) bwrite(bp);
1951		goto loop;
1952	}
1953	if (sync == 0) {
1954		splx(s);
1955		return;
1956	}
1957	while (vp->v_numoutput) {
1958		vp->v_flag |= VBWAIT;
1959		tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "vflushbuf", 0);
1960	}
1961	splx(s);
1962	if (vp->v_dirtyblkhd.lh_first != NULL) {
1963		vprint("vflushbuf: dirty", vp);
1964		goto loop;
1965	}
1966}
1967
1968/*
1969 * Associate a buffer with a vnode.
1970 *
1971 * Manipulates buffer vnode queues. Must be called at splbio().
1972 */
1973void
1974bgetvp(vp, bp)
1975	register struct vnode *vp;
1976	register struct buf *bp;
1977{
1978
1979	if (bp->b_vp)
1980		panic("bgetvp: not free");
1981	VHOLD(vp);
1982	bp->b_vp = vp;
1983	if (vp->v_type == VBLK || vp->v_type == VCHR)
1984		bp->b_dev = vp->v_rdev;
1985	else
1986		bp->b_dev = NODEV;
1987	/*
1988	 * Insert onto list for new vnode.
1989	 */
1990	bufinsvn(bp, &vp->v_cleanblkhd);
1991}
1992
1993/*
1994 * Disassociate a buffer from a vnode.
1995 *
1996 * Manipulates vnode buffer queues. Must be called at splbio().
1997 */
1998void
1999brelvp(bp)
2000	register struct buf *bp;
2001{
2002	struct vnode *vp;
2003	struct buf *wasdirty;
2004
2005	if ((vp = bp->b_vp) == (struct vnode *) 0)
2006		panic("brelvp: NULL");
2007	/*
2008	 * Delete from old vnode list, if on one.
2009	 */
2010	wasdirty = vp->v_dirtyblkhd.lh_first;
2011	if (bp->b_vnbufs.le_next != NOLIST)
2012		bufremvn(bp);
2013	if (wasdirty && LIST_FIRST(&vp->v_dirtyblkhd) == NULL)
2014		LIST_REMOVE(vp, v_synclist);
2015	bp->b_vp = (struct vnode *) 0;
2016	HOLDRELE(vp);
2017}
2018
2019/*
2020 * Reassign a buffer from one vnode to another. Used to assign buffers
2021 * to the appropriate clean or dirty list and to add newly dirty vnodes
2022 * to the appropriate filesystem syncer list.
2023 *
2024 * Manipulates vnode buffer queues. Must be called at splbio().
2025 */
2026void
2027reassignbuf(bp, newvp)
2028	register struct buf *bp;
2029	register struct vnode *newvp;
2030{
2031	struct buflists *listheadp;
2032	struct buf *wasdirty;
2033	int delay;
2034
2035	if (newvp == NULL) {
2036		printf("reassignbuf: NULL");
2037		return;
2038	}
2039	/*
2040	 * Delete from old vnode list, if on one.
2041	 */
2042	wasdirty = newvp->v_dirtyblkhd.lh_first;
2043	if (bp->b_vnbufs.le_next != NOLIST)
2044		bufremvn(bp);
2045	/*
2046	 * If dirty, put on list of dirty buffers;
2047	 * otherwise insert onto list of clean buffers.
2048	 */
2049	if ((bp->b_flags & B_DELWRI) == 0) {
2050		listheadp = &newvp->v_cleanblkhd;
2051		if (wasdirty && LIST_FIRST(&newvp->v_dirtyblkhd) == NULL)
2052			LIST_REMOVE(newvp, v_synclist);
2053	} else {
2054		listheadp = &newvp->v_dirtyblkhd;
2055		if (LIST_FIRST(listheadp) == NULL) {
2056			switch (newvp->v_type) {
2057			case VDIR:
2058				delay = syncdelay / 3;
2059				break;
2060			case VBLK:
2061				if (newvp->v_specmountpoint != NULL) {
2062					delay = syncdelay / 2;
2063					break;
2064				}
2065				/* fall through */
2066			default:
2067				delay = syncdelay;
2068			}
2069			vn_syncer_add_to_worklist(newvp, delay);
2070		}
2071	}
2072	bufinsvn(bp, listheadp);
2073}
2074