vfs_subr.c revision 1.24
1/*	$OpenBSD: vfs_subr.c,v 1.24 1998/11/12 04:30:02 csapuntz Exp $	*/
2/*	$NetBSD: vfs_subr.c,v 1.53 1996/04/22 01:39:13 christos Exp $	*/
3
4/*
5 * Copyright (c) 1989, 1993
6 *	The Regents of the University of California.  All rights reserved.
7 * (c) UNIX System Laboratories, Inc.
8 * All or some portions of this file are derived from material licensed
9 * to the University of California by American Telephone and Telegraph
10 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
11 * the permission of UNIX System Laboratories, Inc.
12 *
13 * Redistribution and use in source and binary forms, with or without
14 * modification, are permitted provided that the following conditions
15 * are met:
16 * 1. Redistributions of source code must retain the above copyright
17 *    notice, this list of conditions and the following disclaimer.
18 * 2. Redistributions in binary form must reproduce the above copyright
19 *    notice, this list of conditions and the following disclaimer in the
20 *    documentation and/or other materials provided with the distribution.
21 * 3. All advertising materials mentioning features or use of this software
22 *    must display the following acknowledgement:
23 *	This product includes software developed by the University of
24 *	California, Berkeley and its contributors.
25 * 4. Neither the name of the University nor the names of its contributors
26 *    may be used to endorse or promote products derived from this software
27 *    without specific prior written permission.
28 *
29 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
30 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
31 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
32 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
33 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
34 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
35 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
36 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
37 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
38 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
39 * SUCH DAMAGE.
40 *
41 *	@(#)vfs_subr.c	8.13 (Berkeley) 4/18/94
42 */
43
44/*
45 * External virtual filesystem routines
46 */
47
48#include <sys/param.h>
49#include <sys/systm.h>
50#include <sys/proc.h>
51#include <sys/mount.h>
52#include <sys/time.h>
53#include <sys/fcntl.h>
54#include <sys/kernel.h>
55#include <sys/vnode.h>
56#include <sys/stat.h>
57#include <sys/namei.h>
58#include <sys/ucred.h>
59#include <sys/buf.h>
60#include <sys/errno.h>
61#include <sys/malloc.h>
62#include <sys/domain.h>
63#include <sys/mbuf.h>
64#include <sys/syscallargs.h>
65
66#include <vm/vm.h>
67#include <sys/sysctl.h>
68
69#include <miscfs/specfs/specdev.h>
70
71enum vtype iftovt_tab[16] = {
72	VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
73	VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
74};
75int	vttoif_tab[9] = {
76	0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
77	S_IFSOCK, S_IFIFO, S_IFMT,
78};
79
80int doforce = 1;		/* 1 => permit forcible unmounting */
81int prtactive = 0;		/* 1 => print out reclaim of active vnodes */
82int suid_clear = 1;		/* 1 => clear SUID / SGID on owner change */
83
84/*
85 * Insq/Remq for the vnode usage lists.
86 */
87#define	bufinsvn(bp, dp)	LIST_INSERT_HEAD(dp, bp, b_vnbufs)
88#define	bufremvn(bp) {							\
89	LIST_REMOVE(bp, b_vnbufs);					\
90	(bp)->b_vnbufs.le_next = NOLIST;				\
91}
92
93struct freelst vnode_hold_list;   /* list of vnodes referencing buffers */
94struct freelst vnode_free_list;   /* vnode free list */
95
96struct mntlist mountlist;			/* mounted filesystem list */
97struct simplelock mountlist_slock;
98static struct simplelock mntid_slock;
99struct simplelock mntvnode_slock;
100struct simplelock vnode_free_list_slock;
101struct simplelock spechash_slock;
102
103
104void insmntque __P((struct vnode *, struct mount *));
105int getdevvp __P((dev_t, struct vnode **, enum vtype));
106
107int vfs_hang_addrlist __P((struct mount *, struct netexport *,
108				  struct export_args *));
109int vfs_free_netcred __P((struct radix_node *, void *));
110void vfs_free_addrlist __P((struct netexport *));
111static __inline__ void vputonfreelist __P((struct vnode *));
112
113#ifdef DEBUG
114void printlockedvnodes __P((void));
115#endif
116
117/*
118 * Initialize the vnode management data structures.
119 */
120void
121vntblinit()
122{
123
124	simple_lock_init(&mntvnode_slock);
125	simple_lock_init(&mntid_slock);
126	simple_lock_init(&spechash_slock);
127	TAILQ_INIT(&vnode_hold_list);
128	TAILQ_INIT(&vnode_free_list);
129	simple_lock_init(&vnode_free_list_slock);
130	CIRCLEQ_INIT(&mountlist);
131	/*
132	 * Initialize the filesystem syncer.
133	 */
134	vn_initialize_syncerd();
135}
136
137
138/*
139 * Mark a mount point as busy. Used to synchronize access and to delay
140 * unmounting. Interlock is not released on failure.
141 */
142
143int
144vfs_busy(mp, flags, interlkp, p)
145	struct mount *mp;
146	int flags;
147	struct simplelock *interlkp;
148	struct proc *p;
149{
150	int lkflags;
151
152	if (mp->mnt_flag & MNT_UNMOUNT) {
153		if (flags & LK_NOWAIT)
154			return (ENOENT);
155		mp->mnt_flag |= MNT_MWAIT;
156		if (interlkp)
157			simple_unlock(interlkp);
158		/*
159		 * Since all busy locks are shared except the exclusive
160		 * lock granted when unmounting, the only place that a
161		 * wakeup needs to be done is at the release of the
162		 * exclusive lock at the end of dounmount.
163		 */
164 		sleep((caddr_t)mp, PVFS);
165		if (interlkp)
166			simple_lock(interlkp);
167		return (ENOENT);
168	}
169	lkflags = LK_SHARED;
170	if (interlkp)
171		lkflags |= LK_INTERLOCK;
172	if (lockmgr(&mp->mnt_lock, lkflags, interlkp, p))
173		panic("vfs_busy: unexpected lock failure");
174        return (0);
175}
176
177
178/*
179 * Free a busy file system
180 */
181void
182vfs_unbusy(mp, p)
183	struct mount *mp;
184	struct proc *p;
185{
186	lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, p);
187}
188
189/*
190 * Lookup a filesystem type, and if found allocate and initialize
191 * a mount structure for it.
192 *
193 * Devname is usually updated by mount(8) after booting.
194 */
195
196int
197vfs_rootmountalloc(fstypename, devname, mpp)
198	char *fstypename;
199	char *devname;
200	struct mount **mpp;
201{
202	struct proc *p = curproc;	/* XXX */
203	struct vfsconf *vfsp;
204	struct mount *mp;
205
206	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
207		if (!strcmp(vfsp->vfc_name, fstypename))
208			break;
209	if (vfsp == NULL)
210		return (ENODEV);
211	mp = malloc((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK);
212	bzero((char *)mp, (u_long)sizeof(struct mount));
213	lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, 0);
214	(void)vfs_busy(mp, LK_NOWAIT, 0, p);
215	LIST_INIT(&mp->mnt_vnodelist);
216	mp->mnt_vfc = vfsp;
217	mp->mnt_op = vfsp->vfc_vfsops;
218	mp->mnt_flag = MNT_RDONLY;
219	mp->mnt_vnodecovered = NULLVP;
220	vfsp->vfc_refcount++;
221	mp->mnt_stat.f_type = vfsp->vfc_typenum;
222	mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
223	strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN);
224	mp->mnt_stat.f_mntonname[0] = '/';
225	(void) copystr(devname, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, 0);
226	*mpp = mp;
227 	return (0);
228 }
229
230/*
231 * Find an appropriate filesystem to use for the root. If a filesystem
232 * has not been preselected, walk through the list of known filesystems
233 * trying those that have mountroot routines, and try them until one
234 * works or we have tried them all.
235  */
236int
237vfs_mountroot()
238{
239	struct vfsconf *vfsp;
240	extern int (*mountroot)(void);
241	int error;
242
243	if (mountroot != NULL)
244		return ((*mountroot)());
245	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
246		if (vfsp->vfc_mountroot == NULL)
247			continue;
248		if ((error = (*vfsp->vfc_mountroot)()) == 0)
249			return (0);
250		printf("%s_mountroot failed: %d\n", vfsp->vfc_name, error);
251 	}
252	return (ENODEV);
253}
254
255/*
256 * Lookup a mount point by filesystem identifier.
257 */
258struct mount *
259vfs_getvfs(fsid)
260	fsid_t *fsid;
261{
262	register struct mount *mp;
263
264	simple_lock(&mountlist_slock);
265	for (mp = mountlist.cqh_first; mp != (void *)&mountlist;
266	     mp = mp->mnt_list.cqe_next) {
267		if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
268		    mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
269			simple_unlock(&mountlist_slock);
270			return (mp);
271		}
272	}
273	simple_unlock(&mountlist_slock);
274	return ((struct mount *)0);
275}
276
277
278/*
279 * Get a new unique fsid
280 */
281void
282vfs_getnewfsid(mp)
283	struct mount *mp;
284{
285	static u_short xxxfs_mntid;
286
287	fsid_t tfsid;
288	int mtype;
289
290	simple_lock(&mntid_slock);
291	mtype = mp->mnt_vfc->vfc_typenum;
292	mp->mnt_stat.f_fsid.val[0] = makedev(nblkdev + mtype, 0);
293	mp->mnt_stat.f_fsid.val[1] = mtype;
294	if (xxxfs_mntid == 0)
295		++xxxfs_mntid;
296	tfsid.val[0] = makedev(nblkdev + mtype, xxxfs_mntid);
297	tfsid.val[1] = mtype;
298	if (mountlist.cqh_first != (void *)&mountlist) {
299		while (vfs_getvfs(&tfsid)) {
300			tfsid.val[0]++;
301			xxxfs_mntid++;
302		}
303	}
304	mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
305	simple_unlock(&mntid_slock);
306}
307
308/*
309 * Make a 'unique' number from a mount type name.
310 * Note that this is no longer used for ffs which
311 * now has an on-disk filesystem id.
312 */
313long
314makefstype(type)
315	char *type;
316{
317	long rv;
318
319	for (rv = 0; *type; type++) {
320		rv <<= 2;
321		rv ^= *type;
322	}
323	return rv;
324}
325
326/*
327 * Set vnode attributes to VNOVAL
328 */
329void
330vattr_null(vap)
331	register struct vattr *vap;
332{
333
334	vap->va_type = VNON;
335	/* XXX These next two used to be one line, but for a GCC bug. */
336	vap->va_size = VNOVAL;
337	vap->va_bytes = VNOVAL;
338	vap->va_mode = vap->va_nlink = vap->va_uid = vap->va_gid =
339		vap->va_fsid = vap->va_fileid =
340		vap->va_blocksize = vap->va_rdev =
341		vap->va_atime.tv_sec = vap->va_atime.tv_nsec =
342		vap->va_mtime.tv_sec = vap->va_mtime.tv_nsec =
343		vap->va_ctime.tv_sec = vap->va_ctime.tv_nsec =
344		vap->va_flags = vap->va_gen = VNOVAL;
345	vap->va_vaflags = 0;
346}
347
348/*
349 * Routines having to do with the management of the vnode table.
350 */
351extern int (**dead_vnodeop_p) __P((void *));
352long numvnodes;
353
354/*
355 * Return the next vnode from the free list.
356 */
357int
358getnewvnode(tag, mp, vops, vpp)
359	enum vtagtype tag;
360	struct mount *mp;
361	int (**vops) __P((void *));
362	struct vnode **vpp;
363{
364	struct proc *p = curproc;			/* XXX */
365	struct freelst *listhd;
366	static int toggle;
367	struct vnode *vp;
368#ifdef DIAGNOSTIC
369	int s;
370#endif
371
372	/*
373	 * We must choose whether to allocate a new vnode or recycle an
374	 * existing one. The criterion for allocating a new one is that
375	 * the total number of vnodes is less than the number desired or
376	 * there are no vnodes on either free list. Generally we only
377	 * want to recycle vnodes that have no buffers associated with
378	 * them, so we look first on the vnode_free_list. If it is empty,
379	 * we next consider vnodes with referencing buffers on the
380	 * vnode_hold_list. The toggle ensures that half the time we
381	 * will use a buffer from the vnode_hold_list, and half the time
382	 * we will allocate a new one unless the list has grown to twice
383	 * the desired size. We are reticent to recycle vnodes from the
384	 * vnode_hold_list because we will lose the identity of all its
385	 * referencing buffers.
386	 */
387	toggle ^= 1;
388	if (numvnodes > 2 * desiredvnodes)
389		toggle = 0;
390
391
392	simple_lock(&vnode_free_list_slock);
393	if ((numvnodes < desiredvnodes) ||
394	    ((TAILQ_FIRST(listhd = &vnode_free_list) == NULL) &&
395	     ((TAILQ_FIRST(listhd = &vnode_hold_list) == NULL) || toggle))) {
396		simple_unlock(&vnode_free_list_slock);
397		vp = (struct vnode *)malloc((u_long)sizeof *vp,
398		    M_VNODE, M_WAITOK);
399		bzero((char *)vp, sizeof *vp);
400		numvnodes++;
401	} else {
402		for (vp = TAILQ_FIRST(listhd); vp != NULLVP;
403		     vp = TAILQ_NEXT(vp, v_freelist)) {
404			if (simple_lock_try(&vp->v_interlock))
405				break;
406		}
407		/*
408		 * Unless this is a bad time of the month, at most
409		 * the first NCPUS items on the free list are
410		 * locked, so this is close enough to being empty.
411		 */
412		if (vp == NULLVP) {
413			simple_unlock(&vnode_free_list_slock);
414			tablefull("vnode");
415			*vpp = 0;
416			return (ENFILE);
417		}
418		if (vp->v_usecount) {
419			vprint("free vnode", vp);
420			panic("free vnode isn't");
421		}
422
423		TAILQ_REMOVE(listhd, vp, v_freelist);
424		vp->v_flag &= ~VONFREELIST;
425
426		simple_unlock(&vnode_free_list_slock);
427		vp->v_lease = NULL;
428		if (vp->v_type != VBAD)
429			vgonel(vp, p);
430		else
431			simple_unlock(&vp->v_interlock);
432#ifdef DIAGNOSTIC
433		if (vp->v_data) {
434			vprint("cleaned vnode", vp);
435			panic("cleaned vnode isn't");
436		}
437		s = splbio();
438		if (vp->v_numoutput)
439			panic("Clean vnode has pending I/O's");
440		splx(s);
441#endif
442		vp->v_flag = 0;
443		vp->v_lastr = 0;
444		vp->v_ralen = 0;
445		vp->v_maxra = 0;
446		vp->v_lastw = 0;
447		vp->v_lasta = 0;
448		vp->v_cstart = 0;
449		vp->v_clen = 0;
450		vp->v_socket = 0;
451	}
452	vp->v_type = VNON;
453	cache_purge(vp);
454	vp->v_tag = tag;
455	vp->v_op = vops;
456	insmntque(vp, mp);
457	*vpp = vp;
458	vp->v_usecount = 1;
459	vp->v_data = 0;
460	return (0);
461}
462
463/*
464 * Move a vnode from one mount queue to another.
465 */
466void
467insmntque(vp, mp)
468	register struct vnode *vp;
469	register struct mount *mp;
470{
471	simple_lock(&mntvnode_slock);
472	/*
473	 * Delete from old mount point vnode list, if on one.
474	 */
475
476	if (vp->v_mount != NULL)
477		LIST_REMOVE(vp, v_mntvnodes);
478	/*
479	 * Insert into list of vnodes for the new mount point, if available.
480	 */
481	if ((vp->v_mount = mp) != NULL)
482		LIST_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes);
483	simple_unlock(&mntvnode_slock);
484}
485
486
487/*
488 * Create a vnode for a block device.
489 * Used for root filesystem, argdev, and swap areas.
490 * Also used for memory file system special devices.
491 */
492int
493bdevvp(dev, vpp)
494	dev_t dev;
495	struct vnode **vpp;
496{
497
498	return (getdevvp(dev, vpp, VBLK));
499}
500
501/*
502 * Create a vnode for a character device.
503 * Used for kernfs and some console handling.
504 */
505int
506cdevvp(dev, vpp)
507	dev_t dev;
508	struct vnode **vpp;
509{
510
511	return (getdevvp(dev, vpp, VCHR));
512}
513
514/*
515 * Create a vnode for a device.
516 * Used by bdevvp (block device) for root file system etc.,
517 * and by cdevvp (character device) for console and kernfs.
518 */
519int
520getdevvp(dev, vpp, type)
521	dev_t dev;
522	struct vnode **vpp;
523	enum vtype type;
524{
525	register struct vnode *vp;
526	struct vnode *nvp;
527	int error;
528
529	if (dev == NODEV) {
530		*vpp = NULLVP;
531		return (0);
532	}
533	error = getnewvnode(VT_NON, NULL, spec_vnodeop_p, &nvp);
534	if (error) {
535		*vpp = NULLVP;
536		return (error);
537	}
538	vp = nvp;
539	vp->v_type = type;
540	if ((nvp = checkalias(vp, dev, NULL)) != 0) {
541		vput(vp);
542		vp = nvp;
543	}
544	*vpp = vp;
545	return (0);
546}
547
548/*
549 * Check to see if the new vnode represents a special device
550 * for which we already have a vnode (either because of
551 * bdevvp() or because of a different vnode representing
552 * the same block device). If such an alias exists, deallocate
553 * the existing contents and return the aliased vnode. The
554 * caller is responsible for filling it with its new contents.
555 */
556struct vnode *
557checkalias(nvp, nvp_rdev, mp)
558	register struct vnode *nvp;
559	dev_t nvp_rdev;
560	struct mount *mp;
561{
562	struct proc *p = curproc;
563	register struct vnode *vp;
564	struct vnode **vpp;
565
566	if (nvp->v_type != VBLK && nvp->v_type != VCHR)
567		return (NULLVP);
568
569	vpp = &speclisth[SPECHASH(nvp_rdev)];
570loop:
571	simple_lock(&spechash_slock);
572	for (vp = *vpp; vp; vp = vp->v_specnext) {
573		simple_lock(&vp->v_interlock);
574		if (nvp_rdev != vp->v_rdev || nvp->v_type != vp->v_type)
575			continue;
576		/*
577		 * Alias, but not in use, so flush it out.
578		 */
579		if (vp->v_usecount == 0) {
580			simple_unlock(&spechash_slock);
581			vgonel(vp, p);
582			goto loop;
583		}
584		if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, p)) {
585			simple_unlock(&spechash_slock);
586			goto loop;
587		}
588		break;
589	}
590
591
592	/*
593	 * Common case is actually in the if statement
594	 */
595	if (vp == NULL || !(vp->v_tag == VT_NON && vp->v_type == VBLK)) {
596		MALLOC(nvp->v_specinfo, struct specinfo *,
597			sizeof(struct specinfo), M_VNODE, M_WAITOK);
598		nvp->v_rdev = nvp_rdev;
599		nvp->v_hashchain = vpp;
600		nvp->v_specnext = *vpp;
601		nvp->v_specmountpoint = NULL;
602		nvp->v_speclockf = NULL;
603		simple_unlock(&spechash_slock);
604		*vpp = nvp;
605		if (vp != NULLVP) {
606			nvp->v_flag |= VALIASED;
607			vp->v_flag |= VALIASED;
608			vput(vp);
609		}
610		return (NULLVP);
611	}
612
613	/*
614	 * This code is the uncommon case. It is called in case
615	 * we found an alias that was VT_NON && vtype of VBLK
616	 * This means we found a block device that was created
617	 * using bdevvp.
618	 * An example of such a vnode is the root partition device vnode
619	 * craeted in ffs_mountroot.
620	 *
621	 * The vnodes created by bdevvp should not be aliased (why?).
622	 */
623
624	simple_unlock(&spechash_slock);
625	VOP_UNLOCK(vp, 0, p);
626	simple_lock(&vp->v_interlock);
627	vclean(vp, 0, p);
628	vp->v_op = nvp->v_op;
629	vp->v_tag = nvp->v_tag;
630	nvp->v_type = VNON;
631	insmntque(vp, mp);
632	return (vp);
633}
634
635/*
636 * Grab a particular vnode from the free list, increment its
637 * reference count and lock it. The vnode lock bit is set the
638 * vnode is being eliminated in vgone. The process is awakened
639 * when the transition is completed, and an error returned to
640 * indicate that the vnode is no longer usable (possibly having
641 * been changed to a new file system type).
642 */
643int
644vget(vp, flags, p)
645        struct vnode *vp;
646	int flags;
647	struct proc *p;
648{
649	int error;
650	/*
651	 * If the vnode is in the process of being cleaned out for
652	 * another use, we wait for the cleaning to finish and then
653	 * return failure. Cleaning is determined by checking that
654	 * the VXLOCK flag is set.
655	 */
656	if ((flags & LK_INTERLOCK) == 0)
657		simple_lock(&vp->v_interlock);
658	if (vp->v_flag & VXLOCK) {
659 		vp->v_flag |= VXWANT;
660		simple_unlock(&vp->v_interlock);
661		tsleep((caddr_t)vp, PINOD, "vget", 0);
662		return (ENOENT);
663 	}
664	if ((vp->v_flag & VONFREELIST) && (vp->v_usecount == 0)) {
665		simple_lock(&vnode_free_list_slock);
666		if (vp->v_holdcnt > 0)
667			TAILQ_REMOVE(&vnode_hold_list, vp, v_freelist);
668		else
669			TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
670		simple_unlock(&vnode_free_list_slock);
671		vp->v_flag &= ~VONFREELIST;
672	}
673 	vp->v_usecount++;
674	if (flags & LK_TYPE_MASK) {
675		if ((error = vn_lock(vp, flags | LK_INTERLOCK, p)) != 0) {
676			vp->v_usecount--;
677			if (vp->v_usecount == 0)
678				vputonfreelist(vp);
679
680			simple_unlock(&vp->v_interlock);
681		}
682		return (error);
683	}
684	simple_unlock(&vp->v_interlock);
685	return (0);
686}
687
688
689#ifdef DIAGNOSTIC
690/*
691 * Vnode reference.
692 */
693void
694vref(vp)
695	struct vnode *vp;
696{
697	simple_lock(&vp->v_interlock);
698	if (vp->v_usecount <= 0)
699		panic("vref used where vget required");
700	vp->v_usecount++;
701	simple_unlock(&vp->v_interlock);
702}
703#endif /* DIAGNOSTIC */
704
705static __inline__ void
706vputonfreelist(vp)
707        struct vnode *vp;
708
709{
710	struct freelst *lst;
711
712	/*
713	 * insert at tail of LRU list
714	 */
715#ifdef DIAGNOSTIC
716	if (vp->v_usecount != 0) {
717		panic("Use count is not zero!");
718	}
719
720	if (vp->v_flag & VONFREELIST) {
721		vprint ("vnode already on free list: ", vp);
722		panic ("vnode already on free list");
723		return;
724	}
725#endif
726
727	vp->v_flag |= VONFREELIST;
728
729	simple_lock(&vnode_free_list_slock);
730
731	if (vp->v_holdcnt > 0)
732		lst = &vnode_hold_list;
733	else
734		lst = &vnode_free_list;
735
736
737	if (vp->v_type == VBAD)
738		TAILQ_INSERT_HEAD(lst, vp, v_freelist);
739	else
740		TAILQ_INSERT_TAIL(lst, vp, v_freelist);
741
742	simple_unlock(&vnode_free_list_slock);
743}
744
745/*
746 * vput(), just unlock and vrele()
747 */
748void
749vput(vp)
750	register struct vnode *vp;
751{
752	struct proc *p = curproc;	/* XXX */
753
754#ifdef DIAGNOSTIC
755	if (vp == NULL)
756		panic("vput: null vp");
757#endif
758	simple_lock(&vp->v_interlock);
759	vp->v_usecount--;
760	if (vp->v_usecount > 0) {
761		simple_unlock(&vp->v_interlock);
762		VOP_UNLOCK(vp, 0, p);
763		return;
764	}
765#ifdef DIAGNOSTIC
766	if (vp->v_usecount < 0 || vp->v_writecount != 0) {
767		vprint("vput: bad ref count", vp);
768		panic("vput: ref cnt");
769	}
770#endif
771	vputonfreelist(vp);
772
773	VOP_INACTIVE(vp, p);
774	simple_unlock(&vp->v_interlock);
775}
776
777/*
778 * Vnode release - use for active VNODES.
779 * If count drops to zero, call inactive routine and return to freelist.
780 */
781void
782vrele(vp)
783	register struct vnode *vp;
784{
785	struct proc *p = curproc;	/* XXX */
786
787#ifdef DIAGNOSTIC
788	if (vp == NULL)
789		panic("vrele: null vp");
790#endif
791	simple_lock(&vp->v_interlock);
792	vp->v_usecount--;
793	if (vp->v_usecount > 0) {
794		simple_unlock(&vp->v_interlock);
795		return;
796	}
797#ifdef DIAGNOSTIC
798	if (vp->v_usecount < 0 || vp->v_writecount != 0) {
799		vprint("vrele: bad ref count", vp);
800		panic("vrele: ref cnt");
801	}
802#endif
803	vputonfreelist(vp);
804
805	if (vn_lock(vp, LK_EXCLUSIVE |LK_INTERLOCK, p) == 0)
806		VOP_INACTIVE(vp, p);
807
808	simple_unlock(&vp->v_interlock);
809}
810
811#ifdef DIAGNOSTIC
812/*
813 * Page or buffer structure gets a reference.
814 */
815void
816vhold(vp)
817	register struct vnode *vp;
818{
819
820	/*
821	 * If it is on the freelist and the hold count is currently
822	 * zero, move it to the hold list.
823	 */
824  	simple_lock(&vp->v_interlock);
825	if ((vp->v_flag & VONFREELIST) &&
826	    vp->v_holdcnt == 0 && vp->v_usecount == 0) {
827		simple_lock(&vnode_free_list_slock);
828		TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
829		TAILQ_INSERT_TAIL(&vnode_hold_list, vp, v_freelist);
830		simple_unlock(&vnode_free_list_slock);
831	}
832	vp->v_holdcnt++;
833	simple_unlock(&vp->v_interlock);
834}
835
836/*
837 * Page or buffer structure frees a reference.
838 */
839void
840holdrele(vp)
841	register struct vnode *vp;
842{
843
844	simple_lock(&vp->v_interlock);
845	if (vp->v_holdcnt <= 0)
846		panic("holdrele: holdcnt");
847	vp->v_holdcnt--;
848	/*
849	 * If it is on the holdlist and the hold count drops to
850	 * zero, move it to the free list.
851	 */
852	if ((vp->v_flag & VONFREELIST) &&
853	    vp->v_holdcnt == 0 && vp->v_usecount == 0) {
854		simple_lock(&vnode_free_list_slock);
855		TAILQ_REMOVE(&vnode_hold_list, vp, v_freelist);
856		TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
857		simple_unlock(&vnode_free_list_slock);
858	}
859	simple_unlock(&vp->v_interlock);
860}
861#endif /* DIAGNOSTIC */
862
863/*
864 * Remove any vnodes in the vnode table belonging to mount point mp.
865 *
866 * If MNT_NOFORCE is specified, there should not be any active ones,
867 * return error if any are found (nb: this is a user error, not a
868 * system error). If MNT_FORCE is specified, detach any active vnodes
869 * that are found.
870 */
871#ifdef DEBUG
872int busyprt = 0;	/* print out busy vnodes */
873struct ctldebug debug1 = { "busyprt", &busyprt };
874#endif
875
876int
877vflush(mp, skipvp, flags)
878	struct mount *mp;
879	struct vnode *skipvp;
880	int flags;
881{
882	struct proc *p = curproc;
883	register struct vnode *vp, *nvp;
884	int busy = 0;
885
886	simple_lock(&mntvnode_slock);
887loop:
888	for (vp = mp->mnt_vnodelist.lh_first; vp; vp = nvp) {
889		if (vp->v_mount != mp)
890			goto loop;
891		nvp = vp->v_mntvnodes.le_next;
892		/*
893		 * Skip over a selected vnode.
894		 */
895		if (vp == skipvp)
896			continue;
897
898		simple_lock(&vp->v_interlock);
899		/*
900		 * Skip over a vnodes marked VSYSTEM.
901		 */
902		if ((flags & SKIPSYSTEM) && (vp->v_flag & VSYSTEM)) {
903			simple_unlock(&vp->v_interlock);
904			continue;
905		}
906		/*
907		 * If WRITECLOSE is set, only flush out regular file
908		 * vnodes open for writing.
909		 */
910		if ((flags & WRITECLOSE) &&
911		    (vp->v_writecount == 0 || vp->v_type != VREG)) {
912			simple_unlock(&vp->v_interlock);
913			continue;
914		}
915		/*
916		 * With v_usecount == 0, all we need to do is clear
917		 * out the vnode data structures and we are done.
918		 */
919		if (vp->v_usecount == 0) {
920			simple_unlock(&mntvnode_slock);
921			vgonel(vp, p);
922			simple_lock(&mntvnode_slock);
923			continue;
924		}
925		/*
926		 * If FORCECLOSE is set, forcibly close the vnode.
927		 * For block or character devices, revert to an
928		 * anonymous device. For all other files, just kill them.
929		 */
930		if (flags & FORCECLOSE) {
931			simple_unlock(&mntvnode_slock);
932			if (vp->v_type != VBLK && vp->v_type != VCHR) {
933				vgonel(vp, p);
934			} else {
935				vclean(vp, 0, p);
936				vp->v_op = spec_vnodeop_p;
937				insmntque(vp, (struct mount *)0);
938			}
939			simple_lock(&mntvnode_slock);
940			continue;
941		}
942#ifdef DEBUG
943		if (busyprt)
944			vprint("vflush: busy vnode", vp);
945#endif
946		simple_unlock(&vp->v_interlock);
947		busy++;
948	}
949	simple_unlock(&mntvnode_slock);
950	if (busy)
951		return (EBUSY);
952	return (0);
953}
954
955/*
956 * Disassociate the underlying file system from a vnode.
957 * The vnode interlock is held on entry.
958 */
959void
960vclean(vp, flags, p)
961	register struct vnode *vp;
962	int flags;
963	struct proc *p;
964{
965	int active;
966
967	/*
968	 * Check to see if the vnode is in use.
969	 * If so we have to reference it before we clean it out
970	 * so that its count cannot fall to zero and generate a
971	 * race against ourselves to recycle it.
972	 */
973	if ((active = vp->v_usecount) != 0)
974		vp->v_usecount++;
975
976	/*
977	 * Prevent the vnode from being recycled or
978	 * brought into use while we clean it out.
979	 */
980	if (vp->v_flag & VXLOCK)
981		panic("vclean: deadlock");
982	vp->v_flag |= VXLOCK;
983
984
985	/*
986	 * Even if the count is zero, the VOP_INACTIVE routine may still
987	 * have the object locked while it cleans it out. The VOP_LOCK
988	 * ensures that the VOP_INACTIVE routine is done with its work.
989	 * For active vnodes, it ensures that no other activity can
990	 * occur while the underlying object is being cleaned out.
991	 */
992	VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK, p);
993
994	/*
995	 * Clean out any buffers associated with the vnode.
996	 */
997	if (flags & DOCLOSE)
998		vinvalbuf(vp, V_SAVE, NOCRED, p, 0, 0);
999	/*
1000	 * If purging an active vnode, it must be closed and
1001	 * deactivated before being reclaimed. Note that the
1002	 * VOP_INACTIVE will unlock the vnode
1003	 */
1004	if (active) {
1005		if (flags & DOCLOSE)
1006			VOP_CLOSE(vp, FNONBLOCK, NOCRED, p);
1007		VOP_INACTIVE(vp, p);
1008	} else {
1009		/*
1010		 * Any other processes trying to obtain this lock must first
1011		 * wait for VXLOCK to clear, then call the new lock operation.
1012		 */
1013		VOP_UNLOCK(vp, 0, p);
1014	}
1015
1016	/*
1017	 * Reclaim the vnode.
1018	 */
1019	if (VOP_RECLAIM(vp, p))
1020		panic("vclean: cannot reclaim");
1021	if (active) {
1022		vp->v_usecount--;
1023		if (vp->v_usecount == 0) {
1024			if (vp->v_holdcnt > 0)
1025				panic("vclean: not clean");
1026			vputonfreelist(vp);
1027		}
1028
1029		simple_unlock(&vp->v_interlock);
1030	}
1031	cache_purge(vp);
1032	if (vp->v_vnlock) {
1033		if ((vp->v_vnlock->lk_flags & LK_DRAINED) == 0)
1034			vprint("vclean: lock not drained", vp);
1035		FREE(vp->v_vnlock, M_VNODE);
1036		vp->v_vnlock = NULL;
1037	}
1038
1039	/*
1040	 * Done with purge, notify sleepers of the grim news.
1041	 */
1042	vp->v_op = dead_vnodeop_p;
1043	vp->v_tag = VT_NON;
1044	vp->v_flag &= ~VXLOCK;
1045	if (vp->v_flag & VXWANT) {
1046		vp->v_flag &= ~VXWANT;
1047		wakeup((caddr_t)vp);
1048	}
1049}
1050
1051
1052
1053/*
1054 * Recycle an unused vnode to the front of the free list.
1055 * Release the passed interlock if the vnode will be recycled.
1056 */
1057int
1058vrecycle(vp, inter_lkp, p)
1059	struct vnode *vp;
1060	struct simplelock *inter_lkp;
1061	struct proc *p;
1062{
1063
1064	simple_lock(&vp->v_interlock);
1065	if (vp->v_usecount == 0) {
1066		if (inter_lkp)
1067			simple_unlock(inter_lkp);
1068		vgonel(vp, p);
1069		return (1);
1070	}
1071	simple_unlock(&vp->v_interlock);
1072	return (0);
1073}
1074
1075/*
1076 * Eliminate all activity associated with a vnode
1077 * in preparation for reuse.
1078 */
1079void
1080vgone(vp)
1081	register struct vnode *vp;
1082{
1083	struct proc *p = curproc;
1084
1085	simple_lock (&vp->v_interlock);
1086	vgonel(vp, p);
1087}
1088
1089/*
1090 * vgone, with the vp interlock held.
1091 */
1092void
1093vgonel(vp, p)
1094	struct vnode *vp;
1095	struct proc *p;
1096{
1097	register struct vnode *vq;
1098	struct vnode *vx;
1099
1100	/*
1101	 * If a vgone (or vclean) is already in progress,
1102	 * wait until it is done and return.
1103	 */
1104	if (vp->v_flag & VXLOCK) {
1105		vp->v_flag |= VXWANT;
1106		simple_unlock(&vp->v_interlock);
1107		tsleep((caddr_t)vp, PINOD, "vgone", 0);
1108		return;
1109	}
1110	/*
1111	 * Clean out the filesystem specific data.
1112	 */
1113	vclean(vp, DOCLOSE, p);
1114	/*
1115	 * Delete from old mount point vnode list, if on one.
1116	 */
1117	if (vp->v_mount != NULL)
1118		insmntque(vp, (struct mount *)0);
1119	/*
1120	 * If special device, remove it from special device alias list
1121	 * if it is on one.
1122	 */
1123	if ((vp->v_type == VBLK || vp->v_type == VCHR) && vp->v_specinfo != 0) {
1124		simple_lock(&spechash_slock);
1125		if (*vp->v_hashchain == vp) {
1126			*vp->v_hashchain = vp->v_specnext;
1127		} else {
1128			for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
1129				if (vq->v_specnext != vp)
1130					continue;
1131				vq->v_specnext = vp->v_specnext;
1132				break;
1133			}
1134			if (vq == NULL)
1135				panic("missing bdev");
1136		}
1137		if (vp->v_flag & VALIASED) {
1138			vx = NULL;
1139			for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
1140				if (vq->v_rdev != vp->v_rdev ||
1141				    vq->v_type != vp->v_type)
1142					continue;
1143				if (vx)
1144					break;
1145				vx = vq;
1146			}
1147			if (vx == NULL)
1148				panic("missing alias");
1149			if (vq == NULL)
1150				vx->v_flag &= ~VALIASED;
1151			vp->v_flag &= ~VALIASED;
1152		}
1153		simple_unlock(&spechash_slock);
1154		FREE(vp->v_specinfo, M_VNODE);
1155		vp->v_specinfo = NULL;
1156	}
1157	/*
1158	 * If it is on the freelist and not already at the head,
1159	 * move it to the head of the list.
1160	 */
1161	vp->v_type = VBAD;
1162
1163	if ((vp->v_flag & VONFREELIST) &&
1164	    vp->v_usecount == 0) {
1165                simple_lock(&vnode_free_list_slock);
1166		if (vp->v_holdcnt > 0)
1167			panic("vgonel: not clean");
1168                if (TAILQ_FIRST(&vnode_free_list) != vp) {
1169                        TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
1170                        TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
1171                }
1172                simple_unlock(&vnode_free_list_slock);
1173	}
1174}
1175
1176/*
1177 * Lookup a vnode by device number.
1178 */
1179int
1180vfinddev(dev, type, vpp)
1181	dev_t dev;
1182	enum vtype type;
1183	struct vnode **vpp;
1184{
1185	register struct vnode *vp;
1186	int rc =0;
1187
1188	simple_lock(&spechash_slock);
1189	for (vp = speclisth[SPECHASH(dev)]; vp; vp = vp->v_specnext) {
1190		if (dev != vp->v_rdev || type != vp->v_type)
1191			continue;
1192		*vpp = vp;
1193		rc = 1;
1194		break;
1195	}
1196	simple_unlock(&spechash_slock);
1197	return (rc);
1198}
1199
1200/*
1201 * Calculate the total number of references to a special device.
1202 */
1203int
1204vcount(vp)
1205	struct vnode *vp;
1206{
1207	struct vnode *vq, *vnext;
1208	int count;
1209
1210loop:
1211	if ((vp->v_flag & VALIASED) == 0)
1212		return (vp->v_usecount);
1213	simple_lock(&spechash_slock);
1214	for (count = 0, vq = *vp->v_hashchain; vq; vq = vnext) {
1215		vnext = vq->v_specnext;
1216		if (vq->v_rdev != vp->v_rdev || vq->v_type != vp->v_type)
1217			continue;
1218		/*
1219		 * Alias, but not in use, so flush it out.
1220		 */
1221		if (vq->v_usecount == 0 && vq != vp) {
1222			simple_unlock(&spechash_slock);
1223			vgone(vq);
1224			goto loop;
1225		}
1226		count += vq->v_usecount;
1227	}
1228	simple_unlock(&spechash_slock);
1229	return (count);
1230}
1231
1232/*
1233 * Print out a description of a vnode.
1234 */
1235static char *typename[] =
1236   { "VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD" };
1237
1238void
1239vprint(label, vp)
1240	char *label;
1241	register struct vnode *vp;
1242{
1243	char buf[64];
1244
1245	if (label != NULL)
1246		printf("%s: ", label);
1247	printf("type %s, usecount %d, writecount %d, refcount %ld,",
1248		typename[vp->v_type], vp->v_usecount, vp->v_writecount,
1249		vp->v_holdcnt);
1250	buf[0] = '\0';
1251	if (vp->v_flag & VROOT)
1252		strcat(buf, "|VROOT");
1253	if (vp->v_flag & VTEXT)
1254		strcat(buf, "|VTEXT");
1255	if (vp->v_flag & VSYSTEM)
1256		strcat(buf, "|VSYSTEM");
1257	if (vp->v_flag & VXLOCK)
1258		strcat(buf, "|VXLOCK");
1259	if (vp->v_flag & VXWANT)
1260		strcat(buf, "|VXWANT");
1261	if (vp->v_flag & VBWAIT)
1262		strcat(buf, "|VBWAIT");
1263	if (vp->v_flag & VALIASED)
1264		strcat(buf, "|VALIASED");
1265	if (buf[0] != '\0')
1266		printf(" flags (%s)", &buf[1]);
1267	if (vp->v_data == NULL) {
1268		printf("\n");
1269	} else {
1270		printf("\n\t");
1271		VOP_PRINT(vp);
1272	}
1273}
1274
1275#ifdef DEBUG
1276/*
1277 * List all of the locked vnodes in the system.
1278 * Called when debugging the kernel.
1279 */
1280void
1281printlockedvnodes()
1282{
1283	struct proc *p = curproc;
1284	register struct mount *mp, *nmp;
1285	register struct vnode *vp;
1286
1287	printf("Locked vnodes\n");
1288	simple_lock(&mountlist_slock);
1289	for (mp = CIRCLEQ_FIRST(&mountlist); mp != (void *)&mountlist;
1290	     mp = nmp) {
1291		if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) {
1292			nmp = CIRCLEQ_NEXT(mp, mnt_list);
1293			continue;
1294		}
1295		for (vp = mp->mnt_vnodelist.lh_first;
1296		     vp != NULL;
1297		     vp = vp->v_mntvnodes.le_next) {
1298			if (VOP_ISLOCKED(vp))
1299				vprint((char *)0, vp);
1300		}
1301		simple_lock(&mountlist_slock);
1302		nmp = CIRCLEQ_NEXT(mp, mnt_list);
1303		vfs_unbusy(mp, p);
1304 	}
1305	simple_unlock(&mountlist_slock);
1306
1307}
1308#endif
1309
1310/*
1311 * Top level filesystem related information gathering.
1312 */
1313int
1314vfs_sysctl(name, namelen, oldp, oldlenp, newp, newlen, p)
1315	int *name;
1316	u_int namelen;
1317	void *oldp;
1318	size_t *oldlenp;
1319	void *newp;
1320	size_t newlen;
1321	struct proc *p;
1322{
1323	struct vfsconf *vfsp;
1324
1325	/* all sysctl names at this level are at least name and field */
1326	if (namelen < 2)
1327		return (ENOTDIR);		/* overloaded */
1328	if (name[0] != VFS_GENERIC) {
1329		for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
1330			if (vfsp->vfc_typenum == name[0])
1331				break;
1332		if (vfsp == NULL)
1333			return (EOPNOTSUPP);
1334		return ((*vfsp->vfc_vfsops->vfs_sysctl)(&name[1], namelen - 1,
1335		    oldp, oldlenp, newp, newlen, p));
1336	}
1337	switch (name[1]) {
1338	case VFS_MAXTYPENUM:
1339		return (sysctl_rdint(oldp, oldlenp, newp, maxvfsconf));
1340	case VFS_CONF:
1341		if (namelen < 3)
1342			return (ENOTDIR);	/* overloaded */
1343		for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
1344			if (vfsp->vfc_typenum == name[2])
1345				break;
1346		if (vfsp == NULL)
1347			return (EOPNOTSUPP);
1348		return (sysctl_rdstruct(oldp, oldlenp, newp, vfsp,
1349		    sizeof(struct vfsconf)));
1350	}
1351	return (EOPNOTSUPP);
1352}
1353
1354
1355int kinfo_vdebug = 1;
1356int kinfo_vgetfailed;
1357#define KINFO_VNODESLOP	10
1358/*
1359 * Dump vnode list (via sysctl).
1360 * Copyout address of vnode followed by vnode.
1361 */
1362/* ARGSUSED */
1363int
1364sysctl_vnode(where, sizep, p)
1365	char *where;
1366	size_t *sizep;
1367	struct proc *p;
1368{
1369	register struct mount *mp, *nmp;
1370	struct vnode *vp, *nvp;
1371	register char *bp = where, *savebp;
1372	char *ewhere;
1373	int error;
1374
1375#define VPTRSZ	sizeof (struct vnode *)
1376#define VNODESZ	sizeof (struct vnode)
1377	if (where == NULL) {
1378		*sizep = (numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ);
1379		return (0);
1380	}
1381	ewhere = where + *sizep;
1382
1383	simple_lock(&mountlist_slock);
1384	for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) {
1385		if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) {
1386			nmp = mp->mnt_list.cqe_next;
1387			continue;
1388		}
1389		savebp = bp;
1390again:
1391		for (vp = mp->mnt_vnodelist.lh_first;
1392		     vp != NULL;
1393		     vp = nvp) {
1394			/*
1395			 * Check that the vp is still associated with
1396			 * this filesystem.  RACE: could have been
1397			 * recycled onto the same filesystem.
1398			 */
1399			if (vp->v_mount != mp) {
1400				simple_unlock(&mntvnode_slock);
1401				if (kinfo_vdebug)
1402					printf("kinfo: vp changed\n");
1403				bp = savebp;
1404				goto again;
1405			}
1406			nvp = vp->v_mntvnodes.le_next;
1407			if (bp + VPTRSZ + VNODESZ > ewhere) {
1408				simple_unlock(&mntvnode_slock);
1409				*sizep = bp - where;
1410				return (ENOMEM);
1411			}
1412			if ((error = copyout((caddr_t)&vp, bp, VPTRSZ)) ||
1413			   (error = copyout((caddr_t)vp, bp + VPTRSZ, VNODESZ)))
1414				return (error);
1415			bp += VPTRSZ + VNODESZ;
1416			simple_lock(&mntvnode_slock);
1417		}
1418
1419		simple_unlock(&mntvnode_slock);
1420		simple_lock(&mountlist_slock);
1421		nmp = mp->mnt_list.cqe_next;
1422		vfs_unbusy(mp, p);
1423	}
1424
1425	simple_unlock(&mountlist_slock);
1426
1427	*sizep = bp - where;
1428	return (0);
1429}
1430
1431/*
1432 * Check to see if a filesystem is mounted on a block device.
1433 */
1434int
1435vfs_mountedon(vp)
1436	register struct vnode *vp;
1437{
1438	register struct vnode *vq;
1439	int error = 0;
1440
1441 	if (vp->v_specmountpoint != NULL)
1442		return (EBUSY);
1443	if (vp->v_flag & VALIASED) {
1444		simple_lock(&spechash_slock);
1445		for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
1446			if (vq->v_rdev != vp->v_rdev ||
1447			    vq->v_type != vp->v_type)
1448				continue;
1449			if (vq->v_specmountpoint != NULL) {
1450				error = EBUSY;
1451				break;
1452			}
1453 		}
1454		simple_unlock(&spechash_slock);
1455	}
1456	return (error);
1457}
1458
1459/*
1460 * Build hash lists of net addresses and hang them off the mount point.
1461 * Called by ufs_mount() to set up the lists of export addresses.
1462 */
1463int
1464vfs_hang_addrlist(mp, nep, argp)
1465	struct mount *mp;
1466	struct netexport *nep;
1467	struct export_args *argp;
1468{
1469	register struct netcred *np;
1470	register struct radix_node_head *rnh;
1471	register int i;
1472	struct radix_node *rn;
1473	struct sockaddr *saddr, *smask = 0;
1474	struct domain *dom;
1475	int error;
1476
1477	if (argp->ex_addrlen == 0) {
1478		if (mp->mnt_flag & MNT_DEFEXPORTED)
1479			return (EPERM);
1480		np = &nep->ne_defexported;
1481		np->netc_exflags = argp->ex_flags;
1482		np->netc_anon = argp->ex_anon;
1483		np->netc_anon.cr_ref = 1;
1484		mp->mnt_flag |= MNT_DEFEXPORTED;
1485		return (0);
1486	}
1487	i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen;
1488	np = (struct netcred *)malloc(i, M_NETADDR, M_WAITOK);
1489	bzero((caddr_t)np, i);
1490	saddr = (struct sockaddr *)(np + 1);
1491	error = copyin(argp->ex_addr, (caddr_t)saddr, argp->ex_addrlen);
1492	if (error)
1493		goto out;
1494	if (saddr->sa_len > argp->ex_addrlen)
1495		saddr->sa_len = argp->ex_addrlen;
1496	if (argp->ex_masklen) {
1497		smask = (struct sockaddr *)((caddr_t)saddr + argp->ex_addrlen);
1498		error = copyin(argp->ex_mask, (caddr_t)smask, argp->ex_masklen);
1499		if (error)
1500			goto out;
1501		if (smask->sa_len > argp->ex_masklen)
1502			smask->sa_len = argp->ex_masklen;
1503	}
1504	i = saddr->sa_family;
1505	if ((rnh = nep->ne_rtable[i]) == 0) {
1506		/*
1507		 * Seems silly to initialize every AF when most are not
1508		 * used, do so on demand here
1509		 */
1510		for (dom = domains; dom; dom = dom->dom_next)
1511			if (dom->dom_family == i && dom->dom_rtattach) {
1512				dom->dom_rtattach((void **)&nep->ne_rtable[i],
1513					dom->dom_rtoffset);
1514				break;
1515			}
1516		if ((rnh = nep->ne_rtable[i]) == 0) {
1517			error = ENOBUFS;
1518			goto out;
1519		}
1520	}
1521	rn = (*rnh->rnh_addaddr)((caddr_t)saddr, (caddr_t)smask, rnh,
1522		np->netc_rnodes);
1523	if (rn == 0 || np != (struct netcred *)rn) { /* already exists */
1524		error = EPERM;
1525		goto out;
1526	}
1527	np->netc_exflags = argp->ex_flags;
1528	np->netc_anon = argp->ex_anon;
1529	np->netc_anon.cr_ref = 1;
1530	return (0);
1531out:
1532	free(np, M_NETADDR);
1533	return (error);
1534}
1535
1536/* ARGSUSED */
1537int
1538vfs_free_netcred(rn, w)
1539	struct radix_node *rn;
1540	void *w;
1541{
1542	register struct radix_node_head *rnh = (struct radix_node_head *)w;
1543
1544	(*rnh->rnh_deladdr)(rn->rn_key, rn->rn_mask, rnh);
1545	free((caddr_t)rn, M_NETADDR);
1546	return (0);
1547}
1548
1549/*
1550 * Free the net address hash lists that are hanging off the mount points.
1551 */
1552void
1553vfs_free_addrlist(nep)
1554	struct netexport *nep;
1555{
1556	register int i;
1557	register struct radix_node_head *rnh;
1558
1559	for (i = 0; i <= AF_MAX; i++)
1560		if ((rnh = nep->ne_rtable[i]) != NULL) {
1561			(*rnh->rnh_walktree)(rnh, vfs_free_netcred, rnh);
1562			free((caddr_t)rnh, M_RTABLE);
1563			nep->ne_rtable[i] = 0;
1564		}
1565}
1566
1567int
1568vfs_export(mp, nep, argp)
1569	struct mount *mp;
1570	struct netexport *nep;
1571	struct export_args *argp;
1572{
1573	int error;
1574
1575	if (argp->ex_flags & MNT_DELEXPORT) {
1576		vfs_free_addrlist(nep);
1577		mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED);
1578	}
1579	if (argp->ex_flags & MNT_EXPORTED) {
1580		if ((error = vfs_hang_addrlist(mp, nep, argp)) != 0)
1581			return (error);
1582		mp->mnt_flag |= MNT_EXPORTED;
1583	}
1584	return (0);
1585}
1586
1587struct netcred *
1588vfs_export_lookup(mp, nep, nam)
1589	register struct mount *mp;
1590	struct netexport *nep;
1591	struct mbuf *nam;
1592{
1593	register struct netcred *np;
1594	register struct radix_node_head *rnh;
1595	struct sockaddr *saddr;
1596
1597	np = NULL;
1598	if (mp->mnt_flag & MNT_EXPORTED) {
1599		/*
1600		 * Lookup in the export list first.
1601		 */
1602		if (nam != NULL) {
1603			saddr = mtod(nam, struct sockaddr *);
1604			rnh = nep->ne_rtable[saddr->sa_family];
1605			if (rnh != NULL) {
1606				np = (struct netcred *)
1607					(*rnh->rnh_matchaddr)((caddr_t)saddr,
1608							      rnh);
1609				if (np && np->netc_rnodes->rn_flags & RNF_ROOT)
1610					np = NULL;
1611			}
1612		}
1613		/*
1614		 * If no address match, use the default if it exists.
1615		 */
1616		if (np == NULL && mp->mnt_flag & MNT_DEFEXPORTED)
1617			np = &nep->ne_defexported;
1618	}
1619	return (np);
1620}
1621
1622/*
1623 * Do the usual access checking.
1624 * file_mode, uid and gid are from the vnode in question,
1625 * while acc_mode and cred are from the VOP_ACCESS parameter list
1626 */
1627int
1628vaccess(file_mode, uid, gid, acc_mode, cred)
1629	mode_t file_mode;
1630	uid_t uid;
1631	gid_t gid;
1632	mode_t acc_mode;
1633	struct ucred *cred;
1634{
1635	mode_t mask;
1636
1637	/* User id 0 always gets access. */
1638	if (cred->cr_uid == 0)
1639		return 0;
1640
1641	mask = 0;
1642
1643	/* Otherwise, check the owner. */
1644	if (cred->cr_uid == uid) {
1645		if (acc_mode & VEXEC)
1646			mask |= S_IXUSR;
1647		if (acc_mode & VREAD)
1648			mask |= S_IRUSR;
1649		if (acc_mode & VWRITE)
1650			mask |= S_IWUSR;
1651		return (file_mode & mask) == mask ? 0 : EACCES;
1652	}
1653
1654	/* Otherwise, check the groups. */
1655	if (cred->cr_gid == gid || groupmember(gid, cred)) {
1656		if (acc_mode & VEXEC)
1657			mask |= S_IXGRP;
1658		if (acc_mode & VREAD)
1659			mask |= S_IRGRP;
1660		if (acc_mode & VWRITE)
1661			mask |= S_IWGRP;
1662		return (file_mode & mask) == mask ? 0 : EACCES;
1663	}
1664
1665	/* Otherwise, check everyone else. */
1666	if (acc_mode & VEXEC)
1667		mask |= S_IXOTH;
1668	if (acc_mode & VREAD)
1669		mask |= S_IROTH;
1670	if (acc_mode & VWRITE)
1671		mask |= S_IWOTH;
1672	return (file_mode & mask) == mask ? 0 : EACCES;
1673}
1674
1675/*
1676 * Unmount all file systems.
1677 * We traverse the list in reverse order under the assumption that doing so
1678 * will avoid needing to worry about dependencies.
1679 */
1680void
1681vfs_unmountall()
1682{
1683	register struct mount *mp, *nmp;
1684	int allerror, error;
1685
1686	for (allerror = 0,
1687	     mp = mountlist.cqh_last; mp != (void *)&mountlist; mp = nmp) {
1688		nmp = mp->mnt_list.cqe_prev;
1689		if ((error = dounmount(mp, MNT_FORCE, curproc)) != 0) {
1690			printf("unmount of %s failed with error %d\n",
1691			    mp->mnt_stat.f_mntonname, error);
1692			allerror = 1;
1693		}
1694	}
1695	if (allerror)
1696		printf("WARNING: some file systems would not unmount\n");
1697}
1698
1699/*
1700 * Sync and unmount file systems before shutting down.
1701 */
1702void
1703vfs_shutdown()
1704{
1705	register struct buf *bp;
1706	int iter, nbusy;
1707
1708	/* XXX Should suspend scheduling. */
1709	(void) spl0();
1710
1711	printf("syncing disks... ");
1712
1713	if (panicstr == 0) {
1714		/* Release inodes held by texts before update. */
1715		vnode_pager_umount(NULL);
1716#ifdef notdef
1717		vnshutdown();
1718#endif
1719
1720		/* Sync before unmount, in case we hang on something. */
1721		sys_sync(&proc0, (void *)0, (register_t *)0);
1722
1723		/* Unmount file systems. */
1724		vfs_unmountall();
1725	}
1726
1727	/* Sync again after unmount, just in case. */
1728	sys_sync(&proc0, (void *)0, (register_t *)0);
1729
1730	/* Wait for sync to finish. */
1731	for (iter = 0; iter < 20; iter++) {
1732		nbusy = 0;
1733		for (bp = &buf[nbuf]; --bp >= buf; )
1734			if ((bp->b_flags & (B_BUSY|B_INVAL)) == B_BUSY)
1735				nbusy++;
1736		if (nbusy == 0)
1737			break;
1738		printf("%d ", nbusy);
1739		DELAY(40000 * iter);
1740	}
1741	if (nbusy)
1742		printf("giving up\n");
1743	else
1744		printf("done\n");
1745}
1746
1747/*
1748 * posix file system related system variables.
1749 */
1750int
1751fs_posix_sysctl(name, namelen, oldp, oldlenp, newp, newlen, p)
1752	int *name;
1753	u_int namelen;
1754	void *oldp;
1755	size_t *oldlenp;
1756	void *newp;
1757	size_t newlen;
1758	struct proc *p;
1759{
1760	/* all sysctl names at this level are terminal */
1761	if (namelen != 1)
1762		return (ENOTDIR);
1763
1764	switch (name[0]) {
1765	case FS_POSIX_SETUID:
1766		if (newp && securelevel > 0)
1767			return (EPERM);
1768		return(sysctl_int(oldp, oldlenp, newp, newlen, &suid_clear));
1769	default:
1770		return (EOPNOTSUPP);
1771	}
1772	/* NOTREACHED */
1773}
1774
1775/*
1776 * file system related system variables.
1777 */
1778int
1779fs_sysctl(name, namelen, oldp, oldlenp, newp, newlen, p)
1780	int *name;
1781	u_int namelen;
1782	void *oldp;
1783	size_t *oldlenp;
1784	void *newp;
1785	size_t newlen;
1786	struct proc *p;
1787{
1788	sysctlfn *fn;
1789
1790	switch (name[0]) {
1791	case FS_POSIX:
1792		fn = fs_posix_sysctl;
1793		break;
1794	default:
1795		return (EOPNOTSUPP);
1796	}
1797	return (*fn)(name + 1, namelen - 1, oldp, oldlenp, newp, newlen, p);
1798}
1799
1800
1801/*
1802 * Routines dealing with vnodes and buffers
1803 */
1804
1805/*
1806 * Update outstanding I/O count and do wakeup if requested.
1807 */
1808void
1809vwakeup(bp)
1810	register struct buf *bp;
1811{
1812	register struct vnode *vp;
1813
1814	bp->b_flags &= ~B_WRITEINPROG;
1815	if ((vp = bp->b_vp) != NULL) {
1816		if (--vp->v_numoutput < 0)
1817			panic("vwakeup: neg numoutput");
1818		if ((vp->v_flag & VBWAIT) && vp->v_numoutput <= 0) {
1819			vp->v_flag &= ~VBWAIT;
1820			wakeup((caddr_t)&vp->v_numoutput);
1821		}
1822	}
1823}
1824
1825/*
1826 * Flush out and invalidate all buffers associated with a vnode.
1827 * Called with the underlying object locked.
1828 */
1829int
1830vinvalbuf(vp, flags, cred, p, slpflag, slptimeo)
1831	register struct vnode *vp;
1832	int flags;
1833	struct ucred *cred;
1834	struct proc *p;
1835	int slpflag, slptimeo;
1836{
1837	register struct buf *bp;
1838	struct buf *nbp, *blist;
1839	int s, error;
1840
1841	if (flags & V_SAVE) {
1842		s = splbio();
1843		while (vp->v_numoutput) {
1844			vp->v_flag |= VBWAIT;
1845			sleep((caddr_t)&vp->v_numoutput, PRIBIO + 1);
1846		}
1847		if (vp->v_dirtyblkhd.lh_first != NULL) {
1848			splx(s);
1849			if ((error = VOP_FSYNC(vp, cred, MNT_WAIT, p)) != 0)
1850				return (error);
1851			s = splbio();
1852			if (vp->v_numoutput > 0 ||
1853			    vp->v_dirtyblkhd.lh_first != NULL)
1854				panic("vinvalbuf: dirty bufs");
1855		}
1856		splx(s);
1857	}
1858	for (;;) {
1859		if ((blist = vp->v_cleanblkhd.lh_first) &&
1860		    (flags & V_SAVEMETA))
1861			while (blist && blist->b_lblkno < 0)
1862				blist = blist->b_vnbufs.le_next;
1863		if (!blist && (blist = vp->v_dirtyblkhd.lh_first) &&
1864		    (flags & V_SAVEMETA))
1865			while (blist && blist->b_lblkno < 0)
1866				blist = blist->b_vnbufs.le_next;
1867		if (!blist)
1868			break;
1869
1870		for (bp = blist; bp; bp = nbp) {
1871			nbp = bp->b_vnbufs.le_next;
1872			if (flags & V_SAVEMETA && bp->b_lblkno < 0)
1873				continue;
1874			s = splbio();
1875			if (bp->b_flags & B_BUSY) {
1876				bp->b_flags |= B_WANTED;
1877				error = tsleep((caddr_t)bp,
1878					slpflag | (PRIBIO + 1), "vinvalbuf",
1879					slptimeo);
1880				splx(s);
1881				if (error)
1882					return (error);
1883				break;
1884			}
1885			bp->b_flags |= B_BUSY | B_VFLUSH;
1886			splx(s);
1887			/*
1888			 * XXX Since there are no node locks for NFS, I believe
1889			 * there is a slight chance that a delayed write will
1890			 * occur while sleeping just above, so check for it.
1891			 */
1892			if ((bp->b_flags & B_DELWRI) && (flags & V_SAVE)) {
1893				(void) VOP_BWRITE(bp);
1894				break;
1895			}
1896			bp->b_flags |= B_INVAL;
1897			brelse(bp);
1898		}
1899	}
1900	if (!(flags & V_SAVEMETA) &&
1901	    (vp->v_dirtyblkhd.lh_first || vp->v_cleanblkhd.lh_first))
1902		panic("vinvalbuf: flush failed");
1903	return (0);
1904}
1905
1906void
1907vflushbuf(vp, sync)
1908	register struct vnode *vp;
1909	int sync;
1910{
1911	register struct buf *bp, *nbp;
1912	int s;
1913
1914loop:
1915	s = splbio();
1916	for (bp = vp->v_dirtyblkhd.lh_first; bp; bp = nbp) {
1917		nbp = bp->b_vnbufs.le_next;
1918		if ((bp->b_flags & B_BUSY))
1919			continue;
1920		if ((bp->b_flags & B_DELWRI) == 0)
1921			panic("vflushbuf: not dirty");
1922		bp->b_flags |= B_BUSY | B_VFLUSH;
1923		splx(s);
1924		/*
1925		 * Wait for I/O associated with indirect blocks to complete,
1926		 * since there is no way to quickly wait for them below.
1927		 */
1928		if (bp->b_vp == vp || sync == 0)
1929			(void) bawrite(bp);
1930		else
1931			(void) bwrite(bp);
1932		goto loop;
1933	}
1934	if (sync == 0) {
1935		splx(s);
1936		return;
1937	}
1938	while (vp->v_numoutput) {
1939		vp->v_flag |= VBWAIT;
1940		tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, "vflushbuf", 0);
1941	}
1942	splx(s);
1943	if (vp->v_dirtyblkhd.lh_first != NULL) {
1944		vprint("vflushbuf: dirty", vp);
1945		goto loop;
1946	}
1947}
1948
1949/*
1950 * Associate a buffer with a vnode.
1951 */
1952void
1953bgetvp(vp, bp)
1954	register struct vnode *vp;
1955	register struct buf *bp;
1956{
1957
1958	if (bp->b_vp)
1959		panic("bgetvp: not free");
1960	VHOLD(vp);
1961	bp->b_vp = vp;
1962	if (vp->v_type == VBLK || vp->v_type == VCHR)
1963		bp->b_dev = vp->v_rdev;
1964	else
1965		bp->b_dev = NODEV;
1966	/*
1967	 * Insert onto list for new vnode.
1968	 */
1969	bufinsvn(bp, &vp->v_cleanblkhd);
1970}
1971
1972/*
1973 * Disassociate a buffer from a vnode.
1974 */
1975void
1976brelvp(bp)
1977	register struct buf *bp;
1978{
1979	struct vnode *vp;
1980	struct buf *wasdirty;
1981
1982	if ((vp = bp->b_vp) == (struct vnode *) 0)
1983		panic("brelvp: NULL");
1984	/*
1985	 * Delete from old vnode list, if on one.
1986	 */
1987	wasdirty = vp->v_dirtyblkhd.lh_first;
1988	if (bp->b_vnbufs.le_next != NOLIST)
1989		bufremvn(bp);
1990	if (wasdirty && LIST_FIRST(&vp->v_dirtyblkhd) == NULL)
1991		LIST_REMOVE(vp, v_synclist);
1992	bp->b_vp = (struct vnode *) 0;
1993	HOLDRELE(vp);
1994}
1995
1996/*
1997 * Reassign a buffer from one vnode to another. Used to assign buffers
1998 * to the appropriate clean or dirty list and to add newly dirty vnodes
1999 * to the appropriate filesystem syncer list.
2000 */
2001void
2002reassignbuf(bp, newvp)
2003	register struct buf *bp;
2004	register struct vnode *newvp;
2005{
2006	struct buflists *listheadp;
2007	struct buf *wasdirty;
2008	int delay;
2009
2010	if (newvp == NULL) {
2011		printf("reassignbuf: NULL");
2012		return;
2013	}
2014	/*
2015	 * Delete from old vnode list, if on one.
2016	 */
2017	wasdirty = newvp->v_dirtyblkhd.lh_first;
2018	if (bp->b_vnbufs.le_next != NOLIST)
2019		bufremvn(bp);
2020	/*
2021	 * If dirty, put on list of dirty buffers;
2022	 * otherwise insert onto list of clean buffers.
2023	 */
2024	if ((bp->b_flags & B_DELWRI) == 0) {
2025		listheadp = &newvp->v_cleanblkhd;
2026		if (wasdirty && LIST_FIRST(&newvp->v_dirtyblkhd) == NULL)
2027			LIST_REMOVE(newvp, v_synclist);
2028	} else {
2029		listheadp = &newvp->v_dirtyblkhd;
2030		if (LIST_FIRST(listheadp) == NULL) {
2031			switch (newvp->v_type) {
2032			case VDIR:
2033				delay = syncdelay / 3;
2034				break;
2035			case VBLK:
2036				if (newvp->v_specmountpoint != NULL) {
2037					delay = syncdelay / 2;
2038					break;
2039				}
2040				/* fall through */
2041			default:
2042				delay = syncdelay;
2043			}
2044			vn_syncer_add_to_worklist(newvp, delay);
2045		}
2046	}
2047	bufinsvn(bp, listheadp);
2048}
2049