vfs_subr.c revision 65557
1/*
2 * Copyright (c) 1989, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 *    must display the following acknowledgement:
20 *	This product includes software developed by the University of
21 *	California, Berkeley and its contributors.
22 * 4. Neither the name of the University nor the names of its contributors
23 *    may be used to endorse or promote products derived from this software
24 *    without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 *
38 *	@(#)vfs_subr.c	8.31 (Berkeley) 5/26/95
39 * $FreeBSD: head/sys/kern/vfs_subr.c 65557 2000-09-07 01:33:02Z jasone $
40 */
41
42/*
43 * External virtual filesystem routines
44 */
45#include "opt_ddb.h"
46#include "opt_ffs.h"
47
48#include <sys/param.h>
49#include <sys/systm.h>
50#include <sys/bio.h>
51#include <sys/buf.h>
52#include <sys/conf.h>
53#include <sys/dirent.h>
54#include <sys/domain.h>
55#include <sys/eventhandler.h>
56#include <sys/fcntl.h>
57#include <sys/kernel.h>
58#include <sys/kthread.h>
59#include <sys/ktr.h>
60#include <sys/malloc.h>
61#include <sys/mount.h>
62#include <sys/namei.h>
63#include <sys/proc.h>
64#include <sys/reboot.h>
65#include <sys/socket.h>
66#include <sys/stat.h>
67#include <sys/sysctl.h>
68#include <sys/vmmeter.h>
69#include <sys/vnode.h>
70
71#include <machine/limits.h>
72#include <machine/mutex.h>
73
74#include <vm/vm.h>
75#include <vm/vm_object.h>
76#include <vm/vm_extern.h>
77#include <vm/pmap.h>
78#include <vm/vm_map.h>
79#include <vm/vm_page.h>
80#include <vm/vm_pager.h>
81#include <vm/vnode_pager.h>
82#include <vm/vm_zone.h>
83
84static MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure");
85
86static void	insmntque __P((struct vnode *vp, struct mount *mp));
87static void	vclean __P((struct vnode *vp, int flags, struct proc *p));
88static unsigned long	numvnodes;
89SYSCTL_INT(_debug, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, "");
90
91enum vtype iftovt_tab[16] = {
92	VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
93	VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
94};
95int vttoif_tab[9] = {
96	0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
97	S_IFSOCK, S_IFIFO, S_IFMT,
98};
99
100static TAILQ_HEAD(freelst, vnode) vnode_free_list;	/* vnode free list */
101
102static u_long wantfreevnodes = 25;
103SYSCTL_INT(_debug, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, "");
104static u_long freevnodes = 0;
105SYSCTL_INT(_debug, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, "");
106
107static int reassignbufcalls;
108SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0, "");
109static int reassignbufloops;
110SYSCTL_INT(_vfs, OID_AUTO, reassignbufloops, CTLFLAG_RW, &reassignbufloops, 0, "");
111static int reassignbufsortgood;
112SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortgood, CTLFLAG_RW, &reassignbufsortgood, 0, "");
113static int reassignbufsortbad;
114SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortbad, CTLFLAG_RW, &reassignbufsortbad, 0, "");
115static int reassignbufmethod = 1;
116SYSCTL_INT(_vfs, OID_AUTO, reassignbufmethod, CTLFLAG_RW, &reassignbufmethod, 0, "");
117
118#ifdef ENABLE_VFS_IOOPT
119int vfs_ioopt = 0;
120SYSCTL_INT(_vfs, OID_AUTO, ioopt, CTLFLAG_RW, &vfs_ioopt, 0, "");
121#endif
122
123struct mntlist mountlist = TAILQ_HEAD_INITIALIZER(mountlist); /* mounted fs */
124struct simplelock mountlist_slock;
125struct simplelock mntvnode_slock;
126int	nfs_mount_type = -1;
127#ifndef NULL_SIMPLELOCKS
128static struct simplelock mntid_slock;
129static struct simplelock vnode_free_list_slock;
130static struct simplelock spechash_slock;
131#endif
132struct nfs_public nfs_pub;	/* publicly exported FS */
133static vm_zone_t vnode_zone;
134int	prtactive = 0;		/* 1 => print out reclaim of active vnodes */
135
136/*
137 * The workitem queue.
138 */
139#define SYNCER_MAXDELAY		32
140static int syncer_maxdelay = SYNCER_MAXDELAY;	/* maximum delay time */
141time_t syncdelay = 30;		/* max time to delay syncing data */
142time_t filedelay = 30;		/* time to delay syncing files */
143SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, "");
144time_t dirdelay = 29;		/* time to delay syncing directories */
145SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, "");
146time_t metadelay = 28;		/* time to delay syncing metadata */
147SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, "");
148static int rushjob;			/* number of slots to run ASAP */
149static int stat_rush_requests;	/* number of times I/O speeded up */
150SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, "");
151
152static int syncer_delayno = 0;
153static long syncer_mask;
154LIST_HEAD(synclist, vnode);
155static struct synclist *syncer_workitem_pending;
156
157int desiredvnodes;
158SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW,
159    &desiredvnodes, 0, "Maximum number of vnodes");
160
161static void	vfs_free_addrlist __P((struct netexport *nep));
162static int	vfs_free_netcred __P((struct radix_node *rn, void *w));
163static int	vfs_hang_addrlist __P((struct mount *mp, struct netexport *nep,
164				       struct export_args *argp));
165
166/*
167 * Initialize the vnode management data structures.
168 */
169void
170vntblinit()
171{
172
173	desiredvnodes = maxproc + cnt.v_page_count / 4;
174	simple_lock_init(&mntvnode_slock);
175	simple_lock_init(&mntid_slock);
176	simple_lock_init(&spechash_slock);
177	TAILQ_INIT(&vnode_free_list);
178	simple_lock_init(&vnode_free_list_slock);
179	vnode_zone = zinit("VNODE", sizeof (struct vnode), 0, 0, 5);
180	/*
181	 * Initialize the filesystem syncer.
182	 */
183	syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE,
184		&syncer_mask);
185	syncer_maxdelay = syncer_mask + 1;
186}
187
188/*
189 * Mark a mount point as busy. Used to synchronize access and to delay
190 * unmounting. Interlock is not released on failure.
191 */
192int
193vfs_busy(mp, flags, interlkp, p)
194	struct mount *mp;
195	int flags;
196	struct simplelock *interlkp;
197	struct proc *p;
198{
199	int lkflags;
200
201	if (mp->mnt_kern_flag & MNTK_UNMOUNT) {
202		if (flags & LK_NOWAIT)
203			return (ENOENT);
204		mp->mnt_kern_flag |= MNTK_MWAIT;
205		if (interlkp) {
206			simple_unlock(interlkp);
207		}
208		/*
209		 * Since all busy locks are shared except the exclusive
210		 * lock granted when unmounting, the only place that a
211		 * wakeup needs to be done is at the release of the
212		 * exclusive lock at the end of dounmount.
213		 */
214		tsleep((caddr_t)mp, PVFS, "vfs_busy", 0);
215		if (interlkp) {
216			simple_lock(interlkp);
217		}
218		return (ENOENT);
219	}
220	lkflags = LK_SHARED | LK_NOPAUSE;
221	if (interlkp)
222		lkflags |= LK_INTERLOCK;
223	if (lockmgr(&mp->mnt_lock, lkflags, interlkp, p))
224		panic("vfs_busy: unexpected lock failure");
225	return (0);
226}
227
228/*
229 * Free a busy filesystem.
230 */
231void
232vfs_unbusy(mp, p)
233	struct mount *mp;
234	struct proc *p;
235{
236
237	lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, p);
238}
239
240/*
241 * Lookup a filesystem type, and if found allocate and initialize
242 * a mount structure for it.
243 *
244 * Devname is usually updated by mount(8) after booting.
245 */
246int
247vfs_rootmountalloc(fstypename, devname, mpp)
248	char *fstypename;
249	char *devname;
250	struct mount **mpp;
251{
252	struct proc *p = curproc;	/* XXX */
253	struct vfsconf *vfsp;
254	struct mount *mp;
255
256	if (fstypename == NULL)
257		return (ENODEV);
258	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
259		if (!strcmp(vfsp->vfc_name, fstypename))
260			break;
261	if (vfsp == NULL)
262		return (ENODEV);
263	mp = malloc((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK);
264	bzero((char *)mp, (u_long)sizeof(struct mount));
265	lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, LK_NOPAUSE);
266	(void)vfs_busy(mp, LK_NOWAIT, 0, p);
267	LIST_INIT(&mp->mnt_vnodelist);
268	mp->mnt_vfc = vfsp;
269	mp->mnt_op = vfsp->vfc_vfsops;
270	mp->mnt_flag = MNT_RDONLY;
271	mp->mnt_vnodecovered = NULLVP;
272	vfsp->vfc_refcount++;
273	mp->mnt_iosize_max = DFLTPHYS;
274	mp->mnt_stat.f_type = vfsp->vfc_typenum;
275	mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
276	strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN);
277	mp->mnt_stat.f_mntonname[0] = '/';
278	mp->mnt_stat.f_mntonname[1] = 0;
279	(void) copystr(devname, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, 0);
280	*mpp = mp;
281	return (0);
282}
283
284/*
285 * Find an appropriate filesystem to use for the root. If a filesystem
286 * has not been preselected, walk through the list of known filesystems
287 * trying those that have mountroot routines, and try them until one
288 * works or we have tried them all.
289 */
290#ifdef notdef	/* XXX JH */
291int
292lite2_vfs_mountroot()
293{
294	struct vfsconf *vfsp;
295	extern int (*lite2_mountroot) __P((void));
296	int error;
297
298	if (lite2_mountroot != NULL)
299		return ((*lite2_mountroot)());
300	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
301		if (vfsp->vfc_mountroot == NULL)
302			continue;
303		if ((error = (*vfsp->vfc_mountroot)()) == 0)
304			return (0);
305		printf("%s_mountroot failed: %d\n", vfsp->vfc_name, error);
306	}
307	return (ENODEV);
308}
309#endif
310
311/*
312 * Lookup a mount point by filesystem identifier.
313 */
314struct mount *
315vfs_getvfs(fsid)
316	fsid_t *fsid;
317{
318	register struct mount *mp;
319
320	simple_lock(&mountlist_slock);
321	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
322		if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
323		    mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
324			simple_unlock(&mountlist_slock);
325			return (mp);
326	    }
327	}
328	simple_unlock(&mountlist_slock);
329	return ((struct mount *) 0);
330}
331
332/*
333 * Get a new unique fsid.  Try to make its val[0] unique, since this value
334 * will be used to create fake device numbers for stat().  Also try (but
335 * not so hard) make its val[0] unique mod 2^16, since some emulators only
336 * support 16-bit device numbers.  We end up with unique val[0]'s for the
337 * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls.
338 *
339 * Keep in mind that several mounts may be running in parallel.  Starting
340 * the search one past where the previous search terminated is both a
341 * micro-optimization and a defense against returning the same fsid to
342 * different mounts.
343 */
344void
345vfs_getnewfsid(mp)
346	struct mount *mp;
347{
348	static u_int16_t mntid_base;
349	fsid_t tfsid;
350	int mtype;
351
352	simple_lock(&mntid_slock);
353	mtype = mp->mnt_vfc->vfc_typenum;
354	tfsid.val[1] = mtype;
355	mtype = (mtype & 0xFF) << 24;
356	for (;;) {
357		tfsid.val[0] = makeudev(255,
358		    mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF));
359		mntid_base++;
360		if (vfs_getvfs(&tfsid) == NULL)
361			break;
362	}
363	mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
364	mp->mnt_stat.f_fsid.val[1] = tfsid.val[1];
365	simple_unlock(&mntid_slock);
366}
367
368/*
369 * Knob to control the precision of file timestamps:
370 *
371 *   0 = seconds only; nanoseconds zeroed.
372 *   1 = seconds and nanoseconds, accurate within 1/HZ.
373 *   2 = seconds and nanoseconds, truncated to microseconds.
374 * >=3 = seconds and nanoseconds, maximum precision.
375 */
376enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC };
377
378static int timestamp_precision = TSP_SEC;
379SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW,
380    &timestamp_precision, 0, "");
381
382/*
383 * Get a current timestamp.
384 */
385void
386vfs_timestamp(tsp)
387	struct timespec *tsp;
388{
389	struct timeval tv;
390
391	switch (timestamp_precision) {
392	case TSP_SEC:
393		tsp->tv_sec = time_second;
394		tsp->tv_nsec = 0;
395		break;
396	case TSP_HZ:
397		getnanotime(tsp);
398		break;
399	case TSP_USEC:
400		microtime(&tv);
401		TIMEVAL_TO_TIMESPEC(&tv, tsp);
402		break;
403	case TSP_NSEC:
404	default:
405		nanotime(tsp);
406		break;
407	}
408}
409
410/*
411 * Set vnode attributes to VNOVAL
412 */
413void
414vattr_null(vap)
415	register struct vattr *vap;
416{
417
418	vap->va_type = VNON;
419	vap->va_size = VNOVAL;
420	vap->va_bytes = VNOVAL;
421	vap->va_mode = VNOVAL;
422	vap->va_nlink = VNOVAL;
423	vap->va_uid = VNOVAL;
424	vap->va_gid = VNOVAL;
425	vap->va_fsid = VNOVAL;
426	vap->va_fileid = VNOVAL;
427	vap->va_blocksize = VNOVAL;
428	vap->va_rdev = VNOVAL;
429	vap->va_atime.tv_sec = VNOVAL;
430	vap->va_atime.tv_nsec = VNOVAL;
431	vap->va_mtime.tv_sec = VNOVAL;
432	vap->va_mtime.tv_nsec = VNOVAL;
433	vap->va_ctime.tv_sec = VNOVAL;
434	vap->va_ctime.tv_nsec = VNOVAL;
435	vap->va_flags = VNOVAL;
436	vap->va_gen = VNOVAL;
437	vap->va_vaflags = 0;
438}
439
440/*
441 * Routines having to do with the management of the vnode table.
442 */
443
444/*
445 * Return the next vnode from the free list.
446 */
447int
448getnewvnode(tag, mp, vops, vpp)
449	enum vtagtype tag;
450	struct mount *mp;
451	vop_t **vops;
452	struct vnode **vpp;
453{
454	int s, count;
455	struct proc *p = curproc;	/* XXX */
456	struct vnode *vp = NULL;
457	struct mount *vnmp;
458	vm_object_t object;
459
460	/*
461	 * We take the least recently used vnode from the freelist
462	 * if we can get it and it has no cached pages, and no
463	 * namecache entries are relative to it.
464	 * Otherwise we allocate a new vnode
465	 */
466
467	s = splbio();
468	simple_lock(&vnode_free_list_slock);
469
470	if (wantfreevnodes && freevnodes < wantfreevnodes) {
471		vp = NULL;
472	} else if (!wantfreevnodes && freevnodes <= desiredvnodes) {
473		/*
474		 * XXX: this is only here to be backwards compatible
475		 */
476		vp = NULL;
477	} else for (count = 0; count < freevnodes; count++) {
478		vp = TAILQ_FIRST(&vnode_free_list);
479		if (vp == NULL || vp->v_usecount)
480			panic("getnewvnode: free vnode isn't");
481		TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
482		/*
483		 * Don't recycle if active in the namecache or
484		 * if it still has cached pages or we cannot get
485		 * its interlock.
486		 */
487		object = vp->v_object;
488		if (LIST_FIRST(&vp->v_cache_src) != NULL ||
489		    (object && (object->resident_page_count ||
490		     object->ref_count)) ||
491		    !simple_lock_try(&vp->v_interlock)) {
492			TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
493			vp = NULL;
494			continue;
495		}
496		/*
497		 * Skip over it if its filesystem is being suspended.
498		 */
499		if (vn_start_write(vp, &vnmp, V_NOWAIT) == 0)
500			break;
501		simple_unlock(&vp->v_interlock);
502		TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
503		vp = NULL;
504	}
505	if (vp) {
506		vp->v_flag |= VDOOMED;
507		freevnodes--;
508		simple_unlock(&vnode_free_list_slock);
509		cache_purge(vp);
510		vp->v_lease = NULL;
511		if (vp->v_type != VBAD) {
512			vgonel(vp, p);
513		} else {
514			simple_unlock(&vp->v_interlock);
515		}
516		vn_finished_write(vnmp);
517
518#ifdef INVARIANTS
519		{
520			int s;
521
522			if (vp->v_data)
523				panic("cleaned vnode isn't");
524			s = splbio();
525			if (vp->v_numoutput)
526				panic("Clean vnode has pending I/O's");
527			splx(s);
528			if (vp->v_writecount != 0)
529				panic("Non-zero write count");
530		}
531#endif
532		vp->v_flag = 0;
533		vp->v_lastw = 0;
534		vp->v_lasta = 0;
535		vp->v_cstart = 0;
536		vp->v_clen = 0;
537		vp->v_socket = 0;
538	} else {
539		simple_unlock(&vnode_free_list_slock);
540		vp = (struct vnode *) zalloc(vnode_zone);
541		bzero((char *) vp, sizeof *vp);
542		simple_lock_init(&vp->v_interlock);
543		vp->v_dd = vp;
544		cache_purge(vp);
545		LIST_INIT(&vp->v_cache_src);
546		TAILQ_INIT(&vp->v_cache_dst);
547		numvnodes++;
548	}
549
550	TAILQ_INIT(&vp->v_cleanblkhd);
551	TAILQ_INIT(&vp->v_dirtyblkhd);
552	vp->v_type = VNON;
553	vp->v_tag = tag;
554	vp->v_op = vops;
555	insmntque(vp, mp);
556	*vpp = vp;
557	vp->v_usecount = 1;
558	vp->v_data = 0;
559	splx(s);
560
561	vfs_object_create(vp, p, p->p_ucred);
562	return (0);
563}
564
565/*
566 * Move a vnode from one mount queue to another.
567 */
568static void
569insmntque(vp, mp)
570	register struct vnode *vp;
571	register struct mount *mp;
572{
573
574	simple_lock(&mntvnode_slock);
575	/*
576	 * Delete from old mount point vnode list, if on one.
577	 */
578	if (vp->v_mount != NULL)
579		LIST_REMOVE(vp, v_mntvnodes);
580	/*
581	 * Insert into list of vnodes for the new mount point, if available.
582	 */
583	if ((vp->v_mount = mp) == NULL) {
584		simple_unlock(&mntvnode_slock);
585		return;
586	}
587	LIST_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes);
588	simple_unlock(&mntvnode_slock);
589}
590
591/*
592 * Update outstanding I/O count and do wakeup if requested.
593 */
594void
595vwakeup(bp)
596	register struct buf *bp;
597{
598	register struct vnode *vp;
599
600	bp->b_flags &= ~B_WRITEINPROG;
601	if ((vp = bp->b_vp)) {
602		vp->v_numoutput--;
603		if (vp->v_numoutput < 0)
604			panic("vwakeup: neg numoutput");
605		if ((vp->v_numoutput == 0) && (vp->v_flag & VBWAIT)) {
606			vp->v_flag &= ~VBWAIT;
607			wakeup((caddr_t) &vp->v_numoutput);
608		}
609	}
610}
611
612/*
613 * Flush out and invalidate all buffers associated with a vnode.
614 * Called with the underlying object locked.
615 */
616int
617vinvalbuf(vp, flags, cred, p, slpflag, slptimeo)
618	register struct vnode *vp;
619	int flags;
620	struct ucred *cred;
621	struct proc *p;
622	int slpflag, slptimeo;
623{
624	register struct buf *bp;
625	struct buf *nbp, *blist;
626	int s, error;
627	vm_object_t object;
628
629	if (flags & V_SAVE) {
630		s = splbio();
631		while (vp->v_numoutput) {
632			vp->v_flag |= VBWAIT;
633			error = tsleep((caddr_t)&vp->v_numoutput,
634			    slpflag | (PRIBIO + 1), "vinvlbuf", slptimeo);
635			if (error) {
636				splx(s);
637				return (error);
638			}
639		}
640		if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
641			splx(s);
642			if ((error = VOP_FSYNC(vp, cred, MNT_WAIT, p)) != 0)
643				return (error);
644			s = splbio();
645			if (vp->v_numoutput > 0 ||
646			    !TAILQ_EMPTY(&vp->v_dirtyblkhd))
647				panic("vinvalbuf: dirty bufs");
648		}
649		splx(s);
650  	}
651	s = splbio();
652	for (;;) {
653		blist = TAILQ_FIRST(&vp->v_cleanblkhd);
654		if (!blist)
655			blist = TAILQ_FIRST(&vp->v_dirtyblkhd);
656		if (!blist)
657			break;
658
659		for (bp = blist; bp; bp = nbp) {
660			nbp = TAILQ_NEXT(bp, b_vnbufs);
661			if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
662				error = BUF_TIMELOCK(bp,
663				    LK_EXCLUSIVE | LK_SLEEPFAIL,
664				    "vinvalbuf", slpflag, slptimeo);
665				if (error == ENOLCK)
666					break;
667				splx(s);
668				return (error);
669			}
670			/*
671			 * XXX Since there are no node locks for NFS, I
672			 * believe there is a slight chance that a delayed
673			 * write will occur while sleeping just above, so
674			 * check for it.  Note that vfs_bio_awrite expects
675			 * buffers to reside on a queue, while VOP_BWRITE and
676			 * brelse do not.
677			 */
678			if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) &&
679				(flags & V_SAVE)) {
680
681				if (bp->b_vp == vp) {
682					if (bp->b_flags & B_CLUSTEROK) {
683						BUF_UNLOCK(bp);
684						vfs_bio_awrite(bp);
685					} else {
686						bremfree(bp);
687						bp->b_flags |= B_ASYNC;
688						BUF_WRITE(bp);
689					}
690				} else {
691					bremfree(bp);
692					(void) BUF_WRITE(bp);
693				}
694				break;
695			}
696			bremfree(bp);
697			bp->b_flags |= (B_INVAL | B_NOCACHE | B_RELBUF);
698			bp->b_flags &= ~B_ASYNC;
699			brelse(bp);
700		}
701	}
702
703	while (vp->v_numoutput > 0) {
704		vp->v_flag |= VBWAIT;
705		tsleep(&vp->v_numoutput, PVM, "vnvlbv", 0);
706	}
707
708	splx(s);
709
710	/*
711	 * Destroy the copy in the VM cache, too.
712	 */
713	simple_lock(&vp->v_interlock);
714	object = vp->v_object;
715	if (object != NULL) {
716		vm_object_page_remove(object, 0, 0,
717			(flags & V_SAVE) ? TRUE : FALSE);
718	}
719	simple_unlock(&vp->v_interlock);
720
721	if (!TAILQ_EMPTY(&vp->v_dirtyblkhd) || !TAILQ_EMPTY(&vp->v_cleanblkhd))
722		panic("vinvalbuf: flush failed");
723	return (0);
724}
725
726/*
727 * Truncate a file's buffer and pages to a specified length.  This
728 * is in lieu of the old vinvalbuf mechanism, which performed unneeded
729 * sync activity.
730 */
731int
732vtruncbuf(vp, cred, p, length, blksize)
733	register struct vnode *vp;
734	struct ucred *cred;
735	struct proc *p;
736	off_t length;
737	int blksize;
738{
739	register struct buf *bp;
740	struct buf *nbp;
741	int s, anyfreed;
742	int trunclbn;
743
744	/*
745	 * Round up to the *next* lbn.
746	 */
747	trunclbn = (length + blksize - 1) / blksize;
748
749	s = splbio();
750restart:
751	anyfreed = 1;
752	for (;anyfreed;) {
753		anyfreed = 0;
754		for (bp = TAILQ_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) {
755			nbp = TAILQ_NEXT(bp, b_vnbufs);
756			if (bp->b_lblkno >= trunclbn) {
757				if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
758					BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL);
759					goto restart;
760				} else {
761					bremfree(bp);
762					bp->b_flags |= (B_INVAL | B_RELBUF);
763					bp->b_flags &= ~B_ASYNC;
764					brelse(bp);
765					anyfreed = 1;
766				}
767				if (nbp &&
768				    (((nbp->b_xflags & BX_VNCLEAN) == 0) ||
769				    (nbp->b_vp != vp) ||
770				    (nbp->b_flags & B_DELWRI))) {
771					goto restart;
772				}
773			}
774		}
775
776		for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
777			nbp = TAILQ_NEXT(bp, b_vnbufs);
778			if (bp->b_lblkno >= trunclbn) {
779				if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
780					BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL);
781					goto restart;
782				} else {
783					bremfree(bp);
784					bp->b_flags |= (B_INVAL | B_RELBUF);
785					bp->b_flags &= ~B_ASYNC;
786					brelse(bp);
787					anyfreed = 1;
788				}
789				if (nbp &&
790				    (((nbp->b_xflags & BX_VNDIRTY) == 0) ||
791				    (nbp->b_vp != vp) ||
792				    (nbp->b_flags & B_DELWRI) == 0)) {
793					goto restart;
794				}
795			}
796		}
797	}
798
799	if (length > 0) {
800restartsync:
801		for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
802			nbp = TAILQ_NEXT(bp, b_vnbufs);
803			if ((bp->b_flags & B_DELWRI) && (bp->b_lblkno < 0)) {
804				if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
805					BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL);
806					goto restart;
807				} else {
808					bremfree(bp);
809					if (bp->b_vp == vp) {
810						bp->b_flags |= B_ASYNC;
811					} else {
812						bp->b_flags &= ~B_ASYNC;
813					}
814					BUF_WRITE(bp);
815				}
816				goto restartsync;
817			}
818
819		}
820	}
821
822	while (vp->v_numoutput > 0) {
823		vp->v_flag |= VBWAIT;
824		tsleep(&vp->v_numoutput, PVM, "vbtrunc", 0);
825	}
826
827	splx(s);
828
829	vnode_pager_setsize(vp, length);
830
831	return (0);
832}
833
834/*
835 * Associate a buffer with a vnode.
836 */
837void
838bgetvp(vp, bp)
839	register struct vnode *vp;
840	register struct buf *bp;
841{
842	int s;
843
844	KASSERT(bp->b_vp == NULL, ("bgetvp: not free"));
845
846	vhold(vp);
847	bp->b_vp = vp;
848	bp->b_dev = vn_todev(vp);
849	/*
850	 * Insert onto list for new vnode.
851	 */
852	s = splbio();
853	bp->b_xflags |= BX_VNCLEAN;
854	bp->b_xflags &= ~BX_VNDIRTY;
855	TAILQ_INSERT_TAIL(&vp->v_cleanblkhd, bp, b_vnbufs);
856	splx(s);
857}
858
859/*
860 * Disassociate a buffer from a vnode.
861 */
862void
863brelvp(bp)
864	register struct buf *bp;
865{
866	struct vnode *vp;
867	struct buflists *listheadp;
868	int s;
869
870	KASSERT(bp->b_vp != NULL, ("brelvp: NULL"));
871
872	/*
873	 * Delete from old vnode list, if on one.
874	 */
875	vp = bp->b_vp;
876	s = splbio();
877	if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) {
878		if (bp->b_xflags & BX_VNDIRTY)
879			listheadp = &vp->v_dirtyblkhd;
880		else
881			listheadp = &vp->v_cleanblkhd;
882		TAILQ_REMOVE(listheadp, bp, b_vnbufs);
883		bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN);
884	}
885	if ((vp->v_flag & VONWORKLST) && TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
886		vp->v_flag &= ~VONWORKLST;
887		LIST_REMOVE(vp, v_synclist);
888	}
889	splx(s);
890	bp->b_vp = (struct vnode *) 0;
891	vdrop(vp);
892}
893
894/*
895 * The workitem queue.
896 *
897 * It is useful to delay writes of file data and filesystem metadata
898 * for tens of seconds so that quickly created and deleted files need
899 * not waste disk bandwidth being created and removed. To realize this,
900 * we append vnodes to a "workitem" queue. When running with a soft
901 * updates implementation, most pending metadata dependencies should
902 * not wait for more than a few seconds. Thus, mounted on block devices
903 * are delayed only about a half the time that file data is delayed.
904 * Similarly, directory updates are more critical, so are only delayed
905 * about a third the time that file data is delayed. Thus, there are
906 * SYNCER_MAXDELAY queues that are processed round-robin at a rate of
907 * one each second (driven off the filesystem syncer process). The
908 * syncer_delayno variable indicates the next queue that is to be processed.
909 * Items that need to be processed soon are placed in this queue:
910 *
911 *	syncer_workitem_pending[syncer_delayno]
912 *
913 * A delay of fifteen seconds is done by placing the request fifteen
914 * entries later in the queue:
915 *
916 *	syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
917 *
918 */
919
920/*
921 * Add an item to the syncer work queue.
922 */
923static void
924vn_syncer_add_to_worklist(struct vnode *vp, int delay)
925{
926	int s, slot;
927
928	s = splbio();
929
930	if (vp->v_flag & VONWORKLST) {
931		LIST_REMOVE(vp, v_synclist);
932	}
933
934	if (delay > syncer_maxdelay - 2)
935		delay = syncer_maxdelay - 2;
936	slot = (syncer_delayno + delay) & syncer_mask;
937
938	LIST_INSERT_HEAD(&syncer_workitem_pending[slot], vp, v_synclist);
939	vp->v_flag |= VONWORKLST;
940	splx(s);
941}
942
943struct  proc *updateproc;
944static void sched_sync __P((void));
945static struct kproc_desc up_kp = {
946	"syncer",
947	sched_sync,
948	&updateproc
949};
950SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp)
951
952/*
953 * System filesystem synchronizer daemon.
954 */
955void
956sched_sync(void)
957{
958	struct synclist *slp;
959	struct vnode *vp;
960	struct mount *mp;
961	long starttime;
962	int s;
963	struct proc *p = updateproc;
964
965	mtx_enter(&Giant, MTX_DEF);
966
967	EVENTHANDLER_REGISTER(shutdown_pre_sync, shutdown_kproc, p,
968	    SHUTDOWN_PRI_LAST);
969
970	for (;;) {
971		kproc_suspend_loop(p);
972
973		starttime = time_second;
974
975		/*
976		 * Push files whose dirty time has expired.  Be careful
977		 * of interrupt race on slp queue.
978		 */
979		s = splbio();
980		slp = &syncer_workitem_pending[syncer_delayno];
981		syncer_delayno += 1;
982		if (syncer_delayno == syncer_maxdelay)
983			syncer_delayno = 0;
984		splx(s);
985
986		while ((vp = LIST_FIRST(slp)) != NULL) {
987			if (VOP_ISLOCKED(vp, NULL) == 0 &&
988			    vn_start_write(vp, &mp, V_NOWAIT) == 0) {
989				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
990				(void) VOP_FSYNC(vp, p->p_ucred, MNT_LAZY, p);
991				VOP_UNLOCK(vp, 0, p);
992				vn_finished_write(mp);
993			}
994			s = splbio();
995			if (LIST_FIRST(slp) == vp) {
996				/*
997				 * Note: v_tag VT_VFS vps can remain on the
998				 * worklist too with no dirty blocks, but
999				 * since sync_fsync() moves it to a different
1000				 * slot we are safe.
1001				 */
1002				if (TAILQ_EMPTY(&vp->v_dirtyblkhd) &&
1003				    !vn_isdisk(vp, NULL))
1004					panic("sched_sync: fsync failed vp %p tag %d", vp, vp->v_tag);
1005				/*
1006				 * Put us back on the worklist.  The worklist
1007				 * routine will remove us from our current
1008				 * position and then add us back in at a later
1009				 * position.
1010				 */
1011				vn_syncer_add_to_worklist(vp, syncdelay);
1012			}
1013			splx(s);
1014		}
1015
1016		/*
1017		 * Do soft update processing.
1018		 */
1019#ifdef SOFTUPDATES
1020		softdep_process_worklist(NULL);
1021#endif
1022
1023		/*
1024		 * The variable rushjob allows the kernel to speed up the
1025		 * processing of the filesystem syncer process. A rushjob
1026		 * value of N tells the filesystem syncer to process the next
1027		 * N seconds worth of work on its queue ASAP. Currently rushjob
1028		 * is used by the soft update code to speed up the filesystem
1029		 * syncer process when the incore state is getting so far
1030		 * ahead of the disk that the kernel memory pool is being
1031		 * threatened with exhaustion.
1032		 */
1033		if (rushjob > 0) {
1034			rushjob -= 1;
1035			continue;
1036		}
1037		/*
1038		 * If it has taken us less than a second to process the
1039		 * current work, then wait. Otherwise start right over
1040		 * again. We can still lose time if any single round
1041		 * takes more than two seconds, but it does not really
1042		 * matter as we are just trying to generally pace the
1043		 * filesystem activity.
1044		 */
1045		if (time_second == starttime)
1046			tsleep(&lbolt, PPAUSE, "syncer", 0);
1047	}
1048}
1049
1050/*
1051 * Request the syncer daemon to speed up its work.
1052 * We never push it to speed up more than half of its
1053 * normal turn time, otherwise it could take over the cpu.
1054 */
1055int
1056speedup_syncer()
1057{
1058	int s;
1059
1060	s = splhigh();
1061	if (updateproc->p_wchan == &lbolt)
1062		setrunnable(updateproc);
1063	splx(s);
1064	if (rushjob < syncdelay / 2) {
1065		rushjob += 1;
1066		stat_rush_requests += 1;
1067		return (1);
1068	}
1069	return(0);
1070}
1071
1072/*
1073 * Associate a p-buffer with a vnode.
1074 *
1075 * Also sets B_PAGING flag to indicate that vnode is not fully associated
1076 * with the buffer.  i.e. the bp has not been linked into the vnode or
1077 * ref-counted.
1078 */
1079void
1080pbgetvp(vp, bp)
1081	register struct vnode *vp;
1082	register struct buf *bp;
1083{
1084
1085	KASSERT(bp->b_vp == NULL, ("pbgetvp: not free"));
1086
1087	bp->b_vp = vp;
1088	bp->b_flags |= B_PAGING;
1089	bp->b_dev = vn_todev(vp);
1090}
1091
1092/*
1093 * Disassociate a p-buffer from a vnode.
1094 */
1095void
1096pbrelvp(bp)
1097	register struct buf *bp;
1098{
1099
1100	KASSERT(bp->b_vp != NULL, ("pbrelvp: NULL"));
1101
1102	/* XXX REMOVE ME */
1103	if (bp->b_vnbufs.tqe_next != NULL) {
1104		panic(
1105		    "relpbuf(): b_vp was probably reassignbuf()d %p %x",
1106		    bp,
1107		    (int)bp->b_flags
1108		);
1109	}
1110	bp->b_vp = (struct vnode *) 0;
1111	bp->b_flags &= ~B_PAGING;
1112}
1113
1114void
1115pbreassignbuf(bp, newvp)
1116	struct buf *bp;
1117	struct vnode *newvp;
1118{
1119	if ((bp->b_flags & B_PAGING) == 0) {
1120		panic(
1121		    "pbreassignbuf() on non phys bp %p",
1122		    bp
1123		);
1124	}
1125	bp->b_vp = newvp;
1126}
1127
1128/*
1129 * Reassign a buffer from one vnode to another.
1130 * Used to assign file specific control information
1131 * (indirect blocks) to the vnode to which they belong.
1132 */
1133void
1134reassignbuf(bp, newvp)
1135	register struct buf *bp;
1136	register struct vnode *newvp;
1137{
1138	struct buflists *listheadp;
1139	int delay;
1140	int s;
1141
1142	if (newvp == NULL) {
1143		printf("reassignbuf: NULL");
1144		return;
1145	}
1146	++reassignbufcalls;
1147
1148	/*
1149	 * B_PAGING flagged buffers cannot be reassigned because their vp
1150	 * is not fully linked in.
1151	 */
1152	if (bp->b_flags & B_PAGING)
1153		panic("cannot reassign paging buffer");
1154
1155	s = splbio();
1156	/*
1157	 * Delete from old vnode list, if on one.
1158	 */
1159	if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) {
1160		if (bp->b_xflags & BX_VNDIRTY)
1161			listheadp = &bp->b_vp->v_dirtyblkhd;
1162		else
1163			listheadp = &bp->b_vp->v_cleanblkhd;
1164		TAILQ_REMOVE(listheadp, bp, b_vnbufs);
1165		bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN);
1166		if (bp->b_vp != newvp) {
1167			vdrop(bp->b_vp);
1168			bp->b_vp = NULL;	/* for clarification */
1169		}
1170	}
1171	/*
1172	 * If dirty, put on list of dirty buffers; otherwise insert onto list
1173	 * of clean buffers.
1174	 */
1175	if (bp->b_flags & B_DELWRI) {
1176		struct buf *tbp;
1177
1178		listheadp = &newvp->v_dirtyblkhd;
1179		if ((newvp->v_flag & VONWORKLST) == 0) {
1180			switch (newvp->v_type) {
1181			case VDIR:
1182				delay = dirdelay;
1183				break;
1184			case VCHR:
1185			case VBLK:
1186				if (newvp->v_specmountpoint != NULL) {
1187					delay = metadelay;
1188					break;
1189				}
1190				/* fall through */
1191			default:
1192				delay = filedelay;
1193			}
1194			vn_syncer_add_to_worklist(newvp, delay);
1195		}
1196		bp->b_xflags |= BX_VNDIRTY;
1197		tbp = TAILQ_FIRST(listheadp);
1198		if (tbp == NULL ||
1199		    bp->b_lblkno == 0 ||
1200		    (bp->b_lblkno > 0 && tbp->b_lblkno < 0) ||
1201		    (bp->b_lblkno > 0 && bp->b_lblkno < tbp->b_lblkno)) {
1202			TAILQ_INSERT_HEAD(listheadp, bp, b_vnbufs);
1203			++reassignbufsortgood;
1204		} else if (bp->b_lblkno < 0) {
1205			TAILQ_INSERT_TAIL(listheadp, bp, b_vnbufs);
1206			++reassignbufsortgood;
1207		} else if (reassignbufmethod == 1) {
1208			/*
1209			 * New sorting algorithm, only handle sequential case,
1210			 * otherwise append to end (but before metadata)
1211			 */
1212			if ((tbp = gbincore(newvp, bp->b_lblkno - 1)) != NULL &&
1213			    (tbp->b_xflags & BX_VNDIRTY)) {
1214				/*
1215				 * Found the best place to insert the buffer
1216				 */
1217				TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs);
1218				++reassignbufsortgood;
1219			} else {
1220				/*
1221				 * Missed, append to end, but before meta-data.
1222				 * We know that the head buffer in the list is
1223				 * not meta-data due to prior conditionals.
1224				 *
1225				 * Indirect effects:  NFS second stage write
1226				 * tends to wind up here, giving maximum
1227				 * distance between the unstable write and the
1228				 * commit rpc.
1229				 */
1230				tbp = TAILQ_LAST(listheadp, buflists);
1231				while (tbp && tbp->b_lblkno < 0)
1232					tbp = TAILQ_PREV(tbp, buflists, b_vnbufs);
1233				TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs);
1234				++reassignbufsortbad;
1235			}
1236		} else {
1237			/*
1238			 * Old sorting algorithm, scan queue and insert
1239			 */
1240			struct buf *ttbp;
1241			while ((ttbp = TAILQ_NEXT(tbp, b_vnbufs)) &&
1242			    (ttbp->b_lblkno < bp->b_lblkno)) {
1243				++reassignbufloops;
1244				tbp = ttbp;
1245			}
1246			TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs);
1247		}
1248	} else {
1249		bp->b_xflags |= BX_VNCLEAN;
1250		TAILQ_INSERT_TAIL(&newvp->v_cleanblkhd, bp, b_vnbufs);
1251		if ((newvp->v_flag & VONWORKLST) &&
1252		    TAILQ_EMPTY(&newvp->v_dirtyblkhd)) {
1253			newvp->v_flag &= ~VONWORKLST;
1254			LIST_REMOVE(newvp, v_synclist);
1255		}
1256	}
1257	if (bp->b_vp != newvp) {
1258		bp->b_vp = newvp;
1259		vhold(bp->b_vp);
1260	}
1261	splx(s);
1262}
1263
1264/*
1265 * Create a vnode for a block device.
1266 * Used for mounting the root file system.
1267 * XXX: This now changed to a VCHR due to the block/char merging.
1268 */
1269int
1270bdevvp(dev, vpp)
1271	dev_t dev;
1272	struct vnode **vpp;
1273{
1274	register struct vnode *vp;
1275	struct vnode *nvp;
1276	int error;
1277
1278	if (dev == NODEV) {
1279		*vpp = NULLVP;
1280		return (ENXIO);
1281	}
1282	error = getnewvnode(VT_NON, (struct mount *)0, spec_vnodeop_p, &nvp);
1283	if (error) {
1284		*vpp = NULLVP;
1285		return (error);
1286	}
1287	vp = nvp;
1288	vp->v_type = VCHR;
1289	addalias(vp, dev);
1290	*vpp = vp;
1291	return (0);
1292}
1293
1294/*
1295 * Add vnode to the alias list hung off the dev_t.
1296 *
1297 * The reason for this gunk is that multiple vnodes can reference
1298 * the same physical device, so checking vp->v_usecount to see
1299 * how many users there are is inadequate; the v_usecount for
1300 * the vnodes need to be accumulated.  vcount() does that.
1301 */
1302struct vnode *
1303addaliasu(nvp, nvp_rdev)
1304	struct vnode *nvp;
1305	udev_t nvp_rdev;
1306{
1307	struct vnode *ovp;
1308	vop_t **ops;
1309	dev_t dev;
1310
1311	if (nvp->v_type != VBLK && nvp->v_type != VCHR)
1312		panic("addaliasu on non-special vnode");
1313	dev = udev2dev(nvp_rdev, nvp->v_type == VBLK ? 1 : 0);
1314	/*
1315	 * Check to see if we have a bdevvp vnode with no associated
1316	 * filesystem. If so, we want to associate the filesystem of
1317	 * the new newly instigated vnode with the bdevvp vnode and
1318	 * discard the newly created vnode rather than leaving the
1319	 * bdevvp vnode lying around with no associated filesystem.
1320	 */
1321	if (vfinddev(dev, nvp->v_type, &ovp) == 0 || ovp->v_data != NULL) {
1322		addalias(nvp, dev);
1323		return (nvp);
1324	}
1325	/*
1326	 * Discard unneeded vnode, but save its node specific data.
1327	 * Note that if there is a lock, it is carried over in the
1328	 * node specific data to the replacement vnode.
1329	 */
1330	vref(ovp);
1331	ovp->v_data = nvp->v_data;
1332	ovp->v_tag = nvp->v_tag;
1333	nvp->v_data = NULL;
1334	ops = nvp->v_op;
1335	nvp->v_op = ovp->v_op;
1336	ovp->v_op = ops;
1337	insmntque(ovp, nvp->v_mount);
1338	vrele(nvp);
1339	vgone(nvp);
1340	return (ovp);
1341}
1342
1343void
1344addalias(nvp, dev)
1345	struct vnode *nvp;
1346	dev_t dev;
1347{
1348
1349	if (nvp->v_type != VBLK && nvp->v_type != VCHR)
1350		panic("addalias on non-special vnode");
1351
1352	nvp->v_rdev = dev;
1353	simple_lock(&spechash_slock);
1354	SLIST_INSERT_HEAD(&dev->si_hlist, nvp, v_specnext);
1355	simple_unlock(&spechash_slock);
1356}
1357
1358/*
1359 * Grab a particular vnode from the free list, increment its
1360 * reference count and lock it. The vnode lock bit is set if the
1361 * vnode is being eliminated in vgone. The process is awakened
1362 * when the transition is completed, and an error returned to
1363 * indicate that the vnode is no longer usable (possibly having
1364 * been changed to a new file system type).
1365 */
1366int
1367vget(vp, flags, p)
1368	register struct vnode *vp;
1369	int flags;
1370	struct proc *p;
1371{
1372	int error;
1373
1374	/*
1375	 * If the vnode is in the process of being cleaned out for
1376	 * another use, we wait for the cleaning to finish and then
1377	 * return failure. Cleaning is determined by checking that
1378	 * the VXLOCK flag is set.
1379	 */
1380	if ((flags & LK_INTERLOCK) == 0) {
1381		simple_lock(&vp->v_interlock);
1382	}
1383	if (vp->v_flag & VXLOCK) {
1384		vp->v_flag |= VXWANT;
1385		simple_unlock(&vp->v_interlock);
1386		tsleep((caddr_t)vp, PINOD, "vget", 0);
1387		return (ENOENT);
1388	}
1389
1390	vp->v_usecount++;
1391
1392	if (VSHOULDBUSY(vp))
1393		vbusy(vp);
1394	if (flags & LK_TYPE_MASK) {
1395		if ((error = vn_lock(vp, flags | LK_INTERLOCK, p)) != 0) {
1396			/*
1397			 * must expand vrele here because we do not want
1398			 * to call VOP_INACTIVE if the reference count
1399			 * drops back to zero since it was never really
1400			 * active. We must remove it from the free list
1401			 * before sleeping so that multiple processes do
1402			 * not try to recycle it.
1403			 */
1404			simple_lock(&vp->v_interlock);
1405			vp->v_usecount--;
1406			if (VSHOULDFREE(vp))
1407				vfree(vp);
1408			simple_unlock(&vp->v_interlock);
1409		}
1410		return (error);
1411	}
1412	simple_unlock(&vp->v_interlock);
1413	return (0);
1414}
1415
1416void
1417vref(struct vnode *vp)
1418{
1419	simple_lock(&vp->v_interlock);
1420	vp->v_usecount++;
1421	simple_unlock(&vp->v_interlock);
1422}
1423
1424/*
1425 * Vnode put/release.
1426 * If count drops to zero, call inactive routine and return to freelist.
1427 */
1428void
1429vrele(vp)
1430	struct vnode *vp;
1431{
1432	struct proc *p = curproc;	/* XXX */
1433
1434	KASSERT(vp != NULL, ("vrele: null vp"));
1435	KASSERT(vp->v_writecount < vp->v_usecount, ("vrele: missed vn_close"));
1436
1437	simple_lock(&vp->v_interlock);
1438
1439	if (vp->v_usecount > 1) {
1440
1441		vp->v_usecount--;
1442		simple_unlock(&vp->v_interlock);
1443
1444		return;
1445	}
1446
1447	if (vp->v_usecount == 1) {
1448
1449		vp->v_usecount--;
1450		if (VSHOULDFREE(vp))
1451			vfree(vp);
1452	/*
1453	 * If we are doing a vput, the node is already locked, and we must
1454	 * call VOP_INACTIVE with the node locked.  So, in the case of
1455	 * vrele, we explicitly lock the vnode before calling VOP_INACTIVE.
1456	 */
1457		if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK, p) == 0) {
1458			VOP_INACTIVE(vp, p);
1459		}
1460
1461	} else {
1462#ifdef DIAGNOSTIC
1463		vprint("vrele: negative ref count", vp);
1464		simple_unlock(&vp->v_interlock);
1465#endif
1466		panic("vrele: negative ref cnt");
1467	}
1468}
1469
1470void
1471vput(vp)
1472	struct vnode *vp;
1473{
1474	struct proc *p = curproc;	/* XXX */
1475
1476	KASSERT(vp != NULL, ("vput: null vp"));
1477	KASSERT(vp->v_writecount < vp->v_usecount, ("vput: missed vn_close"));
1478
1479	simple_lock(&vp->v_interlock);
1480
1481	if (vp->v_usecount > 1) {
1482
1483		vp->v_usecount--;
1484		VOP_UNLOCK(vp, LK_INTERLOCK, p);
1485		return;
1486
1487	}
1488
1489	if (vp->v_usecount == 1) {
1490
1491		vp->v_usecount--;
1492		if (VSHOULDFREE(vp))
1493			vfree(vp);
1494	/*
1495	 * If we are doing a vput, the node is already locked, and we must
1496	 * call VOP_INACTIVE with the node locked.  So, in the case of
1497	 * vrele, we explicitly lock the vnode before calling VOP_INACTIVE.
1498	 */
1499		simple_unlock(&vp->v_interlock);
1500		VOP_INACTIVE(vp, p);
1501
1502	} else {
1503#ifdef DIAGNOSTIC
1504		vprint("vput: negative ref count", vp);
1505#endif
1506		panic("vput: negative ref cnt");
1507	}
1508}
1509
1510/*
1511 * Somebody doesn't want the vnode recycled.
1512 */
1513void
1514vhold(vp)
1515	register struct vnode *vp;
1516{
1517	int s;
1518
1519  	s = splbio();
1520	vp->v_holdcnt++;
1521	if (VSHOULDBUSY(vp))
1522		vbusy(vp);
1523	splx(s);
1524}
1525
1526/*
1527 * One less who cares about this vnode.
1528 */
1529void
1530vdrop(vp)
1531	register struct vnode *vp;
1532{
1533	int s;
1534
1535	s = splbio();
1536	if (vp->v_holdcnt <= 0)
1537		panic("vdrop: holdcnt");
1538	vp->v_holdcnt--;
1539	if (VSHOULDFREE(vp))
1540		vfree(vp);
1541	splx(s);
1542}
1543
1544/*
1545 * Remove any vnodes in the vnode table belonging to mount point mp.
1546 *
1547 * If MNT_NOFORCE is specified, there should not be any active ones,
1548 * return error if any are found (nb: this is a user error, not a
1549 * system error). If MNT_FORCE is specified, detach any active vnodes
1550 * that are found.
1551 */
1552#ifdef DIAGNOSTIC
1553static int busyprt = 0;		/* print out busy vnodes */
1554SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "");
1555#endif
1556
1557int
1558vflush(mp, skipvp, flags)
1559	struct mount *mp;
1560	struct vnode *skipvp;
1561	int flags;
1562{
1563	struct proc *p = curproc;	/* XXX */
1564	struct vnode *vp, *nvp;
1565	int busy = 0;
1566
1567	simple_lock(&mntvnode_slock);
1568loop:
1569	for (vp = LIST_FIRST(&mp->mnt_vnodelist); vp; vp = nvp) {
1570		/*
1571		 * Make sure this vnode wasn't reclaimed in getnewvnode().
1572		 * Start over if it has (it won't be on the list anymore).
1573		 */
1574		if (vp->v_mount != mp)
1575			goto loop;
1576		nvp = LIST_NEXT(vp, v_mntvnodes);
1577		/*
1578		 * Skip over a selected vnode.
1579		 */
1580		if (vp == skipvp)
1581			continue;
1582
1583		simple_lock(&vp->v_interlock);
1584		/*
1585		 * Skip over a vnodes marked VSYSTEM.
1586		 */
1587		if ((flags & SKIPSYSTEM) && (vp->v_flag & VSYSTEM)) {
1588			simple_unlock(&vp->v_interlock);
1589			continue;
1590		}
1591		/*
1592		 * If WRITECLOSE is set, only flush out regular file vnodes
1593		 * open for writing.
1594		 */
1595		if ((flags & WRITECLOSE) &&
1596		    (vp->v_writecount == 0 || vp->v_type != VREG)) {
1597			simple_unlock(&vp->v_interlock);
1598			continue;
1599		}
1600
1601		/*
1602		 * With v_usecount == 0, all we need to do is clear out the
1603		 * vnode data structures and we are done.
1604		 */
1605		if (vp->v_usecount == 0) {
1606			simple_unlock(&mntvnode_slock);
1607			vgonel(vp, p);
1608			simple_lock(&mntvnode_slock);
1609			continue;
1610		}
1611
1612		/*
1613		 * If FORCECLOSE is set, forcibly close the vnode. For block
1614		 * or character devices, revert to an anonymous device. For
1615		 * all other files, just kill them.
1616		 */
1617		if (flags & FORCECLOSE) {
1618			simple_unlock(&mntvnode_slock);
1619			if (vp->v_type != VBLK && vp->v_type != VCHR) {
1620				vgonel(vp, p);
1621			} else {
1622				vclean(vp, 0, p);
1623				vp->v_op = spec_vnodeop_p;
1624				insmntque(vp, (struct mount *) 0);
1625			}
1626			simple_lock(&mntvnode_slock);
1627			continue;
1628		}
1629#ifdef DIAGNOSTIC
1630		if (busyprt)
1631			vprint("vflush: busy vnode", vp);
1632#endif
1633		simple_unlock(&vp->v_interlock);
1634		busy++;
1635	}
1636	simple_unlock(&mntvnode_slock);
1637	if (busy)
1638		return (EBUSY);
1639	return (0);
1640}
1641
1642/*
1643 * Disassociate the underlying file system from a vnode.
1644 */
1645static void
1646vclean(vp, flags, p)
1647	struct vnode *vp;
1648	int flags;
1649	struct proc *p;
1650{
1651	int active;
1652	vm_object_t obj;
1653
1654	/*
1655	 * Check to see if the vnode is in use. If so we have to reference it
1656	 * before we clean it out so that its count cannot fall to zero and
1657	 * generate a race against ourselves to recycle it.
1658	 */
1659	if ((active = vp->v_usecount))
1660		vp->v_usecount++;
1661
1662	/*
1663	 * Prevent the vnode from being recycled or brought into use while we
1664	 * clean it out.
1665	 */
1666	if (vp->v_flag & VXLOCK)
1667		panic("vclean: deadlock");
1668	vp->v_flag |= VXLOCK;
1669	/*
1670	 * Even if the count is zero, the VOP_INACTIVE routine may still
1671	 * have the object locked while it cleans it out. The VOP_LOCK
1672	 * ensures that the VOP_INACTIVE routine is done with its work.
1673	 * For active vnodes, it ensures that no other activity can
1674	 * occur while the underlying object is being cleaned out.
1675	 */
1676	VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK, p);
1677
1678	/*
1679	 * Clean out any buffers associated with the vnode.
1680	 * If the flush fails, just toss the buffers.
1681	 */
1682	if (flags & DOCLOSE) {
1683		if (TAILQ_FIRST(&vp->v_dirtyblkhd) != NULL)
1684			(void) vn_write_suspend_wait(vp, NULL, V_WAIT);
1685		if (vinvalbuf(vp, V_SAVE, NOCRED, p, 0, 0) != 0)
1686			vinvalbuf(vp, 0, NOCRED, p, 0, 0);
1687	}
1688
1689	if ((obj = vp->v_object) != NULL) {
1690		if (obj->ref_count == 0) {
1691			/*
1692			 * vclean() may be called twice. The first time
1693			 * removes the primary reference to the object,
1694			 * the second time goes one further and is a
1695			 * special-case to terminate the object.
1696			 */
1697			vm_object_terminate(obj);
1698		} else {
1699			/*
1700			 * Woe to the process that tries to page now :-).
1701			 */
1702			vm_pager_deallocate(obj);
1703		}
1704	}
1705
1706	/*
1707	 * If purging an active vnode, it must be closed and
1708	 * deactivated before being reclaimed. Note that the
1709	 * VOP_INACTIVE will unlock the vnode.
1710	 */
1711	if (active) {
1712		if (flags & DOCLOSE)
1713			VOP_CLOSE(vp, FNONBLOCK, NOCRED, p);
1714		VOP_INACTIVE(vp, p);
1715	} else {
1716		/*
1717		 * Any other processes trying to obtain this lock must first
1718		 * wait for VXLOCK to clear, then call the new lock operation.
1719		 */
1720		VOP_UNLOCK(vp, 0, p);
1721	}
1722	/*
1723	 * Reclaim the vnode.
1724	 */
1725	if (VOP_RECLAIM(vp, p))
1726		panic("vclean: cannot reclaim");
1727
1728	if (active) {
1729		/*
1730		 * Inline copy of vrele() since VOP_INACTIVE
1731		 * has already been called.
1732		 */
1733		simple_lock(&vp->v_interlock);
1734		if (--vp->v_usecount <= 0) {
1735#ifdef DIAGNOSTIC
1736			if (vp->v_usecount < 0 || vp->v_writecount != 0) {
1737				vprint("vclean: bad ref count", vp);
1738				panic("vclean: ref cnt");
1739			}
1740#endif
1741			vfree(vp);
1742		}
1743		simple_unlock(&vp->v_interlock);
1744	}
1745
1746	cache_purge(vp);
1747	if (vp->v_vnlock) {
1748		FREE(vp->v_vnlock, M_VNODE);
1749		vp->v_vnlock = NULL;
1750	}
1751
1752	if (VSHOULDFREE(vp))
1753		vfree(vp);
1754
1755	/*
1756	 * Done with purge, notify sleepers of the grim news.
1757	 */
1758	vp->v_op = dead_vnodeop_p;
1759	vn_pollgone(vp);
1760	vp->v_tag = VT_NON;
1761	vp->v_flag &= ~VXLOCK;
1762	if (vp->v_flag & VXWANT) {
1763		vp->v_flag &= ~VXWANT;
1764		wakeup((caddr_t) vp);
1765	}
1766}
1767
1768/*
1769 * Eliminate all activity associated with the requested vnode
1770 * and with all vnodes aliased to the requested vnode.
1771 */
1772int
1773vop_revoke(ap)
1774	struct vop_revoke_args /* {
1775		struct vnode *a_vp;
1776		int a_flags;
1777	} */ *ap;
1778{
1779	struct vnode *vp, *vq;
1780	dev_t dev;
1781
1782	KASSERT((ap->a_flags & REVOKEALL) != 0, ("vop_revoke"));
1783
1784	vp = ap->a_vp;
1785	/*
1786	 * If a vgone (or vclean) is already in progress,
1787	 * wait until it is done and return.
1788	 */
1789	if (vp->v_flag & VXLOCK) {
1790		vp->v_flag |= VXWANT;
1791		simple_unlock(&vp->v_interlock);
1792		tsleep((caddr_t)vp, PINOD, "vop_revokeall", 0);
1793		return (0);
1794	}
1795	dev = vp->v_rdev;
1796	for (;;) {
1797		simple_lock(&spechash_slock);
1798		vq = SLIST_FIRST(&dev->si_hlist);
1799		simple_unlock(&spechash_slock);
1800		if (!vq)
1801			break;
1802		vgone(vq);
1803	}
1804	return (0);
1805}
1806
1807/*
1808 * Recycle an unused vnode to the front of the free list.
1809 * Release the passed interlock if the vnode will be recycled.
1810 */
1811int
1812vrecycle(vp, inter_lkp, p)
1813	struct vnode *vp;
1814	struct simplelock *inter_lkp;
1815	struct proc *p;
1816{
1817
1818	simple_lock(&vp->v_interlock);
1819	if (vp->v_usecount == 0) {
1820		if (inter_lkp) {
1821			simple_unlock(inter_lkp);
1822		}
1823		vgonel(vp, p);
1824		return (1);
1825	}
1826	simple_unlock(&vp->v_interlock);
1827	return (0);
1828}
1829
1830/*
1831 * Eliminate all activity associated with a vnode
1832 * in preparation for reuse.
1833 */
1834void
1835vgone(vp)
1836	register struct vnode *vp;
1837{
1838	struct proc *p = curproc;	/* XXX */
1839
1840	simple_lock(&vp->v_interlock);
1841	vgonel(vp, p);
1842}
1843
1844/*
1845 * vgone, with the vp interlock held.
1846 */
1847void
1848vgonel(vp, p)
1849	struct vnode *vp;
1850	struct proc *p;
1851{
1852	int s;
1853
1854	/*
1855	 * If a vgone (or vclean) is already in progress,
1856	 * wait until it is done and return.
1857	 */
1858	if (vp->v_flag & VXLOCK) {
1859		vp->v_flag |= VXWANT;
1860		simple_unlock(&vp->v_interlock);
1861		tsleep((caddr_t)vp, PINOD, "vgone", 0);
1862		return;
1863	}
1864
1865	/*
1866	 * Clean out the filesystem specific data.
1867	 */
1868	vclean(vp, DOCLOSE, p);
1869	simple_lock(&vp->v_interlock);
1870
1871	/*
1872	 * Delete from old mount point vnode list, if on one.
1873	 */
1874	if (vp->v_mount != NULL)
1875		insmntque(vp, (struct mount *)0);
1876	/*
1877	 * If special device, remove it from special device alias list
1878	 * if it is on one.
1879	 */
1880	if ((vp->v_type == VBLK || vp->v_type == VCHR) && vp->v_rdev != NULL) {
1881		simple_lock(&spechash_slock);
1882		SLIST_REMOVE(&vp->v_hashchain, vp, vnode, v_specnext);
1883		freedev(vp->v_rdev);
1884		simple_unlock(&spechash_slock);
1885		vp->v_rdev = NULL;
1886	}
1887
1888	/*
1889	 * If it is on the freelist and not already at the head,
1890	 * move it to the head of the list. The test of the
1891	 * VDOOMED flag and the reference count of zero is because
1892	 * it will be removed from the free list by getnewvnode,
1893	 * but will not have its reference count incremented until
1894	 * after calling vgone. If the reference count were
1895	 * incremented first, vgone would (incorrectly) try to
1896	 * close the previous instance of the underlying object.
1897	 */
1898	if (vp->v_usecount == 0 && !(vp->v_flag & VDOOMED)) {
1899		s = splbio();
1900		simple_lock(&vnode_free_list_slock);
1901		if (vp->v_flag & VFREE)
1902			TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
1903		else
1904			freevnodes++;
1905		vp->v_flag |= VFREE;
1906		TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
1907		simple_unlock(&vnode_free_list_slock);
1908		splx(s);
1909	}
1910
1911	vp->v_type = VBAD;
1912	simple_unlock(&vp->v_interlock);
1913}
1914
1915/*
1916 * Lookup a vnode by device number.
1917 */
1918int
1919vfinddev(dev, type, vpp)
1920	dev_t dev;
1921	enum vtype type;
1922	struct vnode **vpp;
1923{
1924	struct vnode *vp;
1925
1926	simple_lock(&spechash_slock);
1927	SLIST_FOREACH(vp, &dev->si_hlist, v_specnext) {
1928		if (type == vp->v_type) {
1929			*vpp = vp;
1930			simple_unlock(&spechash_slock);
1931			return (1);
1932		}
1933	}
1934	simple_unlock(&spechash_slock);
1935	return (0);
1936}
1937
1938/*
1939 * Calculate the total number of references to a special device.
1940 */
1941int
1942vcount(vp)
1943	struct vnode *vp;
1944{
1945	struct vnode *vq;
1946	int count;
1947
1948	count = 0;
1949	simple_lock(&spechash_slock);
1950	SLIST_FOREACH(vq, &vp->v_hashchain, v_specnext)
1951		count += vq->v_usecount;
1952	simple_unlock(&spechash_slock);
1953	return (count);
1954}
1955
1956/*
1957 * Same as above, but using the dev_t as argument
1958 */
1959
1960int
1961count_dev(dev)
1962	dev_t dev;
1963{
1964	struct vnode *vp;
1965
1966	vp = SLIST_FIRST(&dev->si_hlist);
1967	if (vp == NULL)
1968		return (0);
1969	return(vcount(vp));
1970}
1971
1972/*
1973 * Print out a description of a vnode.
1974 */
1975static char *typename[] =
1976{"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"};
1977
1978void
1979vprint(label, vp)
1980	char *label;
1981	struct vnode *vp;
1982{
1983	char buf[96];
1984
1985	if (label != NULL)
1986		printf("%s: %p: ", label, (void *)vp);
1987	else
1988		printf("%p: ", (void *)vp);
1989	printf("type %s, usecount %d, writecount %d, refcount %d,",
1990	    typename[vp->v_type], vp->v_usecount, vp->v_writecount,
1991	    vp->v_holdcnt);
1992	buf[0] = '\0';
1993	if (vp->v_flag & VROOT)
1994		strcat(buf, "|VROOT");
1995	if (vp->v_flag & VTEXT)
1996		strcat(buf, "|VTEXT");
1997	if (vp->v_flag & VSYSTEM)
1998		strcat(buf, "|VSYSTEM");
1999	if (vp->v_flag & VXLOCK)
2000		strcat(buf, "|VXLOCK");
2001	if (vp->v_flag & VXWANT)
2002		strcat(buf, "|VXWANT");
2003	if (vp->v_flag & VBWAIT)
2004		strcat(buf, "|VBWAIT");
2005	if (vp->v_flag & VDOOMED)
2006		strcat(buf, "|VDOOMED");
2007	if (vp->v_flag & VFREE)
2008		strcat(buf, "|VFREE");
2009	if (vp->v_flag & VOBJBUF)
2010		strcat(buf, "|VOBJBUF");
2011	if (buf[0] != '\0')
2012		printf(" flags (%s)", &buf[1]);
2013	if (vp->v_data == NULL) {
2014		printf("\n");
2015	} else {
2016		printf("\n\t");
2017		VOP_PRINT(vp);
2018	}
2019}
2020
2021#ifdef DDB
2022#include <ddb/ddb.h>
2023/*
2024 * List all of the locked vnodes in the system.
2025 * Called when debugging the kernel.
2026 */
2027DB_SHOW_COMMAND(lockedvnodes, lockedvnodes)
2028{
2029	struct proc *p = curproc;	/* XXX */
2030	struct mount *mp, *nmp;
2031	struct vnode *vp;
2032
2033	printf("Locked vnodes\n");
2034	simple_lock(&mountlist_slock);
2035	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
2036		if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) {
2037			nmp = TAILQ_NEXT(mp, mnt_list);
2038			continue;
2039		}
2040		LIST_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) {
2041			if (VOP_ISLOCKED(vp, NULL))
2042				vprint((char *)0, vp);
2043		}
2044		simple_lock(&mountlist_slock);
2045		nmp = TAILQ_NEXT(mp, mnt_list);
2046		vfs_unbusy(mp, p);
2047	}
2048	simple_unlock(&mountlist_slock);
2049}
2050#endif
2051
2052/*
2053 * Top level filesystem related information gathering.
2054 */
2055static int	sysctl_ovfs_conf __P((SYSCTL_HANDLER_ARGS));
2056
2057static int
2058vfs_sysctl(SYSCTL_HANDLER_ARGS)
2059{
2060	int *name = (int *)arg1 - 1;	/* XXX */
2061	u_int namelen = arg2 + 1;	/* XXX */
2062	struct vfsconf *vfsp;
2063
2064#if 1 || defined(COMPAT_PRELITE2)
2065	/* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */
2066	if (namelen == 1)
2067		return (sysctl_ovfs_conf(oidp, arg1, arg2, req));
2068#endif
2069
2070#ifdef notyet
2071	/* all sysctl names at this level are at least name and field */
2072	if (namelen < 2)
2073		return (ENOTDIR);		/* overloaded */
2074	if (name[0] != VFS_GENERIC) {
2075		for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
2076			if (vfsp->vfc_typenum == name[0])
2077				break;
2078		if (vfsp == NULL)
2079			return (EOPNOTSUPP);
2080		return ((*vfsp->vfc_vfsops->vfs_sysctl)(&name[1], namelen - 1,
2081		    oldp, oldlenp, newp, newlen, p));
2082	}
2083#endif
2084	switch (name[1]) {
2085	case VFS_MAXTYPENUM:
2086		if (namelen != 2)
2087			return (ENOTDIR);
2088		return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int)));
2089	case VFS_CONF:
2090		if (namelen != 3)
2091			return (ENOTDIR);	/* overloaded */
2092		for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
2093			if (vfsp->vfc_typenum == name[2])
2094				break;
2095		if (vfsp == NULL)
2096			return (EOPNOTSUPP);
2097		return (SYSCTL_OUT(req, vfsp, sizeof *vfsp));
2098	}
2099	return (EOPNOTSUPP);
2100}
2101
2102SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD, vfs_sysctl,
2103	"Generic filesystem");
2104
2105#if 1 || defined(COMPAT_PRELITE2)
2106
2107static int
2108sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS)
2109{
2110	int error;
2111	struct vfsconf *vfsp;
2112	struct ovfsconf ovfs;
2113
2114	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
2115		ovfs.vfc_vfsops = vfsp->vfc_vfsops;	/* XXX used as flag */
2116		strcpy(ovfs.vfc_name, vfsp->vfc_name);
2117		ovfs.vfc_index = vfsp->vfc_typenum;
2118		ovfs.vfc_refcount = vfsp->vfc_refcount;
2119		ovfs.vfc_flags = vfsp->vfc_flags;
2120		error = SYSCTL_OUT(req, &ovfs, sizeof ovfs);
2121		if (error)
2122			return error;
2123	}
2124	return 0;
2125}
2126
2127#endif /* 1 || COMPAT_PRELITE2 */
2128
2129#if 0
2130#define KINFO_VNODESLOP	10
2131/*
2132 * Dump vnode list (via sysctl).
2133 * Copyout address of vnode followed by vnode.
2134 */
2135/* ARGSUSED */
2136static int
2137sysctl_vnode(SYSCTL_HANDLER_ARGS)
2138{
2139	struct proc *p = curproc;	/* XXX */
2140	struct mount *mp, *nmp;
2141	struct vnode *nvp, *vp;
2142	int error;
2143
2144#define VPTRSZ	sizeof (struct vnode *)
2145#define VNODESZ	sizeof (struct vnode)
2146
2147	req->lock = 0;
2148	if (!req->oldptr) /* Make an estimate */
2149		return (SYSCTL_OUT(req, 0,
2150			(numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ)));
2151
2152	simple_lock(&mountlist_slock);
2153	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
2154		if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) {
2155			nmp = TAILQ_NEXT(mp, mnt_list);
2156			continue;
2157		}
2158again:
2159		simple_lock(&mntvnode_slock);
2160		for (vp = LIST_FIRST(&mp->mnt_vnodelist);
2161		     vp != NULL;
2162		     vp = nvp) {
2163			/*
2164			 * Check that the vp is still associated with
2165			 * this filesystem.  RACE: could have been
2166			 * recycled onto the same filesystem.
2167			 */
2168			if (vp->v_mount != mp) {
2169				simple_unlock(&mntvnode_slock);
2170				goto again;
2171			}
2172			nvp = LIST_NEXT(vp, v_mntvnodes);
2173			simple_unlock(&mntvnode_slock);
2174			if ((error = SYSCTL_OUT(req, &vp, VPTRSZ)) ||
2175			    (error = SYSCTL_OUT(req, vp, VNODESZ)))
2176				return (error);
2177			simple_lock(&mntvnode_slock);
2178		}
2179		simple_unlock(&mntvnode_slock);
2180		simple_lock(&mountlist_slock);
2181		nmp = TAILQ_NEXT(mp, mnt_list);
2182		vfs_unbusy(mp, p);
2183	}
2184	simple_unlock(&mountlist_slock);
2185
2186	return (0);
2187}
2188#endif
2189
2190/*
2191 * XXX
2192 * Exporting the vnode list on large systems causes them to crash.
2193 * Exporting the vnode list on medium systems causes sysctl to coredump.
2194 */
2195#if 0
2196SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD,
2197	0, 0, sysctl_vnode, "S,vnode", "");
2198#endif
2199
2200/*
2201 * Check to see if a filesystem is mounted on a block device.
2202 */
2203int
2204vfs_mountedon(vp)
2205	struct vnode *vp;
2206{
2207
2208	if (vp->v_specmountpoint != NULL)
2209		return (EBUSY);
2210	return (0);
2211}
2212
2213/*
2214 * Unmount all filesystems. The list is traversed in reverse order
2215 * of mounting to avoid dependencies.
2216 */
2217void
2218vfs_unmountall()
2219{
2220	struct mount *mp;
2221	struct proc *p;
2222	int error;
2223
2224	if (curproc != NULL)
2225		p = curproc;
2226	else
2227		p = initproc;	/* XXX XXX should this be proc0? */
2228	/*
2229	 * Since this only runs when rebooting, it is not interlocked.
2230	 */
2231	while(!TAILQ_EMPTY(&mountlist)) {
2232		mp = TAILQ_LAST(&mountlist, mntlist);
2233		error = dounmount(mp, MNT_FORCE, p);
2234		if (error) {
2235			TAILQ_REMOVE(&mountlist, mp, mnt_list);
2236			printf("unmount of %s failed (",
2237			    mp->mnt_stat.f_mntonname);
2238			if (error == EBUSY)
2239				printf("BUSY)\n");
2240			else
2241				printf("%d)\n", error);
2242		} else {
2243			/* The unmount has removed mp from the mountlist */
2244		}
2245	}
2246}
2247
2248/*
2249 * Build hash lists of net addresses and hang them off the mount point.
2250 * Called by ufs_mount() to set up the lists of export addresses.
2251 */
2252static int
2253vfs_hang_addrlist(mp, nep, argp)
2254	struct mount *mp;
2255	struct netexport *nep;
2256	struct export_args *argp;
2257{
2258	register struct netcred *np;
2259	register struct radix_node_head *rnh;
2260	register int i;
2261	struct radix_node *rn;
2262	struct sockaddr *saddr, *smask = 0;
2263	struct domain *dom;
2264	int error;
2265
2266	if (argp->ex_addrlen == 0) {
2267		if (mp->mnt_flag & MNT_DEFEXPORTED)
2268			return (EPERM);
2269		np = &nep->ne_defexported;
2270		np->netc_exflags = argp->ex_flags;
2271		np->netc_anon = argp->ex_anon;
2272		np->netc_anon.cr_ref = 1;
2273		mp->mnt_flag |= MNT_DEFEXPORTED;
2274		return (0);
2275	}
2276	i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen;
2277	np = (struct netcred *) malloc(i, M_NETADDR, M_WAITOK);
2278	bzero((caddr_t) np, i);
2279	saddr = (struct sockaddr *) (np + 1);
2280	if ((error = copyin(argp->ex_addr, (caddr_t) saddr, argp->ex_addrlen)))
2281		goto out;
2282	if (saddr->sa_len > argp->ex_addrlen)
2283		saddr->sa_len = argp->ex_addrlen;
2284	if (argp->ex_masklen) {
2285		smask = (struct sockaddr *) ((caddr_t) saddr + argp->ex_addrlen);
2286		error = copyin(argp->ex_mask, (caddr_t) smask, argp->ex_masklen);
2287		if (error)
2288			goto out;
2289		if (smask->sa_len > argp->ex_masklen)
2290			smask->sa_len = argp->ex_masklen;
2291	}
2292	i = saddr->sa_family;
2293	if ((rnh = nep->ne_rtable[i]) == 0) {
2294		/*
2295		 * Seems silly to initialize every AF when most are not used,
2296		 * do so on demand here
2297		 */
2298		for (dom = domains; dom; dom = dom->dom_next)
2299			if (dom->dom_family == i && dom->dom_rtattach) {
2300				dom->dom_rtattach((void **) &nep->ne_rtable[i],
2301				    dom->dom_rtoffset);
2302				break;
2303			}
2304		if ((rnh = nep->ne_rtable[i]) == 0) {
2305			error = ENOBUFS;
2306			goto out;
2307		}
2308	}
2309	rn = (*rnh->rnh_addaddr) ((caddr_t) saddr, (caddr_t) smask, rnh,
2310	    np->netc_rnodes);
2311	if (rn == 0 || np != (struct netcred *) rn) {	/* already exists */
2312		error = EPERM;
2313		goto out;
2314	}
2315	np->netc_exflags = argp->ex_flags;
2316	np->netc_anon = argp->ex_anon;
2317	np->netc_anon.cr_ref = 1;
2318	return (0);
2319out:
2320	free(np, M_NETADDR);
2321	return (error);
2322}
2323
2324/* ARGSUSED */
2325static int
2326vfs_free_netcred(rn, w)
2327	struct radix_node *rn;
2328	void *w;
2329{
2330	register struct radix_node_head *rnh = (struct radix_node_head *) w;
2331
2332	(*rnh->rnh_deladdr) (rn->rn_key, rn->rn_mask, rnh);
2333	free((caddr_t) rn, M_NETADDR);
2334	return (0);
2335}
2336
2337/*
2338 * Free the net address hash lists that are hanging off the mount points.
2339 */
2340static void
2341vfs_free_addrlist(nep)
2342	struct netexport *nep;
2343{
2344	register int i;
2345	register struct radix_node_head *rnh;
2346
2347	for (i = 0; i <= AF_MAX; i++)
2348		if ((rnh = nep->ne_rtable[i])) {
2349			(*rnh->rnh_walktree) (rnh, vfs_free_netcred,
2350			    (caddr_t) rnh);
2351			free((caddr_t) rnh, M_RTABLE);
2352			nep->ne_rtable[i] = 0;
2353		}
2354}
2355
2356int
2357vfs_export(mp, nep, argp)
2358	struct mount *mp;
2359	struct netexport *nep;
2360	struct export_args *argp;
2361{
2362	int error;
2363
2364	if (argp->ex_flags & MNT_DELEXPORT) {
2365		if (mp->mnt_flag & MNT_EXPUBLIC) {
2366			vfs_setpublicfs(NULL, NULL, NULL);
2367			mp->mnt_flag &= ~MNT_EXPUBLIC;
2368		}
2369		vfs_free_addrlist(nep);
2370		mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED);
2371	}
2372	if (argp->ex_flags & MNT_EXPORTED) {
2373		if (argp->ex_flags & MNT_EXPUBLIC) {
2374			if ((error = vfs_setpublicfs(mp, nep, argp)) != 0)
2375				return (error);
2376			mp->mnt_flag |= MNT_EXPUBLIC;
2377		}
2378		if ((error = vfs_hang_addrlist(mp, nep, argp)))
2379			return (error);
2380		mp->mnt_flag |= MNT_EXPORTED;
2381	}
2382	return (0);
2383}
2384
2385
2386/*
2387 * Set the publicly exported filesystem (WebNFS). Currently, only
2388 * one public filesystem is possible in the spec (RFC 2054 and 2055)
2389 */
2390int
2391vfs_setpublicfs(mp, nep, argp)
2392	struct mount *mp;
2393	struct netexport *nep;
2394	struct export_args *argp;
2395{
2396	int error;
2397	struct vnode *rvp;
2398	char *cp;
2399
2400	/*
2401	 * mp == NULL -> invalidate the current info, the FS is
2402	 * no longer exported. May be called from either vfs_export
2403	 * or unmount, so check if it hasn't already been done.
2404	 */
2405	if (mp == NULL) {
2406		if (nfs_pub.np_valid) {
2407			nfs_pub.np_valid = 0;
2408			if (nfs_pub.np_index != NULL) {
2409				FREE(nfs_pub.np_index, M_TEMP);
2410				nfs_pub.np_index = NULL;
2411			}
2412		}
2413		return (0);
2414	}
2415
2416	/*
2417	 * Only one allowed at a time.
2418	 */
2419	if (nfs_pub.np_valid != 0 && mp != nfs_pub.np_mount)
2420		return (EBUSY);
2421
2422	/*
2423	 * Get real filehandle for root of exported FS.
2424	 */
2425	bzero((caddr_t)&nfs_pub.np_handle, sizeof(nfs_pub.np_handle));
2426	nfs_pub.np_handle.fh_fsid = mp->mnt_stat.f_fsid;
2427
2428	if ((error = VFS_ROOT(mp, &rvp)))
2429		return (error);
2430
2431	if ((error = VFS_VPTOFH(rvp, &nfs_pub.np_handle.fh_fid)))
2432		return (error);
2433
2434	vput(rvp);
2435
2436	/*
2437	 * If an indexfile was specified, pull it in.
2438	 */
2439	if (argp->ex_indexfile != NULL) {
2440		MALLOC(nfs_pub.np_index, char *, MAXNAMLEN + 1, M_TEMP,
2441		    M_WAITOK);
2442		error = copyinstr(argp->ex_indexfile, nfs_pub.np_index,
2443		    MAXNAMLEN, (size_t *)0);
2444		if (!error) {
2445			/*
2446			 * Check for illegal filenames.
2447			 */
2448			for (cp = nfs_pub.np_index; *cp; cp++) {
2449				if (*cp == '/') {
2450					error = EINVAL;
2451					break;
2452				}
2453			}
2454		}
2455		if (error) {
2456			FREE(nfs_pub.np_index, M_TEMP);
2457			return (error);
2458		}
2459	}
2460
2461	nfs_pub.np_mount = mp;
2462	nfs_pub.np_valid = 1;
2463	return (0);
2464}
2465
2466struct netcred *
2467vfs_export_lookup(mp, nep, nam)
2468	register struct mount *mp;
2469	struct netexport *nep;
2470	struct sockaddr *nam;
2471{
2472	register struct netcred *np;
2473	register struct radix_node_head *rnh;
2474	struct sockaddr *saddr;
2475
2476	np = NULL;
2477	if (mp->mnt_flag & MNT_EXPORTED) {
2478		/*
2479		 * Lookup in the export list first.
2480		 */
2481		if (nam != NULL) {
2482			saddr = nam;
2483			rnh = nep->ne_rtable[saddr->sa_family];
2484			if (rnh != NULL) {
2485				np = (struct netcred *)
2486					(*rnh->rnh_matchaddr)((caddr_t)saddr,
2487							      rnh);
2488				if (np && np->netc_rnodes->rn_flags & RNF_ROOT)
2489					np = NULL;
2490			}
2491		}
2492		/*
2493		 * If no address match, use the default if it exists.
2494		 */
2495		if (np == NULL && mp->mnt_flag & MNT_DEFEXPORTED)
2496			np = &nep->ne_defexported;
2497	}
2498	return (np);
2499}
2500
2501/*
2502 * perform msync on all vnodes under a mount point
2503 * the mount point must be locked.
2504 */
2505void
2506vfs_msync(struct mount *mp, int flags) {
2507	struct vnode *vp, *nvp;
2508	struct vm_object *obj;
2509	int anyio, tries;
2510
2511	tries = 5;
2512loop:
2513	anyio = 0;
2514	for (vp = LIST_FIRST(&mp->mnt_vnodelist); vp != NULL; vp = nvp) {
2515
2516		nvp = LIST_NEXT(vp, v_mntvnodes);
2517
2518		if (vp->v_mount != mp) {
2519			goto loop;
2520		}
2521
2522		if (vp->v_flag & VXLOCK)	/* XXX: what if MNT_WAIT? */
2523			continue;
2524
2525		if (flags != MNT_WAIT) {
2526			obj = vp->v_object;
2527			if (obj == NULL || (obj->flags & OBJ_MIGHTBEDIRTY) == 0)
2528				continue;
2529			if (VOP_ISLOCKED(vp, NULL))
2530				continue;
2531		}
2532
2533		simple_lock(&vp->v_interlock);
2534		if (vp->v_object &&
2535		   (vp->v_object->flags & OBJ_MIGHTBEDIRTY)) {
2536			if (!vget(vp,
2537				LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY | LK_NOOBJ, curproc)) {
2538				if (vp->v_object) {
2539					vm_object_page_clean(vp->v_object, 0, 0, flags == MNT_WAIT ? OBJPC_SYNC : OBJPC_NOSYNC);
2540					anyio = 1;
2541				}
2542				vput(vp);
2543			}
2544		} else {
2545			simple_unlock(&vp->v_interlock);
2546		}
2547	}
2548	if (anyio && (--tries > 0))
2549		goto loop;
2550}
2551
2552/*
2553 * Create the VM object needed for VMIO and mmap support.  This
2554 * is done for all VREG files in the system.  Some filesystems might
2555 * afford the additional metadata buffering capability of the
2556 * VMIO code by making the device node be VMIO mode also.
2557 *
2558 * vp must be locked when vfs_object_create is called.
2559 */
2560int
2561vfs_object_create(vp, p, cred)
2562	struct vnode *vp;
2563	struct proc *p;
2564	struct ucred *cred;
2565{
2566	struct vattr vat;
2567	vm_object_t object;
2568	int error = 0;
2569
2570	if (!vn_isdisk(vp, NULL) && vn_canvmio(vp) == FALSE)
2571		return 0;
2572
2573retry:
2574	if ((object = vp->v_object) == NULL) {
2575		if (vp->v_type == VREG || vp->v_type == VDIR) {
2576			if ((error = VOP_GETATTR(vp, &vat, cred, p)) != 0)
2577				goto retn;
2578			object = vnode_pager_alloc(vp, vat.va_size, 0, 0);
2579		} else if (devsw(vp->v_rdev) != NULL) {
2580			/*
2581			 * This simply allocates the biggest object possible
2582			 * for a disk vnode.  This should be fixed, but doesn't
2583			 * cause any problems (yet).
2584			 */
2585			object = vnode_pager_alloc(vp, IDX_TO_OFF(INT_MAX), 0, 0);
2586		} else {
2587			goto retn;
2588		}
2589		/*
2590		 * Dereference the reference we just created.  This assumes
2591		 * that the object is associated with the vp.
2592		 */
2593		object->ref_count--;
2594		vp->v_usecount--;
2595	} else {
2596		if (object->flags & OBJ_DEAD) {
2597			VOP_UNLOCK(vp, 0, p);
2598			tsleep(object, PVM, "vodead", 0);
2599			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
2600			goto retry;
2601		}
2602	}
2603
2604	KASSERT(vp->v_object != NULL, ("vfs_object_create: NULL object"));
2605	vp->v_flag |= VOBJBUF;
2606
2607retn:
2608	return error;
2609}
2610
2611void
2612vfree(vp)
2613	struct vnode *vp;
2614{
2615	int s;
2616
2617	s = splbio();
2618	simple_lock(&vnode_free_list_slock);
2619	KASSERT((vp->v_flag & VFREE) == 0, ("vnode already free"));
2620	if (vp->v_flag & VAGE) {
2621		TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
2622	} else {
2623		TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
2624	}
2625	freevnodes++;
2626	simple_unlock(&vnode_free_list_slock);
2627	vp->v_flag &= ~VAGE;
2628	vp->v_flag |= VFREE;
2629	splx(s);
2630}
2631
2632void
2633vbusy(vp)
2634	struct vnode *vp;
2635{
2636	int s;
2637
2638	s = splbio();
2639	simple_lock(&vnode_free_list_slock);
2640	KASSERT((vp->v_flag & VFREE) != 0, ("vnode not free"));
2641	TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
2642	freevnodes--;
2643	simple_unlock(&vnode_free_list_slock);
2644	vp->v_flag &= ~(VFREE|VAGE);
2645	splx(s);
2646}
2647
2648/*
2649 * Record a process's interest in events which might happen to
2650 * a vnode.  Because poll uses the historic select-style interface
2651 * internally, this routine serves as both the ``check for any
2652 * pending events'' and the ``record my interest in future events''
2653 * functions.  (These are done together, while the lock is held,
2654 * to avoid race conditions.)
2655 */
2656int
2657vn_pollrecord(vp, p, events)
2658	struct vnode *vp;
2659	struct proc *p;
2660	short events;
2661{
2662	simple_lock(&vp->v_pollinfo.vpi_lock);
2663	if (vp->v_pollinfo.vpi_revents & events) {
2664		/*
2665		 * This leaves events we are not interested
2666		 * in available for the other process which
2667		 * which presumably had requested them
2668		 * (otherwise they would never have been
2669		 * recorded).
2670		 */
2671		events &= vp->v_pollinfo.vpi_revents;
2672		vp->v_pollinfo.vpi_revents &= ~events;
2673
2674		simple_unlock(&vp->v_pollinfo.vpi_lock);
2675		return events;
2676	}
2677	vp->v_pollinfo.vpi_events |= events;
2678	selrecord(p, &vp->v_pollinfo.vpi_selinfo);
2679	simple_unlock(&vp->v_pollinfo.vpi_lock);
2680	return 0;
2681}
2682
2683/*
2684 * Note the occurrence of an event.  If the VN_POLLEVENT macro is used,
2685 * it is possible for us to miss an event due to race conditions, but
2686 * that condition is expected to be rare, so for the moment it is the
2687 * preferred interface.
2688 */
2689void
2690vn_pollevent(vp, events)
2691	struct vnode *vp;
2692	short events;
2693{
2694	simple_lock(&vp->v_pollinfo.vpi_lock);
2695	if (vp->v_pollinfo.vpi_events & events) {
2696		/*
2697		 * We clear vpi_events so that we don't
2698		 * call selwakeup() twice if two events are
2699		 * posted before the polling process(es) is
2700		 * awakened.  This also ensures that we take at
2701		 * most one selwakeup() if the polling process
2702		 * is no longer interested.  However, it does
2703		 * mean that only one event can be noticed at
2704		 * a time.  (Perhaps we should only clear those
2705		 * event bits which we note?) XXX
2706		 */
2707		vp->v_pollinfo.vpi_events = 0;	/* &= ~events ??? */
2708		vp->v_pollinfo.vpi_revents |= events;
2709		selwakeup(&vp->v_pollinfo.vpi_selinfo);
2710	}
2711	simple_unlock(&vp->v_pollinfo.vpi_lock);
2712}
2713
2714/*
2715 * Wake up anyone polling on vp because it is being revoked.
2716 * This depends on dead_poll() returning POLLHUP for correct
2717 * behavior.
2718 */
2719void
2720vn_pollgone(vp)
2721	struct vnode *vp;
2722{
2723	simple_lock(&vp->v_pollinfo.vpi_lock);
2724	if (vp->v_pollinfo.vpi_events) {
2725		vp->v_pollinfo.vpi_events = 0;
2726		selwakeup(&vp->v_pollinfo.vpi_selinfo);
2727	}
2728	simple_unlock(&vp->v_pollinfo.vpi_lock);
2729}
2730
2731
2732
2733/*
2734 * Routine to create and manage a filesystem syncer vnode.
2735 */
2736#define sync_close ((int (*) __P((struct  vop_close_args *)))nullop)
2737static int	sync_fsync __P((struct  vop_fsync_args *));
2738static int	sync_inactive __P((struct  vop_inactive_args *));
2739static int	sync_reclaim  __P((struct  vop_reclaim_args *));
2740#define sync_lock ((int (*) __P((struct  vop_lock_args *)))vop_nolock)
2741#define sync_unlock ((int (*) __P((struct  vop_unlock_args *)))vop_nounlock)
2742static int	sync_print __P((struct vop_print_args *));
2743#define sync_islocked ((int(*) __P((struct vop_islocked_args *)))vop_noislocked)
2744
2745static vop_t **sync_vnodeop_p;
2746static struct vnodeopv_entry_desc sync_vnodeop_entries[] = {
2747	{ &vop_default_desc,	(vop_t *) vop_eopnotsupp },
2748	{ &vop_close_desc,	(vop_t *) sync_close },		/* close */
2749	{ &vop_fsync_desc,	(vop_t *) sync_fsync },		/* fsync */
2750	{ &vop_inactive_desc,	(vop_t *) sync_inactive },	/* inactive */
2751	{ &vop_reclaim_desc,	(vop_t *) sync_reclaim },	/* reclaim */
2752	{ &vop_lock_desc,	(vop_t *) sync_lock },		/* lock */
2753	{ &vop_unlock_desc,	(vop_t *) sync_unlock },	/* unlock */
2754	{ &vop_print_desc,	(vop_t *) sync_print },		/* print */
2755	{ &vop_islocked_desc,	(vop_t *) sync_islocked },	/* islocked */
2756	{ NULL, NULL }
2757};
2758static struct vnodeopv_desc sync_vnodeop_opv_desc =
2759	{ &sync_vnodeop_p, sync_vnodeop_entries };
2760
2761VNODEOP_SET(sync_vnodeop_opv_desc);
2762
2763/*
2764 * Create a new filesystem syncer vnode for the specified mount point.
2765 */
2766int
2767vfs_allocate_syncvnode(mp)
2768	struct mount *mp;
2769{
2770	struct vnode *vp;
2771	static long start, incr, next;
2772	int error;
2773
2774	/* Allocate a new vnode */
2775	if ((error = getnewvnode(VT_VFS, mp, sync_vnodeop_p, &vp)) != 0) {
2776		mp->mnt_syncer = NULL;
2777		return (error);
2778	}
2779	vp->v_type = VNON;
2780	/*
2781	 * Place the vnode onto the syncer worklist. We attempt to
2782	 * scatter them about on the list so that they will go off
2783	 * at evenly distributed times even if all the filesystems
2784	 * are mounted at once.
2785	 */
2786	next += incr;
2787	if (next == 0 || next > syncer_maxdelay) {
2788		start /= 2;
2789		incr /= 2;
2790		if (start == 0) {
2791			start = syncer_maxdelay / 2;
2792			incr = syncer_maxdelay;
2793		}
2794		next = start;
2795	}
2796	vn_syncer_add_to_worklist(vp, syncdelay > 0 ? next % syncdelay : 0);
2797	mp->mnt_syncer = vp;
2798	return (0);
2799}
2800
2801/*
2802 * Do a lazy sync of the filesystem.
2803 */
2804static int
2805sync_fsync(ap)
2806	struct vop_fsync_args /* {
2807		struct vnode *a_vp;
2808		struct ucred *a_cred;
2809		int a_waitfor;
2810		struct proc *a_p;
2811	} */ *ap;
2812{
2813	struct vnode *syncvp = ap->a_vp;
2814	struct mount *mp = syncvp->v_mount;
2815	struct proc *p = ap->a_p;
2816	int asyncflag;
2817
2818	/*
2819	 * We only need to do something if this is a lazy evaluation.
2820	 */
2821	if (ap->a_waitfor != MNT_LAZY)
2822		return (0);
2823
2824	/*
2825	 * Move ourselves to the back of the sync list.
2826	 */
2827	vn_syncer_add_to_worklist(syncvp, syncdelay);
2828
2829	/*
2830	 * Walk the list of vnodes pushing all that are dirty and
2831	 * not already on the sync list.
2832	 */
2833	simple_lock(&mountlist_slock);
2834	if (vfs_busy(mp, LK_EXCLUSIVE | LK_NOWAIT, &mountlist_slock, p) != 0) {
2835		simple_unlock(&mountlist_slock);
2836		return (0);
2837	}
2838	if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) {
2839		vfs_unbusy(mp, p);
2840		simple_unlock(&mountlist_slock);
2841		return (0);
2842	}
2843	asyncflag = mp->mnt_flag & MNT_ASYNC;
2844	mp->mnt_flag &= ~MNT_ASYNC;
2845	vfs_msync(mp, MNT_NOWAIT);
2846	VFS_SYNC(mp, MNT_LAZY, ap->a_cred, p);
2847	if (asyncflag)
2848		mp->mnt_flag |= MNT_ASYNC;
2849	vn_finished_write(mp);
2850	vfs_unbusy(mp, p);
2851	return (0);
2852}
2853
2854/*
2855 * The syncer vnode is no referenced.
2856 */
2857static int
2858sync_inactive(ap)
2859	struct vop_inactive_args /* {
2860		struct vnode *a_vp;
2861		struct proc *a_p;
2862	} */ *ap;
2863{
2864
2865	vgone(ap->a_vp);
2866	return (0);
2867}
2868
2869/*
2870 * The syncer vnode is no longer needed and is being decommissioned.
2871 *
2872 * Modifications to the worklist must be protected at splbio().
2873 */
2874static int
2875sync_reclaim(ap)
2876	struct vop_reclaim_args /* {
2877		struct vnode *a_vp;
2878	} */ *ap;
2879{
2880	struct vnode *vp = ap->a_vp;
2881	int s;
2882
2883	s = splbio();
2884	vp->v_mount->mnt_syncer = NULL;
2885	if (vp->v_flag & VONWORKLST) {
2886		LIST_REMOVE(vp, v_synclist);
2887		vp->v_flag &= ~VONWORKLST;
2888	}
2889	splx(s);
2890
2891	return (0);
2892}
2893
2894/*
2895 * Print out a syncer vnode.
2896 */
2897static int
2898sync_print(ap)
2899	struct vop_print_args /* {
2900		struct vnode *a_vp;
2901	} */ *ap;
2902{
2903	struct vnode *vp = ap->a_vp;
2904
2905	printf("syncer vnode");
2906	if (vp->v_vnlock != NULL)
2907		lockmgr_printinfo(vp->v_vnlock);
2908	printf("\n");
2909	return (0);
2910}
2911
2912/*
2913 * extract the dev_t from a VBLK or VCHR
2914 */
2915dev_t
2916vn_todev(vp)
2917	struct vnode *vp;
2918{
2919	if (vp->v_type != VBLK && vp->v_type != VCHR)
2920		return (NODEV);
2921	return (vp->v_rdev);
2922}
2923
2924/*
2925 * Check if vnode represents a disk device
2926 */
2927int
2928vn_isdisk(vp, errp)
2929	struct vnode *vp;
2930	int *errp;
2931{
2932	struct cdevsw *cdevsw;
2933
2934	if (vp->v_type != VBLK && vp->v_type != VCHR) {
2935		if (errp != NULL)
2936			*errp = ENOTBLK;
2937		return (0);
2938	}
2939	if (vp->v_rdev == NULL) {
2940		if (errp != NULL)
2941			*errp = ENXIO;
2942		return (0);
2943	}
2944	cdevsw = devsw(vp->v_rdev);
2945	if (cdevsw == NULL) {
2946		if (errp != NULL)
2947			*errp = ENXIO;
2948		return (0);
2949	}
2950	if (!(cdevsw->d_flags & D_DISK)) {
2951		if (errp != NULL)
2952			*errp = ENOTBLK;
2953		return (0);
2954	}
2955	if (errp != NULL)
2956		*errp = 0;
2957	return (1);
2958}
2959
2960void
2961NDFREE(ndp, flags)
2962     struct nameidata *ndp;
2963     const uint flags;
2964{
2965	if (!(flags & NDF_NO_FREE_PNBUF) &&
2966	    (ndp->ni_cnd.cn_flags & HASBUF)) {
2967		zfree(namei_zone, ndp->ni_cnd.cn_pnbuf);
2968		ndp->ni_cnd.cn_flags &= ~HASBUF;
2969	}
2970	if (!(flags & NDF_NO_DVP_UNLOCK) &&
2971	    (ndp->ni_cnd.cn_flags & LOCKPARENT) &&
2972	    ndp->ni_dvp != ndp->ni_vp)
2973		VOP_UNLOCK(ndp->ni_dvp, 0, ndp->ni_cnd.cn_proc);
2974	if (!(flags & NDF_NO_DVP_RELE) &&
2975	    (ndp->ni_cnd.cn_flags & (LOCKPARENT|WANTPARENT))) {
2976		vrele(ndp->ni_dvp);
2977		ndp->ni_dvp = NULL;
2978	}
2979	if (!(flags & NDF_NO_VP_UNLOCK) &&
2980	    (ndp->ni_cnd.cn_flags & LOCKLEAF) && ndp->ni_vp)
2981		VOP_UNLOCK(ndp->ni_vp, 0, ndp->ni_cnd.cn_proc);
2982	if (!(flags & NDF_NO_VP_RELE) &&
2983	    ndp->ni_vp) {
2984		vrele(ndp->ni_vp);
2985		ndp->ni_vp = NULL;
2986	}
2987	if (!(flags & NDF_NO_STARTDIR_RELE) &&
2988	    (ndp->ni_cnd.cn_flags & SAVESTART)) {
2989		vrele(ndp->ni_startdir);
2990		ndp->ni_startdir = NULL;
2991	}
2992}
2993
2994int
2995vaccess(type, file_mode, file_uid, file_gid, acc_mode, cred, privused)
2996	enum vtype type;
2997	mode_t file_mode;
2998	uid_t file_uid;
2999	gid_t file_gid;
3000	mode_t acc_mode;
3001	struct ucred *cred;
3002	int *privused;
3003{
3004	mode_t dac_granted;
3005#ifdef CAPABILITIES
3006	mode_t cap_granted;
3007#endif
3008
3009	/*
3010	 * Look for a normal, non-privileged way to access the file/directory
3011	 * as requested.  If it exists, go with that.
3012	 */
3013
3014	if (privused != NULL)
3015		*privused = 0;
3016
3017	dac_granted = 0;
3018
3019	/* Check the owner. */
3020	if (cred->cr_uid == file_uid) {
3021		if (file_mode & S_IXUSR)
3022			dac_granted |= VEXEC;
3023		if (file_mode & S_IRUSR)
3024			dac_granted |= VREAD;
3025		if (file_mode & S_IWUSR)
3026			dac_granted |= VWRITE;
3027
3028		if ((acc_mode & dac_granted) == acc_mode)
3029			return (0);
3030
3031		goto privcheck;
3032	}
3033
3034	/* Otherwise, check the groups (first match) */
3035	if (groupmember(file_gid, cred)) {
3036		if (file_mode & S_IXGRP)
3037			dac_granted |= VEXEC;
3038		if (file_mode & S_IRGRP)
3039			dac_granted |= VREAD;
3040		if (file_mode & S_IWGRP)
3041			dac_granted |= VWRITE;
3042
3043		if ((acc_mode & dac_granted) == acc_mode)
3044			return (0);
3045
3046		goto privcheck;
3047	}
3048
3049	/* Otherwise, check everyone else. */
3050	if (file_mode & S_IXOTH)
3051		dac_granted |= VEXEC;
3052	if (file_mode & S_IROTH)
3053		dac_granted |= VREAD;
3054	if (file_mode & S_IWOTH)
3055		dac_granted |= VWRITE;
3056	if ((acc_mode & dac_granted) == acc_mode)
3057		return (0);
3058
3059privcheck:
3060	if (!suser_xxx(cred, NULL, PRISON_ROOT)) {
3061		/* XXX audit: privilege used */
3062		if (privused != NULL)
3063			*privused = 1;
3064		return (0);
3065	}
3066
3067#ifdef CAPABILITIES
3068	/*
3069	 * Build a capability mask to determine if the set of capabilities
3070	 * satisfies the requirements when combined with the granted mask
3071	 * from above.
3072	 * For each capability, if the capability is required, bitwise
3073	 * or the request type onto the cap_granted mask.
3074	 */
3075	cap_granted = 0;
3076	if ((acc_mode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
3077	    !cap_check_xxx(cred, NULL, CAP_DAC_EXECUTE, PRISON_ROOT))
3078	    cap_granted |= VEXEC;
3079
3080	if ((acc_mode & VREAD) && ((dac_granted & VREAD) == 0) &&
3081	    !cap_check_xxx(cred, NULL, CAP_DAC_READ_SEARCH, PRISON_ROOT))
3082		cap_granted |= VREAD;
3083
3084	if ((acc_mode & VWRITE) && ((dac_granted & VWRITE) == 0) &&
3085	    !cap_check_xxx(cred, NULL, CAP_DAC_WRITE, PRISON_ROOT))
3086		cap_granted |= VWRITE;
3087
3088	if ((acc_mode & (cap_granted | dac_granted)) == acc_mode) {
3089		/* XXX audit: privilege used */
3090		if (privused != NULL)
3091			*privused = 1;
3092		return (0);
3093	}
3094#endif
3095
3096	return (EACCES);
3097}
3098