vfs_subr.c revision 88465
1/*
2 * Copyright (c) 1989, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 *    must display the following acknowledgement:
20 *	This product includes software developed by the University of
21 *	California, Berkeley and its contributors.
22 * 4. Neither the name of the University nor the names of its contributors
23 *    may be used to endorse or promote products derived from this software
24 *    without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 *
38 *	@(#)vfs_subr.c	8.31 (Berkeley) 5/26/95
39 * $FreeBSD: head/sys/kern/vfs_subr.c 88465 2001-12-25 01:23:25Z dillon $
40 */
41
42/*
43 * External virtual filesystem routines
44 */
45#include "opt_ddb.h"
46#include "opt_ffs.h"
47
48#include <sys/param.h>
49#include <sys/systm.h>
50#include <sys/bio.h>
51#include <sys/buf.h>
52#include <sys/conf.h>
53#include <sys/eventhandler.h>
54#include <sys/fcntl.h>
55#include <sys/kernel.h>
56#include <sys/kthread.h>
57#include <sys/malloc.h>
58#include <sys/mount.h>
59#include <sys/namei.h>
60#include <sys/stat.h>
61#include <sys/sysctl.h>
62#include <sys/syslog.h>
63#include <sys/vmmeter.h>
64#include <sys/vnode.h>
65
66#include <vm/vm.h>
67#include <vm/vm_object.h>
68#include <vm/vm_extern.h>
69#include <vm/pmap.h>
70#include <vm/vm_map.h>
71#include <vm/vm_page.h>
72#include <vm/vm_zone.h>
73
74static MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure");
75
76static void	addalias __P((struct vnode *vp, dev_t nvp_rdev));
77static void	insmntque __P((struct vnode *vp, struct mount *mp));
78static void	vclean __P((struct vnode *vp, int flags, struct thread *td));
79
80/*
81 * Number of vnodes in existence.  Increased whenever getnewvnode()
82 * allocates a new vnode, never decreased.
83 */
84static unsigned long	numvnodes;
85SYSCTL_LONG(_debug, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, "");
86
87/*
88 * Conversion tables for conversion from vnode types to inode formats
89 * and back.
90 */
91enum vtype iftovt_tab[16] = {
92	VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
93	VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
94};
95int vttoif_tab[9] = {
96	0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
97	S_IFSOCK, S_IFIFO, S_IFMT,
98};
99
100/*
101 * List of vnodes that are ready for recycling.
102 */
103static TAILQ_HEAD(freelst, vnode) vnode_free_list;
104
105/*
106 * Minimum number of free vnodes.  If there are fewer than this free vnodes,
107 * getnewvnode() will return a newly allocated vnode.
108 */
109static u_long wantfreevnodes = 25;
110SYSCTL_LONG(_debug, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, "");
111/* Number of vnodes in the free list. */
112static u_long freevnodes;
113SYSCTL_LONG(_debug, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, "");
114
115#if 0
116/* Number of vnode allocation. */
117static u_long vnodeallocs;
118SYSCTL_LONG(_debug, OID_AUTO, vnodeallocs, CTLFLAG_RD, &vnodeallocs, 0, "");
119/* Period of vnode recycle from namecache in vnode allocation times. */
120static u_long vnoderecycleperiod = 1000;
121SYSCTL_LONG(_debug, OID_AUTO, vnoderecycleperiod, CTLFLAG_RW, &vnoderecycleperiod, 0, "");
122/* Minimum number of total vnodes required to invoke vnode recycle from namecache. */
123static u_long vnoderecyclemintotalvn = 2000;
124SYSCTL_LONG(_debug, OID_AUTO, vnoderecyclemintotalvn, CTLFLAG_RW, &vnoderecyclemintotalvn, 0, "");
125/* Minimum number of free vnodes required to invoke vnode recycle from namecache. */
126static u_long vnoderecycleminfreevn = 2000;
127SYSCTL_LONG(_debug, OID_AUTO, vnoderecycleminfreevn, CTLFLAG_RW, &vnoderecycleminfreevn, 0, "");
128/* Number of vnodes attempted to recycle at a time. */
129static u_long vnoderecyclenumber = 3000;
130SYSCTL_LONG(_debug, OID_AUTO, vnoderecyclenumber, CTLFLAG_RW, &vnoderecyclenumber, 0, "");
131#endif
132
133/*
134 * Various variables used for debugging the new implementation of
135 * reassignbuf().
136 * XXX these are probably of (very) limited utility now.
137 */
138static int reassignbufcalls;
139SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0, "");
140static int reassignbufloops;
141SYSCTL_INT(_vfs, OID_AUTO, reassignbufloops, CTLFLAG_RW, &reassignbufloops, 0, "");
142static int reassignbufsortgood;
143SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortgood, CTLFLAG_RW, &reassignbufsortgood, 0, "");
144static int reassignbufsortbad;
145SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortbad, CTLFLAG_RW, &reassignbufsortbad, 0, "");
146/* Set to 0 for old insertion-sort based reassignbuf, 1 for modern method. */
147static int reassignbufmethod = 1;
148SYSCTL_INT(_vfs, OID_AUTO, reassignbufmethod, CTLFLAG_RW, &reassignbufmethod, 0, "");
149static int nameileafonly;
150SYSCTL_INT(_vfs, OID_AUTO, nameileafonly, CTLFLAG_RW, &nameileafonly, 0, "");
151
152#ifdef ENABLE_VFS_IOOPT
153/* See NOTES for a description of this setting. */
154int vfs_ioopt;
155SYSCTL_INT(_vfs, OID_AUTO, ioopt, CTLFLAG_RW, &vfs_ioopt, 0, "");
156#endif
157
158/* List of mounted filesystems. */
159struct mntlist mountlist = TAILQ_HEAD_INITIALIZER(mountlist);
160
161/* For any iteration/modification of mountlist */
162struct mtx mountlist_mtx;
163
164/* For any iteration/modification of mnt_vnodelist */
165struct mtx mntvnode_mtx;
166
167/*
168 * Cache for the mount type id assigned to NFS.  This is used for
169 * special checks in nfs/nfs_nqlease.c and vm/vnode_pager.c.
170 */
171int	nfs_mount_type = -1;
172
173/* To keep more than one thread at a time from running vfs_getnewfsid */
174static struct mtx mntid_mtx;
175
176/* For any iteration/modification of vnode_free_list */
177static struct mtx vnode_free_list_mtx;
178
179/*
180 * For any iteration/modification of dev->si_hlist (linked through
181 * v_specnext)
182 */
183static struct mtx spechash_mtx;
184
185/* Publicly exported FS */
186struct nfs_public nfs_pub;
187
188/* Zone for allocation of new vnodes - used exclusively by getnewvnode() */
189static vm_zone_t vnode_zone;
190
191/* Set to 1 to print out reclaim of active vnodes */
192int	prtactive;
193
194/*
195 * The workitem queue.
196 *
197 * It is useful to delay writes of file data and filesystem metadata
198 * for tens of seconds so that quickly created and deleted files need
199 * not waste disk bandwidth being created and removed. To realize this,
200 * we append vnodes to a "workitem" queue. When running with a soft
201 * updates implementation, most pending metadata dependencies should
202 * not wait for more than a few seconds. Thus, mounted on block devices
203 * are delayed only about a half the time that file data is delayed.
204 * Similarly, directory updates are more critical, so are only delayed
205 * about a third the time that file data is delayed. Thus, there are
206 * SYNCER_MAXDELAY queues that are processed round-robin at a rate of
207 * one each second (driven off the filesystem syncer process). The
208 * syncer_delayno variable indicates the next queue that is to be processed.
209 * Items that need to be processed soon are placed in this queue:
210 *
211 *	syncer_workitem_pending[syncer_delayno]
212 *
213 * A delay of fifteen seconds is done by placing the request fifteen
214 * entries later in the queue:
215 *
216 *	syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
217 *
218 */
219static int syncer_delayno;
220static long syncer_mask;
221LIST_HEAD(synclist, vnode);
222static struct synclist *syncer_workitem_pending;
223
224#define SYNCER_MAXDELAY		32
225static int syncer_maxdelay = SYNCER_MAXDELAY;	/* maximum delay time */
226static int syncdelay = 30;		/* max time to delay syncing data */
227static int filedelay = 30;		/* time to delay syncing files */
228SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, "");
229static int dirdelay = 29;		/* time to delay syncing directories */
230SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, "");
231static int metadelay = 28;		/* time to delay syncing metadata */
232SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, "");
233static int rushjob;		/* number of slots to run ASAP */
234static int stat_rush_requests;	/* number of times I/O speeded up */
235SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, "");
236
237/*
238 * Number of vnodes we want to exist at any one time.  This is mostly used
239 * to size hash tables in vnode-related code.  It is normally not used in
240 * getnewvnode(), as wantfreevnodes is normally nonzero.)
241 *
242 * XXX desiredvnodes is historical cruft and should not exist.
243 */
244int desiredvnodes;
245SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW,
246    &desiredvnodes, 0, "Maximum number of vnodes");
247static int minvnodes;
248SYSCTL_INT(_kern, OID_AUTO, minvnodes, CTLFLAG_RW,
249    &minvnodes, 0, "Minimum number of vnodes");
250static int vnlru_nowhere;
251SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW, &vnlru_nowhere, 0,
252    "Number of times the vnlru process ran without success");
253
254/*
255 * Initialize the vnode management data structures.
256 */
257static void
258vntblinit(void *dummy __unused)
259{
260
261	desiredvnodes = maxproc + cnt.v_page_count / 4;
262	minvnodes = desiredvnodes / 4;
263	mtx_init(&mountlist_mtx, "mountlist", MTX_DEF);
264	mtx_init(&mntvnode_mtx, "mntvnode", MTX_DEF);
265	mtx_init(&mntid_mtx, "mntid", MTX_DEF);
266	mtx_init(&spechash_mtx, "spechash", MTX_DEF);
267	TAILQ_INIT(&vnode_free_list);
268	mtx_init(&vnode_free_list_mtx, "vnode_free_list", MTX_DEF);
269	vnode_zone = zinit("VNODE", sizeof (struct vnode), 0, 0, 5);
270	/*
271	 * Initialize the filesystem syncer.
272	 */
273	syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE,
274		&syncer_mask);
275	syncer_maxdelay = syncer_mask + 1;
276}
277SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL)
278
279
280/*
281 * Mark a mount point as busy. Used to synchronize access and to delay
282 * unmounting. Interlock is not released on failure.
283 */
284int
285vfs_busy(mp, flags, interlkp, td)
286	struct mount *mp;
287	int flags;
288	struct mtx *interlkp;
289	struct thread *td;
290{
291	int lkflags;
292
293	if (mp->mnt_kern_flag & MNTK_UNMOUNT) {
294		if (flags & LK_NOWAIT)
295			return (ENOENT);
296		mp->mnt_kern_flag |= MNTK_MWAIT;
297		/*
298		 * Since all busy locks are shared except the exclusive
299		 * lock granted when unmounting, the only place that a
300		 * wakeup needs to be done is at the release of the
301		 * exclusive lock at the end of dounmount.
302		 */
303		msleep((caddr_t)mp, interlkp, PVFS, "vfs_busy", 0);
304		return (ENOENT);
305	}
306	lkflags = LK_SHARED | LK_NOPAUSE;
307	if (interlkp)
308		lkflags |= LK_INTERLOCK;
309	if (lockmgr(&mp->mnt_lock, lkflags, interlkp, td))
310		panic("vfs_busy: unexpected lock failure");
311	return (0);
312}
313
314/*
315 * Free a busy filesystem.
316 */
317void
318vfs_unbusy(mp, td)
319	struct mount *mp;
320	struct thread *td;
321{
322
323	lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, td);
324}
325
326/*
327 * Lookup a filesystem type, and if found allocate and initialize
328 * a mount structure for it.
329 *
330 * Devname is usually updated by mount(8) after booting.
331 */
332int
333vfs_rootmountalloc(fstypename, devname, mpp)
334	char *fstypename;
335	char *devname;
336	struct mount **mpp;
337{
338	struct thread *td = curthread;	/* XXX */
339	struct vfsconf *vfsp;
340	struct mount *mp;
341
342	if (fstypename == NULL)
343		return (ENODEV);
344	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
345		if (!strcmp(vfsp->vfc_name, fstypename))
346			break;
347	if (vfsp == NULL)
348		return (ENODEV);
349	mp = malloc((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK | M_ZERO);
350	lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, LK_NOPAUSE);
351	(void)vfs_busy(mp, LK_NOWAIT, 0, td);
352	TAILQ_INIT(&mp->mnt_nvnodelist);
353	TAILQ_INIT(&mp->mnt_reservedvnlist);
354	mp->mnt_vfc = vfsp;
355	mp->mnt_op = vfsp->vfc_vfsops;
356	mp->mnt_flag = MNT_RDONLY;
357	mp->mnt_vnodecovered = NULLVP;
358	vfsp->vfc_refcount++;
359	mp->mnt_iosize_max = DFLTPHYS;
360	mp->mnt_stat.f_type = vfsp->vfc_typenum;
361	mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
362	strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN);
363	mp->mnt_stat.f_mntonname[0] = '/';
364	mp->mnt_stat.f_mntonname[1] = 0;
365	(void) copystr(devname, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, 0);
366	*mpp = mp;
367	return (0);
368}
369
370/*
371 * Find an appropriate filesystem to use for the root. If a filesystem
372 * has not been preselected, walk through the list of known filesystems
373 * trying those that have mountroot routines, and try them until one
374 * works or we have tried them all.
375 */
376#ifdef notdef	/* XXX JH */
377int
378lite2_vfs_mountroot()
379{
380	struct vfsconf *vfsp;
381	extern int (*lite2_mountroot) __P((void));
382	int error;
383
384	if (lite2_mountroot != NULL)
385		return ((*lite2_mountroot)());
386	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
387		if (vfsp->vfc_mountroot == NULL)
388			continue;
389		if ((error = (*vfsp->vfc_mountroot)()) == 0)
390			return (0);
391		printf("%s_mountroot failed: %d\n", vfsp->vfc_name, error);
392	}
393	return (ENODEV);
394}
395#endif
396
397/*
398 * Lookup a mount point by filesystem identifier.
399 */
400struct mount *
401vfs_getvfs(fsid)
402	fsid_t *fsid;
403{
404	register struct mount *mp;
405
406	mtx_lock(&mountlist_mtx);
407	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
408		if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
409		    mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
410			mtx_unlock(&mountlist_mtx);
411			return (mp);
412	    }
413	}
414	mtx_unlock(&mountlist_mtx);
415	return ((struct mount *) 0);
416}
417
418/*
419 * Get a new unique fsid.  Try to make its val[0] unique, since this value
420 * will be used to create fake device numbers for stat().  Also try (but
421 * not so hard) make its val[0] unique mod 2^16, since some emulators only
422 * support 16-bit device numbers.  We end up with unique val[0]'s for the
423 * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls.
424 *
425 * Keep in mind that several mounts may be running in parallel.  Starting
426 * the search one past where the previous search terminated is both a
427 * micro-optimization and a defense against returning the same fsid to
428 * different mounts.
429 */
430void
431vfs_getnewfsid(mp)
432	struct mount *mp;
433{
434	static u_int16_t mntid_base;
435	fsid_t tfsid;
436	int mtype;
437
438	mtx_lock(&mntid_mtx);
439	mtype = mp->mnt_vfc->vfc_typenum;
440	tfsid.val[1] = mtype;
441	mtype = (mtype & 0xFF) << 24;
442	for (;;) {
443		tfsid.val[0] = makeudev(255,
444		    mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF));
445		mntid_base++;
446		if (vfs_getvfs(&tfsid) == NULL)
447			break;
448	}
449	mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
450	mp->mnt_stat.f_fsid.val[1] = tfsid.val[1];
451	mtx_unlock(&mntid_mtx);
452}
453
454/*
455 * Knob to control the precision of file timestamps:
456 *
457 *   0 = seconds only; nanoseconds zeroed.
458 *   1 = seconds and nanoseconds, accurate within 1/HZ.
459 *   2 = seconds and nanoseconds, truncated to microseconds.
460 * >=3 = seconds and nanoseconds, maximum precision.
461 */
462enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC };
463
464static int timestamp_precision = TSP_SEC;
465SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW,
466    &timestamp_precision, 0, "");
467
468/*
469 * Get a current timestamp.
470 */
471void
472vfs_timestamp(tsp)
473	struct timespec *tsp;
474{
475	struct timeval tv;
476
477	switch (timestamp_precision) {
478	case TSP_SEC:
479		tsp->tv_sec = time_second;
480		tsp->tv_nsec = 0;
481		break;
482	case TSP_HZ:
483		getnanotime(tsp);
484		break;
485	case TSP_USEC:
486		microtime(&tv);
487		TIMEVAL_TO_TIMESPEC(&tv, tsp);
488		break;
489	case TSP_NSEC:
490	default:
491		nanotime(tsp);
492		break;
493	}
494}
495
496/*
497 * Set vnode attributes to VNOVAL
498 */
499void
500vattr_null(vap)
501	register struct vattr *vap;
502{
503
504	vap->va_type = VNON;
505	vap->va_size = VNOVAL;
506	vap->va_bytes = VNOVAL;
507	vap->va_mode = VNOVAL;
508	vap->va_nlink = VNOVAL;
509	vap->va_uid = VNOVAL;
510	vap->va_gid = VNOVAL;
511	vap->va_fsid = VNOVAL;
512	vap->va_fileid = VNOVAL;
513	vap->va_blocksize = VNOVAL;
514	vap->va_rdev = VNOVAL;
515	vap->va_atime.tv_sec = VNOVAL;
516	vap->va_atime.tv_nsec = VNOVAL;
517	vap->va_mtime.tv_sec = VNOVAL;
518	vap->va_mtime.tv_nsec = VNOVAL;
519	vap->va_ctime.tv_sec = VNOVAL;
520	vap->va_ctime.tv_nsec = VNOVAL;
521	vap->va_flags = VNOVAL;
522	vap->va_gen = VNOVAL;
523	vap->va_vaflags = 0;
524}
525
526/*
527 * This routine is called when we have too many vnodes.  It attempts
528 * to free <count> vnodes and will potentially free vnodes that still
529 * have VM backing store (VM backing store is typically the cause
530 * of a vnode blowout so we want to do this).  Therefore, this operation
531 * is not considered cheap.
532 *
533 * A number of conditions may prevent a vnode from being reclaimed.
534 * the buffer cache may have references on the vnode, a directory
535 * vnode may still have references due to the namei cache representing
536 * underlying files, or the vnode may be in active use.   It is not
537 * desireable to reuse such vnodes.  These conditions may cause the
538 * number of vnodes to reach some minimum value regardless of what
539 * you set kern.maxvnodes to.  Do not set kern.maxvnodes too low.
540 */
541static int
542vlrureclaim(struct mount *mp, int count)
543{
544	struct vnode *vp;
545	int done;
546
547	done = 0;
548	mtx_lock(&mntvnode_mtx);
549	while (count && (vp = TAILQ_FIRST(&mp->mnt_nvnodelist)) != NULL) {
550		TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
551		TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
552
553		if (vp->v_type != VNON &&
554		    vp->v_type != VBAD &&
555		    VMIGHTFREE(vp) &&           /* critical path opt */
556		    mtx_trylock(&vp->v_interlock)
557		) {
558			mtx_unlock(&mntvnode_mtx);
559			if (VMIGHTFREE(vp)) {
560				vgonel(vp, curthread);
561				done++;
562			} else {
563				mtx_unlock(&vp->v_interlock);
564			}
565			mtx_lock(&mntvnode_mtx);
566		}
567		--count;
568	}
569	mtx_unlock(&mntvnode_mtx);
570	return done;
571}
572
573/*
574 * Attempt to recycle vnodes in a context that is always safe to block.
575 * Calling vlrurecycle() from the bowels of file system code has some
576 * interesting deadlock problems.
577 */
578static struct proc *vnlruproc;
579static int vnlruproc_sig;
580
581static void
582vnlru_proc(void)
583{
584	struct mount *mp, *nmp;
585	int s;
586	int done;
587	struct proc *p = vnlruproc;
588	struct thread *td = &p->p_thread;	/* XXXKSE */
589
590	mtx_lock(&Giant);
591
592	EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, p,
593	    SHUTDOWN_PRI_FIRST);
594
595	s = splbio();
596	for (;;) {
597		kthread_suspend_check(p);
598		if (numvnodes - freevnodes <= desiredvnodes * 9 / 10) {
599			vnlruproc_sig = 0;
600			tsleep(vnlruproc, PVFS, "vlruwt", 0);
601			continue;
602		}
603		done = 0;
604		mtx_lock(&mountlist_mtx);
605		for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
606			if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td)) {
607				nmp = TAILQ_NEXT(mp, mnt_list);
608				continue;
609			}
610			done += vlrureclaim(mp, 10);
611			mtx_lock(&mountlist_mtx);
612			nmp = TAILQ_NEXT(mp, mnt_list);
613			vfs_unbusy(mp, td);
614		}
615		mtx_unlock(&mountlist_mtx);
616		if (done == 0) {
617#if 0
618			/* These messages are temporary debugging aids */
619			if (vnlru_nowhere < 5)
620				printf("vnlru process getting nowhere..\n");
621			else if (vnlru_nowhere == 5)
622				printf("vnlru process messages stopped.\n");
623#endif
624			vnlru_nowhere++;
625			tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3);
626		}
627	}
628	splx(s);
629}
630
631static struct kproc_desc vnlru_kp = {
632	"vnlru",
633	vnlru_proc,
634	&vnlruproc
635};
636SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &vnlru_kp)
637
638
639/*
640 * Routines having to do with the management of the vnode table.
641 */
642
643/*
644 * Return the next vnode from the free list.
645 */
646int
647getnewvnode(tag, mp, vops, vpp)
648	enum vtagtype tag;
649	struct mount *mp;
650	vop_t **vops;
651	struct vnode **vpp;
652{
653	int s;
654	struct thread *td = curthread;	/* XXX */
655	struct vnode *vp = NULL;
656	struct mount *vnmp;
657	vm_object_t object;
658
659	s = splbio();
660	/*
661	 * Try to reuse vnodes if we hit the max.  This situation only
662	 * occurs in certain large-memory (2G+) situations.  We cannot
663	 * attempt to directly reclaim vnodes due to nasty recursion
664	 * problems.
665	 */
666	if (vnlruproc_sig == 0 && numvnodes - freevnodes > desiredvnodes) {
667		vnlruproc_sig = 1;      /* avoid unnecessary wakeups */
668		wakeup(vnlruproc);
669	}
670
671	/*
672	 * Attempt to reuse a vnode already on the free list, allocating
673	 * a new vnode if we can't find one or if we have not reached a
674	 * good minimum for good LRU performance.
675	 */
676
677	mtx_lock(&vnode_free_list_mtx);
678
679	if (freevnodes >= wantfreevnodes && numvnodes >= minvnodes) {
680		int count;
681
682		for (count = 0; count < freevnodes; count++) {
683			vp = TAILQ_FIRST(&vnode_free_list);
684			if (vp == NULL || vp->v_usecount)
685				panic("getnewvnode: free vnode isn't");
686			TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
687
688			/*
689			 * Don't recycle if we still have cached pages or if
690			 * we cannot get the interlock.
691			 */
692			if ((VOP_GETVOBJECT(vp, &object) == 0 &&
693			     (object->resident_page_count ||
694			      object->ref_count)) ||
695			     !mtx_trylock(&vp->v_interlock)) {
696				TAILQ_INSERT_TAIL(&vnode_free_list, vp,
697						    v_freelist);
698				vp = NULL;
699				continue;
700			}
701			if (LIST_FIRST(&vp->v_cache_src)) {
702				/*
703				 * note: nameileafonly sysctl is temporary,
704				 * for debugging only, and will eventually be
705				 * removed.
706				 */
707				if (nameileafonly > 0) {
708					/*
709					 * Do not reuse namei-cached directory
710					 * vnodes that have cached
711					 * subdirectories.
712					 */
713					if (cache_leaf_test(vp) < 0) {
714						mtx_unlock(&vp->v_interlock);
715						TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
716						vp = NULL;
717						continue;
718					}
719				} else if (nameileafonly < 0 ||
720					    vmiodirenable == 0) {
721					/*
722					 * Do not reuse namei-cached directory
723					 * vnodes if nameileafonly is -1 or
724					 * if VMIO backing for directories is
725					 * turned off (otherwise we reuse them
726					 * too quickly).
727					 */
728					mtx_unlock(&vp->v_interlock);
729					TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
730					vp = NULL;
731					continue;
732				}
733			}
734			/*
735			 * Skip over it if its filesystem is being suspended.
736			 */
737			if (vn_start_write(vp, &vnmp, V_NOWAIT) == 0)
738				break;
739			mtx_unlock(&vp->v_interlock);
740			TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
741			vp = NULL;
742		}
743	}
744	if (vp) {
745		vp->v_flag |= VDOOMED;
746		vp->v_flag &= ~VFREE;
747		freevnodes--;
748		mtx_unlock(&vnode_free_list_mtx);
749		cache_purge(vp);
750		vp->v_lease = NULL;
751		if (vp->v_type != VBAD) {
752			vgonel(vp, td);
753		} else {
754			mtx_unlock(&vp->v_interlock);
755		}
756		vn_finished_write(vnmp);
757
758#ifdef INVARIANTS
759		{
760			int s;
761
762			if (vp->v_data)
763				panic("cleaned vnode isn't");
764			s = splbio();
765			if (vp->v_numoutput)
766				panic("Clean vnode has pending I/O's");
767			splx(s);
768			if (vp->v_writecount != 0)
769				panic("Non-zero write count");
770		}
771#endif
772		vp->v_flag = 0;
773		vp->v_lastw = 0;
774		vp->v_lasta = 0;
775		vp->v_cstart = 0;
776		vp->v_clen = 0;
777		vp->v_socket = 0;
778	} else {
779		mtx_unlock(&vnode_free_list_mtx);
780		vp = (struct vnode *) zalloc(vnode_zone);
781		bzero((char *) vp, sizeof *vp);
782		mtx_init(&vp->v_interlock, "vnode interlock", MTX_DEF);
783		vp->v_dd = vp;
784		mtx_init(&vp->v_pollinfo.vpi_lock, "vnode pollinfo", MTX_DEF);
785		cache_purge(vp);
786		LIST_INIT(&vp->v_cache_src);
787		TAILQ_INIT(&vp->v_cache_dst);
788		numvnodes++;
789	}
790
791	TAILQ_INIT(&vp->v_cleanblkhd);
792	TAILQ_INIT(&vp->v_dirtyblkhd);
793	vp->v_type = VNON;
794	vp->v_tag = tag;
795	vp->v_op = vops;
796	lockinit(&vp->v_lock, PVFS, "vnlock", VLKTIMEOUT, LK_NOPAUSE);
797	insmntque(vp, mp);
798	*vpp = vp;
799	vp->v_usecount = 1;
800	vp->v_data = 0;
801
802	splx(s);
803
804	vfs_object_create(vp, td, td->td_proc->p_ucred);
805
806#if 0
807	vnodeallocs++;
808	if (vnodeallocs % vnoderecycleperiod == 0 &&
809	    freevnodes < vnoderecycleminfreevn &&
810	    vnoderecyclemintotalvn < numvnodes) {
811		/* Recycle vnodes. */
812		cache_purgeleafdirs(vnoderecyclenumber);
813	}
814#endif
815
816	return (0);
817}
818
819/*
820 * Move a vnode from one mount queue to another.
821 */
822static void
823insmntque(vp, mp)
824	register struct vnode *vp;
825	register struct mount *mp;
826{
827
828	mtx_lock(&mntvnode_mtx);
829	/*
830	 * Delete from old mount point vnode list, if on one.
831	 */
832	if (vp->v_mount != NULL)
833		TAILQ_REMOVE(&vp->v_mount->mnt_nvnodelist, vp, v_nmntvnodes);
834	/*
835	 * Insert into list of vnodes for the new mount point, if available.
836	 */
837	if ((vp->v_mount = mp) == NULL) {
838		mtx_unlock(&mntvnode_mtx);
839		return;
840	}
841	TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
842	mtx_unlock(&mntvnode_mtx);
843}
844
845/*
846 * Update outstanding I/O count and do wakeup if requested.
847 */
848void
849vwakeup(bp)
850	register struct buf *bp;
851{
852	register struct vnode *vp;
853
854	bp->b_flags &= ~B_WRITEINPROG;
855	if ((vp = bp->b_vp)) {
856		vp->v_numoutput--;
857		if (vp->v_numoutput < 0)
858			panic("vwakeup: neg numoutput");
859		if ((vp->v_numoutput == 0) && (vp->v_flag & VBWAIT)) {
860			vp->v_flag &= ~VBWAIT;
861			wakeup((caddr_t) &vp->v_numoutput);
862		}
863	}
864}
865
866/*
867 * Flush out and invalidate all buffers associated with a vnode.
868 * Called with the underlying object locked.
869 */
870int
871vinvalbuf(vp, flags, cred, td, slpflag, slptimeo)
872	register struct vnode *vp;
873	int flags;
874	struct ucred *cred;
875	struct thread *td;
876	int slpflag, slptimeo;
877{
878	register struct buf *bp;
879	struct buf *nbp, *blist;
880	int s, error;
881	vm_object_t object;
882
883	GIANT_REQUIRED;
884
885	if (flags & V_SAVE) {
886		s = splbio();
887		while (vp->v_numoutput) {
888			vp->v_flag |= VBWAIT;
889			error = tsleep((caddr_t)&vp->v_numoutput,
890			    slpflag | (PRIBIO + 1), "vinvlbuf", slptimeo);
891			if (error) {
892				splx(s);
893				return (error);
894			}
895		}
896		if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
897			splx(s);
898			if ((error = VOP_FSYNC(vp, cred, MNT_WAIT, td)) != 0)
899				return (error);
900			s = splbio();
901			if (vp->v_numoutput > 0 ||
902			    !TAILQ_EMPTY(&vp->v_dirtyblkhd))
903				panic("vinvalbuf: dirty bufs");
904		}
905		splx(s);
906  	}
907	s = splbio();
908	for (;;) {
909		blist = TAILQ_FIRST(&vp->v_cleanblkhd);
910		if (!blist)
911			blist = TAILQ_FIRST(&vp->v_dirtyblkhd);
912		if (!blist)
913			break;
914
915		for (bp = blist; bp; bp = nbp) {
916			nbp = TAILQ_NEXT(bp, b_vnbufs);
917			if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
918				error = BUF_TIMELOCK(bp,
919				    LK_EXCLUSIVE | LK_SLEEPFAIL,
920				    "vinvalbuf", slpflag, slptimeo);
921				if (error == ENOLCK)
922					break;
923				splx(s);
924				return (error);
925			}
926			/*
927			 * XXX Since there are no node locks for NFS, I
928			 * believe there is a slight chance that a delayed
929			 * write will occur while sleeping just above, so
930			 * check for it.  Note that vfs_bio_awrite expects
931			 * buffers to reside on a queue, while BUF_WRITE and
932			 * brelse do not.
933			 */
934			if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) &&
935				(flags & V_SAVE)) {
936
937				if (bp->b_vp == vp) {
938					if (bp->b_flags & B_CLUSTEROK) {
939						BUF_UNLOCK(bp);
940						vfs_bio_awrite(bp);
941					} else {
942						bremfree(bp);
943						bp->b_flags |= B_ASYNC;
944						BUF_WRITE(bp);
945					}
946				} else {
947					bremfree(bp);
948					(void) BUF_WRITE(bp);
949				}
950				break;
951			}
952			bremfree(bp);
953			bp->b_flags |= (B_INVAL | B_NOCACHE | B_RELBUF);
954			bp->b_flags &= ~B_ASYNC;
955			brelse(bp);
956		}
957	}
958
959	/*
960	 * Wait for I/O to complete.  XXX needs cleaning up.  The vnode can
961	 * have write I/O in-progress but if there is a VM object then the
962	 * VM object can also have read-I/O in-progress.
963	 */
964	do {
965		while (vp->v_numoutput > 0) {
966			vp->v_flag |= VBWAIT;
967			tsleep(&vp->v_numoutput, PVM, "vnvlbv", 0);
968		}
969		if (VOP_GETVOBJECT(vp, &object) == 0) {
970			while (object->paging_in_progress)
971			vm_object_pip_sleep(object, "vnvlbx");
972		}
973	} while (vp->v_numoutput > 0);
974
975	splx(s);
976
977	/*
978	 * Destroy the copy in the VM cache, too.
979	 */
980	mtx_lock(&vp->v_interlock);
981	if (VOP_GETVOBJECT(vp, &object) == 0) {
982		vm_object_page_remove(object, 0, 0,
983			(flags & V_SAVE) ? TRUE : FALSE);
984	}
985	mtx_unlock(&vp->v_interlock);
986
987	if (!TAILQ_EMPTY(&vp->v_dirtyblkhd) || !TAILQ_EMPTY(&vp->v_cleanblkhd))
988		panic("vinvalbuf: flush failed");
989	return (0);
990}
991
992/*
993 * Truncate a file's buffer and pages to a specified length.  This
994 * is in lieu of the old vinvalbuf mechanism, which performed unneeded
995 * sync activity.
996 */
997int
998vtruncbuf(vp, cred, td, length, blksize)
999	register struct vnode *vp;
1000	struct ucred *cred;
1001	struct thread *td;
1002	off_t length;
1003	int blksize;
1004{
1005	register struct buf *bp;
1006	struct buf *nbp;
1007	int s, anyfreed;
1008	int trunclbn;
1009
1010	/*
1011	 * Round up to the *next* lbn.
1012	 */
1013	trunclbn = (length + blksize - 1) / blksize;
1014
1015	s = splbio();
1016restart:
1017	anyfreed = 1;
1018	for (;anyfreed;) {
1019		anyfreed = 0;
1020		for (bp = TAILQ_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) {
1021			nbp = TAILQ_NEXT(bp, b_vnbufs);
1022			if (bp->b_lblkno >= trunclbn) {
1023				if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
1024					BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL);
1025					goto restart;
1026				} else {
1027					bremfree(bp);
1028					bp->b_flags |= (B_INVAL | B_RELBUF);
1029					bp->b_flags &= ~B_ASYNC;
1030					brelse(bp);
1031					anyfreed = 1;
1032				}
1033				if (nbp &&
1034				    (((nbp->b_xflags & BX_VNCLEAN) == 0) ||
1035				    (nbp->b_vp != vp) ||
1036				    (nbp->b_flags & B_DELWRI))) {
1037					goto restart;
1038				}
1039			}
1040		}
1041
1042		for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
1043			nbp = TAILQ_NEXT(bp, b_vnbufs);
1044			if (bp->b_lblkno >= trunclbn) {
1045				if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
1046					BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL);
1047					goto restart;
1048				} else {
1049					bremfree(bp);
1050					bp->b_flags |= (B_INVAL | B_RELBUF);
1051					bp->b_flags &= ~B_ASYNC;
1052					brelse(bp);
1053					anyfreed = 1;
1054				}
1055				if (nbp &&
1056				    (((nbp->b_xflags & BX_VNDIRTY) == 0) ||
1057				    (nbp->b_vp != vp) ||
1058				    (nbp->b_flags & B_DELWRI) == 0)) {
1059					goto restart;
1060				}
1061			}
1062		}
1063	}
1064
1065	if (length > 0) {
1066restartsync:
1067		for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
1068			nbp = TAILQ_NEXT(bp, b_vnbufs);
1069			if ((bp->b_flags & B_DELWRI) && (bp->b_lblkno < 0)) {
1070				if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
1071					BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL);
1072					goto restart;
1073				} else {
1074					bremfree(bp);
1075					if (bp->b_vp == vp) {
1076						bp->b_flags |= B_ASYNC;
1077					} else {
1078						bp->b_flags &= ~B_ASYNC;
1079					}
1080					BUF_WRITE(bp);
1081				}
1082				goto restartsync;
1083			}
1084
1085		}
1086	}
1087
1088	while (vp->v_numoutput > 0) {
1089		vp->v_flag |= VBWAIT;
1090		tsleep(&vp->v_numoutput, PVM, "vbtrunc", 0);
1091	}
1092
1093	splx(s);
1094
1095	vnode_pager_setsize(vp, length);
1096
1097	return (0);
1098}
1099
1100/*
1101 * Associate a buffer with a vnode.
1102 */
1103void
1104bgetvp(vp, bp)
1105	register struct vnode *vp;
1106	register struct buf *bp;
1107{
1108	int s;
1109
1110	KASSERT(bp->b_vp == NULL, ("bgetvp: not free"));
1111
1112	vhold(vp);
1113	bp->b_vp = vp;
1114	bp->b_dev = vn_todev(vp);
1115	/*
1116	 * Insert onto list for new vnode.
1117	 */
1118	s = splbio();
1119	bp->b_xflags |= BX_VNCLEAN;
1120	bp->b_xflags &= ~BX_VNDIRTY;
1121	TAILQ_INSERT_TAIL(&vp->v_cleanblkhd, bp, b_vnbufs);
1122	splx(s);
1123}
1124
1125/*
1126 * Disassociate a buffer from a vnode.
1127 */
1128void
1129brelvp(bp)
1130	register struct buf *bp;
1131{
1132	struct vnode *vp;
1133	struct buflists *listheadp;
1134	int s;
1135
1136	KASSERT(bp->b_vp != NULL, ("brelvp: NULL"));
1137
1138	/*
1139	 * Delete from old vnode list, if on one.
1140	 */
1141	vp = bp->b_vp;
1142	s = splbio();
1143	if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) {
1144		if (bp->b_xflags & BX_VNDIRTY)
1145			listheadp = &vp->v_dirtyblkhd;
1146		else
1147			listheadp = &vp->v_cleanblkhd;
1148		TAILQ_REMOVE(listheadp, bp, b_vnbufs);
1149		bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN);
1150	}
1151	if ((vp->v_flag & VONWORKLST) && TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
1152		vp->v_flag &= ~VONWORKLST;
1153		LIST_REMOVE(vp, v_synclist);
1154	}
1155	splx(s);
1156	bp->b_vp = (struct vnode *) 0;
1157	vdrop(vp);
1158}
1159
1160/*
1161 * Add an item to the syncer work queue.
1162 */
1163static void
1164vn_syncer_add_to_worklist(struct vnode *vp, int delay)
1165{
1166	int s, slot;
1167
1168	s = splbio();
1169
1170	if (vp->v_flag & VONWORKLST) {
1171		LIST_REMOVE(vp, v_synclist);
1172	}
1173
1174	if (delay > syncer_maxdelay - 2)
1175		delay = syncer_maxdelay - 2;
1176	slot = (syncer_delayno + delay) & syncer_mask;
1177
1178	LIST_INSERT_HEAD(&syncer_workitem_pending[slot], vp, v_synclist);
1179	vp->v_flag |= VONWORKLST;
1180	splx(s);
1181}
1182
1183struct  proc *updateproc;
1184static void sched_sync __P((void));
1185static struct kproc_desc up_kp = {
1186	"syncer",
1187	sched_sync,
1188	&updateproc
1189};
1190SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp)
1191
1192/*
1193 * System filesystem synchronizer daemon.
1194 */
1195void
1196sched_sync(void)
1197{
1198	struct synclist *slp;
1199	struct vnode *vp;
1200	struct mount *mp;
1201	long starttime;
1202	int s;
1203	struct thread *td = &updateproc->p_thread;  /* XXXKSE */
1204
1205	mtx_lock(&Giant);
1206
1207	EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, td->td_proc,
1208	    SHUTDOWN_PRI_LAST);
1209
1210	for (;;) {
1211		kthread_suspend_check(td->td_proc);
1212
1213		starttime = time_second;
1214
1215		/*
1216		 * Push files whose dirty time has expired.  Be careful
1217		 * of interrupt race on slp queue.
1218		 */
1219		s = splbio();
1220		slp = &syncer_workitem_pending[syncer_delayno];
1221		syncer_delayno += 1;
1222		if (syncer_delayno == syncer_maxdelay)
1223			syncer_delayno = 0;
1224		splx(s);
1225
1226		while ((vp = LIST_FIRST(slp)) != NULL) {
1227			if (VOP_ISLOCKED(vp, NULL) == 0 &&
1228			    vn_start_write(vp, &mp, V_NOWAIT) == 0) {
1229				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
1230				(void) VOP_FSYNC(vp, td->td_proc->p_ucred, MNT_LAZY, td);
1231				VOP_UNLOCK(vp, 0, td);
1232				vn_finished_write(mp);
1233			}
1234			s = splbio();
1235			if (LIST_FIRST(slp) == vp) {
1236				/*
1237				 * Note: v_tag VT_VFS vps can remain on the
1238				 * worklist too with no dirty blocks, but
1239				 * since sync_fsync() moves it to a different
1240				 * slot we are safe.
1241				 */
1242				if (TAILQ_EMPTY(&vp->v_dirtyblkhd) &&
1243				    !vn_isdisk(vp, NULL))
1244					panic("sched_sync: fsync failed vp %p tag %d", vp, vp->v_tag);
1245				/*
1246				 * Put us back on the worklist.  The worklist
1247				 * routine will remove us from our current
1248				 * position and then add us back in at a later
1249				 * position.
1250				 */
1251				vn_syncer_add_to_worklist(vp, syncdelay);
1252			}
1253			splx(s);
1254		}
1255
1256		/*
1257		 * Do soft update processing.
1258		 */
1259#ifdef SOFTUPDATES
1260		softdep_process_worklist(NULL);
1261#endif
1262
1263		/*
1264		 * The variable rushjob allows the kernel to speed up the
1265		 * processing of the filesystem syncer process. A rushjob
1266		 * value of N tells the filesystem syncer to process the next
1267		 * N seconds worth of work on its queue ASAP. Currently rushjob
1268		 * is used by the soft update code to speed up the filesystem
1269		 * syncer process when the incore state is getting so far
1270		 * ahead of the disk that the kernel memory pool is being
1271		 * threatened with exhaustion.
1272		 */
1273		if (rushjob > 0) {
1274			rushjob -= 1;
1275			continue;
1276		}
1277		/*
1278		 * If it has taken us less than a second to process the
1279		 * current work, then wait. Otherwise start right over
1280		 * again. We can still lose time if any single round
1281		 * takes more than two seconds, but it does not really
1282		 * matter as we are just trying to generally pace the
1283		 * filesystem activity.
1284		 */
1285		if (time_second == starttime)
1286			tsleep(&lbolt, PPAUSE, "syncer", 0);
1287	}
1288}
1289
1290/*
1291 * Request the syncer daemon to speed up its work.
1292 * We never push it to speed up more than half of its
1293 * normal turn time, otherwise it could take over the cpu.
1294 * XXXKSE  only one update?
1295 */
1296int
1297speedup_syncer()
1298{
1299
1300	mtx_lock_spin(&sched_lock);
1301	if (updateproc->p_thread.td_wchan == &lbolt) /* XXXKSE */
1302		setrunnable(&updateproc->p_thread);
1303	mtx_unlock_spin(&sched_lock);
1304	if (rushjob < syncdelay / 2) {
1305		rushjob += 1;
1306		stat_rush_requests += 1;
1307		return (1);
1308	}
1309	return(0);
1310}
1311
1312/*
1313 * Associate a p-buffer with a vnode.
1314 *
1315 * Also sets B_PAGING flag to indicate that vnode is not fully associated
1316 * with the buffer.  i.e. the bp has not been linked into the vnode or
1317 * ref-counted.
1318 */
1319void
1320pbgetvp(vp, bp)
1321	register struct vnode *vp;
1322	register struct buf *bp;
1323{
1324
1325	KASSERT(bp->b_vp == NULL, ("pbgetvp: not free"));
1326
1327	bp->b_vp = vp;
1328	bp->b_flags |= B_PAGING;
1329	bp->b_dev = vn_todev(vp);
1330}
1331
1332/*
1333 * Disassociate a p-buffer from a vnode.
1334 */
1335void
1336pbrelvp(bp)
1337	register struct buf *bp;
1338{
1339
1340	KASSERT(bp->b_vp != NULL, ("pbrelvp: NULL"));
1341
1342	/* XXX REMOVE ME */
1343	if (TAILQ_NEXT(bp, b_vnbufs) != NULL) {
1344		panic(
1345		    "relpbuf(): b_vp was probably reassignbuf()d %p %x",
1346		    bp,
1347		    (int)bp->b_flags
1348		);
1349	}
1350	bp->b_vp = (struct vnode *) 0;
1351	bp->b_flags &= ~B_PAGING;
1352}
1353
1354/*
1355 * Change the vnode a pager buffer is associated with.
1356 */
1357void
1358pbreassignbuf(bp, newvp)
1359	struct buf *bp;
1360	struct vnode *newvp;
1361{
1362
1363	KASSERT(bp->b_flags & B_PAGING,
1364	    ("pbreassignbuf() on non phys bp %p", bp));
1365	bp->b_vp = newvp;
1366}
1367
1368/*
1369 * Reassign a buffer from one vnode to another.
1370 * Used to assign file specific control information
1371 * (indirect blocks) to the vnode to which they belong.
1372 */
1373void
1374reassignbuf(bp, newvp)
1375	register struct buf *bp;
1376	register struct vnode *newvp;
1377{
1378	struct buflists *listheadp;
1379	int delay;
1380	int s;
1381
1382	if (newvp == NULL) {
1383		printf("reassignbuf: NULL");
1384		return;
1385	}
1386	++reassignbufcalls;
1387
1388	/*
1389	 * B_PAGING flagged buffers cannot be reassigned because their vp
1390	 * is not fully linked in.
1391	 */
1392	if (bp->b_flags & B_PAGING)
1393		panic("cannot reassign paging buffer");
1394
1395	s = splbio();
1396	/*
1397	 * Delete from old vnode list, if on one.
1398	 */
1399	if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) {
1400		if (bp->b_xflags & BX_VNDIRTY)
1401			listheadp = &bp->b_vp->v_dirtyblkhd;
1402		else
1403			listheadp = &bp->b_vp->v_cleanblkhd;
1404		TAILQ_REMOVE(listheadp, bp, b_vnbufs);
1405		bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN);
1406		if (bp->b_vp != newvp) {
1407			vdrop(bp->b_vp);
1408			bp->b_vp = NULL;	/* for clarification */
1409		}
1410	}
1411	/*
1412	 * If dirty, put on list of dirty buffers; otherwise insert onto list
1413	 * of clean buffers.
1414	 */
1415	if (bp->b_flags & B_DELWRI) {
1416		struct buf *tbp;
1417
1418		listheadp = &newvp->v_dirtyblkhd;
1419		if ((newvp->v_flag & VONWORKLST) == 0) {
1420			switch (newvp->v_type) {
1421			case VDIR:
1422				delay = dirdelay;
1423				break;
1424			case VCHR:
1425				if (newvp->v_rdev->si_mountpoint != NULL) {
1426					delay = metadelay;
1427					break;
1428				}
1429				/* fall through */
1430			default:
1431				delay = filedelay;
1432			}
1433			vn_syncer_add_to_worklist(newvp, delay);
1434		}
1435		bp->b_xflags |= BX_VNDIRTY;
1436		tbp = TAILQ_FIRST(listheadp);
1437		if (tbp == NULL ||
1438		    bp->b_lblkno == 0 ||
1439		    (bp->b_lblkno > 0 && tbp->b_lblkno < 0) ||
1440		    (bp->b_lblkno > 0 && bp->b_lblkno < tbp->b_lblkno)) {
1441			TAILQ_INSERT_HEAD(listheadp, bp, b_vnbufs);
1442			++reassignbufsortgood;
1443		} else if (bp->b_lblkno < 0) {
1444			TAILQ_INSERT_TAIL(listheadp, bp, b_vnbufs);
1445			++reassignbufsortgood;
1446		} else if (reassignbufmethod == 1) {
1447			/*
1448			 * New sorting algorithm, only handle sequential case,
1449			 * otherwise append to end (but before metadata)
1450			 */
1451			if ((tbp = gbincore(newvp, bp->b_lblkno - 1)) != NULL &&
1452			    (tbp->b_xflags & BX_VNDIRTY)) {
1453				/*
1454				 * Found the best place to insert the buffer
1455				 */
1456				TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs);
1457				++reassignbufsortgood;
1458			} else {
1459				/*
1460				 * Missed, append to end, but before meta-data.
1461				 * We know that the head buffer in the list is
1462				 * not meta-data due to prior conditionals.
1463				 *
1464				 * Indirect effects:  NFS second stage write
1465				 * tends to wind up here, giving maximum
1466				 * distance between the unstable write and the
1467				 * commit rpc.
1468				 */
1469				tbp = TAILQ_LAST(listheadp, buflists);
1470				while (tbp && tbp->b_lblkno < 0)
1471					tbp = TAILQ_PREV(tbp, buflists, b_vnbufs);
1472				TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs);
1473				++reassignbufsortbad;
1474			}
1475		} else {
1476			/*
1477			 * Old sorting algorithm, scan queue and insert
1478			 */
1479			struct buf *ttbp;
1480			while ((ttbp = TAILQ_NEXT(tbp, b_vnbufs)) &&
1481			    (ttbp->b_lblkno < bp->b_lblkno)) {
1482				++reassignbufloops;
1483				tbp = ttbp;
1484			}
1485			TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs);
1486		}
1487	} else {
1488		bp->b_xflags |= BX_VNCLEAN;
1489		TAILQ_INSERT_TAIL(&newvp->v_cleanblkhd, bp, b_vnbufs);
1490		if ((newvp->v_flag & VONWORKLST) &&
1491		    TAILQ_EMPTY(&newvp->v_dirtyblkhd)) {
1492			newvp->v_flag &= ~VONWORKLST;
1493			LIST_REMOVE(newvp, v_synclist);
1494		}
1495	}
1496	if (bp->b_vp != newvp) {
1497		bp->b_vp = newvp;
1498		vhold(bp->b_vp);
1499	}
1500	splx(s);
1501}
1502
1503/*
1504 * Create a vnode for a device.
1505 * Used for mounting the root file system.
1506 */
1507int
1508bdevvp(dev, vpp)
1509	dev_t dev;
1510	struct vnode **vpp;
1511{
1512	register struct vnode *vp;
1513	struct vnode *nvp;
1514	int error;
1515
1516	if (dev == NODEV) {
1517		*vpp = NULLVP;
1518		return (ENXIO);
1519	}
1520	if (vfinddev(dev, VCHR, vpp))
1521		return (0);
1522	error = getnewvnode(VT_NON, (struct mount *)0, spec_vnodeop_p, &nvp);
1523	if (error) {
1524		*vpp = NULLVP;
1525		return (error);
1526	}
1527	vp = nvp;
1528	vp->v_type = VCHR;
1529	addalias(vp, dev);
1530	*vpp = vp;
1531	return (0);
1532}
1533
1534/*
1535 * Add vnode to the alias list hung off the dev_t.
1536 *
1537 * The reason for this gunk is that multiple vnodes can reference
1538 * the same physical device, so checking vp->v_usecount to see
1539 * how many users there are is inadequate; the v_usecount for
1540 * the vnodes need to be accumulated.  vcount() does that.
1541 */
1542struct vnode *
1543addaliasu(nvp, nvp_rdev)
1544	struct vnode *nvp;
1545	udev_t nvp_rdev;
1546{
1547	struct vnode *ovp;
1548	vop_t **ops;
1549	dev_t dev;
1550
1551	if (nvp->v_type == VBLK)
1552		return (nvp);
1553	if (nvp->v_type != VCHR)
1554		panic("addaliasu on non-special vnode");
1555	dev = udev2dev(nvp_rdev, 0);
1556	/*
1557	 * Check to see if we have a bdevvp vnode with no associated
1558	 * filesystem. If so, we want to associate the filesystem of
1559	 * the new newly instigated vnode with the bdevvp vnode and
1560	 * discard the newly created vnode rather than leaving the
1561	 * bdevvp vnode lying around with no associated filesystem.
1562	 */
1563	if (vfinddev(dev, nvp->v_type, &ovp) == 0 || ovp->v_data != NULL) {
1564		addalias(nvp, dev);
1565		return (nvp);
1566	}
1567	/*
1568	 * Discard unneeded vnode, but save its node specific data.
1569	 * Note that if there is a lock, it is carried over in the
1570	 * node specific data to the replacement vnode.
1571	 */
1572	vref(ovp);
1573	ovp->v_data = nvp->v_data;
1574	ovp->v_tag = nvp->v_tag;
1575	nvp->v_data = NULL;
1576	lockinit(&ovp->v_lock, PVFS, nvp->v_lock.lk_wmesg,
1577	    nvp->v_lock.lk_timo, nvp->v_lock.lk_flags & LK_EXTFLG_MASK);
1578	if (nvp->v_vnlock)
1579		ovp->v_vnlock = &ovp->v_lock;
1580	ops = ovp->v_op;
1581	ovp->v_op = nvp->v_op;
1582	if (VOP_ISLOCKED(nvp, curthread)) {
1583		VOP_UNLOCK(nvp, 0, curthread);
1584		vn_lock(ovp, LK_EXCLUSIVE | LK_RETRY, curthread);
1585	}
1586	nvp->v_op = ops;
1587	insmntque(ovp, nvp->v_mount);
1588	vrele(nvp);
1589	vgone(nvp);
1590	return (ovp);
1591}
1592
1593/* This is a local helper function that do the same as addaliasu, but for a
1594 * dev_t instead of an udev_t. */
1595static void
1596addalias(nvp, dev)
1597	struct vnode *nvp;
1598	dev_t dev;
1599{
1600
1601	KASSERT(nvp->v_type == VCHR, ("addalias on non-special vnode"));
1602	nvp->v_rdev = dev;
1603	mtx_lock(&spechash_mtx);
1604	SLIST_INSERT_HEAD(&dev->si_hlist, nvp, v_specnext);
1605	mtx_unlock(&spechash_mtx);
1606}
1607
1608/*
1609 * Grab a particular vnode from the free list, increment its
1610 * reference count and lock it. The vnode lock bit is set if the
1611 * vnode is being eliminated in vgone. The process is awakened
1612 * when the transition is completed, and an error returned to
1613 * indicate that the vnode is no longer usable (possibly having
1614 * been changed to a new file system type).
1615 */
1616int
1617vget(vp, flags, td)
1618	register struct vnode *vp;
1619	int flags;
1620	struct thread *td;
1621{
1622	int error;
1623
1624	/*
1625	 * If the vnode is in the process of being cleaned out for
1626	 * another use, we wait for the cleaning to finish and then
1627	 * return failure. Cleaning is determined by checking that
1628	 * the VXLOCK flag is set.
1629	 */
1630	if ((flags & LK_INTERLOCK) == 0)
1631		mtx_lock(&vp->v_interlock);
1632	if (vp->v_flag & VXLOCK) {
1633		if (vp->v_vxproc == curthread) {
1634			log(LOG_INFO, "VXLOCK interlock avoided\n");
1635		} else {
1636			vp->v_flag |= VXWANT;
1637			msleep((caddr_t)vp, &vp->v_interlock, PINOD | PDROP,
1638			    "vget", 0);
1639			return (ENOENT);
1640		}
1641	}
1642
1643	vp->v_usecount++;
1644
1645	if (VSHOULDBUSY(vp))
1646		vbusy(vp);
1647	if (flags & LK_TYPE_MASK) {
1648		if ((error = vn_lock(vp, flags | LK_INTERLOCK, td)) != 0) {
1649			/*
1650			 * must expand vrele here because we do not want
1651			 * to call VOP_INACTIVE if the reference count
1652			 * drops back to zero since it was never really
1653			 * active. We must remove it from the free list
1654			 * before sleeping so that multiple processes do
1655			 * not try to recycle it.
1656			 */
1657			mtx_lock(&vp->v_interlock);
1658			vp->v_usecount--;
1659			if (VSHOULDFREE(vp))
1660				vfree(vp);
1661			mtx_unlock(&vp->v_interlock);
1662		}
1663		return (error);
1664	}
1665	mtx_unlock(&vp->v_interlock);
1666	return (0);
1667}
1668
1669/*
1670 * Increase the reference count of a vnode.
1671 */
1672void
1673vref(struct vnode *vp)
1674{
1675	mtx_lock(&vp->v_interlock);
1676	vp->v_usecount++;
1677	mtx_unlock(&vp->v_interlock);
1678}
1679
1680/*
1681 * Vnode put/release.
1682 * If count drops to zero, call inactive routine and return to freelist.
1683 */
1684void
1685vrele(vp)
1686	struct vnode *vp;
1687{
1688	struct thread *td = curthread;	/* XXX */
1689
1690	KASSERT(vp != NULL, ("vrele: null vp"));
1691
1692	mtx_lock(&vp->v_interlock);
1693
1694	/* Skip this v_writecount check if we're going to panic below. */
1695	KASSERT(vp->v_writecount < vp->v_usecount || vp->v_usecount < 1,
1696	    ("vrele: missed vn_close"));
1697
1698	if (vp->v_usecount > 1) {
1699
1700		vp->v_usecount--;
1701		mtx_unlock(&vp->v_interlock);
1702
1703		return;
1704	}
1705
1706	if (vp->v_usecount == 1) {
1707		vp->v_usecount--;
1708		if (VSHOULDFREE(vp))
1709			vfree(vp);
1710	/*
1711	 * If we are doing a vput, the node is already locked, and we must
1712	 * call VOP_INACTIVE with the node locked.  So, in the case of
1713	 * vrele, we explicitly lock the vnode before calling VOP_INACTIVE.
1714	 */
1715		if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK, td) == 0) {
1716			VOP_INACTIVE(vp, td);
1717		}
1718
1719	} else {
1720#ifdef DIAGNOSTIC
1721		vprint("vrele: negative ref count", vp);
1722		mtx_unlock(&vp->v_interlock);
1723#endif
1724		panic("vrele: negative ref cnt");
1725	}
1726}
1727
1728/*
1729 * Release an already locked vnode.  This give the same effects as
1730 * unlock+vrele(), but takes less time and avoids releasing and
1731 * re-aquiring the lock (as vrele() aquires the lock internally.)
1732 */
1733void
1734vput(vp)
1735	struct vnode *vp;
1736{
1737	struct thread *td = curthread;	/* XXX */
1738
1739	GIANT_REQUIRED;
1740
1741	KASSERT(vp != NULL, ("vput: null vp"));
1742	mtx_lock(&vp->v_interlock);
1743	/* Skip this v_writecount check if we're going to panic below. */
1744	KASSERT(vp->v_writecount < vp->v_usecount || vp->v_usecount < 1,
1745	    ("vput: missed vn_close"));
1746
1747	if (vp->v_usecount > 1) {
1748		vp->v_usecount--;
1749		VOP_UNLOCK(vp, LK_INTERLOCK, td);
1750		return;
1751	}
1752
1753	if (vp->v_usecount == 1) {
1754		vp->v_usecount--;
1755		if (VSHOULDFREE(vp))
1756			vfree(vp);
1757	/*
1758	 * If we are doing a vput, the node is already locked, and we must
1759	 * call VOP_INACTIVE with the node locked.  So, in the case of
1760	 * vrele, we explicitly lock the vnode before calling VOP_INACTIVE.
1761	 */
1762		mtx_unlock(&vp->v_interlock);
1763		VOP_INACTIVE(vp, td);
1764
1765	} else {
1766#ifdef DIAGNOSTIC
1767		vprint("vput: negative ref count", vp);
1768#endif
1769		panic("vput: negative ref cnt");
1770	}
1771}
1772
1773/*
1774 * Somebody doesn't want the vnode recycled.
1775 */
1776void
1777vhold(vp)
1778	register struct vnode *vp;
1779{
1780	int s;
1781
1782  	s = splbio();
1783	vp->v_holdcnt++;
1784	if (VSHOULDBUSY(vp))
1785		vbusy(vp);
1786	splx(s);
1787}
1788
1789/*
1790 * Note that there is one less who cares about this vnode.  vdrop() is the
1791 * opposite of vhold().
1792 */
1793void
1794vdrop(vp)
1795	register struct vnode *vp;
1796{
1797	int s;
1798
1799	s = splbio();
1800	if (vp->v_holdcnt <= 0)
1801		panic("vdrop: holdcnt");
1802	vp->v_holdcnt--;
1803	if (VSHOULDFREE(vp))
1804		vfree(vp);
1805	splx(s);
1806}
1807
1808/*
1809 * Remove any vnodes in the vnode table belonging to mount point mp.
1810 *
1811 * If FORCECLOSE is not specified, there should not be any active ones,
1812 * return error if any are found (nb: this is a user error, not a
1813 * system error). If FORCECLOSE is specified, detach any active vnodes
1814 * that are found.
1815 *
1816 * If WRITECLOSE is set, only flush out regular file vnodes open for
1817 * writing.
1818 *
1819 * SKIPSYSTEM causes any vnodes marked VSYSTEM to be skipped.
1820 *
1821 * `rootrefs' specifies the base reference count for the root vnode
1822 * of this filesystem. The root vnode is considered busy if its
1823 * v_usecount exceeds this value. On a successful return, vflush()
1824 * will call vrele() on the root vnode exactly rootrefs times.
1825 * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must
1826 * be zero.
1827 */
1828#ifdef DIAGNOSTIC
1829static int busyprt = 0;		/* print out busy vnodes */
1830SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "");
1831#endif
1832
1833int
1834vflush(mp, rootrefs, flags)
1835	struct mount *mp;
1836	int rootrefs;
1837	int flags;
1838{
1839	struct thread *td = curthread;	/* XXX */
1840	struct vnode *vp, *nvp, *rootvp = NULL;
1841	int busy = 0, error;
1842
1843	if (rootrefs > 0) {
1844		KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0,
1845		    ("vflush: bad args"));
1846		/*
1847		 * Get the filesystem root vnode. We can vput() it
1848		 * immediately, since with rootrefs > 0, it won't go away.
1849		 */
1850		if ((error = VFS_ROOT(mp, &rootvp)) != 0)
1851			return (error);
1852		vput(rootvp);
1853	}
1854	mtx_lock(&mntvnode_mtx);
1855loop:
1856	for (vp = TAILQ_FIRST(&mp->mnt_nvnodelist); vp; vp = nvp) {
1857		/*
1858		 * Make sure this vnode wasn't reclaimed in getnewvnode().
1859		 * Start over if it has (it won't be on the list anymore).
1860		 */
1861		if (vp->v_mount != mp)
1862			goto loop;
1863		nvp = TAILQ_NEXT(vp, v_nmntvnodes);
1864
1865		mtx_unlock(&mntvnode_mtx);
1866		mtx_lock(&vp->v_interlock);
1867		/*
1868		 * Skip over a vnodes marked VSYSTEM.
1869		 */
1870		if ((flags & SKIPSYSTEM) && (vp->v_flag & VSYSTEM)) {
1871			mtx_unlock(&vp->v_interlock);
1872			mtx_lock(&mntvnode_mtx);
1873			continue;
1874		}
1875		/*
1876		 * If WRITECLOSE is set, only flush out regular file vnodes
1877		 * open for writing.
1878		 */
1879		if ((flags & WRITECLOSE) &&
1880		    (vp->v_writecount == 0 || vp->v_type != VREG)) {
1881			mtx_unlock(&vp->v_interlock);
1882			mtx_lock(&mntvnode_mtx);
1883			continue;
1884		}
1885
1886		/*
1887		 * With v_usecount == 0, all we need to do is clear out the
1888		 * vnode data structures and we are done.
1889		 */
1890		if (vp->v_usecount == 0) {
1891			vgonel(vp, td);
1892			mtx_lock(&mntvnode_mtx);
1893			continue;
1894		}
1895
1896		/*
1897		 * If FORCECLOSE is set, forcibly close the vnode. For block
1898		 * or character devices, revert to an anonymous device. For
1899		 * all other files, just kill them.
1900		 */
1901		if (flags & FORCECLOSE) {
1902			if (vp->v_type != VCHR) {
1903				vgonel(vp, td);
1904			} else {
1905				vclean(vp, 0, td);
1906				vp->v_op = spec_vnodeop_p;
1907				insmntque(vp, (struct mount *) 0);
1908			}
1909			mtx_lock(&mntvnode_mtx);
1910			continue;
1911		}
1912#ifdef DIAGNOSTIC
1913		if (busyprt)
1914			vprint("vflush: busy vnode", vp);
1915#endif
1916		mtx_unlock(&vp->v_interlock);
1917		mtx_lock(&mntvnode_mtx);
1918		busy++;
1919	}
1920	mtx_unlock(&mntvnode_mtx);
1921	if (rootrefs > 0 && (flags & FORCECLOSE) == 0) {
1922		/*
1923		 * If just the root vnode is busy, and if its refcount
1924		 * is equal to `rootrefs', then go ahead and kill it.
1925		 */
1926		mtx_lock(&rootvp->v_interlock);
1927		KASSERT(busy > 0, ("vflush: not busy"));
1928		KASSERT(rootvp->v_usecount >= rootrefs, ("vflush: rootrefs"));
1929		if (busy == 1 && rootvp->v_usecount == rootrefs) {
1930			vgonel(rootvp, td);
1931			busy = 0;
1932		} else
1933			mtx_unlock(&rootvp->v_interlock);
1934	}
1935	if (busy)
1936		return (EBUSY);
1937	for (; rootrefs > 0; rootrefs--)
1938		vrele(rootvp);
1939	return (0);
1940}
1941
1942/*
1943 * Disassociate the underlying file system from a vnode.
1944 */
1945static void
1946vclean(vp, flags, td)
1947	struct vnode *vp;
1948	int flags;
1949	struct thread *td;
1950{
1951	int active;
1952
1953	/*
1954	 * Check to see if the vnode is in use. If so we have to reference it
1955	 * before we clean it out so that its count cannot fall to zero and
1956	 * generate a race against ourselves to recycle it.
1957	 */
1958	if ((active = vp->v_usecount))
1959		vp->v_usecount++;
1960
1961	/*
1962	 * Prevent the vnode from being recycled or brought into use while we
1963	 * clean it out.
1964	 */
1965	if (vp->v_flag & VXLOCK)
1966		panic("vclean: deadlock");
1967	vp->v_flag |= VXLOCK;
1968	vp->v_vxproc = curthread;
1969	/*
1970	 * Even if the count is zero, the VOP_INACTIVE routine may still
1971	 * have the object locked while it cleans it out. The VOP_LOCK
1972	 * ensures that the VOP_INACTIVE routine is done with its work.
1973	 * For active vnodes, it ensures that no other activity can
1974	 * occur while the underlying object is being cleaned out.
1975	 */
1976	VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK, td);
1977
1978	/*
1979	 * Clean out any buffers associated with the vnode.
1980	 * If the flush fails, just toss the buffers.
1981	 */
1982	if (flags & DOCLOSE) {
1983		if (TAILQ_FIRST(&vp->v_dirtyblkhd) != NULL)
1984			(void) vn_write_suspend_wait(vp, NULL, V_WAIT);
1985		if (vinvalbuf(vp, V_SAVE, NOCRED, td, 0, 0) != 0)
1986			vinvalbuf(vp, 0, NOCRED, td, 0, 0);
1987	}
1988
1989	VOP_DESTROYVOBJECT(vp);
1990
1991	/*
1992	 * If purging an active vnode, it must be closed and
1993	 * deactivated before being reclaimed. Note that the
1994	 * VOP_INACTIVE will unlock the vnode.
1995	 */
1996	if (active) {
1997		if (flags & DOCLOSE)
1998			VOP_CLOSE(vp, FNONBLOCK, NOCRED, td);
1999		VOP_INACTIVE(vp, td);
2000	} else {
2001		/*
2002		 * Any other processes trying to obtain this lock must first
2003		 * wait for VXLOCK to clear, then call the new lock operation.
2004		 */
2005		VOP_UNLOCK(vp, 0, td);
2006	}
2007	/*
2008	 * Reclaim the vnode.
2009	 */
2010	if (VOP_RECLAIM(vp, td))
2011		panic("vclean: cannot reclaim");
2012
2013	if (active) {
2014		/*
2015		 * Inline copy of vrele() since VOP_INACTIVE
2016		 * has already been called.
2017		 */
2018		mtx_lock(&vp->v_interlock);
2019		if (--vp->v_usecount <= 0) {
2020#ifdef DIAGNOSTIC
2021			if (vp->v_usecount < 0 || vp->v_writecount != 0) {
2022				vprint("vclean: bad ref count", vp);
2023				panic("vclean: ref cnt");
2024			}
2025#endif
2026			vfree(vp);
2027		}
2028		mtx_unlock(&vp->v_interlock);
2029	}
2030
2031	cache_purge(vp);
2032	vp->v_vnlock = NULL;
2033	lockdestroy(&vp->v_lock);
2034
2035	if (VSHOULDFREE(vp))
2036		vfree(vp);
2037
2038	/*
2039	 * Done with purge, notify sleepers of the grim news.
2040	 */
2041	vp->v_op = dead_vnodeop_p;
2042	vn_pollgone(vp);
2043	vp->v_tag = VT_NON;
2044	vp->v_flag &= ~VXLOCK;
2045	vp->v_vxproc = NULL;
2046	if (vp->v_flag & VXWANT) {
2047		vp->v_flag &= ~VXWANT;
2048		wakeup((caddr_t) vp);
2049	}
2050}
2051
2052/*
2053 * Eliminate all activity associated with the requested vnode
2054 * and with all vnodes aliased to the requested vnode.
2055 */
2056int
2057vop_revoke(ap)
2058	struct vop_revoke_args /* {
2059		struct vnode *a_vp;
2060		int a_flags;
2061	} */ *ap;
2062{
2063	struct vnode *vp, *vq;
2064	dev_t dev;
2065
2066	KASSERT((ap->a_flags & REVOKEALL) != 0, ("vop_revoke"));
2067
2068	vp = ap->a_vp;
2069	/*
2070	 * If a vgone (or vclean) is already in progress,
2071	 * wait until it is done and return.
2072	 */
2073	if (vp->v_flag & VXLOCK) {
2074		vp->v_flag |= VXWANT;
2075		msleep((caddr_t)vp, &vp->v_interlock, PINOD | PDROP,
2076		    "vop_revokeall", 0);
2077		return (0);
2078	}
2079	dev = vp->v_rdev;
2080	for (;;) {
2081		mtx_lock(&spechash_mtx);
2082		vq = SLIST_FIRST(&dev->si_hlist);
2083		mtx_unlock(&spechash_mtx);
2084		if (!vq)
2085			break;
2086		vgone(vq);
2087	}
2088	return (0);
2089}
2090
2091/*
2092 * Recycle an unused vnode to the front of the free list.
2093 * Release the passed interlock if the vnode will be recycled.
2094 */
2095int
2096vrecycle(vp, inter_lkp, td)
2097	struct vnode *vp;
2098	struct mtx *inter_lkp;
2099	struct thread *td;
2100{
2101
2102	mtx_lock(&vp->v_interlock);
2103	if (vp->v_usecount == 0) {
2104		if (inter_lkp) {
2105			mtx_unlock(inter_lkp);
2106		}
2107		vgonel(vp, td);
2108		return (1);
2109	}
2110	mtx_unlock(&vp->v_interlock);
2111	return (0);
2112}
2113
2114/*
2115 * Eliminate all activity associated with a vnode
2116 * in preparation for reuse.
2117 */
2118void
2119vgone(vp)
2120	register struct vnode *vp;
2121{
2122	struct thread *td = curthread;	/* XXX */
2123
2124	mtx_lock(&vp->v_interlock);
2125	vgonel(vp, td);
2126}
2127
2128/*
2129 * vgone, with the vp interlock held.
2130 */
2131void
2132vgonel(vp, td)
2133	struct vnode *vp;
2134	struct thread *td;
2135{
2136	int s;
2137
2138	/*
2139	 * If a vgone (or vclean) is already in progress,
2140	 * wait until it is done and return.
2141	 */
2142	if (vp->v_flag & VXLOCK) {
2143		vp->v_flag |= VXWANT;
2144		msleep((caddr_t)vp, &vp->v_interlock, PINOD | PDROP,
2145		    "vgone", 0);
2146		return;
2147	}
2148
2149	/*
2150	 * Clean out the filesystem specific data.
2151	 */
2152	vclean(vp, DOCLOSE, td);
2153	mtx_lock(&vp->v_interlock);
2154
2155	/*
2156	 * Delete from old mount point vnode list, if on one.
2157	 */
2158	if (vp->v_mount != NULL)
2159		insmntque(vp, (struct mount *)0);
2160	/*
2161	 * If special device, remove it from special device alias list
2162	 * if it is on one.
2163	 */
2164	if (vp->v_type == VCHR && vp->v_rdev != NULL && vp->v_rdev != NODEV) {
2165		mtx_lock(&spechash_mtx);
2166		SLIST_REMOVE(&vp->v_rdev->si_hlist, vp, vnode, v_specnext);
2167		freedev(vp->v_rdev);
2168		mtx_unlock(&spechash_mtx);
2169		vp->v_rdev = NULL;
2170	}
2171
2172	/*
2173	 * If it is on the freelist and not already at the head,
2174	 * move it to the head of the list. The test of the
2175	 * VDOOMED flag and the reference count of zero is because
2176	 * it will be removed from the free list by getnewvnode,
2177	 * but will not have its reference count incremented until
2178	 * after calling vgone. If the reference count were
2179	 * incremented first, vgone would (incorrectly) try to
2180	 * close the previous instance of the underlying object.
2181	 */
2182	if (vp->v_usecount == 0 && !(vp->v_flag & VDOOMED)) {
2183		s = splbio();
2184		mtx_lock(&vnode_free_list_mtx);
2185		if (vp->v_flag & VFREE)
2186			TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
2187		else
2188			freevnodes++;
2189		vp->v_flag |= VFREE;
2190		TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
2191		mtx_unlock(&vnode_free_list_mtx);
2192		splx(s);
2193	}
2194
2195	vp->v_type = VBAD;
2196	mtx_unlock(&vp->v_interlock);
2197}
2198
2199/*
2200 * Lookup a vnode by device number.
2201 */
2202int
2203vfinddev(dev, type, vpp)
2204	dev_t dev;
2205	enum vtype type;
2206	struct vnode **vpp;
2207{
2208	struct vnode *vp;
2209
2210	mtx_lock(&spechash_mtx);
2211	SLIST_FOREACH(vp, &dev->si_hlist, v_specnext) {
2212		if (type == vp->v_type) {
2213			*vpp = vp;
2214			mtx_unlock(&spechash_mtx);
2215			return (1);
2216		}
2217	}
2218	mtx_unlock(&spechash_mtx);
2219	return (0);
2220}
2221
2222/*
2223 * Calculate the total number of references to a special device.
2224 */
2225int
2226vcount(vp)
2227	struct vnode *vp;
2228{
2229	struct vnode *vq;
2230	int count;
2231
2232	count = 0;
2233	mtx_lock(&spechash_mtx);
2234	SLIST_FOREACH(vq, &vp->v_rdev->si_hlist, v_specnext)
2235		count += vq->v_usecount;
2236	mtx_unlock(&spechash_mtx);
2237	return (count);
2238}
2239
2240/*
2241 * Same as above, but using the dev_t as argument
2242 */
2243int
2244count_dev(dev)
2245	dev_t dev;
2246{
2247	struct vnode *vp;
2248
2249	vp = SLIST_FIRST(&dev->si_hlist);
2250	if (vp == NULL)
2251		return (0);
2252	return(vcount(vp));
2253}
2254
2255/*
2256 * Print out a description of a vnode.
2257 */
2258static char *typename[] =
2259{"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"};
2260
2261void
2262vprint(label, vp)
2263	char *label;
2264	struct vnode *vp;
2265{
2266	char buf[96];
2267
2268	if (label != NULL)
2269		printf("%s: %p: ", label, (void *)vp);
2270	else
2271		printf("%p: ", (void *)vp);
2272	printf("type %s, usecount %d, writecount %d, refcount %d,",
2273	    typename[vp->v_type], vp->v_usecount, vp->v_writecount,
2274	    vp->v_holdcnt);
2275	buf[0] = '\0';
2276	if (vp->v_flag & VROOT)
2277		strcat(buf, "|VROOT");
2278	if (vp->v_flag & VTEXT)
2279		strcat(buf, "|VTEXT");
2280	if (vp->v_flag & VSYSTEM)
2281		strcat(buf, "|VSYSTEM");
2282	if (vp->v_flag & VXLOCK)
2283		strcat(buf, "|VXLOCK");
2284	if (vp->v_flag & VXWANT)
2285		strcat(buf, "|VXWANT");
2286	if (vp->v_flag & VBWAIT)
2287		strcat(buf, "|VBWAIT");
2288	if (vp->v_flag & VDOOMED)
2289		strcat(buf, "|VDOOMED");
2290	if (vp->v_flag & VFREE)
2291		strcat(buf, "|VFREE");
2292	if (vp->v_flag & VOBJBUF)
2293		strcat(buf, "|VOBJBUF");
2294	if (buf[0] != '\0')
2295		printf(" flags (%s)", &buf[1]);
2296	if (vp->v_data == NULL) {
2297		printf("\n");
2298	} else {
2299		printf("\n\t");
2300		VOP_PRINT(vp);
2301	}
2302}
2303
2304#ifdef DDB
2305#include <ddb/ddb.h>
2306/*
2307 * List all of the locked vnodes in the system.
2308 * Called when debugging the kernel.
2309 */
2310DB_SHOW_COMMAND(lockedvnodes, lockedvnodes)
2311{
2312	struct thread *td = curthread;	/* XXX */
2313	struct mount *mp, *nmp;
2314	struct vnode *vp;
2315
2316	printf("Locked vnodes\n");
2317	mtx_lock(&mountlist_mtx);
2318	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
2319		if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td)) {
2320			nmp = TAILQ_NEXT(mp, mnt_list);
2321			continue;
2322		}
2323		mtx_lock(&mntvnode_mtx);
2324		TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
2325			if (VOP_ISLOCKED(vp, NULL))
2326				vprint((char *)0, vp);
2327		}
2328		mtx_unlock(&mntvnode_mtx);
2329		mtx_lock(&mountlist_mtx);
2330		nmp = TAILQ_NEXT(mp, mnt_list);
2331		vfs_unbusy(mp, td);
2332	}
2333	mtx_unlock(&mountlist_mtx);
2334}
2335#endif
2336
2337/*
2338 * Top level filesystem related information gathering.
2339 */
2340static int	sysctl_ovfs_conf __P((SYSCTL_HANDLER_ARGS));
2341
2342static int
2343vfs_sysctl(SYSCTL_HANDLER_ARGS)
2344{
2345	int *name = (int *)arg1 - 1;	/* XXX */
2346	u_int namelen = arg2 + 1;	/* XXX */
2347	struct vfsconf *vfsp;
2348
2349#if 1 || defined(COMPAT_PRELITE2)
2350	/* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */
2351	if (namelen == 1)
2352		return (sysctl_ovfs_conf(oidp, arg1, arg2, req));
2353#endif
2354
2355	/* XXX the below code does not compile; vfs_sysctl does not exist. */
2356#ifdef notyet
2357	/* all sysctl names at this level are at least name and field */
2358	if (namelen < 2)
2359		return (ENOTDIR);		/* overloaded */
2360	if (name[0] != VFS_GENERIC) {
2361		for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
2362			if (vfsp->vfc_typenum == name[0])
2363				break;
2364		if (vfsp == NULL)
2365			return (EOPNOTSUPP);
2366		return ((*vfsp->vfc_vfsops->vfs_sysctl)(&name[1], namelen - 1,
2367		    oldp, oldlenp, newp, newlen, td));
2368	}
2369#endif
2370	switch (name[1]) {
2371	case VFS_MAXTYPENUM:
2372		if (namelen != 2)
2373			return (ENOTDIR);
2374		return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int)));
2375	case VFS_CONF:
2376		if (namelen != 3)
2377			return (ENOTDIR);	/* overloaded */
2378		for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
2379			if (vfsp->vfc_typenum == name[2])
2380				break;
2381		if (vfsp == NULL)
2382			return (EOPNOTSUPP);
2383		return (SYSCTL_OUT(req, vfsp, sizeof *vfsp));
2384	}
2385	return (EOPNOTSUPP);
2386}
2387
2388SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD, vfs_sysctl,
2389	"Generic filesystem");
2390
2391#if 1 || defined(COMPAT_PRELITE2)
2392
2393static int
2394sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS)
2395{
2396	int error;
2397	struct vfsconf *vfsp;
2398	struct ovfsconf ovfs;
2399
2400	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
2401		ovfs.vfc_vfsops = vfsp->vfc_vfsops;	/* XXX used as flag */
2402		strcpy(ovfs.vfc_name, vfsp->vfc_name);
2403		ovfs.vfc_index = vfsp->vfc_typenum;
2404		ovfs.vfc_refcount = vfsp->vfc_refcount;
2405		ovfs.vfc_flags = vfsp->vfc_flags;
2406		error = SYSCTL_OUT(req, &ovfs, sizeof ovfs);
2407		if (error)
2408			return error;
2409	}
2410	return 0;
2411}
2412
2413#endif /* 1 || COMPAT_PRELITE2 */
2414
2415#if COMPILING_LINT
2416#define KINFO_VNODESLOP	10
2417/*
2418 * Dump vnode list (via sysctl).
2419 * Copyout address of vnode followed by vnode.
2420 */
2421/* ARGSUSED */
2422static int
2423sysctl_vnode(SYSCTL_HANDLER_ARGS)
2424{
2425	struct thread *td = curthread;	/* XXX */
2426	struct mount *mp, *nmp;
2427	struct vnode *nvp, *vp;
2428	int error;
2429
2430#define VPTRSZ	sizeof (struct vnode *)
2431#define VNODESZ	sizeof (struct vnode)
2432
2433	req->lock = 0;
2434	if (!req->oldptr) /* Make an estimate */
2435		return (SYSCTL_OUT(req, 0,
2436			(numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ)));
2437
2438	mtx_lock(&mountlist_mtx);
2439	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
2440		if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td)) {
2441			nmp = TAILQ_NEXT(mp, mnt_list);
2442			continue;
2443		}
2444		mtx_lock(&mntvnode_mtx);
2445again:
2446		for (vp = TAILQ_FIRST(&mp->mnt_nvnodelist);
2447		     vp != NULL;
2448		     vp = nvp) {
2449			/*
2450			 * Check that the vp is still associated with
2451			 * this filesystem.  RACE: could have been
2452			 * recycled onto the same filesystem.
2453			 */
2454			if (vp->v_mount != mp)
2455				goto again;
2456			nvp = TAILQ_NEXT(vp, v_nmntvnodes);
2457			mtx_unlock(&mntvnode_mtx);
2458			if ((error = SYSCTL_OUT(req, &vp, VPTRSZ)) ||
2459			    (error = SYSCTL_OUT(req, vp, VNODESZ)))
2460				return (error);
2461			mtx_lock(&mntvnode_mtx);
2462		}
2463		mtx_unlock(&mntvnode_mtx);
2464		mtx_lock(&mountlist_mtx);
2465		nmp = TAILQ_NEXT(mp, mnt_list);
2466		vfs_unbusy(mp, td);
2467	}
2468	mtx_unlock(&mountlist_mtx);
2469
2470	return (0);
2471}
2472
2473/*
2474 * XXX
2475 * Exporting the vnode list on large systems causes them to crash.
2476 * Exporting the vnode list on medium systems causes sysctl to coredump.
2477 */
2478SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD,
2479	0, 0, sysctl_vnode, "S,vnode", "");
2480#endif
2481
2482/*
2483 * Check to see if a filesystem is mounted on a block device.
2484 */
2485int
2486vfs_mountedon(vp)
2487	struct vnode *vp;
2488{
2489
2490	if (vp->v_rdev->si_mountpoint != NULL)
2491		return (EBUSY);
2492	return (0);
2493}
2494
2495/*
2496 * Unmount all filesystems. The list is traversed in reverse order
2497 * of mounting to avoid dependencies.
2498 */
2499void
2500vfs_unmountall()
2501{
2502	struct mount *mp;
2503	struct thread *td;
2504	int error;
2505
2506	if (curthread != NULL)
2507		td = curthread;
2508	else
2509		td = &initproc->p_thread;	/* XXX XXX should this be proc0? */
2510	/*
2511	 * Since this only runs when rebooting, it is not interlocked.
2512	 */
2513	while(!TAILQ_EMPTY(&mountlist)) {
2514		mp = TAILQ_LAST(&mountlist, mntlist);
2515		error = dounmount(mp, MNT_FORCE, td);
2516		if (error) {
2517			TAILQ_REMOVE(&mountlist, mp, mnt_list);
2518			printf("unmount of %s failed (",
2519			    mp->mnt_stat.f_mntonname);
2520			if (error == EBUSY)
2521				printf("BUSY)\n");
2522			else
2523				printf("%d)\n", error);
2524		} else {
2525			/* The unmount has removed mp from the mountlist */
2526		}
2527	}
2528}
2529
2530/*
2531 * perform msync on all vnodes under a mount point
2532 * the mount point must be locked.
2533 */
2534void
2535vfs_msync(struct mount *mp, int flags)
2536{
2537	struct vnode *vp, *nvp;
2538	struct vm_object *obj;
2539	int tries;
2540
2541	GIANT_REQUIRED;
2542
2543	tries = 5;
2544	mtx_lock(&mntvnode_mtx);
2545loop:
2546	for (vp = TAILQ_FIRST(&mp->mnt_nvnodelist); vp != NULL; vp = nvp) {
2547		if (vp->v_mount != mp) {
2548			if (--tries > 0)
2549				goto loop;
2550			break;
2551		}
2552		nvp = TAILQ_NEXT(vp, v_nmntvnodes);
2553
2554		if (vp->v_flag & VXLOCK)	/* XXX: what if MNT_WAIT? */
2555			continue;
2556
2557		if (vp->v_flag & VNOSYNC)	/* unlinked, skip it */
2558			continue;
2559
2560		if ((vp->v_flag & VOBJDIRTY) &&
2561		    (flags == MNT_WAIT || VOP_ISLOCKED(vp, NULL) == 0)) {
2562			mtx_unlock(&mntvnode_mtx);
2563			if (!vget(vp,
2564			    LK_EXCLUSIVE | LK_RETRY | LK_NOOBJ, curthread)) {
2565				if (VOP_GETVOBJECT(vp, &obj) == 0) {
2566					vm_object_page_clean(obj, 0, 0,
2567					    flags == MNT_WAIT ?
2568					    OBJPC_SYNC : OBJPC_NOSYNC);
2569				}
2570				vput(vp);
2571			}
2572			mtx_lock(&mntvnode_mtx);
2573			if (TAILQ_NEXT(vp, v_nmntvnodes) != nvp) {
2574				if (--tries > 0)
2575					goto loop;
2576				break;
2577			}
2578		}
2579	}
2580	mtx_unlock(&mntvnode_mtx);
2581}
2582
2583/*
2584 * Create the VM object needed for VMIO and mmap support.  This
2585 * is done for all VREG files in the system.  Some filesystems might
2586 * afford the additional metadata buffering capability of the
2587 * VMIO code by making the device node be VMIO mode also.
2588 *
2589 * vp must be locked when vfs_object_create is called.
2590 */
2591int
2592vfs_object_create(vp, td, cred)
2593	struct vnode *vp;
2594	struct thread *td;
2595	struct ucred *cred;
2596{
2597	GIANT_REQUIRED;
2598	return (VOP_CREATEVOBJECT(vp, cred, td));
2599}
2600
2601/*
2602 * Mark a vnode as free, putting it up for recycling.
2603 */
2604void
2605vfree(vp)
2606	struct vnode *vp;
2607{
2608	int s;
2609
2610	s = splbio();
2611	mtx_lock(&vnode_free_list_mtx);
2612	KASSERT((vp->v_flag & VFREE) == 0, ("vnode already free"));
2613	if (vp->v_flag & VAGE) {
2614		TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
2615	} else {
2616		TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
2617	}
2618	freevnodes++;
2619	mtx_unlock(&vnode_free_list_mtx);
2620	vp->v_flag &= ~VAGE;
2621	vp->v_flag |= VFREE;
2622	splx(s);
2623}
2624
2625/*
2626 * Opposite of vfree() - mark a vnode as in use.
2627 */
2628void
2629vbusy(vp)
2630	struct vnode *vp;
2631{
2632	int s;
2633
2634	s = splbio();
2635	mtx_lock(&vnode_free_list_mtx);
2636	KASSERT((vp->v_flag & VFREE) != 0, ("vnode not free"));
2637	TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
2638	freevnodes--;
2639	mtx_unlock(&vnode_free_list_mtx);
2640	vp->v_flag &= ~(VFREE|VAGE);
2641	splx(s);
2642}
2643
2644/*
2645 * Record a process's interest in events which might happen to
2646 * a vnode.  Because poll uses the historic select-style interface
2647 * internally, this routine serves as both the ``check for any
2648 * pending events'' and the ``record my interest in future events''
2649 * functions.  (These are done together, while the lock is held,
2650 * to avoid race conditions.)
2651 */
2652int
2653vn_pollrecord(vp, td, events)
2654	struct vnode *vp;
2655	struct thread *td;
2656	short events;
2657{
2658	mtx_lock(&vp->v_pollinfo.vpi_lock);
2659	if (vp->v_pollinfo.vpi_revents & events) {
2660		/*
2661		 * This leaves events we are not interested
2662		 * in available for the other process which
2663		 * which presumably had requested them
2664		 * (otherwise they would never have been
2665		 * recorded).
2666		 */
2667		events &= vp->v_pollinfo.vpi_revents;
2668		vp->v_pollinfo.vpi_revents &= ~events;
2669
2670		mtx_unlock(&vp->v_pollinfo.vpi_lock);
2671		return events;
2672	}
2673	vp->v_pollinfo.vpi_events |= events;
2674	selrecord(td, &vp->v_pollinfo.vpi_selinfo);
2675	mtx_unlock(&vp->v_pollinfo.vpi_lock);
2676	return 0;
2677}
2678
2679/*
2680 * Note the occurrence of an event.  If the VN_POLLEVENT macro is used,
2681 * it is possible for us to miss an event due to race conditions, but
2682 * that condition is expected to be rare, so for the moment it is the
2683 * preferred interface.
2684 */
2685void
2686vn_pollevent(vp, events)
2687	struct vnode *vp;
2688	short events;
2689{
2690	mtx_lock(&vp->v_pollinfo.vpi_lock);
2691	if (vp->v_pollinfo.vpi_events & events) {
2692		/*
2693		 * We clear vpi_events so that we don't
2694		 * call selwakeup() twice if two events are
2695		 * posted before the polling process(es) is
2696		 * awakened.  This also ensures that we take at
2697		 * most one selwakeup() if the polling process
2698		 * is no longer interested.  However, it does
2699		 * mean that only one event can be noticed at
2700		 * a time.  (Perhaps we should only clear those
2701		 * event bits which we note?) XXX
2702		 */
2703		vp->v_pollinfo.vpi_events = 0;	/* &= ~events ??? */
2704		vp->v_pollinfo.vpi_revents |= events;
2705		selwakeup(&vp->v_pollinfo.vpi_selinfo);
2706	}
2707	mtx_unlock(&vp->v_pollinfo.vpi_lock);
2708}
2709
2710#define VN_KNOTE(vp, b) \
2711	KNOTE((struct klist *)&vp->v_pollinfo.vpi_selinfo.si_note, (b))
2712
2713/*
2714 * Wake up anyone polling on vp because it is being revoked.
2715 * This depends on dead_poll() returning POLLHUP for correct
2716 * behavior.
2717 */
2718void
2719vn_pollgone(vp)
2720	struct vnode *vp;
2721{
2722	mtx_lock(&vp->v_pollinfo.vpi_lock);
2723        VN_KNOTE(vp, NOTE_REVOKE);
2724	if (vp->v_pollinfo.vpi_events) {
2725		vp->v_pollinfo.vpi_events = 0;
2726		selwakeup(&vp->v_pollinfo.vpi_selinfo);
2727	}
2728	mtx_unlock(&vp->v_pollinfo.vpi_lock);
2729}
2730
2731
2732
2733/*
2734 * Routine to create and manage a filesystem syncer vnode.
2735 */
2736#define sync_close ((int (*) __P((struct  vop_close_args *)))nullop)
2737static int	sync_fsync __P((struct  vop_fsync_args *));
2738static int	sync_inactive __P((struct  vop_inactive_args *));
2739static int	sync_reclaim  __P((struct  vop_reclaim_args *));
2740#define sync_lock ((int (*) __P((struct  vop_lock_args *)))vop_nolock)
2741#define sync_unlock ((int (*) __P((struct  vop_unlock_args *)))vop_nounlock)
2742static int	sync_print __P((struct vop_print_args *));
2743#define sync_islocked ((int(*) __P((struct vop_islocked_args *)))vop_noislocked)
2744
2745static vop_t **sync_vnodeop_p;
2746static struct vnodeopv_entry_desc sync_vnodeop_entries[] = {
2747	{ &vop_default_desc,	(vop_t *) vop_eopnotsupp },
2748	{ &vop_close_desc,	(vop_t *) sync_close },		/* close */
2749	{ &vop_fsync_desc,	(vop_t *) sync_fsync },		/* fsync */
2750	{ &vop_inactive_desc,	(vop_t *) sync_inactive },	/* inactive */
2751	{ &vop_reclaim_desc,	(vop_t *) sync_reclaim },	/* reclaim */
2752	{ &vop_lock_desc,	(vop_t *) sync_lock },		/* lock */
2753	{ &vop_unlock_desc,	(vop_t *) sync_unlock },	/* unlock */
2754	{ &vop_print_desc,	(vop_t *) sync_print },		/* print */
2755	{ &vop_islocked_desc,	(vop_t *) sync_islocked },	/* islocked */
2756	{ NULL, NULL }
2757};
2758static struct vnodeopv_desc sync_vnodeop_opv_desc =
2759	{ &sync_vnodeop_p, sync_vnodeop_entries };
2760
2761VNODEOP_SET(sync_vnodeop_opv_desc);
2762
2763/*
2764 * Create a new filesystem syncer vnode for the specified mount point.
2765 */
2766int
2767vfs_allocate_syncvnode(mp)
2768	struct mount *mp;
2769{
2770	struct vnode *vp;
2771	static long start, incr, next;
2772	int error;
2773
2774	/* Allocate a new vnode */
2775	if ((error = getnewvnode(VT_VFS, mp, sync_vnodeop_p, &vp)) != 0) {
2776		mp->mnt_syncer = NULL;
2777		return (error);
2778	}
2779	vp->v_type = VNON;
2780	/*
2781	 * Place the vnode onto the syncer worklist. We attempt to
2782	 * scatter them about on the list so that they will go off
2783	 * at evenly distributed times even if all the filesystems
2784	 * are mounted at once.
2785	 */
2786	next += incr;
2787	if (next == 0 || next > syncer_maxdelay) {
2788		start /= 2;
2789		incr /= 2;
2790		if (start == 0) {
2791			start = syncer_maxdelay / 2;
2792			incr = syncer_maxdelay;
2793		}
2794		next = start;
2795	}
2796	vn_syncer_add_to_worklist(vp, syncdelay > 0 ? next % syncdelay : 0);
2797	mp->mnt_syncer = vp;
2798	return (0);
2799}
2800
2801/*
2802 * Do a lazy sync of the filesystem.
2803 */
2804static int
2805sync_fsync(ap)
2806	struct vop_fsync_args /* {
2807		struct vnode *a_vp;
2808		struct ucred *a_cred;
2809		int a_waitfor;
2810		struct thread *a_td;
2811	} */ *ap;
2812{
2813	struct vnode *syncvp = ap->a_vp;
2814	struct mount *mp = syncvp->v_mount;
2815	struct thread *td = ap->a_td;
2816	int asyncflag;
2817
2818	/*
2819	 * We only need to do something if this is a lazy evaluation.
2820	 */
2821	if (ap->a_waitfor != MNT_LAZY)
2822		return (0);
2823
2824	/*
2825	 * Move ourselves to the back of the sync list.
2826	 */
2827	vn_syncer_add_to_worklist(syncvp, syncdelay);
2828
2829	/*
2830	 * Walk the list of vnodes pushing all that are dirty and
2831	 * not already on the sync list.
2832	 */
2833	mtx_lock(&mountlist_mtx);
2834	if (vfs_busy(mp, LK_EXCLUSIVE | LK_NOWAIT, &mountlist_mtx, td) != 0) {
2835		mtx_unlock(&mountlist_mtx);
2836		return (0);
2837	}
2838	if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) {
2839		vfs_unbusy(mp, td);
2840		return (0);
2841	}
2842	asyncflag = mp->mnt_flag & MNT_ASYNC;
2843	mp->mnt_flag &= ~MNT_ASYNC;
2844	vfs_msync(mp, MNT_NOWAIT);
2845	VFS_SYNC(mp, MNT_LAZY, ap->a_cred, td);
2846	if (asyncflag)
2847		mp->mnt_flag |= MNT_ASYNC;
2848	vn_finished_write(mp);
2849	vfs_unbusy(mp, td);
2850	return (0);
2851}
2852
2853/*
2854 * The syncer vnode is no referenced.
2855 */
2856static int
2857sync_inactive(ap)
2858	struct vop_inactive_args /* {
2859		struct vnode *a_vp;
2860		struct thread *a_td;
2861	} */ *ap;
2862{
2863
2864	vgone(ap->a_vp);
2865	return (0);
2866}
2867
2868/*
2869 * The syncer vnode is no longer needed and is being decommissioned.
2870 *
2871 * Modifications to the worklist must be protected at splbio().
2872 */
2873static int
2874sync_reclaim(ap)
2875	struct vop_reclaim_args /* {
2876		struct vnode *a_vp;
2877	} */ *ap;
2878{
2879	struct vnode *vp = ap->a_vp;
2880	int s;
2881
2882	s = splbio();
2883	vp->v_mount->mnt_syncer = NULL;
2884	if (vp->v_flag & VONWORKLST) {
2885		LIST_REMOVE(vp, v_synclist);
2886		vp->v_flag &= ~VONWORKLST;
2887	}
2888	splx(s);
2889
2890	return (0);
2891}
2892
2893/*
2894 * Print out a syncer vnode.
2895 */
2896static int
2897sync_print(ap)
2898	struct vop_print_args /* {
2899		struct vnode *a_vp;
2900	} */ *ap;
2901{
2902	struct vnode *vp = ap->a_vp;
2903
2904	printf("syncer vnode");
2905	if (vp->v_vnlock != NULL)
2906		lockmgr_printinfo(vp->v_vnlock);
2907	printf("\n");
2908	return (0);
2909}
2910
2911/*
2912 * extract the dev_t from a VCHR
2913 */
2914dev_t
2915vn_todev(vp)
2916	struct vnode *vp;
2917{
2918	if (vp->v_type != VCHR)
2919		return (NODEV);
2920	return (vp->v_rdev);
2921}
2922
2923/*
2924 * Check if vnode represents a disk device
2925 */
2926int
2927vn_isdisk(vp, errp)
2928	struct vnode *vp;
2929	int *errp;
2930{
2931	struct cdevsw *cdevsw;
2932
2933	if (vp->v_type != VCHR) {
2934		if (errp != NULL)
2935			*errp = ENOTBLK;
2936		return (0);
2937	}
2938	if (vp->v_rdev == NULL) {
2939		if (errp != NULL)
2940			*errp = ENXIO;
2941		return (0);
2942	}
2943	cdevsw = devsw(vp->v_rdev);
2944	if (cdevsw == NULL) {
2945		if (errp != NULL)
2946			*errp = ENXIO;
2947		return (0);
2948	}
2949	if (!(cdevsw->d_flags & D_DISK)) {
2950		if (errp != NULL)
2951			*errp = ENOTBLK;
2952		return (0);
2953	}
2954	if (errp != NULL)
2955		*errp = 0;
2956	return (1);
2957}
2958
2959/*
2960 * Free data allocated by namei(); see namei(9) for details.
2961 */
2962void
2963NDFREE(ndp, flags)
2964     struct nameidata *ndp;
2965     const uint flags;
2966{
2967	if (!(flags & NDF_NO_FREE_PNBUF) &&
2968	    (ndp->ni_cnd.cn_flags & HASBUF)) {
2969		zfree(namei_zone, ndp->ni_cnd.cn_pnbuf);
2970		ndp->ni_cnd.cn_flags &= ~HASBUF;
2971	}
2972	if (!(flags & NDF_NO_DVP_UNLOCK) &&
2973	    (ndp->ni_cnd.cn_flags & LOCKPARENT) &&
2974	    ndp->ni_dvp != ndp->ni_vp)
2975		VOP_UNLOCK(ndp->ni_dvp, 0, ndp->ni_cnd.cn_thread);
2976	if (!(flags & NDF_NO_DVP_RELE) &&
2977	    (ndp->ni_cnd.cn_flags & (LOCKPARENT|WANTPARENT))) {
2978		vrele(ndp->ni_dvp);
2979		ndp->ni_dvp = NULL;
2980	}
2981	if (!(flags & NDF_NO_VP_UNLOCK) &&
2982	    (ndp->ni_cnd.cn_flags & LOCKLEAF) && ndp->ni_vp)
2983		VOP_UNLOCK(ndp->ni_vp, 0, ndp->ni_cnd.cn_thread);
2984	if (!(flags & NDF_NO_VP_RELE) &&
2985	    ndp->ni_vp) {
2986		vrele(ndp->ni_vp);
2987		ndp->ni_vp = NULL;
2988	}
2989	if (!(flags & NDF_NO_STARTDIR_RELE) &&
2990	    (ndp->ni_cnd.cn_flags & SAVESTART)) {
2991		vrele(ndp->ni_startdir);
2992		ndp->ni_startdir = NULL;
2993	}
2994}
2995
2996/*
2997 * Common file system object access control check routine.  Accepts a
2998 * vnode's type, "mode", uid and gid, requested access mode, credentials,
2999 * and optional call-by-reference privused argument allowing vaccess()
3000 * to indicate to the caller whether privilege was used to satisfy the
3001 * request.  Returns 0 on success, or an errno on failure.
3002 */
3003int
3004vaccess(type, file_mode, file_uid, file_gid, acc_mode, cred, privused)
3005	enum vtype type;
3006	mode_t file_mode;
3007	uid_t file_uid;
3008	gid_t file_gid;
3009	mode_t acc_mode;
3010	struct ucred *cred;
3011	int *privused;
3012{
3013	mode_t dac_granted;
3014#ifdef CAPABILITIES
3015	mode_t cap_granted;
3016#endif
3017
3018	/*
3019	 * Look for a normal, non-privileged way to access the file/directory
3020	 * as requested.  If it exists, go with that.
3021	 */
3022
3023	if (privused != NULL)
3024		*privused = 0;
3025
3026	dac_granted = 0;
3027
3028	/* Check the owner. */
3029	if (cred->cr_uid == file_uid) {
3030		dac_granted |= VADMIN;
3031		if (file_mode & S_IXUSR)
3032			dac_granted |= VEXEC;
3033		if (file_mode & S_IRUSR)
3034			dac_granted |= VREAD;
3035		if (file_mode & S_IWUSR)
3036			dac_granted |= VWRITE;
3037
3038		if ((acc_mode & dac_granted) == acc_mode)
3039			return (0);
3040
3041		goto privcheck;
3042	}
3043
3044	/* Otherwise, check the groups (first match) */
3045	if (groupmember(file_gid, cred)) {
3046		if (file_mode & S_IXGRP)
3047			dac_granted |= VEXEC;
3048		if (file_mode & S_IRGRP)
3049			dac_granted |= VREAD;
3050		if (file_mode & S_IWGRP)
3051			dac_granted |= VWRITE;
3052
3053		if ((acc_mode & dac_granted) == acc_mode)
3054			return (0);
3055
3056		goto privcheck;
3057	}
3058
3059	/* Otherwise, check everyone else. */
3060	if (file_mode & S_IXOTH)
3061		dac_granted |= VEXEC;
3062	if (file_mode & S_IROTH)
3063		dac_granted |= VREAD;
3064	if (file_mode & S_IWOTH)
3065		dac_granted |= VWRITE;
3066	if ((acc_mode & dac_granted) == acc_mode)
3067		return (0);
3068
3069privcheck:
3070	if (!suser_xxx(cred, NULL, PRISON_ROOT)) {
3071		/* XXX audit: privilege used */
3072		if (privused != NULL)
3073			*privused = 1;
3074		return (0);
3075	}
3076
3077#ifdef CAPABILITIES
3078	/*
3079	 * Build a capability mask to determine if the set of capabilities
3080	 * satisfies the requirements when combined with the granted mask
3081	 * from above.
3082	 * For each capability, if the capability is required, bitwise
3083	 * or the request type onto the cap_granted mask.
3084	 */
3085	cap_granted = 0;
3086
3087	if (type == VDIR) {
3088		/*
3089		 * For directories, use CAP_DAC_READ_SEARCH to satisfy
3090		 * VEXEC requests, instead of CAP_DAC_EXECUTE.
3091		 */
3092		if ((acc_mode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
3093		    !cap_check(cred, NULL, CAP_DAC_READ_SEARCH, PRISON_ROOT))
3094			cap_granted |= VEXEC;
3095	} else {
3096		if ((acc_mode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
3097		    !cap_check(cred, NULL, CAP_DAC_EXECUTE, PRISON_ROOT))
3098			cap_granted |= VEXEC;
3099	}
3100
3101	if ((acc_mode & VREAD) && ((dac_granted & VREAD) == 0) &&
3102	    !cap_check(cred, NULL, CAP_DAC_READ_SEARCH, PRISON_ROOT))
3103		cap_granted |= VREAD;
3104
3105	if ((acc_mode & VWRITE) && ((dac_granted & VWRITE) == 0) &&
3106	    !cap_check(cred, NULL, CAP_DAC_WRITE, PRISON_ROOT))
3107		cap_granted |= VWRITE;
3108
3109	if ((acc_mode & VADMIN) && ((dac_granted & VADMIN) == 0) &&
3110	    !cap_check(cred, NULL, CAP_FOWNER, PRISON_ROOT))
3111		cap_granted |= VADMIN;
3112
3113	if ((acc_mode & (cap_granted | dac_granted)) == acc_mode) {
3114		/* XXX audit: privilege used */
3115		if (privused != NULL)
3116			*privused = 1;
3117		return (0);
3118	}
3119#endif
3120
3121	return ((acc_mode & VADMIN) ? EPERM : EACCES);
3122}
3123
3124