vfs_subr.c revision 148167
110154Sache/*-
27767Sache * Copyright (c) 1989, 1993
37767Sache *	The Regents of the University of California.  All rights reserved.
4941Snate * (c) UNIX System Laboratories, Inc.
57767Sache * All or some portions of this file are derived from material licensed
67767Sache * to the University of California by American Telephone and Telegraph
7941Snate * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8941Snate * the permission of UNIX System Laboratories, Inc.
9941Snate *
10941Snate * Redistribution and use in source and binary forms, with or without
11941Snate * modification, are permitted provided that the following conditions
12941Snate * are met:
13941Snate * 1. Redistributions of source code must retain the above copyright
14941Snate *    notice, this list of conditions and the following disclaimer.
15941Snate * 2. Redistributions in binary form must reproduce the above copyright
16941Snate *    notice, this list of conditions and the following disclaimer in the
17941Snate *    documentation and/or other materials provided with the distribution.
18941Snate * 4. Neither the name of the University nor the names of its contributors
19941Snate *    may be used to endorse or promote products derived from this software
2010154Sache *    without specific prior written permission.
21941Snate *
22941Snate * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23941Snate * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24941Snate * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25941Snate * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26941Snate * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27941Snate * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28941Snate * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
2987230Smarkm * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
3054158Scharnier * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
3187230Smarkm * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
3287230Smarkm * SUCH DAMAGE.
33941Snate *
34941Snate *	@(#)vfs_subr.c	8.31 (Berkeley) 5/26/95
35941Snate */
367767Sache
37941Snate/*
38941Snate * External virtual filesystem routines
39941Snate */
4022873Sdavidn
41941Snate#include <sys/cdefs.h>
42941Snate__FBSDID("$FreeBSD: head/sys/kern/vfs_subr.c 148167 2005-07-20 01:43:27Z jeff $");
4354158Scharnier
44941Snate#include "opt_ddb.h"
45941Snate#include "opt_mac.h"
46941Snate
47941Snate#include <sys/param.h>
48941Snate#include <sys/systm.h>
49941Snate#include <sys/bio.h>
50941Snate#include <sys/buf.h>
51941Snate#include <sys/conf.h>
52941Snate#include <sys/event.h>
53941Snate#include <sys/eventhandler.h>
5422873Sdavidn#include <sys/extattr.h>
557767Sache#include <sys/file.h>
567767Sache#include <sys/fcntl.h>
5711760Sache#include <sys/kdb.h>
5811760Sache#include <sys/kernel.h>
597767Sache#include <sys/kthread.h>
60941Snate#include <sys/mac.h>
6123318Sache#include <sys/malloc.h>
6222873Sdavidn#include <sys/mount.h>
6322873Sdavidn#include <sys/namei.h>
6423318Sache#include <sys/reboot.h>
6522873Sdavidn#include <sys/sleepqueue.h>
6622873Sdavidn#include <sys/stat.h>
67941Snate#include <sys/sysctl.h>
687767Sache#include <sys/syslog.h>
69941Snate#include <sys/vmmeter.h>
70941Snate#include <sys/vnode.h>
71941Snate
727767Sache#include <machine/stdarg.h>
737767Sache
74941Snate#include <vm/vm.h>
75941Snate#include <vm/vm_object.h>
76941Snate#include <vm/vm_extern.h>
77941Snate#include <vm/pmap.h>
78941Snate#include <vm/vm_map.h>
7910154Sache#include <vm/vm_page.h>
807767Sache#include <vm/vm_kern.h>
817767Sache#include <vm/uma.h>
827767Sache
837767Sachestatic MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure");
847767Sache
857767Sachestatic void	delmntque(struct vnode *vp);
867767Sachestatic void	insmntque(struct vnode *vp, struct mount *mp);
877767Sachestatic int	flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo,
887767Sache		    int slpflag, int slptimeo);
897767Sachestatic void	syncer_shutdown(void *arg, int howto);
907767Sachestatic int	vtryrecycle(struct vnode *vp);
917767Sachestatic void	vbusy(struct vnode *vp);
927767Sachestatic void	vdropl(struct vnode *vp);
93941Snatestatic void	vinactive(struct vnode *, struct thread *);
94941Snatestatic void	v_incr_usecount(struct vnode *);
95941Snatestatic void	v_decr_usecount(struct vnode *);
9610154Sachestatic void	v_decr_useonly(struct vnode *);
9710154Sachestatic void	vfree(struct vnode *);
98941Snatestatic void	vnlru_free(int);
997767Sachestatic void	vdestroy(struct vnode *);
10087208Smarkmstatic void	vgonel(struct vnode *);
101941Snatestatic void	vfs_knllock(void *arg);
1027767Sachestatic void	vfs_knlunlock(void *arg);
1037767Sachestatic int	vfs_knllocked(void *arg);
10446081Simp
105941Snate
106941Snate/*
1077767Sache * Enable Giant pushdown based on whether or not the vm is mpsafe in this
108941Snate * build.  Without mpsafevm the buffer cache can not run Giant free.
109941Snate */
1107767Sache#if defined(__alpha__) || defined(__amd64__) || defined(__i386__)
111941Snateint mpsafe_vfs = 1;
1127767Sache#else
113941Snateint mpsafe_vfs;
114941Snate#endif
11582722SkrisTUNABLE_INT("debug.mpsafevfs", &mpsafe_vfs);
116941SnateSYSCTL_INT(_debug, OID_AUTO, mpsafevfs, CTLFLAG_RD, &mpsafe_vfs, 0,
117941Snate    "MPSAFE VFS");
118941Snate
1197767Sache/*
1207767Sache * Number of vnodes in existence.  Increased whenever getnewvnode()
1217767Sache * allocates a new vnode, never decreased.
1227767Sache */
1237767Sachestatic unsigned long	numvnodes;
12487208Smarkm
1257767SacheSYSCTL_LONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, "");
126941Snate
127941Snate/*
12887208Smarkm * Conversion tables for conversion from vnode types to inode formats
129941Snate * and back.
13010154Sache */
131941Snateenum vtype iftovt_tab[16] = {
1327767Sache	VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
1337767Sache	VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
1347767Sache};
1357767Sacheint vttoif_tab[9] = {
1367767Sache	0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
1377767Sache	S_IFSOCK, S_IFIFO, S_IFMT,
138941Snate};
13982722Skris
140941Snate/*
141941Snate * List of vnodes that are ready for recycling.
14287208Smarkm */
143941Snatestatic TAILQ_HEAD(freelst, vnode) vnode_free_list;
14482722Skris
14582722Skris/*
14682722Skris * Free vnode target.  Free vnodes may simply be files which have been stat'd
14782722Skris * but not read.  This is somewhat common, and a small cache of such files
14882722Skris * should be kept to avoid recreation costs.
14982722Skris */
15082722Skrisstatic u_long wantfreevnodes;
151941SnateSYSCTL_LONG(_vfs, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, "");
152941Snate/* Number of vnodes in the free list. */
153941Snatestatic u_long freevnodes;
154941SnateSYSCTL_LONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, "");
1557767Sache
156941Snate/*
157941Snate * Various variables used for debugging the new implementation of
158941Snate * reassignbuf().
159941Snate * XXX these are probably of (very) limited utility now.
1607767Sache */
1617767Sachestatic int reassignbufcalls;
162941SnateSYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0, "");
1637767Sache
16480294Sobrien/*
16580294Sobrien * Cache for the mount type id assigned to NFS.  This is used for
1667767Sache * special checks in nfs/nfs_nqlease.c and vm/vnode_pager.c.
1677767Sache */
1687767Sacheint	nfs_mount_type = -1;
169941Snate
17054158Scharnier/* To keep more than one thread at a time from running vfs_getnewfsid */
171941Snatestatic struct mtx mntid_mtx;
1727767Sache
1737767Sache/*
17410154Sache * Lock for any access to the following:
1757767Sache *	vnode_free_list
17654158Scharnier *	numvnodes
17710154Sache *	freevnodes
1787767Sache */
1797767Sachestatic struct mtx vnode_free_list_mtx;
18080294Sobrien
18180294Sobrien/* Publicly exported FS */
1827767Sachestruct nfs_public nfs_pub;
183941Snate
184941Snate/* Zone for allocation of new vnodes - used exclusively by getnewvnode() */
18510154Sachestatic uma_zone_t vnode_zone;
18610154Sachestatic uma_zone_t vnodepoll_zone;
18710154Sache
18810154Sache/* Set to 1 to print out reclaim of active vnodes */
18910154Sacheint	prtactive;
19010154Sache
19110154Sache/*
19210154Sache * The workitem queue.
19310154Sache *
19410154Sache * It is useful to delay writes of file data and filesystem metadata
19510154Sache * for tens of seconds so that quickly created and deleted files need
19610154Sache * not waste disk bandwidth being created and removed. To realize this,
19710154Sache * we append vnodes to a "workitem" queue. When running with a soft
19810154Sache * updates implementation, most pending metadata dependencies should
19910154Sache * not wait for more than a few seconds. Thus, mounted on block devices
20010154Sache * are delayed only about a half the time that file data is delayed.
20110154Sache * Similarly, directory updates are more critical, so are only delayed
20210154Sache * about a third the time that file data is delayed. Thus, there are
20310154Sache * SYNCER_MAXDELAY queues that are processed round-robin at a rate of
20410154Sache * one each second (driven off the filesystem syncer process). The
20510154Sache * syncer_delayno variable indicates the next queue that is to be processed.
20610154Sache * Items that need to be processed soon are placed in this queue:
20710154Sache *
20810154Sache *	syncer_workitem_pending[syncer_delayno]
20910154Sache *
210941Snate * A delay of fifteen seconds is done by placing the request fifteen
2117767Sache * entries later in the queue:
212941Snate *
2137767Sache *	syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
2147767Sache *
21510154Sache */
2167767Sachestatic int syncer_delayno;
2177767Sachestatic long syncer_mask;
2187767SacheLIST_HEAD(synclist, bufobj);
2197767Sachestatic struct synclist *syncer_workitem_pending;
2207767Sache/*
2217767Sache * The sync_mtx protects:
2227767Sache *	bo->bo_synclist
2237767Sache *	sync_vnode_count
2247767Sache *	syncer_delayno
2257767Sache *	syncer_state
22610154Sache *	syncer_workitem_pending
22711760Sache *	syncer_worklist_len
22811760Sache *	rushjob
22911760Sache */
23011760Sachestatic struct mtx sync_mtx;
2317767Sache
2327767Sache#define SYNCER_MAXDELAY		32
2337767Sachestatic int syncer_maxdelay = SYNCER_MAXDELAY;	/* maximum delay time */
2347767Sachestatic int syncdelay = 30;		/* max time to delay syncing data */
2357767Sachestatic int filedelay = 30;		/* time to delay syncing files */
2367767SacheSYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, "");
237941Snatestatic int dirdelay = 29;		/* time to delay syncing directories */
2387767SacheSYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, "");
239941Snatestatic int metadelay = 28;		/* time to delay syncing metadata */
2407767SacheSYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, "");
241941Snatestatic int rushjob;		/* number of slots to run ASAP */
2427767Sachestatic int stat_rush_requests;	/* number of times I/O speeded up */
2437767SacheSYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, "");
2447767Sache
2457767Sache/*
2467767Sache * When shutting down the syncer, run it at four times normal speed.
247941Snate */
2487767Sache#define SYNCER_SHUTDOWN_SPEEDUP		4
249941Snatestatic int sync_vnode_count;
2507767Sachestatic int syncer_worklist_len;
25154158Scharnierstatic enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY }
252941Snate    syncer_state;
2537767Sache
2547767Sache/*
255941Snate * Number of vnodes we want to exist at any one time.  This is mostly used
2567767Sache * to size hash tables in vnode-related code.  It is normally not used in
2577767Sache * getnewvnode(), as wantfreevnodes is normally nonzero.)
2587767Sache *
259941Snate * XXX desiredvnodes is historical cruft and should not exist.
2607767Sache */
2617767Sacheint desiredvnodes;
2627767SacheSYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW,
2637767Sache    &desiredvnodes, 0, "Maximum number of vnodes");
2647767SacheSYSCTL_INT(_kern, OID_AUTO, minvnodes, CTLFLAG_RW,
2657767Sache    &wantfreevnodes, 0, "Minimum number of vnodes (legacy)");
2667767Sachestatic int vnlru_nowhere;
267941SnateSYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW,
26810154Sache    &vnlru_nowhere, 0, "Number of times the vnlru process ran without success");
26954158Scharnier
270941Snate/* Hook for calling soft updates. */
27110154Sacheint (*softdep_process_worklist_hook)(struct mount *);
27210154Sache
273941Snate/*
27410154Sache * Macros to control when a vnode is freed and recycled.  All require
27510154Sache * the vnode interlock.
27610154Sache */
277941Snate#define VCANRECYCLE(vp) (((vp)->v_iflag & VI_FREE) && !(vp)->v_holdcnt)
27810154Sache#define VSHOULDFREE(vp) (!((vp)->v_iflag & VI_FREE) && !(vp)->v_holdcnt)
27910154Sache#define VSHOULDBUSY(vp) (((vp)->v_iflag & VI_FREE) && (vp)->v_holdcnt)
28054158Scharnier
28110154Sache
2827767Sache/*
2837767Sache * Initialize the vnode management data structures.
2847767Sache */
2857767Sache#ifndef	MAXVNODES_MAX
2867767Sache#define	MAXVNODES_MAX	100000
2877767Sache#endif
2887767Sachestatic void
28954158Scharniervntblinit(void *dummy __unused)
290941Snate{
2917767Sache
29254158Scharnier	/*
293941Snate	 * Desiredvnodes is a function of the physical memory size and
2947767Sache	 * the kernel's heap size.  Specifically, desiredvnodes scales
29554158Scharnier	 * in proportion to the physical memory size until two fifths
296941Snate	 * of the kernel's heap size is consumed by vnodes and vm
2977767Sache	 * objects.
298941Snate	 */
2998112Sache	desiredvnodes = min(maxproc + cnt.v_page_count / 4, 2 * vm_kmem_size /
3008112Sache	    (5 * (sizeof(struct vm_object) + sizeof(struct vnode))));
3018112Sache	if (desiredvnodes > MAXVNODES_MAX) {
3028112Sache		if (bootverbose)
3038112Sache			printf("Reducing kern.maxvnodes %d -> %d\n",
3048112Sache			    desiredvnodes, MAXVNODES_MAX);
30510154Sache		desiredvnodes = MAXVNODES_MAX;
3067767Sache	}
3077767Sache	wantfreevnodes = desiredvnodes / 4;
3087767Sache	mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF);
309941Snate	TAILQ_INIT(&vnode_free_list);
3107767Sache	mtx_init(&vnode_free_list_mtx, "vnode_free_list", NULL, MTX_DEF);
3117767Sache	vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL,
3127767Sache	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
3137767Sache	vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo),
3147767Sache	      NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
3157767Sache	/*
316941Snate	 * Initialize the filesystem syncer.
3177767Sache	 */
31854158Scharnier	syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE,
319941Snate		&syncer_mask);
3207767Sache	syncer_maxdelay = syncer_mask + 1;
3217767Sache	mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF);
3227767Sache}
3237767SacheSYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL)
3247767Sache
3257767Sache
326941Snate/*
32710154Sache * Mark a mount point as busy. Used to synchronize access and to delay
32822873Sdavidn * unmounting. Interlock is not released on failure.
3297767Sache */
33010154Sacheint
3317767Sachevfs_busy(mp, flags, interlkp, td)
3327767Sache	struct mount *mp;
3337767Sache	int flags;
334941Snate	struct mtx *interlkp;
3357767Sache	struct thread *td;
3367767Sache{
3377767Sache	int lkflags;
3387767Sache
33954158Scharnier	MNT_ILOCK(mp);
3407767Sache	if (mp->mnt_kern_flag & MNTK_UNMOUNT) {
34122873Sdavidn		if (flags & LK_NOWAIT) {
34222873Sdavidn			MNT_IUNLOCK(mp);
343941Snate			return (ENOENT);
3447767Sache		}
3457767Sache		if (interlkp)
3467767Sache			mtx_unlock(interlkp);
347941Snate		mp->mnt_kern_flag |= MNTK_MWAIT;
3487767Sache		/*
3497767Sache		 * Since all busy locks are shared except the exclusive
35054158Scharnier		 * lock granted when unmounting, the only place that a
3517767Sache		 * wakeup needs to be done is at the release of the
3527767Sache		 * exclusive lock at the end of dounmount.
3537767Sache		 */
3547767Sache		msleep(mp, MNT_MTX(mp), PVFS|PDROP, "vfs_busy", 0);
3557767Sache		if (interlkp)
3567767Sache			mtx_lock(interlkp);
357941Snate		return (ENOENT);
3587767Sache	}
3597767Sache	if (interlkp)
3607767Sache		mtx_unlock(interlkp);
3617767Sache	lkflags = LK_SHARED | LK_INTERLOCK;
3627767Sache	if (lockmgr(&mp->mnt_lock, lkflags, MNT_MTX(mp), td))
36387208Smarkm		panic("vfs_busy: unexpected lock failure");
3647767Sache	return (0);
3657767Sache}
3667767Sache
36710154Sache/*
3687767Sache * Free a busy filesystem.
3697767Sache */
3707767Sachevoid
3717767Sachevfs_unbusy(mp, td)
372941Snate	struct mount *mp;
3737767Sache	struct thread *td;
3747767Sache{
3757767Sache
3767767Sache	lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, td);
3777767Sache}
3787767Sache
3797767Sache/*
3807767Sache * Lookup a mount point by filesystem identifier.
3817767Sache */
38210154Sachestruct mount *
38310154Sachevfs_getvfs(fsid)
38410154Sache	fsid_t *fsid;
38510154Sache{
38610154Sache	struct mount *mp;
38710154Sache
38810154Sache	mtx_lock(&mountlist_mtx);
38910154Sache	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
39010154Sache		if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
39110154Sache		    mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
39210154Sache			mtx_unlock(&mountlist_mtx);
39310154Sache			return (mp);
3947767Sache		}
395941Snate	}
3967767Sache	mtx_unlock(&mountlist_mtx);
3977767Sache	return ((struct mount *) 0);
3987767Sache}
3997767Sache
40010154Sache/*
401941Snate * Check if a user can access priveledged mount options.
40210154Sache */
4037767Sacheint
4047767Sachevfs_suser(struct mount *mp, struct thread *td)
4057767Sache{
4067767Sache	int error;
4077767Sache
4087767Sache	if ((mp->mnt_flag & MNT_USER) == 0 ||
4097767Sache	    mp->mnt_cred->cr_uid != td->td_ucred->cr_uid) {
4107767Sache		if ((error = suser(td)) != 0)
4117767Sache			return (error);
4127767Sache	}
4137767Sache	return (0);
4147767Sache}
41510154Sache
4167767Sache/*
4177767Sache * Get a new unique fsid.  Try to make its val[0] unique, since this value
4187767Sache * will be used to create fake device numbers for stat().  Also try (but
4197767Sache * not so hard) make its val[0] unique mod 2^16, since some emulators only
4207767Sache * support 16-bit device numbers.  We end up with unique val[0]'s for the
4217767Sache * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls.
4227767Sache *
4237767Sache * Keep in mind that several mounts may be running in parallel.  Starting
424941Snate * the search one past where the previous search terminated is both a
4257767Sache * micro-optimization and a defense against returning the same fsid to
4267767Sache * different mounts.
427941Snate */
4287767Sachevoid
4297767Sachevfs_getnewfsid(mp)
43054158Scharnier	struct mount *mp;
43110154Sache{
4327767Sache	static u_int16_t mntid_base;
43354158Scharnier	fsid_t tfsid;
434941Snate	int mtype;
4357767Sache
436941Snate	mtx_lock(&mntid_mtx);
4377767Sache	mtype = mp->mnt_vfc->vfc_typenum;
4387767Sache	tfsid.val[1] = mtype;
439941Snate	mtype = (mtype & 0xFF) << 24;
4407767Sache	for (;;) {
44154158Scharnier		tfsid.val[0] = makedev(255,
442941Snate		    mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF));
4437767Sache		mntid_base++;
44410154Sache		if (vfs_getvfs(&tfsid) == NULL)
445941Snate			break;
446941Snate	}
447941Snate	mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
448941Snate	mp->mnt_stat.f_fsid.val[1] = tfsid.val[1];
449941Snate	mtx_unlock(&mntid_mtx);
45010154Sache}
4517767Sache
4527767Sache/*
4537767Sache * Knob to control the precision of file timestamps:
4547767Sache *
4557767Sache *   0 = seconds only; nanoseconds zeroed.
4567767Sache *   1 = seconds and nanoseconds, accurate within 1/HZ.
4577767Sache *   2 = seconds and nanoseconds, truncated to microseconds.
4587767Sache * >=3 = seconds and nanoseconds, maximum precision.
4597767Sache */
46010154Sacheenum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC };
4617767Sache
4627767Sachestatic int timestamp_precision = TSP_SEC;
4637767SacheSYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW,
46440389Smckay    &timestamp_precision, 0, "");
46540389Smckay
46640389Smckay/*
46740389Smckay * Get a current timestamp.
468941Snate */
4697767Sachevoid
470941Snatevfs_timestamp(tsp)
4717767Sache	struct timespec *tsp;
47254158Scharnier{
473941Snate	struct timeval tv;
4747767Sache
47554158Scharnier	switch (timestamp_precision) {
476941Snate	case TSP_SEC:
47710154Sache		tsp->tv_sec = time_second;
4787767Sache		tsp->tv_nsec = 0;
4797767Sache		break;
4807767Sache	case TSP_HZ:
48154158Scharnier		getnanotime(tsp);
48210154Sache		break;
4837767Sache	case TSP_USEC:
4847767Sache		microtime(&tv);
4857767Sache		TIMEVAL_TO_TIMESPEC(&tv, tsp);
4867767Sache		break;
4877767Sache	case TSP_NSEC:
4887767Sache	default:
4897767Sache		nanotime(tsp);
490941Snate		break;
49110154Sache	}
4927767Sache}
493941Snate
4947767Sache/*
4957767Sache * Set vnode attributes to VNOVAL
496941Snate */
4977767Sachevoid
4987767Sachevattr_null(vap)
49987208Smarkm	struct vattr *vap;
5007767Sache{
5017767Sache
5027767Sache	vap->va_type = VNON;
5037767Sache	vap->va_size = VNOVAL;
5047767Sache	vap->va_bytes = VNOVAL;
505941Snate	vap->va_mode = VNOVAL;
50610154Sache	vap->va_nlink = VNOVAL;
50710154Sache	vap->va_uid = VNOVAL;
50810154Sache	vap->va_gid = VNOVAL;
50910154Sache	vap->va_fsid = VNOVAL;
51010154Sache	vap->va_fileid = VNOVAL;
51110154Sache	vap->va_blocksize = VNOVAL;
5127767Sache	vap->va_rdev = VNOVAL;
5137767Sache	vap->va_atime.tv_sec = VNOVAL;
514941Snate	vap->va_atime.tv_nsec = VNOVAL;
515941Snate	vap->va_mtime.tv_sec = VNOVAL;
516941Snate	vap->va_mtime.tv_nsec = VNOVAL;
51710154Sache	vap->va_ctime.tv_sec = VNOVAL;
518941Snate	vap->va_ctime.tv_nsec = VNOVAL;
5197767Sache	vap->va_birthtime.tv_sec = VNOVAL;
5207767Sache	vap->va_birthtime.tv_nsec = VNOVAL;
5217767Sache	vap->va_flags = VNOVAL;
5227767Sache	vap->va_gen = VNOVAL;
52310154Sache	vap->va_vaflags = 0;
52410154Sache}
52510154Sache
52610154Sache/*
52710154Sache * This routine is called when we have too many vnodes.  It attempts
528941Snate * to free <count> vnodes and will potentially free vnodes that still
5297767Sache * have VM backing store (VM backing store is typically the cause
530941Snate * of a vnode blowout so we want to do this).  Therefore, this operation
5317767Sache * is not considered cheap.
53254158Scharnier *
5338874Srgrimes * A number of conditions may prevent a vnode from being reclaimed.
53410154Sache * the buffer cache may have references on the vnode, a directory
53554158Scharnier * vnode may still have references due to the namei cache representing
53610154Sache * underlying files, or the vnode may be in active use.   It is not
53710154Sache * desireable to reuse such vnodes.  These conditions may cause the
53810154Sache * number of vnodes to reach some minimum value regardless of what
53910154Sache * you set kern.maxvnodes to.  Do not set kern.maxvnodes too low.
54010154Sache */
54110154Sachestatic int
54210154Sachevlrureclaim(struct mount *mp)
54310154Sache{
54410154Sache	struct thread *td;
54554158Scharnier	struct vnode *vp;
54610154Sache	int done;
54710154Sache	int trigger;
54810154Sache	int usevnodes;
5497860Sache	int count;
55010154Sache
55110154Sache	/*
55210154Sache	 * Calculate the trigger point, don't allow user
55354158Scharnier	 * screwups to blow us up.   This prevents us from
55454158Scharnier	 * recycling vnodes with lots of resident pages.  We
55510154Sache	 * aren't trying to free memory, we are trying to
55610154Sache	 * free vnodes.
55710154Sache	 */
55810154Sache	usevnodes = desiredvnodes;
55910154Sache	if (usevnodes <= 0)
56010154Sache		usevnodes = 1;
56110154Sache	trigger = cnt.v_page_count * 2 / usevnodes;
56210154Sache	done = 0;
56310154Sache	td = curthread;
56410154Sache	vn_start_write(NULL, &mp, V_WAIT);
56510154Sache	MNT_ILOCK(mp);
56610154Sache	count = mp->mnt_nvnodelistsize / 10 + 1;
56710154Sache	while (count && (vp = TAILQ_FIRST(&mp->mnt_nvnodelist)) != NULL) {
56810154Sache		TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
56910154Sache		TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
57010154Sache		--count;
57110154Sache		if (!VI_TRYLOCK(vp))
57210154Sache			continue;
57310154Sache		/*
57410154Sache		 * If it's been deconstructed already, it's still
57510154Sache		 * referenced, or it exceeds the trigger, skip it.
57610154Sache		 */
57710154Sache		if ((vp->v_iflag & VI_DOOMED) != 0 || vp->v_usecount ||
57810154Sache		    !LIST_EMPTY(&(vp)->v_cache_src) || (vp->v_object != NULL &&
57954158Scharnier		    vp->v_object->resident_page_count > trigger)) {
58010154Sache			VI_UNLOCK(vp);
58110154Sache			continue;
58210154Sache		}
58310154Sache		MNT_IUNLOCK(mp);
58410154Sache		vholdl(vp);
58510154Sache		if (VOP_LOCK(vp, LK_INTERLOCK|LK_EXCLUSIVE, td)) {
58610154Sache			vdrop(vp);
58710154Sache			MNT_ILOCK(mp);
58854158Scharnier			continue;
58954158Scharnier		}
59010154Sache		VI_LOCK(vp);
59110154Sache		vgonel(vp);
5927860Sache		VOP_UNLOCK(vp, 0, td);
5937767Sache		vdropl(vp);
5947767Sache		done++;
595941Snate		MNT_ILOCK(mp);
596941Snate	}
5977767Sache	MNT_IUNLOCK(mp);
598941Snate	vn_finished_write(mp);
5997767Sache	return done;
6007767Sache}
6017767Sache
6027767Sache/*
603941Snate * Attempt to keep the free list at wantfreevnodes length.
6047767Sache */
60587208Smarkmstatic void
6067767Sachevnlru_free(int count)
6077767Sache{
608941Snate	struct vnode *vp;
6097767Sache
610941Snate	mtx_assert(&vnode_free_list_mtx, MA_OWNED);
6117767Sache	for (; count > 0; count--) {
6127767Sache		vp = TAILQ_FIRST(&vnode_free_list);
6137767Sache		/*
6147767Sache		 * The list can be modified while the free_list_mtx
6157767Sache		 * has been dropped and vp could be NULL here.
6167767Sache		 */
617941Snate		if (!vp)
61882722Skris			break;
61982722Skris		VNASSERT(vp->v_op != NULL, vp,
6207767Sache		    ("vnlru_free: vnode already reclaimed."));
6217767Sache		TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
6227767Sache		/*
6237767Sache		 * Don't recycle if we can't get the interlock.
6247767Sache		 */
6257767Sache		if (!VI_TRYLOCK(vp)) {
6267767Sache			TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
6277767Sache			continue;
6287767Sache		}
6297767Sache		VNASSERT(VCANRECYCLE(vp), vp,
6307767Sache		    ("vp inconsistent on freelist"));
6317767Sache		freevnodes--;
6327767Sache		vp->v_iflag &= ~VI_FREE;
6337767Sache		vholdl(vp);
634941Snate		mtx_unlock(&vnode_free_list_mtx);
6357767Sache		VI_UNLOCK(vp);
6367767Sache		vtryrecycle(vp);
6377767Sache		/*
63824360Simp		 * If the recycled succeeded this vdrop will actually free
6397767Sache		 * the vnode.  If not it will simply place it back on
6407767Sache		 * the free list.
6417767Sache		 */
6427767Sache		vdrop(vp);
643941Snate		mtx_lock(&vnode_free_list_mtx);
6447767Sache	}
6457767Sache}
6467767Sache/*
647941Snate * Attempt to recycle vnodes in a context that is always safe to block.
6487767Sache * Calling vlrurecycle() from the bowels of filesystem code has some
6497767Sache * interesting deadlock problems.
6507767Sache */
65110154Sachestatic struct proc *vnlruproc;
6527767Sachestatic int vnlruproc_sig;
6537767Sache
6547767Sachestatic void
655941Snatevnlru_proc(void)
6567767Sache{
6577767Sache	struct mount *mp, *nmp;
6587767Sache	int done;
659941Snate	struct proc *p = vnlruproc;
6607767Sache	struct thread *td = FIRST_THREAD_IN_PROC(p);
6617767Sache
662941Snate	mtx_lock(&Giant);
6637767Sache
6647767Sache	EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, p,
6657767Sache	    SHUTDOWN_PRI_FIRST);
666941Snate
6677767Sache	for (;;) {
6687767Sache		kthread_suspend_check(p);
6697767Sache		mtx_lock(&vnode_free_list_mtx);
670941Snate		if (freevnodes > wantfreevnodes)
6717767Sache			vnlru_free(freevnodes - wantfreevnodes);
6727767Sache		if (numvnodes <= desiredvnodes * 9 / 10) {
6737767Sache			vnlruproc_sig = 0;
674941Snate			wakeup(&vnlruproc_sig);
6757767Sache			msleep(vnlruproc, &vnode_free_list_mtx,
6767767Sache			    PVFS|PDROP, "vlruwt", hz);
6777767Sache			continue;
678941Snate		}
6797767Sache		mtx_unlock(&vnode_free_list_mtx);
6807767Sache		done = 0;
6817767Sache		mtx_lock(&mountlist_mtx);
682941Snate		for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
6837767Sache			int vfsunlocked;
6847767Sache			if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td)) {
6857767Sache				nmp = TAILQ_NEXT(mp, mnt_list);
686941Snate				continue;
6877767Sache			}
6887767Sache			if (!VFS_NEEDSGIANT(mp)) {
6897767Sache				mtx_unlock(&Giant);
690941Snate				vfsunlocked = 1;
69110154Sache			} else
69210154Sache				vfsunlocked = 0;
69310154Sache			done += vlrureclaim(mp);
69410154Sache			if (vfsunlocked)
69510154Sache				mtx_lock(&Giant);
6967767Sache			mtx_lock(&mountlist_mtx);
6977767Sache			nmp = TAILQ_NEXT(mp, mnt_list);
6987767Sache			vfs_unbusy(mp, td);
6997767Sache		}
7007767Sache		mtx_unlock(&mountlist_mtx);
7017767Sache		if (done == 0) {
702941Snate#if 0
7037767Sache			/* These messages are temporary debugging aids */
70482722Skris			if (vnlru_nowhere < 5)
70582722Skris				printf("vnlru process getting nowhere..\n");
70682722Skris			else if (vnlru_nowhere == 5)
707941Snate				printf("vnlru process messages stopped.\n");
7087767Sache#endif
7097767Sache			vnlru_nowhere++;
7107767Sache			tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3);
71154158Scharnier		} else
7127767Sache			uio_yield();
7137767Sache	}
714941Snate}
7157767Sache
7167767Sachestatic struct kproc_desc vnlru_kp = {
7177767Sache	"vnlru",
7187767Sache	vnlru_proc,
7197767Sache	&vnlruproc
7207767Sache};
7217767SacheSYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &vnlru_kp)
7227767Sache
7237767Sache/*
72410154Sache * Routines having to do with the management of the vnode table.
7257767Sache */
7267767Sache
72710154Sachestatic void
72810154Sachevdestroy(struct vnode *vp)
72910154Sache{
73010154Sache	struct bufobj *bo;
73110154Sache
7327767Sache	CTR1(KTR_VFS, "vdestroy vp %p", vp);
7337767Sache	mtx_lock(&vnode_free_list_mtx);
7347767Sache	numvnodes--;
7357767Sache	mtx_unlock(&vnode_free_list_mtx);
7367767Sache	bo = &vp->v_bufobj;
7377767Sache	VNASSERT((vp->v_iflag & VI_FREE) == 0, vp,
738941Snate	    ("cleaned vnode still on the free list."));
7397767Sache	VNASSERT(vp->v_data == NULL, vp, ("cleaned vnode isn't"));
7407767Sache	VNASSERT(vp->v_holdcnt == 0, vp, ("Non-zero hold count"));
7417767Sache	VNASSERT(vp->v_usecount == 0, vp, ("Non-zero use count"));
7427767Sache	VNASSERT(vp->v_writecount == 0, vp, ("Non-zero write count"));
7437767Sache	VNASSERT(bo->bo_numoutput == 0, vp, ("Clean vnode has pending I/O's"));
7447767Sache	VNASSERT(bo->bo_clean.bv_cnt == 0, vp, ("cleanbufcnt not 0"));
7457767Sache	VNASSERT(bo->bo_clean.bv_root == NULL, vp, ("cleanblkroot not NULL"));
7467767Sache	VNASSERT(bo->bo_dirty.bv_cnt == 0, vp, ("dirtybufcnt not 0"));
7477767Sache	VNASSERT(bo->bo_dirty.bv_root == NULL, vp, ("dirtyblkroot not NULL"));
7487767Sache	VNASSERT(TAILQ_EMPTY(&vp->v_cache_dst), vp, ("vp has namecache dst"));
7497767Sache	VNASSERT(LIST_EMPTY(&vp->v_cache_src), vp, ("vp has namecache src"));
7507767Sache#ifdef MAC
7517767Sache	mac_destroy_vnode(vp);
75210154Sache#endif
7537767Sache	if (vp->v_pollinfo != NULL) {
7547767Sache		knlist_destroy(&vp->v_pollinfo->vpi_selinfo.si_note);
7557767Sache		mtx_destroy(&vp->v_pollinfo->vpi_lock);
7567767Sache		uma_zfree(vnodepoll_zone, vp->v_pollinfo);
7577767Sache	}
7587767Sache#ifdef INVARIANTS
7597767Sache	/* XXX Elsewhere we can detect an already freed vnode via NULL v_op. */
7607767Sache	vp->v_op = NULL;
7617767Sache#endif
7627767Sache	lockdestroy(vp->v_vnlock);
76354158Scharnier	mtx_destroy(&vp->v_interlock);
7647767Sache	uma_zfree(vnode_zone, vp);
7657767Sache}
7667767Sache
767941Snate/*
768 * Try to recycle a freed vnode.  We abort if anyone picks up a reference
769 * before we actually vgone().  This function must be called with the vnode
770 * held to prevent the vnode from being returned to the free list midway
771 * through vgone().
772 */
773static int
774vtryrecycle(struct vnode *vp)
775{
776	struct thread *td = curthread;
777	struct mount *vnmp;
778
779	CTR1(KTR_VFS, "vtryrecycle: trying vp %p", vp);
780	VNASSERT(vp->v_holdcnt, vp,
781	    ("vtryrecycle: Recycling vp %p without a reference.", vp));
782	/*
783	 * This vnode may found and locked via some other list, if so we
784	 * can't recycle it yet.
785	 */
786	if (VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT, td) != 0)
787		return (EWOULDBLOCK);
788	/*
789	 * Don't recycle if its filesystem is being suspended.
790	 */
791	if (vn_start_write(vp, &vnmp, V_NOWAIT) != 0) {
792		VOP_UNLOCK(vp, 0, td);
793		return (EBUSY);
794	}
795	/*
796	 * If we got this far, we need to acquire the interlock and see if
797	 * anyone picked up this vnode from another list.  If not, we will
798	 * mark it with DOOMED via vgonel() so that anyone who does find it
799	 * will skip over it.
800	 */
801	VI_LOCK(vp);
802	if (vp->v_usecount) {
803		VOP_UNLOCK(vp, LK_INTERLOCK, td);
804		vn_finished_write(vnmp);
805		return (EBUSY);
806	}
807	if ((vp->v_iflag & VI_DOOMED) == 0)
808		vgonel(vp);
809	VOP_UNLOCK(vp, LK_INTERLOCK, td);
810	vn_finished_write(vnmp);
811	CTR1(KTR_VFS, "vtryrecycle: recycled vp %p", vp);
812	return (0);
813}
814
815/*
816 * Return the next vnode from the free list.
817 */
818int
819getnewvnode(tag, mp, vops, vpp)
820	const char *tag;
821	struct mount *mp;
822	struct vop_vector *vops;
823	struct vnode **vpp;
824{
825	struct vnode *vp = NULL;
826	struct bufobj *bo;
827
828	mtx_lock(&vnode_free_list_mtx);
829	/*
830	 * Lend our context to reclaim vnodes if they've exceeded the max.
831	 */
832	if (freevnodes > wantfreevnodes)
833		vnlru_free(1);
834	/*
835	 * Wait for available vnodes.
836	 */
837	if (numvnodes > desiredvnodes) {
838		if (vnlruproc_sig == 0) {
839			vnlruproc_sig = 1;      /* avoid unnecessary wakeups */
840			wakeup(vnlruproc);
841		}
842		msleep(&vnlruproc_sig, &vnode_free_list_mtx, PVFS,
843		    "vlruwk", hz);
844#if 0	/* XXX Not all VFS_VGET/ffs_vget callers check returns. */
845		if (numvnodes > desiredvnodes) {
846			mtx_unlock(&vnode_free_list_mtx);
847			return (ENFILE);
848		}
849#endif
850	}
851	numvnodes++;
852	mtx_unlock(&vnode_free_list_mtx);
853	vp = (struct vnode *) uma_zalloc(vnode_zone, M_WAITOK|M_ZERO);
854	/*
855	 * Setup locks.
856	 */
857	vp->v_vnlock = &vp->v_lock;
858	mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF);
859	/*
860	 * By default, don't allow shared locks unless filesystems
861	 * opt-in.
862	 */
863	lockinit(vp->v_vnlock, PVFS, tag, VLKTIMEOUT, LK_NOSHARE);
864	/*
865	 * Initialize bufobj.
866	 */
867	bo = &vp->v_bufobj;
868	bo->__bo_vnode = vp;
869	bo->bo_mtx = &vp->v_interlock;
870	bo->bo_ops = &buf_ops_bio;
871	bo->bo_private = vp;
872	TAILQ_INIT(&bo->bo_clean.bv_hd);
873	TAILQ_INIT(&bo->bo_dirty.bv_hd);
874	/*
875	 * Initialize namecache.
876	 */
877	LIST_INIT(&vp->v_cache_src);
878	TAILQ_INIT(&vp->v_cache_dst);
879	/*
880	 * Finalize various vnode identity bits.
881	 */
882	vp->v_type = VNON;
883	vp->v_tag = tag;
884	vp->v_op = vops;
885	v_incr_usecount(vp);
886	vp->v_data = 0;
887#ifdef MAC
888	mac_init_vnode(vp);
889	if (mp != NULL && (mp->mnt_flag & MNT_MULTILABEL) == 0)
890		mac_associate_vnode_singlelabel(mp, vp);
891	else if (mp == NULL)
892		printf("NULL mp in getnewvnode()\n");
893#endif
894	delmntque(vp);
895	if (mp != NULL) {
896		insmntque(vp, mp);
897		bo->bo_bsize = mp->mnt_stat.f_iosize;
898	}
899
900	CTR2(KTR_VFS, "getnewvnode: mp %p vp %p", mp, vp);
901	*vpp = vp;
902	return (0);
903}
904
905/*
906 * Delete from old mount point vnode list, if on one.
907 */
908static void
909delmntque(struct vnode *vp)
910{
911	struct mount *mp;
912
913	if (vp->v_mount == NULL)
914		return;
915	mp = vp->v_mount;
916	MNT_ILOCK(mp);
917	vp->v_mount = NULL;
918	VNASSERT(mp->mnt_nvnodelistsize > 0, vp,
919		("bad mount point vnode list size"));
920	TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
921	mp->mnt_nvnodelistsize--;
922	MNT_IUNLOCK(mp);
923}
924
925/*
926 * Insert into list of vnodes for the new mount point, if available.
927 */
928static void
929insmntque(struct vnode *vp, struct mount *mp)
930{
931
932	vp->v_mount = mp;
933	VNASSERT(mp != NULL, vp, ("Don't call insmntque(foo, NULL)"));
934	MNT_ILOCK(vp->v_mount);
935	TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
936	mp->mnt_nvnodelistsize++;
937	MNT_IUNLOCK(vp->v_mount);
938}
939
940/*
941 * Flush out and invalidate all buffers associated with a bufobj
942 * Called with the underlying object locked.
943 */
944int
945bufobj_invalbuf(struct bufobj *bo, int flags, struct thread *td, int slpflag, int slptimeo)
946{
947	int error;
948
949	BO_LOCK(bo);
950	if (flags & V_SAVE) {
951		error = bufobj_wwait(bo, slpflag, slptimeo);
952		if (error) {
953			BO_UNLOCK(bo);
954			return (error);
955		}
956		if (bo->bo_dirty.bv_cnt > 0) {
957			BO_UNLOCK(bo);
958			if ((error = BO_SYNC(bo, MNT_WAIT, td)) != 0)
959				return (error);
960			/*
961			 * XXX We could save a lock/unlock if this was only
962			 * enabled under INVARIANTS
963			 */
964			BO_LOCK(bo);
965			if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0)
966				panic("vinvalbuf: dirty bufs");
967		}
968	}
969	/*
970	 * If you alter this loop please notice that interlock is dropped and
971	 * reacquired in flushbuflist.  Special care is needed to ensure that
972	 * no race conditions occur from this.
973	 */
974	do {
975		error = flushbuflist(&bo->bo_clean,
976		    flags, bo, slpflag, slptimeo);
977		if (error == 0)
978			error = flushbuflist(&bo->bo_dirty,
979			    flags, bo, slpflag, slptimeo);
980		if (error != 0 && error != EAGAIN) {
981			BO_UNLOCK(bo);
982			return (error);
983		}
984	} while (error != 0);
985
986	/*
987	 * Wait for I/O to complete.  XXX needs cleaning up.  The vnode can
988	 * have write I/O in-progress but if there is a VM object then the
989	 * VM object can also have read-I/O in-progress.
990	 */
991	do {
992		bufobj_wwait(bo, 0, 0);
993		BO_UNLOCK(bo);
994		if (bo->bo_object != NULL) {
995			VM_OBJECT_LOCK(bo->bo_object);
996			vm_object_pip_wait(bo->bo_object, "bovlbx");
997			VM_OBJECT_UNLOCK(bo->bo_object);
998		}
999		BO_LOCK(bo);
1000	} while (bo->bo_numoutput > 0);
1001	BO_UNLOCK(bo);
1002
1003	/*
1004	 * Destroy the copy in the VM cache, too.
1005	 */
1006	if (bo->bo_object != NULL) {
1007		VM_OBJECT_LOCK(bo->bo_object);
1008		vm_object_page_remove(bo->bo_object, 0, 0,
1009			(flags & V_SAVE) ? TRUE : FALSE);
1010		VM_OBJECT_UNLOCK(bo->bo_object);
1011	}
1012
1013#ifdef INVARIANTS
1014	BO_LOCK(bo);
1015	if ((flags & (V_ALT | V_NORMAL)) == 0 &&
1016	    (bo->bo_dirty.bv_cnt > 0 || bo->bo_clean.bv_cnt > 0))
1017		panic("vinvalbuf: flush failed");
1018	BO_UNLOCK(bo);
1019#endif
1020	return (0);
1021}
1022
1023/*
1024 * Flush out and invalidate all buffers associated with a vnode.
1025 * Called with the underlying object locked.
1026 */
1027int
1028vinvalbuf(struct vnode *vp, int flags, struct thread *td, int slpflag, int slptimeo)
1029{
1030
1031	CTR2(KTR_VFS, "vinvalbuf vp %p flags %d", vp, flags);
1032	ASSERT_VOP_LOCKED(vp, "vinvalbuf");
1033	return (bufobj_invalbuf(&vp->v_bufobj, flags, td, slpflag, slptimeo));
1034}
1035
1036/*
1037 * Flush out buffers on the specified list.
1038 *
1039 */
1040static int
1041flushbuflist(bufv, flags, bo, slpflag, slptimeo)
1042	struct bufv *bufv;
1043	int flags;
1044	struct bufobj *bo;
1045	int slpflag, slptimeo;
1046{
1047	struct buf *bp, *nbp;
1048	int retval, error;
1049
1050	ASSERT_BO_LOCKED(bo);
1051
1052	retval = 0;
1053	TAILQ_FOREACH_SAFE(bp, &bufv->bv_hd, b_bobufs, nbp) {
1054		if (((flags & V_NORMAL) && (bp->b_xflags & BX_ALTDATA)) ||
1055		    ((flags & V_ALT) && (bp->b_xflags & BX_ALTDATA) == 0)) {
1056			continue;
1057		}
1058		retval = EAGAIN;
1059		error = BUF_TIMELOCK(bp,
1060		    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_MTX(bo),
1061		    "flushbuf", slpflag, slptimeo);
1062		if (error) {
1063			BO_LOCK(bo);
1064			return (error != ENOLCK ? error : EAGAIN);
1065		}
1066		KASSERT(bp->b_bufobj == bo,
1067	            ("bp %p wrong b_bufobj %p should be %p",
1068		    bp, bp->b_bufobj, bo));
1069		if (bp->b_bufobj != bo) {	/* XXX: necessary ? */
1070			BUF_UNLOCK(bp);
1071			BO_LOCK(bo);
1072			return (EAGAIN);
1073		}
1074		/*
1075		 * XXX Since there are no node locks for NFS, I
1076		 * believe there is a slight chance that a delayed
1077		 * write will occur while sleeping just above, so
1078		 * check for it.
1079		 */
1080		if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) &&
1081		    (flags & V_SAVE)) {
1082			bremfree(bp);
1083			bp->b_flags |= B_ASYNC;
1084			bwrite(bp);
1085			BO_LOCK(bo);
1086			return (EAGAIN);	/* XXX: why not loop ? */
1087		}
1088		bremfree(bp);
1089		bp->b_flags |= (B_INVAL | B_NOCACHE | B_RELBUF);
1090		bp->b_flags &= ~B_ASYNC;
1091		brelse(bp);
1092		BO_LOCK(bo);
1093	}
1094	return (retval);
1095}
1096
1097/*
1098 * Truncate a file's buffer and pages to a specified length.  This
1099 * is in lieu of the old vinvalbuf mechanism, which performed unneeded
1100 * sync activity.
1101 */
1102int
1103vtruncbuf(struct vnode *vp, struct ucred *cred, struct thread *td, off_t length, int blksize)
1104{
1105	struct buf *bp, *nbp;
1106	int anyfreed;
1107	int trunclbn;
1108	struct bufobj *bo;
1109
1110	CTR2(KTR_VFS, "vtruncbuf vp %p length %jd", vp, length);
1111	/*
1112	 * Round up to the *next* lbn.
1113	 */
1114	trunclbn = (length + blksize - 1) / blksize;
1115
1116	ASSERT_VOP_LOCKED(vp, "vtruncbuf");
1117restart:
1118	VI_LOCK(vp);
1119	bo = &vp->v_bufobj;
1120	anyfreed = 1;
1121	for (;anyfreed;) {
1122		anyfreed = 0;
1123		TAILQ_FOREACH_SAFE(bp, &bo->bo_clean.bv_hd, b_bobufs, nbp) {
1124			if (bp->b_lblkno < trunclbn)
1125				continue;
1126			if (BUF_LOCK(bp,
1127			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
1128			    VI_MTX(vp)) == ENOLCK)
1129				goto restart;
1130
1131			bremfree(bp);
1132			bp->b_flags |= (B_INVAL | B_RELBUF);
1133			bp->b_flags &= ~B_ASYNC;
1134			brelse(bp);
1135			anyfreed = 1;
1136
1137			if (nbp != NULL &&
1138			    (((nbp->b_xflags & BX_VNCLEAN) == 0) ||
1139			    (nbp->b_vp != vp) ||
1140			    (nbp->b_flags & B_DELWRI))) {
1141				goto restart;
1142			}
1143			VI_LOCK(vp);
1144		}
1145
1146		TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
1147			if (bp->b_lblkno < trunclbn)
1148				continue;
1149			if (BUF_LOCK(bp,
1150			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
1151			    VI_MTX(vp)) == ENOLCK)
1152				goto restart;
1153			bremfree(bp);
1154			bp->b_flags |= (B_INVAL | B_RELBUF);
1155			bp->b_flags &= ~B_ASYNC;
1156			brelse(bp);
1157			anyfreed = 1;
1158			if (nbp != NULL &&
1159			    (((nbp->b_xflags & BX_VNDIRTY) == 0) ||
1160			    (nbp->b_vp != vp) ||
1161			    (nbp->b_flags & B_DELWRI) == 0)) {
1162				goto restart;
1163			}
1164			VI_LOCK(vp);
1165		}
1166	}
1167
1168	if (length > 0) {
1169restartsync:
1170		TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
1171			if (bp->b_lblkno > 0)
1172				continue;
1173			/*
1174			 * Since we hold the vnode lock this should only
1175			 * fail if we're racing with the buf daemon.
1176			 */
1177			if (BUF_LOCK(bp,
1178			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
1179			    VI_MTX(vp)) == ENOLCK) {
1180				goto restart;
1181			}
1182			VNASSERT((bp->b_flags & B_DELWRI), vp,
1183			    ("buf(%p) on dirty queue without DELWRI", bp));
1184
1185			bremfree(bp);
1186			bawrite(bp);
1187			VI_LOCK(vp);
1188			goto restartsync;
1189		}
1190	}
1191
1192	bufobj_wwait(bo, 0, 0);
1193	VI_UNLOCK(vp);
1194	vnode_pager_setsize(vp, length);
1195
1196	return (0);
1197}
1198
1199/*
1200 * buf_splay() - splay tree core for the clean/dirty list of buffers in
1201 * 		 a vnode.
1202 *
1203 *	NOTE: We have to deal with the special case of a background bitmap
1204 *	buffer, a situation where two buffers will have the same logical
1205 *	block offset.  We want (1) only the foreground buffer to be accessed
1206 *	in a lookup and (2) must differentiate between the foreground and
1207 *	background buffer in the splay tree algorithm because the splay
1208 *	tree cannot normally handle multiple entities with the same 'index'.
1209 *	We accomplish this by adding differentiating flags to the splay tree's
1210 *	numerical domain.
1211 */
1212static
1213struct buf *
1214buf_splay(daddr_t lblkno, b_xflags_t xflags, struct buf *root)
1215{
1216	struct buf dummy;
1217	struct buf *lefttreemax, *righttreemin, *y;
1218
1219	if (root == NULL)
1220		return (NULL);
1221	lefttreemax = righttreemin = &dummy;
1222	for (;;) {
1223		if (lblkno < root->b_lblkno ||
1224		    (lblkno == root->b_lblkno &&
1225		    (xflags & BX_BKGRDMARKER) < (root->b_xflags & BX_BKGRDMARKER))) {
1226			if ((y = root->b_left) == NULL)
1227				break;
1228			if (lblkno < y->b_lblkno) {
1229				/* Rotate right. */
1230				root->b_left = y->b_right;
1231				y->b_right = root;
1232				root = y;
1233				if ((y = root->b_left) == NULL)
1234					break;
1235			}
1236			/* Link into the new root's right tree. */
1237			righttreemin->b_left = root;
1238			righttreemin = root;
1239		} else if (lblkno > root->b_lblkno ||
1240		    (lblkno == root->b_lblkno &&
1241		    (xflags & BX_BKGRDMARKER) > (root->b_xflags & BX_BKGRDMARKER))) {
1242			if ((y = root->b_right) == NULL)
1243				break;
1244			if (lblkno > y->b_lblkno) {
1245				/* Rotate left. */
1246				root->b_right = y->b_left;
1247				y->b_left = root;
1248				root = y;
1249				if ((y = root->b_right) == NULL)
1250					break;
1251			}
1252			/* Link into the new root's left tree. */
1253			lefttreemax->b_right = root;
1254			lefttreemax = root;
1255		} else {
1256			break;
1257		}
1258		root = y;
1259	}
1260	/* Assemble the new root. */
1261	lefttreemax->b_right = root->b_left;
1262	righttreemin->b_left = root->b_right;
1263	root->b_left = dummy.b_right;
1264	root->b_right = dummy.b_left;
1265	return (root);
1266}
1267
1268static void
1269buf_vlist_remove(struct buf *bp)
1270{
1271	struct buf *root;
1272	struct bufv *bv;
1273
1274	KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
1275	ASSERT_BO_LOCKED(bp->b_bufobj);
1276	KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) !=
1277	    (BX_VNDIRTY|BX_VNCLEAN),
1278	    ("buf_vlist_remove: Buf %p is on two lists", bp));
1279	if (bp->b_xflags & BX_VNDIRTY)
1280		bv = &bp->b_bufobj->bo_dirty;
1281	else
1282		bv = &bp->b_bufobj->bo_clean;
1283	if (bp != bv->bv_root) {
1284		root = buf_splay(bp->b_lblkno, bp->b_xflags, bv->bv_root);
1285		KASSERT(root == bp, ("splay lookup failed in remove"));
1286	}
1287	if (bp->b_left == NULL) {
1288		root = bp->b_right;
1289	} else {
1290		root = buf_splay(bp->b_lblkno, bp->b_xflags, bp->b_left);
1291		root->b_right = bp->b_right;
1292	}
1293	bv->bv_root = root;
1294	TAILQ_REMOVE(&bv->bv_hd, bp, b_bobufs);
1295	bv->bv_cnt--;
1296	bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN);
1297}
1298
1299/*
1300 * Add the buffer to the sorted clean or dirty block list using a
1301 * splay tree algorithm.
1302 *
1303 * NOTE: xflags is passed as a constant, optimizing this inline function!
1304 */
1305static void
1306buf_vlist_add(struct buf *bp, struct bufobj *bo, b_xflags_t xflags)
1307{
1308	struct buf *root;
1309	struct bufv *bv;
1310
1311	ASSERT_BO_LOCKED(bo);
1312	KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0,
1313	    ("buf_vlist_add: Buf %p has existing xflags %d", bp, bp->b_xflags));
1314	bp->b_xflags |= xflags;
1315	if (xflags & BX_VNDIRTY)
1316		bv = &bo->bo_dirty;
1317	else
1318		bv = &bo->bo_clean;
1319
1320	root = buf_splay(bp->b_lblkno, bp->b_xflags, bv->bv_root);
1321	if (root == NULL) {
1322		bp->b_left = NULL;
1323		bp->b_right = NULL;
1324		TAILQ_INSERT_TAIL(&bv->bv_hd, bp, b_bobufs);
1325	} else if (bp->b_lblkno < root->b_lblkno ||
1326	    (bp->b_lblkno == root->b_lblkno &&
1327	    (bp->b_xflags & BX_BKGRDMARKER) < (root->b_xflags & BX_BKGRDMARKER))) {
1328		bp->b_left = root->b_left;
1329		bp->b_right = root;
1330		root->b_left = NULL;
1331		TAILQ_INSERT_BEFORE(root, bp, b_bobufs);
1332	} else {
1333		bp->b_right = root->b_right;
1334		bp->b_left = root;
1335		root->b_right = NULL;
1336		TAILQ_INSERT_AFTER(&bv->bv_hd, root, bp, b_bobufs);
1337	}
1338	bv->bv_cnt++;
1339	bv->bv_root = bp;
1340}
1341
1342/*
1343 * Lookup a buffer using the splay tree.  Note that we specifically avoid
1344 * shadow buffers used in background bitmap writes.
1345 *
1346 * This code isn't quite efficient as it could be because we are maintaining
1347 * two sorted lists and do not know which list the block resides in.
1348 *
1349 * During a "make buildworld" the desired buffer is found at one of
1350 * the roots more than 60% of the time.  Thus, checking both roots
1351 * before performing either splay eliminates unnecessary splays on the
1352 * first tree splayed.
1353 */
1354struct buf *
1355gbincore(struct bufobj *bo, daddr_t lblkno)
1356{
1357	struct buf *bp;
1358
1359	ASSERT_BO_LOCKED(bo);
1360	if ((bp = bo->bo_clean.bv_root) != NULL &&
1361	    bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
1362		return (bp);
1363	if ((bp = bo->bo_dirty.bv_root) != NULL &&
1364	    bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
1365		return (bp);
1366	if ((bp = bo->bo_clean.bv_root) != NULL) {
1367		bo->bo_clean.bv_root = bp = buf_splay(lblkno, 0, bp);
1368		if (bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
1369			return (bp);
1370	}
1371	if ((bp = bo->bo_dirty.bv_root) != NULL) {
1372		bo->bo_dirty.bv_root = bp = buf_splay(lblkno, 0, bp);
1373		if (bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
1374			return (bp);
1375	}
1376	return (NULL);
1377}
1378
1379/*
1380 * Associate a buffer with a vnode.
1381 */
1382void
1383bgetvp(struct vnode *vp, struct buf *bp)
1384{
1385
1386	VNASSERT(bp->b_vp == NULL, bp->b_vp, ("bgetvp: not free"));
1387
1388	CTR3(KTR_BUF, "bgetvp(%p) vp %p flags %X", bp, vp, bp->b_flags);
1389	VNASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, vp,
1390	    ("bgetvp: bp already attached! %p", bp));
1391
1392	ASSERT_VI_LOCKED(vp, "bgetvp");
1393	vholdl(vp);
1394	bp->b_vp = vp;
1395	bp->b_bufobj = &vp->v_bufobj;
1396	/*
1397	 * Insert onto list for new vnode.
1398	 */
1399	buf_vlist_add(bp, &vp->v_bufobj, BX_VNCLEAN);
1400}
1401
1402/*
1403 * Disassociate a buffer from a vnode.
1404 */
1405void
1406brelvp(struct buf *bp)
1407{
1408	struct bufobj *bo;
1409	struct vnode *vp;
1410
1411	CTR3(KTR_BUF, "brelvp(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
1412	KASSERT(bp->b_vp != NULL, ("brelvp: NULL"));
1413
1414	/*
1415	 * Delete from old vnode list, if on one.
1416	 */
1417	vp = bp->b_vp;		/* XXX */
1418	bo = bp->b_bufobj;
1419	BO_LOCK(bo);
1420	if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN))
1421		buf_vlist_remove(bp);
1422	else
1423		panic("brelvp: Buffer %p not on queue.", bp);
1424	if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) {
1425		bo->bo_flag &= ~BO_ONWORKLST;
1426		mtx_lock(&sync_mtx);
1427		LIST_REMOVE(bo, bo_synclist);
1428 		syncer_worklist_len--;
1429		mtx_unlock(&sync_mtx);
1430	}
1431	bp->b_vp = NULL;
1432	bp->b_bufobj = NULL;
1433	vdropl(vp);
1434}
1435
1436/*
1437 * Add an item to the syncer work queue.
1438 */
1439static void
1440vn_syncer_add_to_worklist(struct bufobj *bo, int delay)
1441{
1442	int slot;
1443
1444	ASSERT_BO_LOCKED(bo);
1445
1446	mtx_lock(&sync_mtx);
1447	if (bo->bo_flag & BO_ONWORKLST)
1448		LIST_REMOVE(bo, bo_synclist);
1449	else {
1450		bo->bo_flag |= BO_ONWORKLST;
1451 		syncer_worklist_len++;
1452	}
1453
1454	if (delay > syncer_maxdelay - 2)
1455		delay = syncer_maxdelay - 2;
1456	slot = (syncer_delayno + delay) & syncer_mask;
1457
1458	LIST_INSERT_HEAD(&syncer_workitem_pending[slot], bo, bo_synclist);
1459	mtx_unlock(&sync_mtx);
1460}
1461
1462static int
1463sysctl_vfs_worklist_len(SYSCTL_HANDLER_ARGS)
1464{
1465	int error, len;
1466
1467	mtx_lock(&sync_mtx);
1468	len = syncer_worklist_len - sync_vnode_count;
1469	mtx_unlock(&sync_mtx);
1470	error = SYSCTL_OUT(req, &len, sizeof(len));
1471	return (error);
1472}
1473
1474SYSCTL_PROC(_vfs, OID_AUTO, worklist_len, CTLTYPE_INT | CTLFLAG_RD, NULL, 0,
1475    sysctl_vfs_worklist_len, "I", "Syncer thread worklist length");
1476
1477struct  proc *updateproc;
1478static void sched_sync(void);
1479static struct kproc_desc up_kp = {
1480	"syncer",
1481	sched_sync,
1482	&updateproc
1483};
1484SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp)
1485
1486static int
1487sync_vnode(struct bufobj *bo, struct thread *td)
1488{
1489	struct vnode *vp;
1490	struct mount *mp;
1491
1492	vp = bo->__bo_vnode; 	/* XXX */
1493	if (VOP_ISLOCKED(vp, NULL) != 0)
1494		return (1);
1495	if (VI_TRYLOCK(vp) == 0)
1496		return (1);
1497	/*
1498	 * We use vhold in case the vnode does not
1499	 * successfully sync.  vhold prevents the vnode from
1500	 * going away when we unlock the sync_mtx so that
1501	 * we can acquire the vnode interlock.
1502	 */
1503	vholdl(vp);
1504	mtx_unlock(&sync_mtx);
1505	VI_UNLOCK(vp);
1506	if (vn_start_write(vp, &mp, V_NOWAIT) != 0) {
1507		vdrop(vp);
1508		mtx_lock(&sync_mtx);
1509		return (1);
1510	}
1511	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
1512	(void) VOP_FSYNC(vp, MNT_LAZY, td);
1513	VOP_UNLOCK(vp, 0, td);
1514	vn_finished_write(mp);
1515	VI_LOCK(vp);
1516	if ((bo->bo_flag & BO_ONWORKLST) != 0) {
1517		/*
1518		 * Put us back on the worklist.  The worklist
1519		 * routine will remove us from our current
1520		 * position and then add us back in at a later
1521		 * position.
1522		 */
1523		vn_syncer_add_to_worklist(bo, syncdelay);
1524	}
1525	vdropl(vp);
1526	mtx_lock(&sync_mtx);
1527	return (0);
1528}
1529
1530/*
1531 * System filesystem synchronizer daemon.
1532 */
1533static void
1534sched_sync(void)
1535{
1536	struct synclist *next;
1537	struct synclist *slp;
1538	struct bufobj *bo;
1539	long starttime;
1540	struct thread *td = FIRST_THREAD_IN_PROC(updateproc);
1541	static int dummychan;
1542	int last_work_seen;
1543	int net_worklist_len;
1544	int syncer_final_iter;
1545	int first_printf;
1546	int error;
1547
1548	mtx_lock(&Giant);
1549	last_work_seen = 0;
1550	syncer_final_iter = 0;
1551	first_printf = 1;
1552	syncer_state = SYNCER_RUNNING;
1553	starttime = time_second;
1554
1555	EVENTHANDLER_REGISTER(shutdown_pre_sync, syncer_shutdown, td->td_proc,
1556	    SHUTDOWN_PRI_LAST);
1557
1558	for (;;) {
1559		mtx_lock(&sync_mtx);
1560		if (syncer_state == SYNCER_FINAL_DELAY &&
1561		    syncer_final_iter == 0) {
1562			mtx_unlock(&sync_mtx);
1563			kthread_suspend_check(td->td_proc);
1564			mtx_lock(&sync_mtx);
1565		}
1566		net_worklist_len = syncer_worklist_len - sync_vnode_count;
1567		if (syncer_state != SYNCER_RUNNING &&
1568		    starttime != time_second) {
1569			if (first_printf) {
1570				printf("\nSyncing disks, vnodes remaining...");
1571				first_printf = 0;
1572			}
1573			printf("%d ", net_worklist_len);
1574		}
1575		starttime = time_second;
1576
1577		/*
1578		 * Push files whose dirty time has expired.  Be careful
1579		 * of interrupt race on slp queue.
1580		 *
1581		 * Skip over empty worklist slots when shutting down.
1582		 */
1583		do {
1584			slp = &syncer_workitem_pending[syncer_delayno];
1585			syncer_delayno += 1;
1586			if (syncer_delayno == syncer_maxdelay)
1587				syncer_delayno = 0;
1588			next = &syncer_workitem_pending[syncer_delayno];
1589			/*
1590			 * If the worklist has wrapped since the
1591			 * it was emptied of all but syncer vnodes,
1592			 * switch to the FINAL_DELAY state and run
1593			 * for one more second.
1594			 */
1595			if (syncer_state == SYNCER_SHUTTING_DOWN &&
1596			    net_worklist_len == 0 &&
1597			    last_work_seen == syncer_delayno) {
1598				syncer_state = SYNCER_FINAL_DELAY;
1599				syncer_final_iter = SYNCER_SHUTDOWN_SPEEDUP;
1600			}
1601		} while (syncer_state != SYNCER_RUNNING && LIST_EMPTY(slp) &&
1602		    syncer_worklist_len > 0);
1603
1604		/*
1605		 * Keep track of the last time there was anything
1606		 * on the worklist other than syncer vnodes.
1607		 * Return to the SHUTTING_DOWN state if any
1608		 * new work appears.
1609		 */
1610		if (net_worklist_len > 0 || syncer_state == SYNCER_RUNNING)
1611			last_work_seen = syncer_delayno;
1612		if (net_worklist_len > 0 && syncer_state == SYNCER_FINAL_DELAY)
1613			syncer_state = SYNCER_SHUTTING_DOWN;
1614		while ((bo = LIST_FIRST(slp)) != NULL) {
1615			error = sync_vnode(bo, td);
1616			if (error == 1) {
1617				LIST_REMOVE(bo, bo_synclist);
1618				LIST_INSERT_HEAD(next, bo, bo_synclist);
1619				continue;
1620			}
1621		}
1622		if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter > 0)
1623			syncer_final_iter--;
1624		mtx_unlock(&sync_mtx);
1625
1626		/*
1627		 * Do soft update processing.
1628		 */
1629		if (softdep_process_worklist_hook != NULL)
1630			(*softdep_process_worklist_hook)(NULL);
1631
1632		/*
1633		 * The variable rushjob allows the kernel to speed up the
1634		 * processing of the filesystem syncer process. A rushjob
1635		 * value of N tells the filesystem syncer to process the next
1636		 * N seconds worth of work on its queue ASAP. Currently rushjob
1637		 * is used by the soft update code to speed up the filesystem
1638		 * syncer process when the incore state is getting so far
1639		 * ahead of the disk that the kernel memory pool is being
1640		 * threatened with exhaustion.
1641		 */
1642		mtx_lock(&sync_mtx);
1643		if (rushjob > 0) {
1644			rushjob -= 1;
1645			mtx_unlock(&sync_mtx);
1646			continue;
1647		}
1648		mtx_unlock(&sync_mtx);
1649		/*
1650		 * Just sleep for a short period if time between
1651		 * iterations when shutting down to allow some I/O
1652		 * to happen.
1653		 *
1654		 * If it has taken us less than a second to process the
1655		 * current work, then wait. Otherwise start right over
1656		 * again. We can still lose time if any single round
1657		 * takes more than two seconds, but it does not really
1658		 * matter as we are just trying to generally pace the
1659		 * filesystem activity.
1660		 */
1661		if (syncer_state != SYNCER_RUNNING)
1662			tsleep(&dummychan, PPAUSE, "syncfnl",
1663			    hz / SYNCER_SHUTDOWN_SPEEDUP);
1664		else if (time_second == starttime)
1665			tsleep(&lbolt, PPAUSE, "syncer", 0);
1666	}
1667}
1668
1669/*
1670 * Request the syncer daemon to speed up its work.
1671 * We never push it to speed up more than half of its
1672 * normal turn time, otherwise it could take over the cpu.
1673 */
1674int
1675speedup_syncer()
1676{
1677	struct thread *td;
1678	int ret = 0;
1679
1680	td = FIRST_THREAD_IN_PROC(updateproc);
1681	sleepq_remove(td, &lbolt);
1682	mtx_lock(&sync_mtx);
1683	if (rushjob < syncdelay / 2) {
1684		rushjob += 1;
1685		stat_rush_requests += 1;
1686		ret = 1;
1687	}
1688	mtx_unlock(&sync_mtx);
1689	return (ret);
1690}
1691
1692/*
1693 * Tell the syncer to speed up its work and run though its work
1694 * list several times, then tell it to shut down.
1695 */
1696static void
1697syncer_shutdown(void *arg, int howto)
1698{
1699	struct thread *td;
1700
1701	if (howto & RB_NOSYNC)
1702		return;
1703	td = FIRST_THREAD_IN_PROC(updateproc);
1704	sleepq_remove(td, &lbolt);
1705	mtx_lock(&sync_mtx);
1706	syncer_state = SYNCER_SHUTTING_DOWN;
1707	rushjob = 0;
1708	mtx_unlock(&sync_mtx);
1709	kproc_shutdown(arg, howto);
1710}
1711
1712/*
1713 * Reassign a buffer from one vnode to another.
1714 * Used to assign file specific control information
1715 * (indirect blocks) to the vnode to which they belong.
1716 */
1717void
1718reassignbuf(struct buf *bp)
1719{
1720	struct vnode *vp;
1721	struct bufobj *bo;
1722	int delay;
1723#ifdef INVARIANTS
1724	struct bufv *bv;
1725#endif
1726
1727	vp = bp->b_vp;
1728	bo = bp->b_bufobj;
1729	++reassignbufcalls;
1730
1731	CTR3(KTR_BUF, "reassignbuf(%p) vp %p flags %X",
1732	    bp, bp->b_vp, bp->b_flags);
1733	/*
1734	 * B_PAGING flagged buffers cannot be reassigned because their vp
1735	 * is not fully linked in.
1736	 */
1737	if (bp->b_flags & B_PAGING)
1738		panic("cannot reassign paging buffer");
1739
1740	/*
1741	 * Delete from old vnode list, if on one.
1742	 */
1743	VI_LOCK(vp);
1744	if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN))
1745		buf_vlist_remove(bp);
1746	else
1747		panic("reassignbuf: Buffer %p not on queue.", bp);
1748	/*
1749	 * If dirty, put on list of dirty buffers; otherwise insert onto list
1750	 * of clean buffers.
1751	 */
1752	if (bp->b_flags & B_DELWRI) {
1753		if ((bo->bo_flag & BO_ONWORKLST) == 0) {
1754			switch (vp->v_type) {
1755			case VDIR:
1756				delay = dirdelay;
1757				break;
1758			case VCHR:
1759				delay = metadelay;
1760				break;
1761			default:
1762				delay = filedelay;
1763			}
1764			vn_syncer_add_to_worklist(bo, delay);
1765		}
1766		buf_vlist_add(bp, bo, BX_VNDIRTY);
1767	} else {
1768		buf_vlist_add(bp, bo, BX_VNCLEAN);
1769
1770		if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) {
1771			mtx_lock(&sync_mtx);
1772			LIST_REMOVE(bo, bo_synclist);
1773 			syncer_worklist_len--;
1774			mtx_unlock(&sync_mtx);
1775			bo->bo_flag &= ~BO_ONWORKLST;
1776		}
1777	}
1778#ifdef INVARIANTS
1779	bv = &bo->bo_clean;
1780	bp = TAILQ_FIRST(&bv->bv_hd);
1781	KASSERT(bp == NULL || bp->b_bufobj == bo,
1782	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
1783	bp = TAILQ_LAST(&bv->bv_hd, buflists);
1784	KASSERT(bp == NULL || bp->b_bufobj == bo,
1785	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
1786	bv = &bo->bo_dirty;
1787	bp = TAILQ_FIRST(&bv->bv_hd);
1788	KASSERT(bp == NULL || bp->b_bufobj == bo,
1789	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
1790	bp = TAILQ_LAST(&bv->bv_hd, buflists);
1791	KASSERT(bp == NULL || bp->b_bufobj == bo,
1792	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
1793#endif
1794	VI_UNLOCK(vp);
1795}
1796
1797/*
1798 * Increment the use and hold counts on the vnode, taking care to reference
1799 * the driver's usecount if this is a chardev.  The vholdl() will remove
1800 * the vnode from the free list if it is presently free.  Requires the
1801 * vnode interlock and returns with it held.
1802 */
1803static void
1804v_incr_usecount(struct vnode *vp)
1805{
1806
1807	CTR3(KTR_VFS, "v_incr_usecount: vp %p holdcnt %d usecount %d\n",
1808	    vp, vp->v_holdcnt, vp->v_usecount);
1809	vp->v_usecount++;
1810	if (vp->v_type == VCHR && vp->v_rdev != NULL) {
1811		dev_lock();
1812		vp->v_rdev->si_usecount++;
1813		dev_unlock();
1814	}
1815	vholdl(vp);
1816}
1817
1818/*
1819 * Decrement the vnode use and hold count along with the driver's usecount
1820 * if this is a chardev.  The vdropl() below releases the vnode interlock
1821 * as it may free the vnode.
1822 */
1823static void
1824v_decr_usecount(struct vnode *vp)
1825{
1826
1827	CTR3(KTR_VFS, "v_decr_usecount: vp %p holdcnt %d usecount %d\n",
1828	    vp, vp->v_holdcnt, vp->v_usecount);
1829	ASSERT_VI_LOCKED(vp, __FUNCTION__);
1830	VNASSERT(vp->v_usecount > 0, vp,
1831	    ("v_decr_usecount: negative usecount"));
1832	vp->v_usecount--;
1833	if (vp->v_type == VCHR && vp->v_rdev != NULL) {
1834		dev_lock();
1835		vp->v_rdev->si_usecount--;
1836		dev_unlock();
1837	}
1838	vdropl(vp);
1839}
1840
1841/*
1842 * Decrement only the use count and driver use count.  This is intended to
1843 * be paired with a follow on vdropl() to release the remaining hold count.
1844 * In this way we may vgone() a vnode with a 0 usecount without risk of
1845 * having it end up on a free list because the hold count is kept above 0.
1846 */
1847static void
1848v_decr_useonly(struct vnode *vp)
1849{
1850
1851	CTR3(KTR_VFS, "v_decr_useonly: vp %p holdcnt %d usecount %d\n",
1852	    vp, vp->v_holdcnt, vp->v_usecount);
1853	ASSERT_VI_LOCKED(vp, __FUNCTION__);
1854	VNASSERT(vp->v_usecount > 0, vp,
1855	    ("v_decr_useonly: negative usecount"));
1856	vp->v_usecount--;
1857	if (vp->v_type == VCHR && vp->v_rdev != NULL) {
1858		dev_lock();
1859		vp->v_rdev->si_usecount--;
1860		dev_unlock();
1861	}
1862}
1863
1864/*
1865 * Grab a particular vnode from the free list, increment its
1866 * reference count and lock it. The vnode lock bit is set if the
1867 * vnode is being eliminated in vgone. The process is awakened
1868 * when the transition is completed, and an error returned to
1869 * indicate that the vnode is no longer usable (possibly having
1870 * been changed to a new filesystem type).
1871 */
1872int
1873vget(vp, flags, td)
1874	struct vnode *vp;
1875	int flags;
1876	struct thread *td;
1877{
1878	int oweinact;
1879	int oldflags;
1880	int error;
1881
1882	error = 0;
1883	oldflags = flags;
1884	oweinact = 0;
1885	if ((flags & LK_INTERLOCK) == 0)
1886		VI_LOCK(vp);
1887	/*
1888	 * If the inactive call was deferred because vput() was called
1889	 * with a shared lock, we have to do it here before another thread
1890	 * gets a reference to data that should be dead.
1891	 */
1892	if (vp->v_iflag & VI_OWEINACT) {
1893		if (flags & LK_NOWAIT) {
1894			VI_UNLOCK(vp);
1895			return (EBUSY);
1896		}
1897		flags &= ~LK_TYPE_MASK;
1898		flags |= LK_EXCLUSIVE;
1899		oweinact = 1;
1900	}
1901	v_incr_usecount(vp);
1902	if ((error = vn_lock(vp, flags | LK_INTERLOCK, td)) != 0) {
1903		VI_LOCK(vp);
1904		/*
1905		 * must expand vrele here because we do not want
1906		 * to call VOP_INACTIVE if the reference count
1907		 * drops back to zero since it was never really
1908		 * active.
1909		 */
1910		v_decr_usecount(vp);
1911		return (error);
1912	}
1913	if (vp->v_iflag & VI_DOOMED && (flags & LK_RETRY) == 0)
1914		panic("vget: vn_lock failed to return ENOENT\n");
1915	if (oweinact) {
1916		VI_LOCK(vp);
1917		if (vp->v_iflag & VI_OWEINACT)
1918			vinactive(vp, td);
1919		VI_UNLOCK(vp);
1920		if ((oldflags & LK_TYPE_MASK) == 0)
1921			VOP_UNLOCK(vp, 0, td);
1922	}
1923	return (0);
1924}
1925
1926/*
1927 * Increase the reference count of a vnode.
1928 */
1929void
1930vref(struct vnode *vp)
1931{
1932
1933	VI_LOCK(vp);
1934	v_incr_usecount(vp);
1935	VI_UNLOCK(vp);
1936}
1937
1938/*
1939 * Return reference count of a vnode.
1940 *
1941 * The results of this call are only guaranteed when some mechanism other
1942 * than the VI lock is used to stop other processes from gaining references
1943 * to the vnode.  This may be the case if the caller holds the only reference.
1944 * This is also useful when stale data is acceptable as race conditions may
1945 * be accounted for by some other means.
1946 */
1947int
1948vrefcnt(struct vnode *vp)
1949{
1950	int usecnt;
1951
1952	VI_LOCK(vp);
1953	usecnt = vp->v_usecount;
1954	VI_UNLOCK(vp);
1955
1956	return (usecnt);
1957}
1958
1959
1960/*
1961 * Vnode put/release.
1962 * If count drops to zero, call inactive routine and return to freelist.
1963 */
1964void
1965vrele(vp)
1966	struct vnode *vp;
1967{
1968	struct thread *td = curthread;	/* XXX */
1969
1970	KASSERT(vp != NULL, ("vrele: null vp"));
1971
1972	VI_LOCK(vp);
1973
1974	/* Skip this v_writecount check if we're going to panic below. */
1975	VNASSERT(vp->v_writecount < vp->v_usecount || vp->v_usecount < 1, vp,
1976	    ("vrele: missed vn_close"));
1977
1978	if (vp->v_usecount > 1 || ((vp->v_iflag & VI_DOINGINACT) &&
1979	    vp->v_usecount == 1)) {
1980		v_decr_usecount(vp);
1981		return;
1982	}
1983	if (vp->v_usecount != 1) {
1984#ifdef DIAGNOSTIC
1985		vprint("vrele: negative ref count", vp);
1986#endif
1987		VI_UNLOCK(vp);
1988		panic("vrele: negative ref cnt");
1989	}
1990	/*
1991	 * We want to hold the vnode until the inactive finishes to
1992	 * prevent vgone() races.  We drop the use count here and the
1993	 * hold count below when we're done.
1994	 */
1995	v_decr_useonly(vp);
1996	/*
1997	 * We must call VOP_INACTIVE with the node locked. Mark
1998	 * as VI_DOINGINACT to avoid recursion.
1999	 */
2000	if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK, td) == 0) {
2001		VI_LOCK(vp);
2002		vinactive(vp, td);
2003		VOP_UNLOCK(vp, 0, td);
2004	} else
2005		VI_LOCK(vp);
2006	vdropl(vp);
2007}
2008
2009/*
2010 * Release an already locked vnode.  This give the same effects as
2011 * unlock+vrele(), but takes less time and avoids releasing and
2012 * re-aquiring the lock (as vrele() aquires the lock internally.)
2013 */
2014void
2015vput(vp)
2016	struct vnode *vp;
2017{
2018	struct thread *td = curthread;	/* XXX */
2019	int error;
2020
2021	KASSERT(vp != NULL, ("vput: null vp"));
2022	ASSERT_VOP_LOCKED(vp, "vput");
2023	VI_LOCK(vp);
2024	/* Skip this v_writecount check if we're going to panic below. */
2025	VNASSERT(vp->v_writecount < vp->v_usecount || vp->v_usecount < 1, vp,
2026	    ("vput: missed vn_close"));
2027	error = 0;
2028
2029	if (vp->v_usecount > 1 || ((vp->v_iflag & VI_DOINGINACT) &&
2030	    vp->v_usecount == 1)) {
2031		VOP_UNLOCK(vp, 0, td);
2032		v_decr_usecount(vp);
2033		return;
2034	}
2035
2036	if (vp->v_usecount != 1) {
2037#ifdef DIAGNOSTIC
2038		vprint("vput: negative ref count", vp);
2039#endif
2040		panic("vput: negative ref cnt");
2041	}
2042	/*
2043	 * We want to hold the vnode until the inactive finishes to
2044	 * prevent vgone() races.  We drop the use count here and the
2045	 * hold count below when we're done.
2046	 */
2047	v_decr_useonly(vp);
2048	vp->v_iflag |= VI_OWEINACT;
2049	if (VOP_ISLOCKED(vp, NULL) != LK_EXCLUSIVE) {
2050		error = VOP_LOCK(vp, LK_EXCLUPGRADE|LK_INTERLOCK|LK_NOWAIT, td);
2051		VI_LOCK(vp);
2052		if (error)
2053			goto done;
2054	}
2055	if (vp->v_iflag & VI_OWEINACT)
2056		vinactive(vp, td);
2057	VOP_UNLOCK(vp, 0, td);
2058done:
2059	vdropl(vp);
2060}
2061
2062/*
2063 * Somebody doesn't want the vnode recycled.
2064 */
2065void
2066vhold(struct vnode *vp)
2067{
2068
2069	VI_LOCK(vp);
2070	vholdl(vp);
2071	VI_UNLOCK(vp);
2072}
2073
2074void
2075vholdl(struct vnode *vp)
2076{
2077
2078	vp->v_holdcnt++;
2079	if (VSHOULDBUSY(vp))
2080		vbusy(vp);
2081}
2082
2083/*
2084 * Note that there is one less who cares about this vnode.  vdrop() is the
2085 * opposite of vhold().
2086 */
2087void
2088vdrop(struct vnode *vp)
2089{
2090
2091	VI_LOCK(vp);
2092	vdropl(vp);
2093}
2094
2095/*
2096 * Drop the hold count of the vnode.  If this is the last reference to
2097 * the vnode we will free it if it has been vgone'd otherwise it is
2098 * placed on the free list.
2099 */
2100static void
2101vdropl(struct vnode *vp)
2102{
2103
2104	if (vp->v_holdcnt <= 0)
2105		panic("vdrop: holdcnt %d", vp->v_holdcnt);
2106	vp->v_holdcnt--;
2107	if (vp->v_holdcnt == 0) {
2108		if (vp->v_iflag & VI_DOOMED) {
2109			vdestroy(vp);
2110			return;
2111		} else
2112			vfree(vp);
2113	}
2114	VI_UNLOCK(vp);
2115}
2116
2117/*
2118 * Call VOP_INACTIVE on the vnode and manage the DOINGINACT and OWEINACT
2119 * flags.  DOINGINACT prevents us from recursing in calls to vinactive.
2120 * OWEINACT tracks whether a vnode missed a call to inactive due to a
2121 * failed lock upgrade.
2122 */
2123static void
2124vinactive(struct vnode *vp, struct thread *td)
2125{
2126
2127	ASSERT_VOP_LOCKED(vp, "vinactive");
2128	ASSERT_VI_LOCKED(vp, "vinactive");
2129	VNASSERT((vp->v_iflag & VI_DOINGINACT) == 0, vp,
2130	    ("vinactive: recursed on VI_DOINGINACT"));
2131	vp->v_iflag |= VI_DOINGINACT;
2132	vp->v_iflag &= ~VI_OWEINACT;
2133	VI_UNLOCK(vp);
2134	VOP_INACTIVE(vp, td);
2135	VI_LOCK(vp);
2136	VNASSERT(vp->v_iflag & VI_DOINGINACT, vp,
2137	    ("vinactive: lost VI_DOINGINACT"));
2138	vp->v_iflag &= ~VI_DOINGINACT;
2139}
2140
2141/*
2142 * Remove any vnodes in the vnode table belonging to mount point mp.
2143 *
2144 * If FORCECLOSE is not specified, there should not be any active ones,
2145 * return error if any are found (nb: this is a user error, not a
2146 * system error). If FORCECLOSE is specified, detach any active vnodes
2147 * that are found.
2148 *
2149 * If WRITECLOSE is set, only flush out regular file vnodes open for
2150 * writing.
2151 *
2152 * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped.
2153 *
2154 * `rootrefs' specifies the base reference count for the root vnode
2155 * of this filesystem. The root vnode is considered busy if its
2156 * v_usecount exceeds this value. On a successful return, vflush(, td)
2157 * will call vrele() on the root vnode exactly rootrefs times.
2158 * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must
2159 * be zero.
2160 */
2161#ifdef DIAGNOSTIC
2162static int busyprt = 0;		/* print out busy vnodes */
2163SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "");
2164#endif
2165
2166int
2167vflush(mp, rootrefs, flags, td)
2168	struct mount *mp;
2169	int rootrefs;
2170	int flags;
2171	struct thread *td;
2172{
2173	struct vnode *vp, *nvp, *rootvp = NULL;
2174	struct vattr vattr;
2175	int busy = 0, error;
2176
2177	CTR1(KTR_VFS, "vflush: mp %p", mp);
2178	if (rootrefs > 0) {
2179		KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0,
2180		    ("vflush: bad args"));
2181		/*
2182		 * Get the filesystem root vnode. We can vput() it
2183		 * immediately, since with rootrefs > 0, it won't go away.
2184		 */
2185		if ((error = VFS_ROOT(mp, LK_EXCLUSIVE, &rootvp, td)) != 0)
2186			return (error);
2187		vput(rootvp);
2188
2189	}
2190	MNT_ILOCK(mp);
2191loop:
2192	MNT_VNODE_FOREACH(vp, mp, nvp) {
2193
2194		VI_LOCK(vp);
2195		vholdl(vp);
2196		MNT_IUNLOCK(mp);
2197		error = vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE, td);
2198		if (error) {
2199			vdrop(vp);
2200			MNT_ILOCK(mp);
2201			goto loop;
2202		}
2203		/*
2204		 * Skip over a vnodes marked VV_SYSTEM.
2205		 */
2206		if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) {
2207			VOP_UNLOCK(vp, 0, td);
2208			vdrop(vp);
2209			MNT_ILOCK(mp);
2210			continue;
2211		}
2212		/*
2213		 * If WRITECLOSE is set, flush out unlinked but still open
2214		 * files (even if open only for reading) and regular file
2215		 * vnodes open for writing.
2216		 */
2217		if (flags & WRITECLOSE) {
2218			error = VOP_GETATTR(vp, &vattr, td->td_ucred, td);
2219			VI_LOCK(vp);
2220
2221			if ((vp->v_type == VNON ||
2222			    (error == 0 && vattr.va_nlink > 0)) &&
2223			    (vp->v_writecount == 0 || vp->v_type != VREG)) {
2224				VOP_UNLOCK(vp, 0, td);
2225				vdropl(vp);
2226				MNT_ILOCK(mp);
2227				continue;
2228			}
2229		} else
2230			VI_LOCK(vp);
2231		/*
2232		 * With v_usecount == 0, all we need to do is clear out the
2233		 * vnode data structures and we are done.
2234		 *
2235		 * If FORCECLOSE is set, forcibly close the vnode.
2236		 */
2237		if (vp->v_usecount == 0 || (flags & FORCECLOSE)) {
2238			VNASSERT(vp->v_usecount == 0 ||
2239			    (vp->v_type != VCHR && vp->v_type != VBLK), vp,
2240			    ("device VNODE %p is FORCECLOSED", vp));
2241			vgonel(vp);
2242		} else {
2243			busy++;
2244#ifdef DIAGNOSTIC
2245			if (busyprt)
2246				vprint("vflush: busy vnode", vp);
2247#endif
2248		}
2249		VOP_UNLOCK(vp, 0, td);
2250		vdropl(vp);
2251		MNT_ILOCK(mp);
2252	}
2253	MNT_IUNLOCK(mp);
2254	if (rootrefs > 0 && (flags & FORCECLOSE) == 0) {
2255		/*
2256		 * If just the root vnode is busy, and if its refcount
2257		 * is equal to `rootrefs', then go ahead and kill it.
2258		 */
2259		VI_LOCK(rootvp);
2260		KASSERT(busy > 0, ("vflush: not busy"));
2261		VNASSERT(rootvp->v_usecount >= rootrefs, rootvp,
2262		    ("vflush: usecount %d < rootrefs %d",
2263		     rootvp->v_usecount, rootrefs));
2264		if (busy == 1 && rootvp->v_usecount == rootrefs) {
2265			VOP_LOCK(rootvp, LK_EXCLUSIVE|LK_INTERLOCK, td);
2266			vgone(rootvp);
2267			VOP_UNLOCK(rootvp, 0, td);
2268			busy = 0;
2269		} else
2270			VI_UNLOCK(rootvp);
2271	}
2272	if (busy)
2273		return (EBUSY);
2274	for (; rootrefs > 0; rootrefs--)
2275		vrele(rootvp);
2276	return (0);
2277}
2278
2279/*
2280 * Recycle an unused vnode to the front of the free list.
2281 */
2282int
2283vrecycle(struct vnode *vp, struct thread *td)
2284{
2285	int recycled;
2286
2287	ASSERT_VOP_LOCKED(vp, "vrecycle");
2288	recycled = 0;
2289	VI_LOCK(vp);
2290	if (vp->v_usecount == 0) {
2291		recycled = 1;
2292		vgonel(vp);
2293	}
2294	VI_UNLOCK(vp);
2295	return (recycled);
2296}
2297
2298/*
2299 * Eliminate all activity associated with a vnode
2300 * in preparation for reuse.
2301 */
2302void
2303vgone(struct vnode *vp)
2304{
2305	VI_LOCK(vp);
2306	vgonel(vp);
2307	VI_UNLOCK(vp);
2308}
2309
2310/*
2311 * vgone, with the vp interlock held.
2312 */
2313void
2314vgonel(struct vnode *vp)
2315{
2316	struct thread *td;
2317	int oweinact;
2318	int active;
2319
2320	CTR1(KTR_VFS, "vgonel: vp %p", vp);
2321	ASSERT_VOP_LOCKED(vp, "vgonel");
2322	ASSERT_VI_LOCKED(vp, "vgonel");
2323#if 0
2324	/* XXX Need to fix ttyvp before I enable this. */
2325	VNASSERT(vp->v_holdcnt, vp,
2326	    ("vgonel: vp %p has no reference.", vp));
2327#endif
2328	td = curthread;
2329
2330	/*
2331	 * Don't vgonel if we're already doomed.
2332	 */
2333	if (vp->v_iflag & VI_DOOMED) {
2334		VI_UNLOCK(vp);
2335		return;
2336	}
2337	vp->v_iflag |= VI_DOOMED;
2338	/*
2339	 * Check to see if the vnode is in use.  If so, we have to call
2340	 * VOP_CLOSE() and VOP_INACTIVE().
2341	 */
2342	active = vp->v_usecount;
2343	oweinact = (vp->v_iflag & VI_OWEINACT);
2344	VI_UNLOCK(vp);
2345	/*
2346	 * Clean out any buffers associated with the vnode.
2347	 * If the flush fails, just toss the buffers.
2348	 */
2349	if (!TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd))
2350		(void) vn_write_suspend_wait(vp, NULL, V_WAIT);
2351	if (vinvalbuf(vp, V_SAVE, td, 0, 0) != 0)
2352		vinvalbuf(vp, 0, td, 0, 0);
2353
2354	/*
2355	 * If purging an active vnode, it must be closed and
2356	 * deactivated before being reclaimed.
2357	 */
2358	if (active)
2359		VOP_CLOSE(vp, FNONBLOCK, NOCRED, td);
2360	if (oweinact || active) {
2361		VI_LOCK(vp);
2362		if ((vp->v_iflag & VI_DOINGINACT) == 0)
2363			vinactive(vp, td);
2364		VI_UNLOCK(vp);
2365	}
2366	/*
2367	 * Reclaim the vnode.
2368	 */
2369	if (VOP_RECLAIM(vp, td))
2370		panic("vgone: cannot reclaim");
2371	VNASSERT(vp->v_object == NULL, vp,
2372	    ("vop_reclaim left v_object vp=%p, tag=%s", vp, vp->v_tag));
2373	/*
2374	 * Delete from old mount point vnode list.
2375	 */
2376	delmntque(vp);
2377	cache_purge(vp);
2378	/*
2379	 * Done with purge, reset to the standard lock and invalidate
2380	 * the vnode.
2381	 */
2382	VI_LOCK(vp);
2383	vp->v_vnlock = &vp->v_lock;
2384	vp->v_op = &dead_vnodeops;
2385	vp->v_tag = "none";
2386	vp->v_type = VBAD;
2387}
2388
2389/*
2390 * Calculate the total number of references to a special device.
2391 */
2392int
2393vcount(vp)
2394	struct vnode *vp;
2395{
2396	int count;
2397
2398	dev_lock();
2399	count = vp->v_rdev->si_usecount;
2400	dev_unlock();
2401	return (count);
2402}
2403
2404/*
2405 * Same as above, but using the struct cdev *as argument
2406 */
2407int
2408count_dev(dev)
2409	struct cdev *dev;
2410{
2411	int count;
2412
2413	dev_lock();
2414	count = dev->si_usecount;
2415	dev_unlock();
2416	return(count);
2417}
2418
2419/*
2420 * Print out a description of a vnode.
2421 */
2422static char *typename[] =
2423{"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"};
2424
2425void
2426vn_printf(struct vnode *vp, const char *fmt, ...)
2427{
2428	va_list ap;
2429	char buf[96];
2430
2431	va_start(ap, fmt);
2432	vprintf(fmt, ap);
2433	va_end(ap);
2434	printf("%p: ", (void *)vp);
2435	printf("tag %s, type %s\n", vp->v_tag, typename[vp->v_type]);
2436	printf("    usecount %d, writecount %d, refcount %d mountedhere %p\n",
2437	    vp->v_usecount, vp->v_writecount, vp->v_holdcnt, vp->v_mountedhere);
2438	buf[0] = '\0';
2439	buf[1] = '\0';
2440	if (vp->v_vflag & VV_ROOT)
2441		strcat(buf, "|VV_ROOT");
2442	if (vp->v_vflag & VV_TEXT)
2443		strcat(buf, "|VV_TEXT");
2444	if (vp->v_vflag & VV_SYSTEM)
2445		strcat(buf, "|VV_SYSTEM");
2446	if (vp->v_iflag & VI_DOOMED)
2447		strcat(buf, "|VI_DOOMED");
2448	if (vp->v_iflag & VI_FREE)
2449		strcat(buf, "|VI_FREE");
2450	printf("    flags (%s)\n", buf + 1);
2451	if (mtx_owned(VI_MTX(vp)))
2452		printf(" VI_LOCKed");
2453	if (vp->v_object != NULL)
2454		printf("    v_object %p ref %d pages %d\n",
2455		    vp->v_object, vp->v_object->ref_count,
2456		    vp->v_object->resident_page_count);
2457	printf("    ");
2458	lockmgr_printinfo(vp->v_vnlock);
2459	printf("\n");
2460	if (vp->v_data != NULL)
2461		VOP_PRINT(vp);
2462}
2463
2464#ifdef DDB
2465#include <ddb/ddb.h>
2466/*
2467 * List all of the locked vnodes in the system.
2468 * Called when debugging the kernel.
2469 */
2470DB_SHOW_COMMAND(lockedvnods, lockedvnodes)
2471{
2472	struct mount *mp, *nmp;
2473	struct vnode *vp;
2474
2475	/*
2476	 * Note: because this is DDB, we can't obey the locking semantics
2477	 * for these structures, which means we could catch an inconsistent
2478	 * state and dereference a nasty pointer.  Not much to be done
2479	 * about that.
2480	 */
2481	printf("Locked vnodes\n");
2482	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
2483		nmp = TAILQ_NEXT(mp, mnt_list);
2484		TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
2485			if (VOP_ISLOCKED(vp, NULL))
2486				vprint("", vp);
2487		}
2488		nmp = TAILQ_NEXT(mp, mnt_list);
2489	}
2490}
2491#endif
2492
2493/*
2494 * Fill in a struct xvfsconf based on a struct vfsconf.
2495 */
2496static void
2497vfsconf2x(struct vfsconf *vfsp, struct xvfsconf *xvfsp)
2498{
2499
2500	strcpy(xvfsp->vfc_name, vfsp->vfc_name);
2501	xvfsp->vfc_typenum = vfsp->vfc_typenum;
2502	xvfsp->vfc_refcount = vfsp->vfc_refcount;
2503	xvfsp->vfc_flags = vfsp->vfc_flags;
2504	/*
2505	 * These are unused in userland, we keep them
2506	 * to not break binary compatibility.
2507	 */
2508	xvfsp->vfc_vfsops = NULL;
2509	xvfsp->vfc_next = NULL;
2510}
2511
2512/*
2513 * Top level filesystem related information gathering.
2514 */
2515static int
2516sysctl_vfs_conflist(SYSCTL_HANDLER_ARGS)
2517{
2518	struct vfsconf *vfsp;
2519	struct xvfsconf xvfsp;
2520	int error;
2521
2522	error = 0;
2523	TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
2524		bzero(&xvfsp, sizeof(xvfsp));
2525		vfsconf2x(vfsp, &xvfsp);
2526		error = SYSCTL_OUT(req, &xvfsp, sizeof xvfsp);
2527		if (error)
2528			break;
2529	}
2530	return (error);
2531}
2532
2533SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLFLAG_RD, NULL, 0, sysctl_vfs_conflist,
2534    "S,xvfsconf", "List of all configured filesystems");
2535
2536#ifndef BURN_BRIDGES
2537static int	sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS);
2538
2539static int
2540vfs_sysctl(SYSCTL_HANDLER_ARGS)
2541{
2542	int *name = (int *)arg1 - 1;	/* XXX */
2543	u_int namelen = arg2 + 1;	/* XXX */
2544	struct vfsconf *vfsp;
2545	struct xvfsconf xvfsp;
2546
2547	printf("WARNING: userland calling deprecated sysctl, "
2548	    "please rebuild world\n");
2549
2550#if 1 || defined(COMPAT_PRELITE2)
2551	/* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */
2552	if (namelen == 1)
2553		return (sysctl_ovfs_conf(oidp, arg1, arg2, req));
2554#endif
2555
2556	switch (name[1]) {
2557	case VFS_MAXTYPENUM:
2558		if (namelen != 2)
2559			return (ENOTDIR);
2560		return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int)));
2561	case VFS_CONF:
2562		if (namelen != 3)
2563			return (ENOTDIR);	/* overloaded */
2564		TAILQ_FOREACH(vfsp, &vfsconf, vfc_list)
2565			if (vfsp->vfc_typenum == name[2])
2566				break;
2567		if (vfsp == NULL)
2568			return (EOPNOTSUPP);
2569		bzero(&xvfsp, sizeof(xvfsp));
2570		vfsconf2x(vfsp, &xvfsp);
2571		return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp)));
2572	}
2573	return (EOPNOTSUPP);
2574}
2575
2576static SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD | CTLFLAG_SKIP,
2577	vfs_sysctl, "Generic filesystem");
2578
2579#if 1 || defined(COMPAT_PRELITE2)
2580
2581static int
2582sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS)
2583{
2584	int error;
2585	struct vfsconf *vfsp;
2586	struct ovfsconf ovfs;
2587
2588	TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
2589		bzero(&ovfs, sizeof(ovfs));
2590		ovfs.vfc_vfsops = vfsp->vfc_vfsops;	/* XXX used as flag */
2591		strcpy(ovfs.vfc_name, vfsp->vfc_name);
2592		ovfs.vfc_index = vfsp->vfc_typenum;
2593		ovfs.vfc_refcount = vfsp->vfc_refcount;
2594		ovfs.vfc_flags = vfsp->vfc_flags;
2595		error = SYSCTL_OUT(req, &ovfs, sizeof ovfs);
2596		if (error)
2597			return error;
2598	}
2599	return 0;
2600}
2601
2602#endif /* 1 || COMPAT_PRELITE2 */
2603#endif /* !BURN_BRIDGES */
2604
2605#define KINFO_VNODESLOP		10
2606#ifdef notyet
2607/*
2608 * Dump vnode list (via sysctl).
2609 */
2610/* ARGSUSED */
2611static int
2612sysctl_vnode(SYSCTL_HANDLER_ARGS)
2613{
2614	struct xvnode *xvn;
2615	struct thread *td = req->td;
2616	struct mount *mp;
2617	struct vnode *vp;
2618	int error, len, n;
2619
2620	/*
2621	 * Stale numvnodes access is not fatal here.
2622	 */
2623	req->lock = 0;
2624	len = (numvnodes + KINFO_VNODESLOP) * sizeof *xvn;
2625	if (!req->oldptr)
2626		/* Make an estimate */
2627		return (SYSCTL_OUT(req, 0, len));
2628
2629	error = sysctl_wire_old_buffer(req, 0);
2630	if (error != 0)
2631		return (error);
2632	xvn = malloc(len, M_TEMP, M_ZERO | M_WAITOK);
2633	n = 0;
2634	mtx_lock(&mountlist_mtx);
2635	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
2636		if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td))
2637			continue;
2638		MNT_ILOCK(mp);
2639		TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
2640			if (n == len)
2641				break;
2642			vref(vp);
2643			xvn[n].xv_size = sizeof *xvn;
2644			xvn[n].xv_vnode = vp;
2645			xvn[n].xv_id = 0;	/* XXX compat */
2646#define XV_COPY(field) xvn[n].xv_##field = vp->v_##field
2647			XV_COPY(usecount);
2648			XV_COPY(writecount);
2649			XV_COPY(holdcnt);
2650			XV_COPY(mount);
2651			XV_COPY(numoutput);
2652			XV_COPY(type);
2653#undef XV_COPY
2654			xvn[n].xv_flag = vp->v_vflag;
2655
2656			switch (vp->v_type) {
2657			case VREG:
2658			case VDIR:
2659			case VLNK:
2660				break;
2661			case VBLK:
2662			case VCHR:
2663				if (vp->v_rdev == NULL) {
2664					vrele(vp);
2665					continue;
2666				}
2667				xvn[n].xv_dev = dev2udev(vp->v_rdev);
2668				break;
2669			case VSOCK:
2670				xvn[n].xv_socket = vp->v_socket;
2671				break;
2672			case VFIFO:
2673				xvn[n].xv_fifo = vp->v_fifoinfo;
2674				break;
2675			case VNON:
2676			case VBAD:
2677			default:
2678				/* shouldn't happen? */
2679				vrele(vp);
2680				continue;
2681			}
2682			vrele(vp);
2683			++n;
2684		}
2685		MNT_IUNLOCK(mp);
2686		mtx_lock(&mountlist_mtx);
2687		vfs_unbusy(mp, td);
2688		if (n == len)
2689			break;
2690	}
2691	mtx_unlock(&mountlist_mtx);
2692
2693	error = SYSCTL_OUT(req, xvn, n * sizeof *xvn);
2694	free(xvn, M_TEMP);
2695	return (error);
2696}
2697
2698SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD,
2699	0, 0, sysctl_vnode, "S,xvnode", "");
2700#endif
2701
2702/*
2703 * Unmount all filesystems. The list is traversed in reverse order
2704 * of mounting to avoid dependencies.
2705 */
2706void
2707vfs_unmountall()
2708{
2709	struct mount *mp;
2710	struct thread *td;
2711	int error;
2712
2713	KASSERT(curthread != NULL, ("vfs_unmountall: NULL curthread"));
2714	td = curthread;
2715	/*
2716	 * Since this only runs when rebooting, it is not interlocked.
2717	 */
2718	while(!TAILQ_EMPTY(&mountlist)) {
2719		mp = TAILQ_LAST(&mountlist, mntlist);
2720		error = dounmount(mp, MNT_FORCE, td);
2721		if (error) {
2722			TAILQ_REMOVE(&mountlist, mp, mnt_list);
2723			printf("unmount of %s failed (",
2724			    mp->mnt_stat.f_mntonname);
2725			if (error == EBUSY)
2726				printf("BUSY)\n");
2727			else
2728				printf("%d)\n", error);
2729		} else {
2730			/* The unmount has removed mp from the mountlist */
2731		}
2732	}
2733}
2734
2735/*
2736 * perform msync on all vnodes under a mount point
2737 * the mount point must be locked.
2738 */
2739void
2740vfs_msync(struct mount *mp, int flags)
2741{
2742	struct vnode *vp, *nvp;
2743	struct vm_object *obj;
2744	int tries;
2745
2746	tries = 5;
2747	MNT_ILOCK(mp);
2748loop:
2749	TAILQ_FOREACH_SAFE(vp, &mp->mnt_nvnodelist, v_nmntvnodes, nvp) {
2750		if (vp->v_mount != mp) {
2751			if (--tries > 0)
2752				goto loop;
2753			break;
2754		}
2755
2756		VI_LOCK(vp);
2757		if ((vp->v_iflag & VI_OBJDIRTY) &&
2758		    (flags == MNT_WAIT || VOP_ISLOCKED(vp, NULL) == 0)) {
2759			MNT_IUNLOCK(mp);
2760			if (!vget(vp,
2761			    LK_EXCLUSIVE | LK_RETRY | LK_INTERLOCK,
2762			    curthread)) {
2763				if (vp->v_vflag & VV_NOSYNC) {	/* unlinked */
2764					vput(vp);
2765					MNT_ILOCK(mp);
2766					continue;
2767				}
2768
2769				obj = vp->v_object;
2770				if (obj != NULL) {
2771					VM_OBJECT_LOCK(obj);
2772					vm_object_page_clean(obj, 0, 0,
2773					    flags == MNT_WAIT ?
2774					    OBJPC_SYNC : OBJPC_NOSYNC);
2775					VM_OBJECT_UNLOCK(obj);
2776				}
2777				vput(vp);
2778			}
2779			MNT_ILOCK(mp);
2780			if (TAILQ_NEXT(vp, v_nmntvnodes) != nvp) {
2781				if (--tries > 0)
2782					goto loop;
2783				break;
2784			}
2785		} else
2786			VI_UNLOCK(vp);
2787	}
2788	MNT_IUNLOCK(mp);
2789}
2790
2791/*
2792 * Mark a vnode as free, putting it up for recycling.
2793 */
2794static void
2795vfree(struct vnode *vp)
2796{
2797
2798	CTR1(KTR_VFS, "vfree vp %p", vp);
2799	ASSERT_VI_LOCKED(vp, "vfree");
2800	mtx_lock(&vnode_free_list_mtx);
2801	VNASSERT(vp->v_op != NULL, vp, ("vfree: vnode already reclaimed."));
2802	VNASSERT((vp->v_iflag & VI_FREE) == 0, vp, ("vnode already free"));
2803	VNASSERT(VSHOULDFREE(vp), vp, ("vfree: freeing when we shouldn't"));
2804	VNASSERT((vp->v_iflag & VI_DOOMED) == 0, vp,
2805	    ("vfree: Freeing doomed vnode"));
2806	if (vp->v_iflag & VI_AGE) {
2807		TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
2808	} else {
2809		TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
2810	}
2811	freevnodes++;
2812	vp->v_iflag &= ~VI_AGE;
2813	vp->v_iflag |= VI_FREE;
2814	mtx_unlock(&vnode_free_list_mtx);
2815}
2816
2817/*
2818 * Opposite of vfree() - mark a vnode as in use.
2819 */
2820static void
2821vbusy(struct vnode *vp)
2822{
2823	CTR1(KTR_VFS, "vbusy vp %p", vp);
2824	ASSERT_VI_LOCKED(vp, "vbusy");
2825	VNASSERT((vp->v_iflag & VI_FREE) != 0, vp, ("vnode not free"));
2826	VNASSERT(vp->v_op != NULL, vp, ("vbusy: vnode already reclaimed."));
2827
2828	mtx_lock(&vnode_free_list_mtx);
2829	TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
2830	freevnodes--;
2831	vp->v_iflag &= ~(VI_FREE|VI_AGE);
2832	mtx_unlock(&vnode_free_list_mtx);
2833}
2834
2835/*
2836 * Initalize per-vnode helper structure to hold poll-related state.
2837 */
2838void
2839v_addpollinfo(struct vnode *vp)
2840{
2841	struct vpollinfo *vi;
2842
2843	vi = uma_zalloc(vnodepoll_zone, M_WAITOK);
2844	if (vp->v_pollinfo != NULL) {
2845		uma_zfree(vnodepoll_zone, vi);
2846		return;
2847	}
2848	vp->v_pollinfo = vi;
2849	mtx_init(&vp->v_pollinfo->vpi_lock, "vnode pollinfo", NULL, MTX_DEF);
2850	knlist_init(&vp->v_pollinfo->vpi_selinfo.si_note, vp, vfs_knllock,
2851	    vfs_knlunlock, vfs_knllocked);
2852}
2853
2854/*
2855 * Record a process's interest in events which might happen to
2856 * a vnode.  Because poll uses the historic select-style interface
2857 * internally, this routine serves as both the ``check for any
2858 * pending events'' and the ``record my interest in future events''
2859 * functions.  (These are done together, while the lock is held,
2860 * to avoid race conditions.)
2861 */
2862int
2863vn_pollrecord(vp, td, events)
2864	struct vnode *vp;
2865	struct thread *td;
2866	short events;
2867{
2868
2869	if (vp->v_pollinfo == NULL)
2870		v_addpollinfo(vp);
2871	mtx_lock(&vp->v_pollinfo->vpi_lock);
2872	if (vp->v_pollinfo->vpi_revents & events) {
2873		/*
2874		 * This leaves events we are not interested
2875		 * in available for the other process which
2876		 * which presumably had requested them
2877		 * (otherwise they would never have been
2878		 * recorded).
2879		 */
2880		events &= vp->v_pollinfo->vpi_revents;
2881		vp->v_pollinfo->vpi_revents &= ~events;
2882
2883		mtx_unlock(&vp->v_pollinfo->vpi_lock);
2884		return events;
2885	}
2886	vp->v_pollinfo->vpi_events |= events;
2887	selrecord(td, &vp->v_pollinfo->vpi_selinfo);
2888	mtx_unlock(&vp->v_pollinfo->vpi_lock);
2889	return 0;
2890}
2891
2892/*
2893 * Routine to create and manage a filesystem syncer vnode.
2894 */
2895#define sync_close ((int (*)(struct  vop_close_args *))nullop)
2896static int	sync_fsync(struct  vop_fsync_args *);
2897static int	sync_inactive(struct  vop_inactive_args *);
2898static int	sync_reclaim(struct  vop_reclaim_args *);
2899
2900static struct vop_vector sync_vnodeops = {
2901	.vop_bypass =	VOP_EOPNOTSUPP,
2902	.vop_close =	sync_close,		/* close */
2903	.vop_fsync =	sync_fsync,		/* fsync */
2904	.vop_inactive =	sync_inactive,	/* inactive */
2905	.vop_reclaim =	sync_reclaim,	/* reclaim */
2906	.vop_lock =	vop_stdlock,	/* lock */
2907	.vop_unlock =	vop_stdunlock,	/* unlock */
2908	.vop_islocked =	vop_stdislocked,	/* islocked */
2909};
2910
2911/*
2912 * Create a new filesystem syncer vnode for the specified mount point.
2913 */
2914int
2915vfs_allocate_syncvnode(mp)
2916	struct mount *mp;
2917{
2918	struct vnode *vp;
2919	static long start, incr, next;
2920	int error;
2921
2922	/* Allocate a new vnode */
2923	if ((error = getnewvnode("syncer", mp, &sync_vnodeops, &vp)) != 0) {
2924		mp->mnt_syncer = NULL;
2925		return (error);
2926	}
2927	vp->v_type = VNON;
2928	/*
2929	 * Place the vnode onto the syncer worklist. We attempt to
2930	 * scatter them about on the list so that they will go off
2931	 * at evenly distributed times even if all the filesystems
2932	 * are mounted at once.
2933	 */
2934	next += incr;
2935	if (next == 0 || next > syncer_maxdelay) {
2936		start /= 2;
2937		incr /= 2;
2938		if (start == 0) {
2939			start = syncer_maxdelay / 2;
2940			incr = syncer_maxdelay;
2941		}
2942		next = start;
2943	}
2944	VI_LOCK(vp);
2945	vn_syncer_add_to_worklist(&vp->v_bufobj,
2946	    syncdelay > 0 ? next % syncdelay : 0);
2947	/* XXX - vn_syncer_add_to_worklist() also grabs and drops sync_mtx. */
2948	mtx_lock(&sync_mtx);
2949	sync_vnode_count++;
2950	mtx_unlock(&sync_mtx);
2951	VI_UNLOCK(vp);
2952	mp->mnt_syncer = vp;
2953	return (0);
2954}
2955
2956/*
2957 * Do a lazy sync of the filesystem.
2958 */
2959static int
2960sync_fsync(ap)
2961	struct vop_fsync_args /* {
2962		struct vnode *a_vp;
2963		struct ucred *a_cred;
2964		int a_waitfor;
2965		struct thread *a_td;
2966	} */ *ap;
2967{
2968	struct vnode *syncvp = ap->a_vp;
2969	struct mount *mp = syncvp->v_mount;
2970	struct thread *td = ap->a_td;
2971	int error, asyncflag;
2972	struct bufobj *bo;
2973
2974	/*
2975	 * We only need to do something if this is a lazy evaluation.
2976	 */
2977	if (ap->a_waitfor != MNT_LAZY)
2978		return (0);
2979
2980	/*
2981	 * Move ourselves to the back of the sync list.
2982	 */
2983	bo = &syncvp->v_bufobj;
2984	BO_LOCK(bo);
2985	vn_syncer_add_to_worklist(bo, syncdelay);
2986	BO_UNLOCK(bo);
2987
2988	/*
2989	 * Walk the list of vnodes pushing all that are dirty and
2990	 * not already on the sync list.
2991	 */
2992	mtx_lock(&mountlist_mtx);
2993	if (vfs_busy(mp, LK_EXCLUSIVE | LK_NOWAIT, &mountlist_mtx, td) != 0) {
2994		mtx_unlock(&mountlist_mtx);
2995		return (0);
2996	}
2997	if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) {
2998		vfs_unbusy(mp, td);
2999		return (0);
3000	}
3001	asyncflag = mp->mnt_flag & MNT_ASYNC;
3002	mp->mnt_flag &= ~MNT_ASYNC;
3003	vfs_msync(mp, MNT_NOWAIT);
3004	error = VFS_SYNC(mp, MNT_LAZY, td);
3005	if (asyncflag)
3006		mp->mnt_flag |= MNT_ASYNC;
3007	vn_finished_write(mp);
3008	vfs_unbusy(mp, td);
3009	return (error);
3010}
3011
3012/*
3013 * The syncer vnode is no referenced.
3014 */
3015static int
3016sync_inactive(ap)
3017	struct vop_inactive_args /* {
3018		struct vnode *a_vp;
3019		struct thread *a_td;
3020	} */ *ap;
3021{
3022
3023	vgone(ap->a_vp);
3024	return (0);
3025}
3026
3027/*
3028 * The syncer vnode is no longer needed and is being decommissioned.
3029 *
3030 * Modifications to the worklist must be protected by sync_mtx.
3031 */
3032static int
3033sync_reclaim(ap)
3034	struct vop_reclaim_args /* {
3035		struct vnode *a_vp;
3036	} */ *ap;
3037{
3038	struct vnode *vp = ap->a_vp;
3039	struct bufobj *bo;
3040
3041	VI_LOCK(vp);
3042	bo = &vp->v_bufobj;
3043	vp->v_mount->mnt_syncer = NULL;
3044	if (bo->bo_flag & BO_ONWORKLST) {
3045		mtx_lock(&sync_mtx);
3046		LIST_REMOVE(bo, bo_synclist);
3047 		syncer_worklist_len--;
3048		sync_vnode_count--;
3049		mtx_unlock(&sync_mtx);
3050		bo->bo_flag &= ~BO_ONWORKLST;
3051	}
3052	VI_UNLOCK(vp);
3053
3054	return (0);
3055}
3056
3057/*
3058 * Check if vnode represents a disk device
3059 */
3060int
3061vn_isdisk(vp, errp)
3062	struct vnode *vp;
3063	int *errp;
3064{
3065	int error;
3066
3067	error = 0;
3068	dev_lock();
3069	if (vp->v_type != VCHR)
3070		error = ENOTBLK;
3071	else if (vp->v_rdev == NULL)
3072		error = ENXIO;
3073	else if (vp->v_rdev->si_devsw == NULL)
3074		error = ENXIO;
3075	else if (!(vp->v_rdev->si_devsw->d_flags & D_DISK))
3076		error = ENOTBLK;
3077	dev_unlock();
3078	if (errp != NULL)
3079		*errp = error;
3080	return (error == 0);
3081}
3082
3083/*
3084 * Common filesystem object access control check routine.  Accepts a
3085 * vnode's type, "mode", uid and gid, requested access mode, credentials,
3086 * and optional call-by-reference privused argument allowing vaccess()
3087 * to indicate to the caller whether privilege was used to satisfy the
3088 * request (obsoleted).  Returns 0 on success, or an errno on failure.
3089 */
3090int
3091vaccess(type, file_mode, file_uid, file_gid, acc_mode, cred, privused)
3092	enum vtype type;
3093	mode_t file_mode;
3094	uid_t file_uid;
3095	gid_t file_gid;
3096	mode_t acc_mode;
3097	struct ucred *cred;
3098	int *privused;
3099{
3100	mode_t dac_granted;
3101#ifdef CAPABILITIES
3102	mode_t cap_granted;
3103#endif
3104
3105	/*
3106	 * Look for a normal, non-privileged way to access the file/directory
3107	 * as requested.  If it exists, go with that.
3108	 */
3109
3110	if (privused != NULL)
3111		*privused = 0;
3112
3113	dac_granted = 0;
3114
3115	/* Check the owner. */
3116	if (cred->cr_uid == file_uid) {
3117		dac_granted |= VADMIN;
3118		if (file_mode & S_IXUSR)
3119			dac_granted |= VEXEC;
3120		if (file_mode & S_IRUSR)
3121			dac_granted |= VREAD;
3122		if (file_mode & S_IWUSR)
3123			dac_granted |= (VWRITE | VAPPEND);
3124
3125		if ((acc_mode & dac_granted) == acc_mode)
3126			return (0);
3127
3128		goto privcheck;
3129	}
3130
3131	/* Otherwise, check the groups (first match) */
3132	if (groupmember(file_gid, cred)) {
3133		if (file_mode & S_IXGRP)
3134			dac_granted |= VEXEC;
3135		if (file_mode & S_IRGRP)
3136			dac_granted |= VREAD;
3137		if (file_mode & S_IWGRP)
3138			dac_granted |= (VWRITE | VAPPEND);
3139
3140		if ((acc_mode & dac_granted) == acc_mode)
3141			return (0);
3142
3143		goto privcheck;
3144	}
3145
3146	/* Otherwise, check everyone else. */
3147	if (file_mode & S_IXOTH)
3148		dac_granted |= VEXEC;
3149	if (file_mode & S_IROTH)
3150		dac_granted |= VREAD;
3151	if (file_mode & S_IWOTH)
3152		dac_granted |= (VWRITE | VAPPEND);
3153	if ((acc_mode & dac_granted) == acc_mode)
3154		return (0);
3155
3156privcheck:
3157	if (!suser_cred(cred, SUSER_ALLOWJAIL)) {
3158		/* XXX audit: privilege used */
3159		if (privused != NULL)
3160			*privused = 1;
3161		return (0);
3162	}
3163
3164#ifdef CAPABILITIES
3165	/*
3166	 * Build a capability mask to determine if the set of capabilities
3167	 * satisfies the requirements when combined with the granted mask
3168	 * from above.
3169	 * For each capability, if the capability is required, bitwise
3170	 * or the request type onto the cap_granted mask.
3171	 */
3172	cap_granted = 0;
3173
3174	if (type == VDIR) {
3175		/*
3176		 * For directories, use CAP_DAC_READ_SEARCH to satisfy
3177		 * VEXEC requests, instead of CAP_DAC_EXECUTE.
3178		 */
3179		if ((acc_mode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
3180		    !cap_check(cred, NULL, CAP_DAC_READ_SEARCH, SUSER_ALLOWJAIL))
3181			cap_granted |= VEXEC;
3182	} else {
3183		if ((acc_mode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
3184		    !cap_check(cred, NULL, CAP_DAC_EXECUTE, SUSER_ALLOWJAIL))
3185			cap_granted |= VEXEC;
3186	}
3187
3188	if ((acc_mode & VREAD) && ((dac_granted & VREAD) == 0) &&
3189	    !cap_check(cred, NULL, CAP_DAC_READ_SEARCH, SUSER_ALLOWJAIL))
3190		cap_granted |= VREAD;
3191
3192	if ((acc_mode & VWRITE) && ((dac_granted & VWRITE) == 0) &&
3193	    !cap_check(cred, NULL, CAP_DAC_WRITE, SUSER_ALLOWJAIL))
3194		cap_granted |= (VWRITE | VAPPEND);
3195
3196	if ((acc_mode & VADMIN) && ((dac_granted & VADMIN) == 0) &&
3197	    !cap_check(cred, NULL, CAP_FOWNER, SUSER_ALLOWJAIL))
3198		cap_granted |= VADMIN;
3199
3200	if ((acc_mode & (cap_granted | dac_granted)) == acc_mode) {
3201		/* XXX audit: privilege used */
3202		if (privused != NULL)
3203			*privused = 1;
3204		return (0);
3205	}
3206#endif
3207
3208	return ((acc_mode & VADMIN) ? EPERM : EACCES);
3209}
3210
3211/*
3212 * Credential check based on process requesting service, and per-attribute
3213 * permissions.
3214 */
3215int
3216extattr_check_cred(struct vnode *vp, int attrnamespace,
3217    struct ucred *cred, struct thread *td, int access)
3218{
3219
3220	/*
3221	 * Kernel-invoked always succeeds.
3222	 */
3223	if (cred == NOCRED)
3224		return (0);
3225
3226	/*
3227	 * Do not allow privileged processes in jail to directly
3228	 * manipulate system attributes.
3229	 *
3230	 * XXX What capability should apply here?
3231	 * Probably CAP_SYS_SETFFLAG.
3232	 */
3233	switch (attrnamespace) {
3234	case EXTATTR_NAMESPACE_SYSTEM:
3235		/* Potentially should be: return (EPERM); */
3236		return (suser_cred(cred, 0));
3237	case EXTATTR_NAMESPACE_USER:
3238		return (VOP_ACCESS(vp, access, cred, td));
3239	default:
3240		return (EPERM);
3241	}
3242}
3243
3244#ifdef DEBUG_VFS_LOCKS
3245/*
3246 * This only exists to supress warnings from unlocked specfs accesses.  It is
3247 * no longer ok to have an unlocked VFS.
3248 */
3249#define	IGNORE_LOCK(vp) ((vp)->v_type == VCHR || (vp)->v_type == VBAD)
3250
3251int vfs_badlock_ddb = 1;	/* Drop into debugger on violation. */
3252SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_ddb, CTLFLAG_RW, &vfs_badlock_ddb, 0, "");
3253
3254int vfs_badlock_mutex = 1;	/* Check for interlock across VOPs. */
3255SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_mutex, CTLFLAG_RW, &vfs_badlock_mutex, 0, "");
3256
3257int vfs_badlock_print = 1;	/* Print lock violations. */
3258SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_print, CTLFLAG_RW, &vfs_badlock_print, 0, "");
3259
3260#ifdef KDB
3261int vfs_badlock_backtrace = 1;	/* Print backtrace at lock violations. */
3262SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_backtrace, CTLFLAG_RW, &vfs_badlock_backtrace, 0, "");
3263#endif
3264
3265static void
3266vfs_badlock(const char *msg, const char *str, struct vnode *vp)
3267{
3268
3269#ifdef KDB
3270	if (vfs_badlock_backtrace)
3271		kdb_backtrace();
3272#endif
3273	if (vfs_badlock_print)
3274		printf("%s: %p %s\n", str, (void *)vp, msg);
3275	if (vfs_badlock_ddb)
3276		kdb_enter("lock violation");
3277}
3278
3279void
3280assert_vi_locked(struct vnode *vp, const char *str)
3281{
3282
3283	if (vfs_badlock_mutex && !mtx_owned(VI_MTX(vp)))
3284		vfs_badlock("interlock is not locked but should be", str, vp);
3285}
3286
3287void
3288assert_vi_unlocked(struct vnode *vp, const char *str)
3289{
3290
3291	if (vfs_badlock_mutex && mtx_owned(VI_MTX(vp)))
3292		vfs_badlock("interlock is locked but should not be", str, vp);
3293}
3294
3295void
3296assert_vop_locked(struct vnode *vp, const char *str)
3297{
3298
3299	if (vp && !IGNORE_LOCK(vp) && VOP_ISLOCKED(vp, NULL) == 0)
3300		vfs_badlock("is not locked but should be", str, vp);
3301}
3302
3303void
3304assert_vop_unlocked(struct vnode *vp, const char *str)
3305{
3306
3307	if (vp && !IGNORE_LOCK(vp) &&
3308	    VOP_ISLOCKED(vp, curthread) == LK_EXCLUSIVE)
3309		vfs_badlock("is locked but should not be", str, vp);
3310}
3311
3312void
3313assert_vop_elocked(struct vnode *vp, const char *str)
3314{
3315
3316	if (vp && !IGNORE_LOCK(vp) &&
3317	    VOP_ISLOCKED(vp, curthread) != LK_EXCLUSIVE)
3318		vfs_badlock("is not exclusive locked but should be", str, vp);
3319}
3320
3321#if 0
3322void
3323assert_vop_elocked_other(struct vnode *vp, const char *str)
3324{
3325
3326	if (vp && !IGNORE_LOCK(vp) &&
3327	    VOP_ISLOCKED(vp, curthread) != LK_EXCLOTHER)
3328		vfs_badlock("is not exclusive locked by another thread",
3329		    str, vp);
3330}
3331
3332void
3333assert_vop_slocked(struct vnode *vp, const char *str)
3334{
3335
3336	if (vp && !IGNORE_LOCK(vp) &&
3337	    VOP_ISLOCKED(vp, curthread) != LK_SHARED)
3338		vfs_badlock("is not locked shared but should be", str, vp);
3339}
3340#endif /* 0 */
3341#endif /* DEBUG_VFS_LOCKS */
3342
3343void
3344vop_rename_pre(void *ap)
3345{
3346	struct vop_rename_args *a = ap;
3347
3348#ifdef DEBUG_VFS_LOCKS
3349	if (a->a_tvp)
3350		ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME");
3351	ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME");
3352	ASSERT_VI_UNLOCKED(a->a_fvp, "VOP_RENAME");
3353	ASSERT_VI_UNLOCKED(a->a_fdvp, "VOP_RENAME");
3354
3355	/* Check the source (from). */
3356	if (a->a_tdvp != a->a_fdvp)
3357		ASSERT_VOP_UNLOCKED(a->a_fdvp, "vop_rename: fdvp locked");
3358	if (a->a_tvp != a->a_fvp)
3359		ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: tvp locked");
3360
3361	/* Check the target. */
3362	if (a->a_tvp)
3363		ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked");
3364	ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked");
3365#endif
3366	if (a->a_tdvp != a->a_fdvp)
3367		vholdl(a->a_fdvp);
3368	if (a->a_tvp != a->a_fvp)
3369		vhold(a->a_fvp);
3370	vhold(a->a_tdvp);
3371	if (a->a_tvp)
3372		vhold(a->a_tvp);
3373}
3374
3375void
3376vop_strategy_pre(void *ap)
3377{
3378#ifdef DEBUG_VFS_LOCKS
3379	struct vop_strategy_args *a;
3380	struct buf *bp;
3381
3382	a = ap;
3383	bp = a->a_bp;
3384
3385	/*
3386	 * Cluster ops lock their component buffers but not the IO container.
3387	 */
3388	if ((bp->b_flags & B_CLUSTER) != 0)
3389		return;
3390
3391	if (BUF_REFCNT(bp) < 1) {
3392		if (vfs_badlock_print)
3393			printf(
3394			    "VOP_STRATEGY: bp is not locked but should be\n");
3395		if (vfs_badlock_ddb)
3396			kdb_enter("lock violation");
3397	}
3398#endif
3399}
3400
3401void
3402vop_lookup_pre(void *ap)
3403{
3404#ifdef DEBUG_VFS_LOCKS
3405	struct vop_lookup_args *a;
3406	struct vnode *dvp;
3407
3408	a = ap;
3409	dvp = a->a_dvp;
3410	ASSERT_VI_UNLOCKED(dvp, "VOP_LOOKUP");
3411	ASSERT_VOP_LOCKED(dvp, "VOP_LOOKUP");
3412#endif
3413}
3414
3415void
3416vop_lookup_post(void *ap, int rc)
3417{
3418#ifdef DEBUG_VFS_LOCKS
3419	struct vop_lookup_args *a;
3420	struct vnode *dvp;
3421	struct vnode *vp;
3422
3423	a = ap;
3424	dvp = a->a_dvp;
3425	vp = *(a->a_vpp);
3426
3427	ASSERT_VI_UNLOCKED(dvp, "VOP_LOOKUP");
3428	ASSERT_VOP_LOCKED(dvp, "VOP_LOOKUP");
3429
3430	if (!rc)
3431		ASSERT_VOP_LOCKED(vp, "VOP_LOOKUP (child)");
3432#endif
3433}
3434
3435void
3436vop_lock_pre(void *ap)
3437{
3438#ifdef DEBUG_VFS_LOCKS
3439	struct vop_lock_args *a = ap;
3440
3441	if ((a->a_flags & LK_INTERLOCK) == 0)
3442		ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
3443	else
3444		ASSERT_VI_LOCKED(a->a_vp, "VOP_LOCK");
3445#endif
3446}
3447
3448void
3449vop_lock_post(void *ap, int rc)
3450{
3451#ifdef DEBUG_VFS_LOCKS
3452	struct vop_lock_args *a = ap;
3453
3454	ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
3455	if (rc == 0)
3456		ASSERT_VOP_LOCKED(a->a_vp, "VOP_LOCK");
3457#endif
3458}
3459
3460void
3461vop_unlock_pre(void *ap)
3462{
3463#ifdef DEBUG_VFS_LOCKS
3464	struct vop_unlock_args *a = ap;
3465
3466	if (a->a_flags & LK_INTERLOCK)
3467		ASSERT_VI_LOCKED(a->a_vp, "VOP_UNLOCK");
3468	ASSERT_VOP_LOCKED(a->a_vp, "VOP_UNLOCK");
3469#endif
3470}
3471
3472void
3473vop_unlock_post(void *ap, int rc)
3474{
3475#ifdef DEBUG_VFS_LOCKS
3476	struct vop_unlock_args *a = ap;
3477
3478	if (a->a_flags & LK_INTERLOCK)
3479		ASSERT_VI_UNLOCKED(a->a_vp, "VOP_UNLOCK");
3480#endif
3481}
3482
3483void
3484vop_create_post(void *ap, int rc)
3485{
3486	struct vop_create_args *a = ap;
3487
3488	if (!rc)
3489		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
3490}
3491
3492void
3493vop_link_post(void *ap, int rc)
3494{
3495	struct vop_link_args *a = ap;
3496
3497	if (!rc) {
3498		VFS_KNOTE_LOCKED(a->a_vp, NOTE_LINK);
3499		VFS_KNOTE_LOCKED(a->a_tdvp, NOTE_WRITE);
3500	}
3501}
3502
3503void
3504vop_mkdir_post(void *ap, int rc)
3505{
3506	struct vop_mkdir_args *a = ap;
3507
3508	if (!rc)
3509		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK);
3510}
3511
3512void
3513vop_mknod_post(void *ap, int rc)
3514{
3515	struct vop_mknod_args *a = ap;
3516
3517	if (!rc)
3518		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
3519}
3520
3521void
3522vop_remove_post(void *ap, int rc)
3523{
3524	struct vop_remove_args *a = ap;
3525
3526	if (!rc) {
3527		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
3528		VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE);
3529	}
3530}
3531
3532void
3533vop_rename_post(void *ap, int rc)
3534{
3535	struct vop_rename_args *a = ap;
3536
3537	if (!rc) {
3538		VFS_KNOTE_UNLOCKED(a->a_fdvp, NOTE_WRITE);
3539		VFS_KNOTE_UNLOCKED(a->a_tdvp, NOTE_WRITE);
3540		VFS_KNOTE_UNLOCKED(a->a_fvp, NOTE_RENAME);
3541		if (a->a_tvp)
3542			VFS_KNOTE_UNLOCKED(a->a_tvp, NOTE_DELETE);
3543	}
3544	if (a->a_tdvp != a->a_fdvp)
3545		vdrop(a->a_fdvp);
3546	if (a->a_tvp != a->a_fvp)
3547		vdrop(a->a_fvp);
3548	vdrop(a->a_tdvp);
3549	if (a->a_tvp)
3550		vdrop(a->a_tvp);
3551}
3552
3553void
3554vop_rmdir_post(void *ap, int rc)
3555{
3556	struct vop_rmdir_args *a = ap;
3557
3558	if (!rc) {
3559		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK);
3560		VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE);
3561	}
3562}
3563
3564void
3565vop_setattr_post(void *ap, int rc)
3566{
3567	struct vop_setattr_args *a = ap;
3568
3569	if (!rc)
3570		VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
3571}
3572
3573void
3574vop_symlink_post(void *ap, int rc)
3575{
3576	struct vop_symlink_args *a = ap;
3577
3578	if (!rc)
3579		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
3580}
3581
3582static struct knlist fs_knlist;
3583
3584static void
3585vfs_event_init(void *arg)
3586{
3587	knlist_init(&fs_knlist, NULL, NULL, NULL, NULL);
3588}
3589/* XXX - correct order? */
3590SYSINIT(vfs_knlist, SI_SUB_VFS, SI_ORDER_ANY, vfs_event_init, NULL);
3591
3592void
3593vfs_event_signal(fsid_t *fsid, u_int32_t event, intptr_t data __unused)
3594{
3595
3596	KNOTE_UNLOCKED(&fs_knlist, event);
3597}
3598
3599static int	filt_fsattach(struct knote *kn);
3600static void	filt_fsdetach(struct knote *kn);
3601static int	filt_fsevent(struct knote *kn, long hint);
3602
3603struct filterops fs_filtops =
3604	{ 0, filt_fsattach, filt_fsdetach, filt_fsevent };
3605
3606static int
3607filt_fsattach(struct knote *kn)
3608{
3609
3610	kn->kn_flags |= EV_CLEAR;
3611	knlist_add(&fs_knlist, kn, 0);
3612	return (0);
3613}
3614
3615static void
3616filt_fsdetach(struct knote *kn)
3617{
3618
3619	knlist_remove(&fs_knlist, kn, 0);
3620}
3621
3622static int
3623filt_fsevent(struct knote *kn, long hint)
3624{
3625
3626	kn->kn_fflags |= hint;
3627	return (kn->kn_fflags != 0);
3628}
3629
3630static int
3631sysctl_vfs_ctl(SYSCTL_HANDLER_ARGS)
3632{
3633	struct vfsidctl vc;
3634	int error;
3635	struct mount *mp;
3636
3637	error = SYSCTL_IN(req, &vc, sizeof(vc));
3638	if (error)
3639		return (error);
3640	if (vc.vc_vers != VFS_CTL_VERS1)
3641		return (EINVAL);
3642	mp = vfs_getvfs(&vc.vc_fsid);
3643	if (mp == NULL)
3644		return (ENOENT);
3645	/* ensure that a specific sysctl goes to the right filesystem. */
3646	if (strcmp(vc.vc_fstypename, "*") != 0 &&
3647	    strcmp(vc.vc_fstypename, mp->mnt_vfc->vfc_name) != 0) {
3648		return (EINVAL);
3649	}
3650	VCTLTOREQ(&vc, req);
3651	return (VFS_SYSCTL(mp, vc.vc_op, req));
3652}
3653
3654SYSCTL_PROC(_vfs, OID_AUTO, ctl, CTLFLAG_WR,
3655        NULL, 0, sysctl_vfs_ctl, "", "Sysctl by fsid");
3656
3657/*
3658 * Function to initialize a va_filerev field sensibly.
3659 * XXX: Wouldn't a random number make a lot more sense ??
3660 */
3661u_quad_t
3662init_va_filerev(void)
3663{
3664	struct bintime bt;
3665
3666	getbinuptime(&bt);
3667	return (((u_quad_t)bt.sec << 32LL) | (bt.frac >> 32LL));
3668}
3669
3670static int	filt_vfsread(struct knote *kn, long hint);
3671static int	filt_vfswrite(struct knote *kn, long hint);
3672static int	filt_vfsvnode(struct knote *kn, long hint);
3673static void	filt_vfsdetach(struct knote *kn);
3674static struct filterops vfsread_filtops =
3675	{ 1, NULL, filt_vfsdetach, filt_vfsread };
3676static struct filterops vfswrite_filtops =
3677	{ 1, NULL, filt_vfsdetach, filt_vfswrite };
3678static struct filterops vfsvnode_filtops =
3679	{ 1, NULL, filt_vfsdetach, filt_vfsvnode };
3680
3681static void
3682vfs_knllock(void *arg)
3683{
3684	struct vnode *vp = arg;
3685
3686	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, curthread);
3687}
3688
3689static void
3690vfs_knlunlock(void *arg)
3691{
3692	struct vnode *vp = arg;
3693
3694	VOP_UNLOCK(vp, 0, curthread);
3695}
3696
3697static int
3698vfs_knllocked(void *arg)
3699{
3700	struct vnode *vp = arg;
3701
3702	return (VOP_ISLOCKED(vp, curthread) == LK_EXCLUSIVE);
3703}
3704
3705int
3706vfs_kqfilter(struct vop_kqfilter_args *ap)
3707{
3708	struct vnode *vp = ap->a_vp;
3709	struct knote *kn = ap->a_kn;
3710	struct knlist *knl;
3711
3712	switch (kn->kn_filter) {
3713	case EVFILT_READ:
3714		kn->kn_fop = &vfsread_filtops;
3715		break;
3716	case EVFILT_WRITE:
3717		kn->kn_fop = &vfswrite_filtops;
3718		break;
3719	case EVFILT_VNODE:
3720		kn->kn_fop = &vfsvnode_filtops;
3721		break;
3722	default:
3723		return (1);
3724	}
3725
3726	kn->kn_hook = (caddr_t)vp;
3727
3728	if (vp->v_pollinfo == NULL)
3729		v_addpollinfo(vp);
3730	if (vp->v_pollinfo == NULL)
3731		return (ENOMEM);
3732	knl = &vp->v_pollinfo->vpi_selinfo.si_note;
3733	knlist_add(knl, kn, 0);
3734
3735	return (0);
3736}
3737
3738/*
3739 * Detach knote from vnode
3740 */
3741static void
3742filt_vfsdetach(struct knote *kn)
3743{
3744	struct vnode *vp = (struct vnode *)kn->kn_hook;
3745
3746	KASSERT(vp->v_pollinfo != NULL, ("Missing v_pollinfo"));
3747	knlist_remove(&vp->v_pollinfo->vpi_selinfo.si_note, kn, 0);
3748}
3749
3750/*ARGSUSED*/
3751static int
3752filt_vfsread(struct knote *kn, long hint)
3753{
3754	struct vnode *vp = (struct vnode *)kn->kn_hook;
3755	struct vattr va;
3756
3757	/*
3758	 * filesystem is gone, so set the EOF flag and schedule
3759	 * the knote for deletion.
3760	 */
3761	if (hint == NOTE_REVOKE) {
3762		kn->kn_flags |= (EV_EOF | EV_ONESHOT);
3763		return (1);
3764	}
3765
3766	if (VOP_GETATTR(vp, &va, curthread->td_ucred, curthread))
3767		return (0);
3768
3769	kn->kn_data = va.va_size - kn->kn_fp->f_offset;
3770	return (kn->kn_data != 0);
3771}
3772
3773/*ARGSUSED*/
3774static int
3775filt_vfswrite(struct knote *kn, long hint)
3776{
3777	/*
3778	 * filesystem is gone, so set the EOF flag and schedule
3779	 * the knote for deletion.
3780	 */
3781	if (hint == NOTE_REVOKE)
3782		kn->kn_flags |= (EV_EOF | EV_ONESHOT);
3783
3784	kn->kn_data = 0;
3785	return (1);
3786}
3787
3788static int
3789filt_vfsvnode(struct knote *kn, long hint)
3790{
3791	if (kn->kn_sfflags & hint)
3792		kn->kn_fflags |= hint;
3793	if (hint == NOTE_REVOKE) {
3794		kn->kn_flags |= EV_EOF;
3795		return (1);
3796	}
3797	return (kn->kn_fflags != 0);
3798}
3799