vfs_subr.c revision 178243
1/*-
2 * Copyright (c) 1989, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 4. Neither the name of the University nor the names of its contributors
19 *    may be used to endorse or promote products derived from this software
20 *    without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 *	@(#)vfs_subr.c	8.31 (Berkeley) 5/26/95
35 */
36
37/*
38 * External virtual filesystem routines
39 */
40
41#include <sys/cdefs.h>
42__FBSDID("$FreeBSD: head/sys/kern/vfs_subr.c 178243 2008-04-16 11:33:32Z kib $");
43
44#include "opt_ddb.h"
45#include "opt_mac.h"
46
47#include <sys/param.h>
48#include <sys/systm.h>
49#include <sys/bio.h>
50#include <sys/buf.h>
51#include <sys/conf.h>
52#include <sys/dirent.h>
53#include <sys/event.h>
54#include <sys/eventhandler.h>
55#include <sys/extattr.h>
56#include <sys/file.h>
57#include <sys/fcntl.h>
58#include <sys/jail.h>
59#include <sys/kdb.h>
60#include <sys/kernel.h>
61#include <sys/kthread.h>
62#include <sys/lockf.h>
63#include <sys/malloc.h>
64#include <sys/mount.h>
65#include <sys/namei.h>
66#include <sys/priv.h>
67#include <sys/reboot.h>
68#include <sys/sleepqueue.h>
69#include <sys/stat.h>
70#include <sys/sysctl.h>
71#include <sys/syslog.h>
72#include <sys/vmmeter.h>
73#include <sys/vnode.h>
74
75#include <machine/stdarg.h>
76
77#include <security/mac/mac_framework.h>
78
79#include <vm/vm.h>
80#include <vm/vm_object.h>
81#include <vm/vm_extern.h>
82#include <vm/pmap.h>
83#include <vm/vm_map.h>
84#include <vm/vm_page.h>
85#include <vm/vm_kern.h>
86#include <vm/uma.h>
87
88#ifdef DDB
89#include <ddb/ddb.h>
90#endif
91
92static MALLOC_DEFINE(M_NETADDR, "subr_export_host", "Export host address structure");
93
94static void	delmntque(struct vnode *vp);
95static int	flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo,
96		    int slpflag, int slptimeo);
97static void	syncer_shutdown(void *arg, int howto);
98static int	vtryrecycle(struct vnode *vp);
99static void	vbusy(struct vnode *vp);
100static void	vinactive(struct vnode *, struct thread *);
101static void	v_incr_usecount(struct vnode *);
102static void	v_decr_usecount(struct vnode *);
103static void	v_decr_useonly(struct vnode *);
104static void	v_upgrade_usecount(struct vnode *);
105static void	vfree(struct vnode *);
106static void	vnlru_free(int);
107static void	vdestroy(struct vnode *);
108static void	vgonel(struct vnode *);
109static void	vfs_knllock(void *arg);
110static void	vfs_knlunlock(void *arg);
111static int	vfs_knllocked(void *arg);
112
113
114/*
115 * Enable Giant pushdown based on whether or not the vm is mpsafe in this
116 * build.  Without mpsafevm the buffer cache can not run Giant free.
117 */
118int mpsafe_vfs = 1;
119TUNABLE_INT("debug.mpsafevfs", &mpsafe_vfs);
120SYSCTL_INT(_debug, OID_AUTO, mpsafevfs, CTLFLAG_RD, &mpsafe_vfs, 0,
121    "MPSAFE VFS");
122
123/*
124 * Number of vnodes in existence.  Increased whenever getnewvnode()
125 * allocates a new vnode, decreased on vdestroy() called on VI_DOOMed
126 * vnode.
127 */
128static unsigned long	numvnodes;
129
130SYSCTL_LONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, "");
131
132/*
133 * Conversion tables for conversion from vnode types to inode formats
134 * and back.
135 */
136enum vtype iftovt_tab[16] = {
137	VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
138	VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
139};
140int vttoif_tab[10] = {
141	0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
142	S_IFSOCK, S_IFIFO, S_IFMT, S_IFMT
143};
144
145/*
146 * List of vnodes that are ready for recycling.
147 */
148static TAILQ_HEAD(freelst, vnode) vnode_free_list;
149
150/*
151 * Free vnode target.  Free vnodes may simply be files which have been stat'd
152 * but not read.  This is somewhat common, and a small cache of such files
153 * should be kept to avoid recreation costs.
154 */
155static u_long wantfreevnodes;
156SYSCTL_LONG(_vfs, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, "");
157/* Number of vnodes in the free list. */
158static u_long freevnodes;
159SYSCTL_LONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, "");
160
161/*
162 * Various variables used for debugging the new implementation of
163 * reassignbuf().
164 * XXX these are probably of (very) limited utility now.
165 */
166static int reassignbufcalls;
167SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0, "");
168
169/*
170 * Cache for the mount type id assigned to NFS.  This is used for
171 * special checks in nfs/nfs_nqlease.c and vm/vnode_pager.c.
172 */
173int	nfs_mount_type = -1;
174
175/* To keep more than one thread at a time from running vfs_getnewfsid */
176static struct mtx mntid_mtx;
177
178/*
179 * Lock for any access to the following:
180 *	vnode_free_list
181 *	numvnodes
182 *	freevnodes
183 */
184static struct mtx vnode_free_list_mtx;
185
186/* Publicly exported FS */
187struct nfs_public nfs_pub;
188
189/* Zone for allocation of new vnodes - used exclusively by getnewvnode() */
190static uma_zone_t vnode_zone;
191static uma_zone_t vnodepoll_zone;
192
193/* Set to 1 to print out reclaim of active vnodes */
194int	prtactive;
195
196/*
197 * The workitem queue.
198 *
199 * It is useful to delay writes of file data and filesystem metadata
200 * for tens of seconds so that quickly created and deleted files need
201 * not waste disk bandwidth being created and removed. To realize this,
202 * we append vnodes to a "workitem" queue. When running with a soft
203 * updates implementation, most pending metadata dependencies should
204 * not wait for more than a few seconds. Thus, mounted on block devices
205 * are delayed only about a half the time that file data is delayed.
206 * Similarly, directory updates are more critical, so are only delayed
207 * about a third the time that file data is delayed. Thus, there are
208 * SYNCER_MAXDELAY queues that are processed round-robin at a rate of
209 * one each second (driven off the filesystem syncer process). The
210 * syncer_delayno variable indicates the next queue that is to be processed.
211 * Items that need to be processed soon are placed in this queue:
212 *
213 *	syncer_workitem_pending[syncer_delayno]
214 *
215 * A delay of fifteen seconds is done by placing the request fifteen
216 * entries later in the queue:
217 *
218 *	syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
219 *
220 */
221static int syncer_delayno;
222static long syncer_mask;
223LIST_HEAD(synclist, bufobj);
224static struct synclist *syncer_workitem_pending;
225/*
226 * The sync_mtx protects:
227 *	bo->bo_synclist
228 *	sync_vnode_count
229 *	syncer_delayno
230 *	syncer_state
231 *	syncer_workitem_pending
232 *	syncer_worklist_len
233 *	rushjob
234 */
235static struct mtx sync_mtx;
236
237#define SYNCER_MAXDELAY		32
238static int syncer_maxdelay = SYNCER_MAXDELAY;	/* maximum delay time */
239static int syncdelay = 30;		/* max time to delay syncing data */
240static int filedelay = 30;		/* time to delay syncing files */
241SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, "");
242static int dirdelay = 29;		/* time to delay syncing directories */
243SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, "");
244static int metadelay = 28;		/* time to delay syncing metadata */
245SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, "");
246static int rushjob;		/* number of slots to run ASAP */
247static int stat_rush_requests;	/* number of times I/O speeded up */
248SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, "");
249
250/*
251 * When shutting down the syncer, run it at four times normal speed.
252 */
253#define SYNCER_SHUTDOWN_SPEEDUP		4
254static int sync_vnode_count;
255static int syncer_worklist_len;
256static enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY }
257    syncer_state;
258
259/*
260 * Number of vnodes we want to exist at any one time.  This is mostly used
261 * to size hash tables in vnode-related code.  It is normally not used in
262 * getnewvnode(), as wantfreevnodes is normally nonzero.)
263 *
264 * XXX desiredvnodes is historical cruft and should not exist.
265 */
266int desiredvnodes;
267SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW,
268    &desiredvnodes, 0, "Maximum number of vnodes");
269SYSCTL_INT(_kern, OID_AUTO, minvnodes, CTLFLAG_RW,
270    &wantfreevnodes, 0, "Minimum number of vnodes (legacy)");
271static int vnlru_nowhere;
272SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW,
273    &vnlru_nowhere, 0, "Number of times the vnlru process ran without success");
274
275/*
276 * Macros to control when a vnode is freed and recycled.  All require
277 * the vnode interlock.
278 */
279#define VCANRECYCLE(vp) (((vp)->v_iflag & VI_FREE) && !(vp)->v_holdcnt)
280#define VSHOULDFREE(vp) (!((vp)->v_iflag & VI_FREE) && !(vp)->v_holdcnt)
281#define VSHOULDBUSY(vp) (((vp)->v_iflag & VI_FREE) && (vp)->v_holdcnt)
282
283
284/*
285 * Initialize the vnode management data structures.
286 */
287#ifndef	MAXVNODES_MAX
288#define	MAXVNODES_MAX	100000
289#endif
290static void
291vntblinit(void *dummy __unused)
292{
293
294	/*
295	 * Desiredvnodes is a function of the physical memory size and
296	 * the kernel's heap size.  Specifically, desiredvnodes scales
297	 * in proportion to the physical memory size until two fifths
298	 * of the kernel's heap size is consumed by vnodes and vm
299	 * objects.
300	 */
301	desiredvnodes = min(maxproc + cnt.v_page_count / 4, 2 * vm_kmem_size /
302	    (5 * (sizeof(struct vm_object) + sizeof(struct vnode))));
303	if (desiredvnodes > MAXVNODES_MAX) {
304		if (bootverbose)
305			printf("Reducing kern.maxvnodes %d -> %d\n",
306			    desiredvnodes, MAXVNODES_MAX);
307		desiredvnodes = MAXVNODES_MAX;
308	}
309	wantfreevnodes = desiredvnodes / 4;
310	mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF);
311	TAILQ_INIT(&vnode_free_list);
312	mtx_init(&vnode_free_list_mtx, "vnode_free_list", NULL, MTX_DEF);
313	vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL,
314	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
315	vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo),
316	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
317	/*
318	 * Initialize the filesystem syncer.
319	 */
320	syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE,
321		&syncer_mask);
322	syncer_maxdelay = syncer_mask + 1;
323	mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF);
324}
325SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL);
326
327
328/*
329 * Mark a mount point as busy. Used to synchronize access and to delay
330 * unmounting. Interlock is not released on failure.
331 */
332int
333vfs_busy(struct mount *mp, int flags, struct mtx *interlkp,
334    struct thread *td)
335{
336	int lkflags;
337
338	MNT_ILOCK(mp);
339	MNT_REF(mp);
340	if (mp->mnt_kern_flag & MNTK_UNMOUNT) {
341		if (flags & LK_NOWAIT) {
342			MNT_REL(mp);
343			MNT_IUNLOCK(mp);
344			return (ENOENT);
345		}
346		if (interlkp)
347			mtx_unlock(interlkp);
348		mp->mnt_kern_flag |= MNTK_MWAIT;
349		/*
350		 * Since all busy locks are shared except the exclusive
351		 * lock granted when unmounting, the only place that a
352		 * wakeup needs to be done is at the release of the
353		 * exclusive lock at the end of dounmount.
354		 */
355		msleep(mp, MNT_MTX(mp), PVFS, "vfs_busy", 0);
356		MNT_REL(mp);
357		MNT_IUNLOCK(mp);
358		if (interlkp)
359			mtx_lock(interlkp);
360		return (ENOENT);
361	}
362	if (interlkp)
363		mtx_unlock(interlkp);
364	lkflags = LK_SHARED | LK_INTERLOCK;
365	if (lockmgr(&mp->mnt_lock, lkflags, MNT_MTX(mp)))
366		panic("vfs_busy: unexpected lock failure");
367	return (0);
368}
369
370/*
371 * Free a busy filesystem.
372 */
373void
374vfs_unbusy(struct mount *mp, struct thread *td)
375{
376
377	lockmgr(&mp->mnt_lock, LK_RELEASE, NULL);
378	vfs_rel(mp);
379}
380
381/*
382 * Lookup a mount point by filesystem identifier.
383 */
384struct mount *
385vfs_getvfs(fsid_t *fsid)
386{
387	struct mount *mp;
388
389	mtx_lock(&mountlist_mtx);
390	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
391		if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
392		    mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
393			vfs_ref(mp);
394			mtx_unlock(&mountlist_mtx);
395			return (mp);
396		}
397	}
398	mtx_unlock(&mountlist_mtx);
399	return ((struct mount *) 0);
400}
401
402/*
403 * Check if a user can access privileged mount options.
404 */
405int
406vfs_suser(struct mount *mp, struct thread *td)
407{
408	int error;
409
410	/*
411	 * If the thread is jailed, but this is not a jail-friendly file
412	 * system, deny immediately.
413	 */
414	if (jailed(td->td_ucred) && !(mp->mnt_vfc->vfc_flags & VFCF_JAIL))
415		return (EPERM);
416
417	/*
418	 * If the file system was mounted outside a jail and a jailed thread
419	 * tries to access it, deny immediately.
420	 */
421	if (!jailed(mp->mnt_cred) && jailed(td->td_ucred))
422		return (EPERM);
423
424	/*
425	 * If the file system was mounted inside different jail that the jail of
426	 * the calling thread, deny immediately.
427	 */
428	if (jailed(mp->mnt_cred) && jailed(td->td_ucred) &&
429	    mp->mnt_cred->cr_prison != td->td_ucred->cr_prison) {
430		return (EPERM);
431	}
432
433	if ((mp->mnt_flag & MNT_USER) == 0 ||
434	    mp->mnt_cred->cr_uid != td->td_ucred->cr_uid) {
435		if ((error = priv_check(td, PRIV_VFS_MOUNT_OWNER)) != 0)
436			return (error);
437	}
438	return (0);
439}
440
441/*
442 * Get a new unique fsid.  Try to make its val[0] unique, since this value
443 * will be used to create fake device numbers for stat().  Also try (but
444 * not so hard) make its val[0] unique mod 2^16, since some emulators only
445 * support 16-bit device numbers.  We end up with unique val[0]'s for the
446 * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls.
447 *
448 * Keep in mind that several mounts may be running in parallel.  Starting
449 * the search one past where the previous search terminated is both a
450 * micro-optimization and a defense against returning the same fsid to
451 * different mounts.
452 */
453void
454vfs_getnewfsid(struct mount *mp)
455{
456	static u_int16_t mntid_base;
457	struct mount *nmp;
458	fsid_t tfsid;
459	int mtype;
460
461	mtx_lock(&mntid_mtx);
462	mtype = mp->mnt_vfc->vfc_typenum;
463	tfsid.val[1] = mtype;
464	mtype = (mtype & 0xFF) << 24;
465	for (;;) {
466		tfsid.val[0] = makedev(255,
467		    mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF));
468		mntid_base++;
469		if ((nmp = vfs_getvfs(&tfsid)) == NULL)
470			break;
471		vfs_rel(nmp);
472	}
473	mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
474	mp->mnt_stat.f_fsid.val[1] = tfsid.val[1];
475	mtx_unlock(&mntid_mtx);
476}
477
478/*
479 * Knob to control the precision of file timestamps:
480 *
481 *   0 = seconds only; nanoseconds zeroed.
482 *   1 = seconds and nanoseconds, accurate within 1/HZ.
483 *   2 = seconds and nanoseconds, truncated to microseconds.
484 * >=3 = seconds and nanoseconds, maximum precision.
485 */
486enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC };
487
488static int timestamp_precision = TSP_SEC;
489SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW,
490    &timestamp_precision, 0, "");
491
492/*
493 * Get a current timestamp.
494 */
495void
496vfs_timestamp(struct timespec *tsp)
497{
498	struct timeval tv;
499
500	switch (timestamp_precision) {
501	case TSP_SEC:
502		tsp->tv_sec = time_second;
503		tsp->tv_nsec = 0;
504		break;
505	case TSP_HZ:
506		getnanotime(tsp);
507		break;
508	case TSP_USEC:
509		microtime(&tv);
510		TIMEVAL_TO_TIMESPEC(&tv, tsp);
511		break;
512	case TSP_NSEC:
513	default:
514		nanotime(tsp);
515		break;
516	}
517}
518
519/*
520 * Set vnode attributes to VNOVAL
521 */
522void
523vattr_null(struct vattr *vap)
524{
525
526	vap->va_type = VNON;
527	vap->va_size = VNOVAL;
528	vap->va_bytes = VNOVAL;
529	vap->va_mode = VNOVAL;
530	vap->va_nlink = VNOVAL;
531	vap->va_uid = VNOVAL;
532	vap->va_gid = VNOVAL;
533	vap->va_fsid = VNOVAL;
534	vap->va_fileid = VNOVAL;
535	vap->va_blocksize = VNOVAL;
536	vap->va_rdev = VNOVAL;
537	vap->va_atime.tv_sec = VNOVAL;
538	vap->va_atime.tv_nsec = VNOVAL;
539	vap->va_mtime.tv_sec = VNOVAL;
540	vap->va_mtime.tv_nsec = VNOVAL;
541	vap->va_ctime.tv_sec = VNOVAL;
542	vap->va_ctime.tv_nsec = VNOVAL;
543	vap->va_birthtime.tv_sec = VNOVAL;
544	vap->va_birthtime.tv_nsec = VNOVAL;
545	vap->va_flags = VNOVAL;
546	vap->va_gen = VNOVAL;
547	vap->va_vaflags = 0;
548}
549
550/*
551 * This routine is called when we have too many vnodes.  It attempts
552 * to free <count> vnodes and will potentially free vnodes that still
553 * have VM backing store (VM backing store is typically the cause
554 * of a vnode blowout so we want to do this).  Therefore, this operation
555 * is not considered cheap.
556 *
557 * A number of conditions may prevent a vnode from being reclaimed.
558 * the buffer cache may have references on the vnode, a directory
559 * vnode may still have references due to the namei cache representing
560 * underlying files, or the vnode may be in active use.   It is not
561 * desireable to reuse such vnodes.  These conditions may cause the
562 * number of vnodes to reach some minimum value regardless of what
563 * you set kern.maxvnodes to.  Do not set kern.maxvnodes too low.
564 */
565static int
566vlrureclaim(struct mount *mp)
567{
568	struct thread *td;
569	struct vnode *vp;
570	int done;
571	int trigger;
572	int usevnodes;
573	int count;
574
575	/*
576	 * Calculate the trigger point, don't allow user
577	 * screwups to blow us up.   This prevents us from
578	 * recycling vnodes with lots of resident pages.  We
579	 * aren't trying to free memory, we are trying to
580	 * free vnodes.
581	 */
582	usevnodes = desiredvnodes;
583	if (usevnodes <= 0)
584		usevnodes = 1;
585	trigger = cnt.v_page_count * 2 / usevnodes;
586	done = 0;
587	td = curthread;
588	vn_start_write(NULL, &mp, V_WAIT);
589	MNT_ILOCK(mp);
590	count = mp->mnt_nvnodelistsize / 10 + 1;
591	while (count != 0) {
592		vp = TAILQ_FIRST(&mp->mnt_nvnodelist);
593		while (vp != NULL && vp->v_type == VMARKER)
594			vp = TAILQ_NEXT(vp, v_nmntvnodes);
595		if (vp == NULL)
596			break;
597		TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
598		TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
599		--count;
600		if (!VI_TRYLOCK(vp))
601			goto next_iter;
602		/*
603		 * If it's been deconstructed already, it's still
604		 * referenced, or it exceeds the trigger, skip it.
605		 */
606		if (vp->v_usecount || !LIST_EMPTY(&(vp)->v_cache_src) ||
607		    (vp->v_iflag & VI_DOOMED) != 0 || (vp->v_object != NULL &&
608		    vp->v_object->resident_page_count > trigger)) {
609			VI_UNLOCK(vp);
610			goto next_iter;
611		}
612		MNT_IUNLOCK(mp);
613		vholdl(vp);
614		if (VOP_LOCK(vp, LK_INTERLOCK|LK_EXCLUSIVE|LK_NOWAIT)) {
615			vdrop(vp);
616			goto next_iter_mntunlocked;
617		}
618		VI_LOCK(vp);
619		/*
620		 * v_usecount may have been bumped after VOP_LOCK() dropped
621		 * the vnode interlock and before it was locked again.
622		 *
623		 * It is not necessary to recheck VI_DOOMED because it can
624		 * only be set by another thread that holds both the vnode
625		 * lock and vnode interlock.  If another thread has the
626		 * vnode lock before we get to VOP_LOCK() and obtains the
627		 * vnode interlock after VOP_LOCK() drops the vnode
628		 * interlock, the other thread will be unable to drop the
629		 * vnode lock before our VOP_LOCK() call fails.
630		 */
631		if (vp->v_usecount || !LIST_EMPTY(&(vp)->v_cache_src) ||
632		    (vp->v_object != NULL &&
633		    vp->v_object->resident_page_count > trigger)) {
634			VOP_UNLOCK(vp, LK_INTERLOCK);
635			goto next_iter_mntunlocked;
636		}
637		KASSERT((vp->v_iflag & VI_DOOMED) == 0,
638		    ("VI_DOOMED unexpectedly detected in vlrureclaim()"));
639		vgonel(vp);
640		VOP_UNLOCK(vp, 0);
641		vdropl(vp);
642		done++;
643next_iter_mntunlocked:
644		if ((count % 256) != 0)
645			goto relock_mnt;
646		goto yield;
647next_iter:
648		if ((count % 256) != 0)
649			continue;
650		MNT_IUNLOCK(mp);
651yield:
652		uio_yield();
653relock_mnt:
654		MNT_ILOCK(mp);
655	}
656	MNT_IUNLOCK(mp);
657	vn_finished_write(mp);
658	return done;
659}
660
661/*
662 * Attempt to keep the free list at wantfreevnodes length.
663 */
664static void
665vnlru_free(int count)
666{
667	struct vnode *vp;
668	int vfslocked;
669
670	mtx_assert(&vnode_free_list_mtx, MA_OWNED);
671	for (; count > 0; count--) {
672		vp = TAILQ_FIRST(&vnode_free_list);
673		/*
674		 * The list can be modified while the free_list_mtx
675		 * has been dropped and vp could be NULL here.
676		 */
677		if (!vp)
678			break;
679		VNASSERT(vp->v_op != NULL, vp,
680		    ("vnlru_free: vnode already reclaimed."));
681		TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
682		/*
683		 * Don't recycle if we can't get the interlock.
684		 */
685		if (!VI_TRYLOCK(vp)) {
686			TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
687			continue;
688		}
689		VNASSERT(VCANRECYCLE(vp), vp,
690		    ("vp inconsistent on freelist"));
691		freevnodes--;
692		vp->v_iflag &= ~VI_FREE;
693		vholdl(vp);
694		mtx_unlock(&vnode_free_list_mtx);
695		VI_UNLOCK(vp);
696		vfslocked = VFS_LOCK_GIANT(vp->v_mount);
697		vtryrecycle(vp);
698		VFS_UNLOCK_GIANT(vfslocked);
699		/*
700		 * If the recycled succeeded this vdrop will actually free
701		 * the vnode.  If not it will simply place it back on
702		 * the free list.
703		 */
704		vdrop(vp);
705		mtx_lock(&vnode_free_list_mtx);
706	}
707}
708/*
709 * Attempt to recycle vnodes in a context that is always safe to block.
710 * Calling vlrurecycle() from the bowels of filesystem code has some
711 * interesting deadlock problems.
712 */
713static struct proc *vnlruproc;
714static int vnlruproc_sig;
715
716static void
717vnlru_proc(void)
718{
719	struct mount *mp, *nmp;
720	int done;
721	struct proc *p = vnlruproc;
722	struct thread *td = curthread;
723
724	EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, p,
725	    SHUTDOWN_PRI_FIRST);
726
727	mtx_lock(&Giant);
728
729	for (;;) {
730		kproc_suspend_check(p);
731		mtx_lock(&vnode_free_list_mtx);
732		if (freevnodes > wantfreevnodes)
733			vnlru_free(freevnodes - wantfreevnodes);
734		if (numvnodes <= desiredvnodes * 9 / 10) {
735			vnlruproc_sig = 0;
736			wakeup(&vnlruproc_sig);
737			msleep(vnlruproc, &vnode_free_list_mtx,
738			    PVFS|PDROP, "vlruwt", hz);
739			continue;
740		}
741		mtx_unlock(&vnode_free_list_mtx);
742		done = 0;
743		mtx_lock(&mountlist_mtx);
744		for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
745			int vfsunlocked;
746			if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td)) {
747				nmp = TAILQ_NEXT(mp, mnt_list);
748				continue;
749			}
750			if (!VFS_NEEDSGIANT(mp)) {
751				mtx_unlock(&Giant);
752				vfsunlocked = 1;
753			} else
754				vfsunlocked = 0;
755			done += vlrureclaim(mp);
756			if (vfsunlocked)
757				mtx_lock(&Giant);
758			mtx_lock(&mountlist_mtx);
759			nmp = TAILQ_NEXT(mp, mnt_list);
760			vfs_unbusy(mp, td);
761		}
762		mtx_unlock(&mountlist_mtx);
763		if (done == 0) {
764			EVENTHANDLER_INVOKE(vfs_lowvnodes, desiredvnodes / 10);
765#if 0
766			/* These messages are temporary debugging aids */
767			if (vnlru_nowhere < 5)
768				printf("vnlru process getting nowhere..\n");
769			else if (vnlru_nowhere == 5)
770				printf("vnlru process messages stopped.\n");
771#endif
772			vnlru_nowhere++;
773			tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3);
774		} else
775			uio_yield();
776	}
777}
778
779static struct kproc_desc vnlru_kp = {
780	"vnlru",
781	vnlru_proc,
782	&vnlruproc
783};
784SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start,
785    &vnlru_kp);
786
787/*
788 * Routines having to do with the management of the vnode table.
789 */
790
791static void
792vdestroy(struct vnode *vp)
793{
794	struct bufobj *bo;
795
796	CTR1(KTR_VFS, "vdestroy vp %p", vp);
797	mtx_lock(&vnode_free_list_mtx);
798	numvnodes--;
799	mtx_unlock(&vnode_free_list_mtx);
800	bo = &vp->v_bufobj;
801	VNASSERT((vp->v_iflag & VI_FREE) == 0, vp,
802	    ("cleaned vnode still on the free list."));
803	VNASSERT(vp->v_data == NULL, vp, ("cleaned vnode isn't"));
804	VNASSERT(vp->v_holdcnt == 0, vp, ("Non-zero hold count"));
805	VNASSERT(vp->v_usecount == 0, vp, ("Non-zero use count"));
806	VNASSERT(vp->v_writecount == 0, vp, ("Non-zero write count"));
807	VNASSERT(bo->bo_numoutput == 0, vp, ("Clean vnode has pending I/O's"));
808	VNASSERT(bo->bo_clean.bv_cnt == 0, vp, ("cleanbufcnt not 0"));
809	VNASSERT(bo->bo_clean.bv_root == NULL, vp, ("cleanblkroot not NULL"));
810	VNASSERT(bo->bo_dirty.bv_cnt == 0, vp, ("dirtybufcnt not 0"));
811	VNASSERT(bo->bo_dirty.bv_root == NULL, vp, ("dirtyblkroot not NULL"));
812	VNASSERT(TAILQ_EMPTY(&vp->v_cache_dst), vp, ("vp has namecache dst"));
813	VNASSERT(LIST_EMPTY(&vp->v_cache_src), vp, ("vp has namecache src"));
814	VI_UNLOCK(vp);
815#ifdef MAC
816	mac_vnode_destroy(vp);
817#endif
818	if (vp->v_pollinfo != NULL) {
819		knlist_destroy(&vp->v_pollinfo->vpi_selinfo.si_note);
820		mtx_destroy(&vp->v_pollinfo->vpi_lock);
821		uma_zfree(vnodepoll_zone, vp->v_pollinfo);
822	}
823#ifdef INVARIANTS
824	/* XXX Elsewhere we can detect an already freed vnode via NULL v_op. */
825	vp->v_op = NULL;
826#endif
827	lockdestroy(vp->v_vnlock);
828	mtx_destroy(&vp->v_interlock);
829	mtx_destroy(BO_MTX(bo));
830	uma_zfree(vnode_zone, vp);
831}
832
833/*
834 * Try to recycle a freed vnode.  We abort if anyone picks up a reference
835 * before we actually vgone().  This function must be called with the vnode
836 * held to prevent the vnode from being returned to the free list midway
837 * through vgone().
838 */
839static int
840vtryrecycle(struct vnode *vp)
841{
842	struct mount *vnmp;
843
844	CTR1(KTR_VFS, "vtryrecycle: trying vp %p", vp);
845	VNASSERT(vp->v_holdcnt, vp,
846	    ("vtryrecycle: Recycling vp %p without a reference.", vp));
847	/*
848	 * This vnode may found and locked via some other list, if so we
849	 * can't recycle it yet.
850	 */
851	if (VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT) != 0)
852		return (EWOULDBLOCK);
853	/*
854	 * Don't recycle if its filesystem is being suspended.
855	 */
856	if (vn_start_write(vp, &vnmp, V_NOWAIT) != 0) {
857		VOP_UNLOCK(vp, 0);
858		return (EBUSY);
859	}
860	/*
861	 * If we got this far, we need to acquire the interlock and see if
862	 * anyone picked up this vnode from another list.  If not, we will
863	 * mark it with DOOMED via vgonel() so that anyone who does find it
864	 * will skip over it.
865	 */
866	VI_LOCK(vp);
867	if (vp->v_usecount) {
868		VOP_UNLOCK(vp, LK_INTERLOCK);
869		vn_finished_write(vnmp);
870		return (EBUSY);
871	}
872	if ((vp->v_iflag & VI_DOOMED) == 0)
873		vgonel(vp);
874	VOP_UNLOCK(vp, LK_INTERLOCK);
875	vn_finished_write(vnmp);
876	CTR1(KTR_VFS, "vtryrecycle: recycled vp %p", vp);
877	return (0);
878}
879
880/*
881 * Return the next vnode from the free list.
882 */
883int
884getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops,
885    struct vnode **vpp)
886{
887	struct vnode *vp = NULL;
888	struct bufobj *bo;
889
890	mtx_lock(&vnode_free_list_mtx);
891	/*
892	 * Lend our context to reclaim vnodes if they've exceeded the max.
893	 */
894	if (freevnodes > wantfreevnodes)
895		vnlru_free(1);
896	/*
897	 * Wait for available vnodes.
898	 */
899	if (numvnodes > desiredvnodes) {
900		if (mp != NULL && (mp->mnt_kern_flag & MNTK_SUSPEND)) {
901			/*
902			 * File system is beeing suspended, we cannot risk a
903			 * deadlock here, so allocate new vnode anyway.
904			 */
905			if (freevnodes > wantfreevnodes)
906				vnlru_free(freevnodes - wantfreevnodes);
907			goto alloc;
908		}
909		if (vnlruproc_sig == 0) {
910			vnlruproc_sig = 1;	/* avoid unnecessary wakeups */
911			wakeup(vnlruproc);
912		}
913		msleep(&vnlruproc_sig, &vnode_free_list_mtx, PVFS,
914		    "vlruwk", hz);
915#if 0	/* XXX Not all VFS_VGET/ffs_vget callers check returns. */
916		if (numvnodes > desiredvnodes) {
917			mtx_unlock(&vnode_free_list_mtx);
918			return (ENFILE);
919		}
920#endif
921	}
922alloc:
923	numvnodes++;
924	mtx_unlock(&vnode_free_list_mtx);
925	vp = (struct vnode *) uma_zalloc(vnode_zone, M_WAITOK|M_ZERO);
926	/*
927	 * Setup locks.
928	 */
929	vp->v_vnlock = &vp->v_lock;
930	mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF);
931	/*
932	 * By default, don't allow shared locks unless filesystems
933	 * opt-in.
934	 */
935	lockinit(vp->v_vnlock, PVFS, tag, VLKTIMEOUT, LK_NOSHARE);
936	/*
937	 * Initialize bufobj.
938	 */
939	bo = &vp->v_bufobj;
940	bo->__bo_vnode = vp;
941	mtx_init(BO_MTX(bo), "bufobj interlock", NULL, MTX_DEF);
942	bo->bo_ops = &buf_ops_bio;
943	bo->bo_private = vp;
944	TAILQ_INIT(&bo->bo_clean.bv_hd);
945	TAILQ_INIT(&bo->bo_dirty.bv_hd);
946	/*
947	 * Initialize namecache.
948	 */
949	LIST_INIT(&vp->v_cache_src);
950	TAILQ_INIT(&vp->v_cache_dst);
951	/*
952	 * Finalize various vnode identity bits.
953	 */
954	vp->v_type = VNON;
955	vp->v_tag = tag;
956	vp->v_op = vops;
957	v_incr_usecount(vp);
958	vp->v_data = 0;
959#ifdef MAC
960	mac_vnode_init(vp);
961	if (mp != NULL && (mp->mnt_flag & MNT_MULTILABEL) == 0)
962		mac_vnode_associate_singlelabel(mp, vp);
963	else if (mp == NULL)
964		printf("NULL mp in getnewvnode()\n");
965#endif
966	if (mp != NULL) {
967		bo->bo_bsize = mp->mnt_stat.f_iosize;
968		if ((mp->mnt_kern_flag & MNTK_NOKNOTE) != 0)
969			vp->v_vflag |= VV_NOKNOTE;
970	}
971
972	CTR2(KTR_VFS, "getnewvnode: mp %p vp %p", mp, vp);
973	*vpp = vp;
974	return (0);
975}
976
977/*
978 * Delete from old mount point vnode list, if on one.
979 */
980static void
981delmntque(struct vnode *vp)
982{
983	struct mount *mp;
984
985	mp = vp->v_mount;
986	if (mp == NULL)
987		return;
988	MNT_ILOCK(mp);
989	vp->v_mount = NULL;
990	VNASSERT(mp->mnt_nvnodelistsize > 0, vp,
991		("bad mount point vnode list size"));
992	TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
993	mp->mnt_nvnodelistsize--;
994	MNT_REL(mp);
995	MNT_IUNLOCK(mp);
996}
997
998static void
999insmntque_stddtr(struct vnode *vp, void *dtr_arg)
1000{
1001
1002	vp->v_data = NULL;
1003	vp->v_op = &dead_vnodeops;
1004	/* XXX non mp-safe fs may still call insmntque with vnode
1005	   unlocked */
1006	if (!VOP_ISLOCKED(vp))
1007		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1008	vgone(vp);
1009	vput(vp);
1010}
1011
1012/*
1013 * Insert into list of vnodes for the new mount point, if available.
1014 */
1015int
1016insmntque1(struct vnode *vp, struct mount *mp,
1017	void (*dtr)(struct vnode *, void *), void *dtr_arg)
1018{
1019
1020	KASSERT(vp->v_mount == NULL,
1021		("insmntque: vnode already on per mount vnode list"));
1022	VNASSERT(mp != NULL, vp, ("Don't call insmntque(foo, NULL)"));
1023	MNT_ILOCK(mp);
1024	if ((mp->mnt_kern_flag & MNTK_NOINSMNTQ) != 0 &&
1025	    mp->mnt_nvnodelistsize == 0) {
1026		MNT_IUNLOCK(mp);
1027		if (dtr != NULL)
1028			dtr(vp, dtr_arg);
1029		return (EBUSY);
1030	}
1031	vp->v_mount = mp;
1032	MNT_REF(mp);
1033	TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
1034	VNASSERT(mp->mnt_nvnodelistsize >= 0, vp,
1035		("neg mount point vnode list size"));
1036	mp->mnt_nvnodelistsize++;
1037	MNT_IUNLOCK(mp);
1038	return (0);
1039}
1040
1041int
1042insmntque(struct vnode *vp, struct mount *mp)
1043{
1044
1045	return (insmntque1(vp, mp, insmntque_stddtr, NULL));
1046}
1047
1048/*
1049 * Flush out and invalidate all buffers associated with a bufobj
1050 * Called with the underlying object locked.
1051 */
1052int
1053bufobj_invalbuf(struct bufobj *bo, int flags, struct thread *td, int slpflag,
1054    int slptimeo)
1055{
1056	int error;
1057
1058	BO_LOCK(bo);
1059	if (flags & V_SAVE) {
1060		error = bufobj_wwait(bo, slpflag, slptimeo);
1061		if (error) {
1062			BO_UNLOCK(bo);
1063			return (error);
1064		}
1065		if (bo->bo_dirty.bv_cnt > 0) {
1066			BO_UNLOCK(bo);
1067			if ((error = BO_SYNC(bo, MNT_WAIT, td)) != 0)
1068				return (error);
1069			/*
1070			 * XXX We could save a lock/unlock if this was only
1071			 * enabled under INVARIANTS
1072			 */
1073			BO_LOCK(bo);
1074			if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0)
1075				panic("vinvalbuf: dirty bufs");
1076		}
1077	}
1078	/*
1079	 * If you alter this loop please notice that interlock is dropped and
1080	 * reacquired in flushbuflist.  Special care is needed to ensure that
1081	 * no race conditions occur from this.
1082	 */
1083	do {
1084		error = flushbuflist(&bo->bo_clean,
1085		    flags, bo, slpflag, slptimeo);
1086		if (error == 0)
1087			error = flushbuflist(&bo->bo_dirty,
1088			    flags, bo, slpflag, slptimeo);
1089		if (error != 0 && error != EAGAIN) {
1090			BO_UNLOCK(bo);
1091			return (error);
1092		}
1093	} while (error != 0);
1094
1095	/*
1096	 * Wait for I/O to complete.  XXX needs cleaning up.  The vnode can
1097	 * have write I/O in-progress but if there is a VM object then the
1098	 * VM object can also have read-I/O in-progress.
1099	 */
1100	do {
1101		bufobj_wwait(bo, 0, 0);
1102		BO_UNLOCK(bo);
1103		if (bo->bo_object != NULL) {
1104			VM_OBJECT_LOCK(bo->bo_object);
1105			vm_object_pip_wait(bo->bo_object, "bovlbx");
1106			VM_OBJECT_UNLOCK(bo->bo_object);
1107		}
1108		BO_LOCK(bo);
1109	} while (bo->bo_numoutput > 0);
1110	BO_UNLOCK(bo);
1111
1112	/*
1113	 * Destroy the copy in the VM cache, too.
1114	 */
1115	if (bo->bo_object != NULL) {
1116		VM_OBJECT_LOCK(bo->bo_object);
1117		vm_object_page_remove(bo->bo_object, 0, 0,
1118			(flags & V_SAVE) ? TRUE : FALSE);
1119		VM_OBJECT_UNLOCK(bo->bo_object);
1120	}
1121
1122#ifdef INVARIANTS
1123	BO_LOCK(bo);
1124	if ((flags & (V_ALT | V_NORMAL)) == 0 &&
1125	    (bo->bo_dirty.bv_cnt > 0 || bo->bo_clean.bv_cnt > 0))
1126		panic("vinvalbuf: flush failed");
1127	BO_UNLOCK(bo);
1128#endif
1129	return (0);
1130}
1131
1132/*
1133 * Flush out and invalidate all buffers associated with a vnode.
1134 * Called with the underlying object locked.
1135 */
1136int
1137vinvalbuf(struct vnode *vp, int flags, struct thread *td, int slpflag,
1138    int slptimeo)
1139{
1140
1141	CTR2(KTR_VFS, "vinvalbuf vp %p flags %d", vp, flags);
1142	ASSERT_VOP_LOCKED(vp, "vinvalbuf");
1143	return (bufobj_invalbuf(&vp->v_bufobj, flags, td, slpflag, slptimeo));
1144}
1145
1146/*
1147 * Flush out buffers on the specified list.
1148 *
1149 */
1150static int
1151flushbuflist( struct bufv *bufv, int flags, struct bufobj *bo, int slpflag,
1152    int slptimeo)
1153{
1154	struct buf *bp, *nbp;
1155	int retval, error;
1156	daddr_t lblkno;
1157	b_xflags_t xflags;
1158
1159	ASSERT_BO_LOCKED(bo);
1160
1161	retval = 0;
1162	TAILQ_FOREACH_SAFE(bp, &bufv->bv_hd, b_bobufs, nbp) {
1163		if (((flags & V_NORMAL) && (bp->b_xflags & BX_ALTDATA)) ||
1164		    ((flags & V_ALT) && (bp->b_xflags & BX_ALTDATA) == 0)) {
1165			continue;
1166		}
1167		lblkno = 0;
1168		xflags = 0;
1169		if (nbp != NULL) {
1170			lblkno = nbp->b_lblkno;
1171			xflags = nbp->b_xflags &
1172				(BX_BKGRDMARKER | BX_VNDIRTY | BX_VNCLEAN);
1173		}
1174		retval = EAGAIN;
1175		error = BUF_TIMELOCK(bp,
1176		    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_MTX(bo),
1177		    "flushbuf", slpflag, slptimeo);
1178		if (error) {
1179			BO_LOCK(bo);
1180			return (error != ENOLCK ? error : EAGAIN);
1181		}
1182		KASSERT(bp->b_bufobj == bo,
1183		    ("bp %p wrong b_bufobj %p should be %p",
1184		    bp, bp->b_bufobj, bo));
1185		if (bp->b_bufobj != bo) {	/* XXX: necessary ? */
1186			BUF_UNLOCK(bp);
1187			BO_LOCK(bo);
1188			return (EAGAIN);
1189		}
1190		/*
1191		 * XXX Since there are no node locks for NFS, I
1192		 * believe there is a slight chance that a delayed
1193		 * write will occur while sleeping just above, so
1194		 * check for it.
1195		 */
1196		if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) &&
1197		    (flags & V_SAVE)) {
1198			bremfree(bp);
1199			bp->b_flags |= B_ASYNC;
1200			bwrite(bp);
1201			BO_LOCK(bo);
1202			return (EAGAIN);	/* XXX: why not loop ? */
1203		}
1204		bremfree(bp);
1205		bp->b_flags |= (B_INVAL | B_RELBUF);
1206		bp->b_flags &= ~B_ASYNC;
1207		brelse(bp);
1208		BO_LOCK(bo);
1209		if (nbp != NULL &&
1210		    (nbp->b_bufobj != bo ||
1211		     nbp->b_lblkno != lblkno ||
1212		     (nbp->b_xflags &
1213		      (BX_BKGRDMARKER | BX_VNDIRTY | BX_VNCLEAN)) != xflags))
1214			break;			/* nbp invalid */
1215	}
1216	return (retval);
1217}
1218
1219/*
1220 * Truncate a file's buffer and pages to a specified length.  This
1221 * is in lieu of the old vinvalbuf mechanism, which performed unneeded
1222 * sync activity.
1223 */
1224int
1225vtruncbuf(struct vnode *vp, struct ucred *cred, struct thread *td,
1226    off_t length, int blksize)
1227{
1228	struct buf *bp, *nbp;
1229	int anyfreed;
1230	int trunclbn;
1231	struct bufobj *bo;
1232
1233	CTR2(KTR_VFS, "vtruncbuf vp %p length %jd", vp, length);
1234	/*
1235	 * Round up to the *next* lbn.
1236	 */
1237	trunclbn = (length + blksize - 1) / blksize;
1238
1239	ASSERT_VOP_LOCKED(vp, "vtruncbuf");
1240restart:
1241	bo = &vp->v_bufobj;
1242	BO_LOCK(bo);
1243	anyfreed = 1;
1244	for (;anyfreed;) {
1245		anyfreed = 0;
1246		TAILQ_FOREACH_SAFE(bp, &bo->bo_clean.bv_hd, b_bobufs, nbp) {
1247			if (bp->b_lblkno < trunclbn)
1248				continue;
1249			if (BUF_LOCK(bp,
1250			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
1251			    BO_MTX(bo)) == ENOLCK)
1252				goto restart;
1253
1254			bremfree(bp);
1255			bp->b_flags |= (B_INVAL | B_RELBUF);
1256			bp->b_flags &= ~B_ASYNC;
1257			brelse(bp);
1258			anyfreed = 1;
1259
1260			if (nbp != NULL &&
1261			    (((nbp->b_xflags & BX_VNCLEAN) == 0) ||
1262			    (nbp->b_vp != vp) ||
1263			    (nbp->b_flags & B_DELWRI))) {
1264				goto restart;
1265			}
1266			BO_LOCK(bo);
1267		}
1268
1269		TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
1270			if (bp->b_lblkno < trunclbn)
1271				continue;
1272			if (BUF_LOCK(bp,
1273			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
1274			    BO_MTX(bo)) == ENOLCK)
1275				goto restart;
1276			bremfree(bp);
1277			bp->b_flags |= (B_INVAL | B_RELBUF);
1278			bp->b_flags &= ~B_ASYNC;
1279			brelse(bp);
1280			anyfreed = 1;
1281			if (nbp != NULL &&
1282			    (((nbp->b_xflags & BX_VNDIRTY) == 0) ||
1283			    (nbp->b_vp != vp) ||
1284			    (nbp->b_flags & B_DELWRI) == 0)) {
1285				goto restart;
1286			}
1287			BO_LOCK(bo);
1288		}
1289	}
1290
1291	if (length > 0) {
1292restartsync:
1293		TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
1294			if (bp->b_lblkno > 0)
1295				continue;
1296			/*
1297			 * Since we hold the vnode lock this should only
1298			 * fail if we're racing with the buf daemon.
1299			 */
1300			if (BUF_LOCK(bp,
1301			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
1302			    BO_MTX(bo)) == ENOLCK) {
1303				goto restart;
1304			}
1305			VNASSERT((bp->b_flags & B_DELWRI), vp,
1306			    ("buf(%p) on dirty queue without DELWRI", bp));
1307
1308			bremfree(bp);
1309			bawrite(bp);
1310			BO_LOCK(bo);
1311			goto restartsync;
1312		}
1313	}
1314
1315	bufobj_wwait(bo, 0, 0);
1316	BO_UNLOCK(bo);
1317	vnode_pager_setsize(vp, length);
1318
1319	return (0);
1320}
1321
1322/*
1323 * buf_splay() - splay tree core for the clean/dirty list of buffers in
1324 * 		 a vnode.
1325 *
1326 *	NOTE: We have to deal with the special case of a background bitmap
1327 *	buffer, a situation where two buffers will have the same logical
1328 *	block offset.  We want (1) only the foreground buffer to be accessed
1329 *	in a lookup and (2) must differentiate between the foreground and
1330 *	background buffer in the splay tree algorithm because the splay
1331 *	tree cannot normally handle multiple entities with the same 'index'.
1332 *	We accomplish this by adding differentiating flags to the splay tree's
1333 *	numerical domain.
1334 */
1335static
1336struct buf *
1337buf_splay(daddr_t lblkno, b_xflags_t xflags, struct buf *root)
1338{
1339	struct buf dummy;
1340	struct buf *lefttreemax, *righttreemin, *y;
1341
1342	if (root == NULL)
1343		return (NULL);
1344	lefttreemax = righttreemin = &dummy;
1345	for (;;) {
1346		if (lblkno < root->b_lblkno ||
1347		    (lblkno == root->b_lblkno &&
1348		    (xflags & BX_BKGRDMARKER) < (root->b_xflags & BX_BKGRDMARKER))) {
1349			if ((y = root->b_left) == NULL)
1350				break;
1351			if (lblkno < y->b_lblkno) {
1352				/* Rotate right. */
1353				root->b_left = y->b_right;
1354				y->b_right = root;
1355				root = y;
1356				if ((y = root->b_left) == NULL)
1357					break;
1358			}
1359			/* Link into the new root's right tree. */
1360			righttreemin->b_left = root;
1361			righttreemin = root;
1362		} else if (lblkno > root->b_lblkno ||
1363		    (lblkno == root->b_lblkno &&
1364		    (xflags & BX_BKGRDMARKER) > (root->b_xflags & BX_BKGRDMARKER))) {
1365			if ((y = root->b_right) == NULL)
1366				break;
1367			if (lblkno > y->b_lblkno) {
1368				/* Rotate left. */
1369				root->b_right = y->b_left;
1370				y->b_left = root;
1371				root = y;
1372				if ((y = root->b_right) == NULL)
1373					break;
1374			}
1375			/* Link into the new root's left tree. */
1376			lefttreemax->b_right = root;
1377			lefttreemax = root;
1378		} else {
1379			break;
1380		}
1381		root = y;
1382	}
1383	/* Assemble the new root. */
1384	lefttreemax->b_right = root->b_left;
1385	righttreemin->b_left = root->b_right;
1386	root->b_left = dummy.b_right;
1387	root->b_right = dummy.b_left;
1388	return (root);
1389}
1390
1391static void
1392buf_vlist_remove(struct buf *bp)
1393{
1394	struct buf *root;
1395	struct bufv *bv;
1396
1397	KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
1398	ASSERT_BO_LOCKED(bp->b_bufobj);
1399	KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) !=
1400	    (BX_VNDIRTY|BX_VNCLEAN),
1401	    ("buf_vlist_remove: Buf %p is on two lists", bp));
1402	if (bp->b_xflags & BX_VNDIRTY)
1403		bv = &bp->b_bufobj->bo_dirty;
1404	else
1405		bv = &bp->b_bufobj->bo_clean;
1406	if (bp != bv->bv_root) {
1407		root = buf_splay(bp->b_lblkno, bp->b_xflags, bv->bv_root);
1408		KASSERT(root == bp, ("splay lookup failed in remove"));
1409	}
1410	if (bp->b_left == NULL) {
1411		root = bp->b_right;
1412	} else {
1413		root = buf_splay(bp->b_lblkno, bp->b_xflags, bp->b_left);
1414		root->b_right = bp->b_right;
1415	}
1416	bv->bv_root = root;
1417	TAILQ_REMOVE(&bv->bv_hd, bp, b_bobufs);
1418	bv->bv_cnt--;
1419	bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN);
1420}
1421
1422/*
1423 * Add the buffer to the sorted clean or dirty block list using a
1424 * splay tree algorithm.
1425 *
1426 * NOTE: xflags is passed as a constant, optimizing this inline function!
1427 */
1428static void
1429buf_vlist_add(struct buf *bp, struct bufobj *bo, b_xflags_t xflags)
1430{
1431	struct buf *root;
1432	struct bufv *bv;
1433
1434	ASSERT_BO_LOCKED(bo);
1435	KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0,
1436	    ("buf_vlist_add: Buf %p has existing xflags %d", bp, bp->b_xflags));
1437	bp->b_xflags |= xflags;
1438	if (xflags & BX_VNDIRTY)
1439		bv = &bo->bo_dirty;
1440	else
1441		bv = &bo->bo_clean;
1442
1443	root = buf_splay(bp->b_lblkno, bp->b_xflags, bv->bv_root);
1444	if (root == NULL) {
1445		bp->b_left = NULL;
1446		bp->b_right = NULL;
1447		TAILQ_INSERT_TAIL(&bv->bv_hd, bp, b_bobufs);
1448	} else if (bp->b_lblkno < root->b_lblkno ||
1449	    (bp->b_lblkno == root->b_lblkno &&
1450	    (bp->b_xflags & BX_BKGRDMARKER) < (root->b_xflags & BX_BKGRDMARKER))) {
1451		bp->b_left = root->b_left;
1452		bp->b_right = root;
1453		root->b_left = NULL;
1454		TAILQ_INSERT_BEFORE(root, bp, b_bobufs);
1455	} else {
1456		bp->b_right = root->b_right;
1457		bp->b_left = root;
1458		root->b_right = NULL;
1459		TAILQ_INSERT_AFTER(&bv->bv_hd, root, bp, b_bobufs);
1460	}
1461	bv->bv_cnt++;
1462	bv->bv_root = bp;
1463}
1464
1465/*
1466 * Lookup a buffer using the splay tree.  Note that we specifically avoid
1467 * shadow buffers used in background bitmap writes.
1468 *
1469 * This code isn't quite efficient as it could be because we are maintaining
1470 * two sorted lists and do not know which list the block resides in.
1471 *
1472 * During a "make buildworld" the desired buffer is found at one of
1473 * the roots more than 60% of the time.  Thus, checking both roots
1474 * before performing either splay eliminates unnecessary splays on the
1475 * first tree splayed.
1476 */
1477struct buf *
1478gbincore(struct bufobj *bo, daddr_t lblkno)
1479{
1480	struct buf *bp;
1481
1482	ASSERT_BO_LOCKED(bo);
1483	if ((bp = bo->bo_clean.bv_root) != NULL &&
1484	    bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
1485		return (bp);
1486	if ((bp = bo->bo_dirty.bv_root) != NULL &&
1487	    bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
1488		return (bp);
1489	if ((bp = bo->bo_clean.bv_root) != NULL) {
1490		bo->bo_clean.bv_root = bp = buf_splay(lblkno, 0, bp);
1491		if (bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
1492			return (bp);
1493	}
1494	if ((bp = bo->bo_dirty.bv_root) != NULL) {
1495		bo->bo_dirty.bv_root = bp = buf_splay(lblkno, 0, bp);
1496		if (bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
1497			return (bp);
1498	}
1499	return (NULL);
1500}
1501
1502/*
1503 * Associate a buffer with a vnode.
1504 */
1505void
1506bgetvp(struct vnode *vp, struct buf *bp)
1507{
1508	struct bufobj *bo;
1509
1510	bo = &vp->v_bufobj;
1511	ASSERT_BO_LOCKED(bo);
1512	VNASSERT(bp->b_vp == NULL, bp->b_vp, ("bgetvp: not free"));
1513
1514	CTR3(KTR_BUF, "bgetvp(%p) vp %p flags %X", bp, vp, bp->b_flags);
1515	VNASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, vp,
1516	    ("bgetvp: bp already attached! %p", bp));
1517
1518	vhold(vp);
1519	if (VFS_NEEDSGIANT(vp->v_mount) || bo->bo_flag & BO_NEEDSGIANT)
1520		bp->b_flags |= B_NEEDSGIANT;
1521	bp->b_vp = vp;
1522	bp->b_bufobj = bo;
1523	/*
1524	 * Insert onto list for new vnode.
1525	 */
1526	buf_vlist_add(bp, bo, BX_VNCLEAN);
1527}
1528
1529/*
1530 * Disassociate a buffer from a vnode.
1531 */
1532void
1533brelvp(struct buf *bp)
1534{
1535	struct bufobj *bo;
1536	struct vnode *vp;
1537
1538	CTR3(KTR_BUF, "brelvp(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
1539	KASSERT(bp->b_vp != NULL, ("brelvp: NULL"));
1540
1541	/*
1542	 * Delete from old vnode list, if on one.
1543	 */
1544	vp = bp->b_vp;		/* XXX */
1545	bo = bp->b_bufobj;
1546	BO_LOCK(bo);
1547	if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN))
1548		buf_vlist_remove(bp);
1549	else
1550		panic("brelvp: Buffer %p not on queue.", bp);
1551	if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) {
1552		bo->bo_flag &= ~BO_ONWORKLST;
1553		mtx_lock(&sync_mtx);
1554		LIST_REMOVE(bo, bo_synclist);
1555		syncer_worklist_len--;
1556		mtx_unlock(&sync_mtx);
1557	}
1558	bp->b_flags &= ~B_NEEDSGIANT;
1559	bp->b_vp = NULL;
1560	bp->b_bufobj = NULL;
1561	BO_UNLOCK(bo);
1562	vdrop(vp);
1563}
1564
1565/*
1566 * Add an item to the syncer work queue.
1567 */
1568static void
1569vn_syncer_add_to_worklist(struct bufobj *bo, int delay)
1570{
1571	int slot;
1572
1573	ASSERT_BO_LOCKED(bo);
1574
1575	mtx_lock(&sync_mtx);
1576	if (bo->bo_flag & BO_ONWORKLST)
1577		LIST_REMOVE(bo, bo_synclist);
1578	else {
1579		bo->bo_flag |= BO_ONWORKLST;
1580		syncer_worklist_len++;
1581	}
1582
1583	if (delay > syncer_maxdelay - 2)
1584		delay = syncer_maxdelay - 2;
1585	slot = (syncer_delayno + delay) & syncer_mask;
1586
1587	LIST_INSERT_HEAD(&syncer_workitem_pending[slot], bo, bo_synclist);
1588	mtx_unlock(&sync_mtx);
1589}
1590
1591static int
1592sysctl_vfs_worklist_len(SYSCTL_HANDLER_ARGS)
1593{
1594	int error, len;
1595
1596	mtx_lock(&sync_mtx);
1597	len = syncer_worklist_len - sync_vnode_count;
1598	mtx_unlock(&sync_mtx);
1599	error = SYSCTL_OUT(req, &len, sizeof(len));
1600	return (error);
1601}
1602
1603SYSCTL_PROC(_vfs, OID_AUTO, worklist_len, CTLTYPE_INT | CTLFLAG_RD, NULL, 0,
1604    sysctl_vfs_worklist_len, "I", "Syncer thread worklist length");
1605
1606static struct proc *updateproc;
1607static void sched_sync(void);
1608static struct kproc_desc up_kp = {
1609	"syncer",
1610	sched_sync,
1611	&updateproc
1612};
1613SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp);
1614
1615static int
1616sync_vnode(struct synclist *slp, struct bufobj **bo, struct thread *td)
1617{
1618	struct vnode *vp;
1619	struct mount *mp;
1620	int vfslocked;
1621
1622	vfslocked = 0;
1623restart:
1624	*bo = LIST_FIRST(slp);
1625	if (*bo == NULL) {
1626		VFS_UNLOCK_GIANT(vfslocked);
1627		return (0);
1628	}
1629	vp = (*bo)->__bo_vnode;	/* XXX */
1630	if (VFS_NEEDSGIANT(vp->v_mount)) {
1631		if (!vfslocked) {
1632			vfslocked = 1;
1633			if (mtx_trylock(&Giant) == 0) {
1634				mtx_unlock(&sync_mtx);
1635				mtx_lock(&Giant);
1636				mtx_lock(&sync_mtx);
1637				goto restart;
1638			}
1639		}
1640	} else {
1641		VFS_UNLOCK_GIANT(vfslocked);
1642		vfslocked = 0;
1643	}
1644	if (VOP_ISLOCKED(vp) != 0) {
1645		VFS_UNLOCK_GIANT(vfslocked);
1646		return (1);
1647	}
1648	if (VI_TRYLOCK(vp) == 0) {
1649		VFS_UNLOCK_GIANT(vfslocked);
1650		return (1);
1651	}
1652	/*
1653	 * We use vhold in case the vnode does not
1654	 * successfully sync.  vhold prevents the vnode from
1655	 * going away when we unlock the sync_mtx so that
1656	 * we can acquire the vnode interlock.
1657	 */
1658	vholdl(vp);
1659	mtx_unlock(&sync_mtx);
1660	VI_UNLOCK(vp);
1661	if (vn_start_write(vp, &mp, V_NOWAIT) != 0) {
1662		vdrop(vp);
1663		VFS_UNLOCK_GIANT(vfslocked);
1664		mtx_lock(&sync_mtx);
1665		return (*bo == LIST_FIRST(slp));
1666	}
1667	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1668	(void) VOP_FSYNC(vp, MNT_LAZY, td);
1669	VOP_UNLOCK(vp, 0);
1670	vn_finished_write(mp);
1671	BO_LOCK(*bo);
1672	if (((*bo)->bo_flag & BO_ONWORKLST) != 0) {
1673		/*
1674		 * Put us back on the worklist.  The worklist
1675		 * routine will remove us from our current
1676		 * position and then add us back in at a later
1677		 * position.
1678		 */
1679		vn_syncer_add_to_worklist(*bo, syncdelay);
1680	}
1681	BO_UNLOCK(*bo);
1682	vdrop(vp);
1683	VFS_UNLOCK_GIANT(vfslocked);
1684	mtx_lock(&sync_mtx);
1685	return (0);
1686}
1687
1688/*
1689 * System filesystem synchronizer daemon.
1690 */
1691static void
1692sched_sync(void)
1693{
1694	struct synclist *next;
1695	struct synclist *slp;
1696	struct bufobj *bo;
1697	long starttime;
1698	struct thread *td = curthread;
1699	static int dummychan;
1700	int last_work_seen;
1701	int net_worklist_len;
1702	int syncer_final_iter;
1703	int first_printf;
1704	int error;
1705
1706	last_work_seen = 0;
1707	syncer_final_iter = 0;
1708	first_printf = 1;
1709	syncer_state = SYNCER_RUNNING;
1710	starttime = time_uptime;
1711	td->td_pflags |= TDP_NORUNNINGBUF;
1712
1713	EVENTHANDLER_REGISTER(shutdown_pre_sync, syncer_shutdown, td->td_proc,
1714	    SHUTDOWN_PRI_LAST);
1715
1716	mtx_lock(&sync_mtx);
1717	for (;;) {
1718		if (syncer_state == SYNCER_FINAL_DELAY &&
1719		    syncer_final_iter == 0) {
1720			mtx_unlock(&sync_mtx);
1721			kproc_suspend_check(td->td_proc);
1722			mtx_lock(&sync_mtx);
1723		}
1724		net_worklist_len = syncer_worklist_len - sync_vnode_count;
1725		if (syncer_state != SYNCER_RUNNING &&
1726		    starttime != time_uptime) {
1727			if (first_printf) {
1728				printf("\nSyncing disks, vnodes remaining...");
1729				first_printf = 0;
1730			}
1731			printf("%d ", net_worklist_len);
1732		}
1733		starttime = time_uptime;
1734
1735		/*
1736		 * Push files whose dirty time has expired.  Be careful
1737		 * of interrupt race on slp queue.
1738		 *
1739		 * Skip over empty worklist slots when shutting down.
1740		 */
1741		do {
1742			slp = &syncer_workitem_pending[syncer_delayno];
1743			syncer_delayno += 1;
1744			if (syncer_delayno == syncer_maxdelay)
1745				syncer_delayno = 0;
1746			next = &syncer_workitem_pending[syncer_delayno];
1747			/*
1748			 * If the worklist has wrapped since the
1749			 * it was emptied of all but syncer vnodes,
1750			 * switch to the FINAL_DELAY state and run
1751			 * for one more second.
1752			 */
1753			if (syncer_state == SYNCER_SHUTTING_DOWN &&
1754			    net_worklist_len == 0 &&
1755			    last_work_seen == syncer_delayno) {
1756				syncer_state = SYNCER_FINAL_DELAY;
1757				syncer_final_iter = SYNCER_SHUTDOWN_SPEEDUP;
1758			}
1759		} while (syncer_state != SYNCER_RUNNING && LIST_EMPTY(slp) &&
1760		    syncer_worklist_len > 0);
1761
1762		/*
1763		 * Keep track of the last time there was anything
1764		 * on the worklist other than syncer vnodes.
1765		 * Return to the SHUTTING_DOWN state if any
1766		 * new work appears.
1767		 */
1768		if (net_worklist_len > 0 || syncer_state == SYNCER_RUNNING)
1769			last_work_seen = syncer_delayno;
1770		if (net_worklist_len > 0 && syncer_state == SYNCER_FINAL_DELAY)
1771			syncer_state = SYNCER_SHUTTING_DOWN;
1772		while (!LIST_EMPTY(slp)) {
1773			error = sync_vnode(slp, &bo, td);
1774			if (error == 1) {
1775				LIST_REMOVE(bo, bo_synclist);
1776				LIST_INSERT_HEAD(next, bo, bo_synclist);
1777				continue;
1778			}
1779		}
1780		if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter > 0)
1781			syncer_final_iter--;
1782		/*
1783		 * The variable rushjob allows the kernel to speed up the
1784		 * processing of the filesystem syncer process. A rushjob
1785		 * value of N tells the filesystem syncer to process the next
1786		 * N seconds worth of work on its queue ASAP. Currently rushjob
1787		 * is used by the soft update code to speed up the filesystem
1788		 * syncer process when the incore state is getting so far
1789		 * ahead of the disk that the kernel memory pool is being
1790		 * threatened with exhaustion.
1791		 */
1792		if (rushjob > 0) {
1793			rushjob -= 1;
1794			continue;
1795		}
1796		/*
1797		 * Just sleep for a short period of time between
1798		 * iterations when shutting down to allow some I/O
1799		 * to happen.
1800		 *
1801		 * If it has taken us less than a second to process the
1802		 * current work, then wait. Otherwise start right over
1803		 * again. We can still lose time if any single round
1804		 * takes more than two seconds, but it does not really
1805		 * matter as we are just trying to generally pace the
1806		 * filesystem activity.
1807		 */
1808		if (syncer_state != SYNCER_RUNNING)
1809			msleep(&dummychan, &sync_mtx, PPAUSE, "syncfnl",
1810			    hz / SYNCER_SHUTDOWN_SPEEDUP);
1811		else if (time_uptime == starttime)
1812			msleep(&lbolt, &sync_mtx, PPAUSE, "syncer", 0);
1813	}
1814}
1815
1816/*
1817 * Request the syncer daemon to speed up its work.
1818 * We never push it to speed up more than half of its
1819 * normal turn time, otherwise it could take over the cpu.
1820 */
1821int
1822speedup_syncer(void)
1823{
1824	struct thread *td;
1825	int ret = 0;
1826
1827	td = FIRST_THREAD_IN_PROC(updateproc);
1828	mtx_lock(&sync_mtx);
1829	if (rushjob < syncdelay / 2) {
1830		rushjob += 1;
1831		stat_rush_requests += 1;
1832		ret = 1;
1833	}
1834	mtx_unlock(&sync_mtx);
1835	sleepq_remove(td, &lbolt);
1836	return (ret);
1837}
1838
1839/*
1840 * Tell the syncer to speed up its work and run though its work
1841 * list several times, then tell it to shut down.
1842 */
1843static void
1844syncer_shutdown(void *arg, int howto)
1845{
1846	struct thread *td;
1847
1848	if (howto & RB_NOSYNC)
1849		return;
1850	td = FIRST_THREAD_IN_PROC(updateproc);
1851	mtx_lock(&sync_mtx);
1852	syncer_state = SYNCER_SHUTTING_DOWN;
1853	rushjob = 0;
1854	mtx_unlock(&sync_mtx);
1855	sleepq_remove(td, &lbolt);
1856	kproc_shutdown(arg, howto);
1857}
1858
1859/*
1860 * Reassign a buffer from one vnode to another.
1861 * Used to assign file specific control information
1862 * (indirect blocks) to the vnode to which they belong.
1863 */
1864void
1865reassignbuf(struct buf *bp)
1866{
1867	struct vnode *vp;
1868	struct bufobj *bo;
1869	int delay;
1870#ifdef INVARIANTS
1871	struct bufv *bv;
1872#endif
1873
1874	vp = bp->b_vp;
1875	bo = bp->b_bufobj;
1876	++reassignbufcalls;
1877
1878	CTR3(KTR_BUF, "reassignbuf(%p) vp %p flags %X",
1879	    bp, bp->b_vp, bp->b_flags);
1880	/*
1881	 * B_PAGING flagged buffers cannot be reassigned because their vp
1882	 * is not fully linked in.
1883	 */
1884	if (bp->b_flags & B_PAGING)
1885		panic("cannot reassign paging buffer");
1886
1887	/*
1888	 * Delete from old vnode list, if on one.
1889	 */
1890	BO_LOCK(bo);
1891	if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN))
1892		buf_vlist_remove(bp);
1893	else
1894		panic("reassignbuf: Buffer %p not on queue.", bp);
1895	/*
1896	 * If dirty, put on list of dirty buffers; otherwise insert onto list
1897	 * of clean buffers.
1898	 */
1899	if (bp->b_flags & B_DELWRI) {
1900		if ((bo->bo_flag & BO_ONWORKLST) == 0) {
1901			switch (vp->v_type) {
1902			case VDIR:
1903				delay = dirdelay;
1904				break;
1905			case VCHR:
1906				delay = metadelay;
1907				break;
1908			default:
1909				delay = filedelay;
1910			}
1911			vn_syncer_add_to_worklist(bo, delay);
1912		}
1913		buf_vlist_add(bp, bo, BX_VNDIRTY);
1914	} else {
1915		buf_vlist_add(bp, bo, BX_VNCLEAN);
1916
1917		if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) {
1918			mtx_lock(&sync_mtx);
1919			LIST_REMOVE(bo, bo_synclist);
1920			syncer_worklist_len--;
1921			mtx_unlock(&sync_mtx);
1922			bo->bo_flag &= ~BO_ONWORKLST;
1923		}
1924	}
1925#ifdef INVARIANTS
1926	bv = &bo->bo_clean;
1927	bp = TAILQ_FIRST(&bv->bv_hd);
1928	KASSERT(bp == NULL || bp->b_bufobj == bo,
1929	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
1930	bp = TAILQ_LAST(&bv->bv_hd, buflists);
1931	KASSERT(bp == NULL || bp->b_bufobj == bo,
1932	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
1933	bv = &bo->bo_dirty;
1934	bp = TAILQ_FIRST(&bv->bv_hd);
1935	KASSERT(bp == NULL || bp->b_bufobj == bo,
1936	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
1937	bp = TAILQ_LAST(&bv->bv_hd, buflists);
1938	KASSERT(bp == NULL || bp->b_bufobj == bo,
1939	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
1940#endif
1941	BO_UNLOCK(bo);
1942}
1943
1944/*
1945 * Increment the use and hold counts on the vnode, taking care to reference
1946 * the driver's usecount if this is a chardev.  The vholdl() will remove
1947 * the vnode from the free list if it is presently free.  Requires the
1948 * vnode interlock and returns with it held.
1949 */
1950static void
1951v_incr_usecount(struct vnode *vp)
1952{
1953
1954	CTR3(KTR_VFS, "v_incr_usecount: vp %p holdcnt %d usecount %d\n",
1955	    vp, vp->v_holdcnt, vp->v_usecount);
1956	vp->v_usecount++;
1957	if (vp->v_type == VCHR && vp->v_rdev != NULL) {
1958		dev_lock();
1959		vp->v_rdev->si_usecount++;
1960		dev_unlock();
1961	}
1962	vholdl(vp);
1963}
1964
1965/*
1966 * Turn a holdcnt into a use+holdcnt such that only one call to
1967 * v_decr_usecount is needed.
1968 */
1969static void
1970v_upgrade_usecount(struct vnode *vp)
1971{
1972
1973	CTR3(KTR_VFS, "v_upgrade_usecount: vp %p holdcnt %d usecount %d\n",
1974	    vp, vp->v_holdcnt, vp->v_usecount);
1975	vp->v_usecount++;
1976	if (vp->v_type == VCHR && vp->v_rdev != NULL) {
1977		dev_lock();
1978		vp->v_rdev->si_usecount++;
1979		dev_unlock();
1980	}
1981}
1982
1983/*
1984 * Decrement the vnode use and hold count along with the driver's usecount
1985 * if this is a chardev.  The vdropl() below releases the vnode interlock
1986 * as it may free the vnode.
1987 */
1988static void
1989v_decr_usecount(struct vnode *vp)
1990{
1991
1992	CTR3(KTR_VFS, "v_decr_usecount: vp %p holdcnt %d usecount %d\n",
1993	    vp, vp->v_holdcnt, vp->v_usecount);
1994	ASSERT_VI_LOCKED(vp, __FUNCTION__);
1995	VNASSERT(vp->v_usecount > 0, vp,
1996	    ("v_decr_usecount: negative usecount"));
1997	vp->v_usecount--;
1998	if (vp->v_type == VCHR && vp->v_rdev != NULL) {
1999		dev_lock();
2000		vp->v_rdev->si_usecount--;
2001		dev_unlock();
2002	}
2003	vdropl(vp);
2004}
2005
2006/*
2007 * Decrement only the use count and driver use count.  This is intended to
2008 * be paired with a follow on vdropl() to release the remaining hold count.
2009 * In this way we may vgone() a vnode with a 0 usecount without risk of
2010 * having it end up on a free list because the hold count is kept above 0.
2011 */
2012static void
2013v_decr_useonly(struct vnode *vp)
2014{
2015
2016	CTR3(KTR_VFS, "v_decr_useonly: vp %p holdcnt %d usecount %d\n",
2017	    vp, vp->v_holdcnt, vp->v_usecount);
2018	ASSERT_VI_LOCKED(vp, __FUNCTION__);
2019	VNASSERT(vp->v_usecount > 0, vp,
2020	    ("v_decr_useonly: negative usecount"));
2021	vp->v_usecount--;
2022	if (vp->v_type == VCHR && vp->v_rdev != NULL) {
2023		dev_lock();
2024		vp->v_rdev->si_usecount--;
2025		dev_unlock();
2026	}
2027}
2028
2029/*
2030 * Grab a particular vnode from the free list, increment its
2031 * reference count and lock it.  VI_DOOMED is set if the vnode
2032 * is being destroyed.  Only callers who specify LK_RETRY will
2033 * see doomed vnodes.  If inactive processing was delayed in
2034 * vput try to do it here.
2035 */
2036int
2037vget(struct vnode *vp, int flags, struct thread *td)
2038{
2039	int error;
2040
2041	error = 0;
2042	VFS_ASSERT_GIANT(vp->v_mount);
2043	if ((flags & LK_INTERLOCK) == 0)
2044		VI_LOCK(vp);
2045	vholdl(vp);
2046	if ((error = vn_lock(vp, flags | LK_INTERLOCK)) != 0) {
2047		vdrop(vp);
2048		return (error);
2049	}
2050	if (vp->v_iflag & VI_DOOMED && (flags & LK_RETRY) == 0)
2051		panic("vget: vn_lock failed to return ENOENT\n");
2052	VI_LOCK(vp);
2053	/* Upgrade our holdcnt to a usecount. */
2054	v_upgrade_usecount(vp);
2055	/*
2056 	 * We don't guarantee that any particular close will
2057	 * trigger inactive processing so just make a best effort
2058	 * here at preventing a reference to a removed file.  If
2059	 * we don't succeed no harm is done.
2060	 */
2061	if (vp->v_iflag & VI_OWEINACT) {
2062		if (VOP_ISLOCKED(vp) == LK_EXCLUSIVE &&
2063		    (flags & LK_NOWAIT) == 0)
2064			vinactive(vp, td);
2065		vp->v_iflag &= ~VI_OWEINACT;
2066	}
2067	VI_UNLOCK(vp);
2068	return (0);
2069}
2070
2071/*
2072 * Increase the reference count of a vnode.
2073 */
2074void
2075vref(struct vnode *vp)
2076{
2077
2078	VI_LOCK(vp);
2079	v_incr_usecount(vp);
2080	VI_UNLOCK(vp);
2081}
2082
2083/*
2084 * Return reference count of a vnode.
2085 *
2086 * The results of this call are only guaranteed when some mechanism other
2087 * than the VI lock is used to stop other processes from gaining references
2088 * to the vnode.  This may be the case if the caller holds the only reference.
2089 * This is also useful when stale data is acceptable as race conditions may
2090 * be accounted for by some other means.
2091 */
2092int
2093vrefcnt(struct vnode *vp)
2094{
2095	int usecnt;
2096
2097	VI_LOCK(vp);
2098	usecnt = vp->v_usecount;
2099	VI_UNLOCK(vp);
2100
2101	return (usecnt);
2102}
2103
2104
2105/*
2106 * Vnode put/release.
2107 * If count drops to zero, call inactive routine and return to freelist.
2108 */
2109void
2110vrele(struct vnode *vp)
2111{
2112	struct thread *td = curthread;	/* XXX */
2113
2114	KASSERT(vp != NULL, ("vrele: null vp"));
2115	VFS_ASSERT_GIANT(vp->v_mount);
2116
2117	VI_LOCK(vp);
2118
2119	/* Skip this v_writecount check if we're going to panic below. */
2120	VNASSERT(vp->v_writecount < vp->v_usecount || vp->v_usecount < 1, vp,
2121	    ("vrele: missed vn_close"));
2122
2123	if (vp->v_usecount > 1 || ((vp->v_iflag & VI_DOINGINACT) &&
2124	    vp->v_usecount == 1)) {
2125		v_decr_usecount(vp);
2126		return;
2127	}
2128	if (vp->v_usecount != 1) {
2129#ifdef DIAGNOSTIC
2130		vprint("vrele: negative ref count", vp);
2131#endif
2132		VI_UNLOCK(vp);
2133		panic("vrele: negative ref cnt");
2134	}
2135	/*
2136	 * We want to hold the vnode until the inactive finishes to
2137	 * prevent vgone() races.  We drop the use count here and the
2138	 * hold count below when we're done.
2139	 */
2140	v_decr_useonly(vp);
2141	/*
2142	 * We must call VOP_INACTIVE with the node locked. Mark
2143	 * as VI_DOINGINACT to avoid recursion.
2144	 */
2145	vp->v_iflag |= VI_OWEINACT;
2146	if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK) == 0) {
2147		VI_LOCK(vp);
2148		if (vp->v_usecount > 0)
2149			vp->v_iflag &= ~VI_OWEINACT;
2150		if (vp->v_iflag & VI_OWEINACT)
2151			vinactive(vp, td);
2152		VOP_UNLOCK(vp, 0);
2153	} else {
2154		VI_LOCK(vp);
2155		if (vp->v_usecount > 0)
2156			vp->v_iflag &= ~VI_OWEINACT;
2157	}
2158	vdropl(vp);
2159}
2160
2161/*
2162 * Release an already locked vnode.  This give the same effects as
2163 * unlock+vrele(), but takes less time and avoids releasing and
2164 * re-aquiring the lock (as vrele() acquires the lock internally.)
2165 */
2166void
2167vput(struct vnode *vp)
2168{
2169	struct thread *td = curthread;	/* XXX */
2170	int error;
2171
2172	KASSERT(vp != NULL, ("vput: null vp"));
2173	ASSERT_VOP_LOCKED(vp, "vput");
2174	VFS_ASSERT_GIANT(vp->v_mount);
2175	VI_LOCK(vp);
2176	/* Skip this v_writecount check if we're going to panic below. */
2177	VNASSERT(vp->v_writecount < vp->v_usecount || vp->v_usecount < 1, vp,
2178	    ("vput: missed vn_close"));
2179	error = 0;
2180
2181	if (vp->v_usecount > 1 || ((vp->v_iflag & VI_DOINGINACT) &&
2182	    vp->v_usecount == 1)) {
2183		VOP_UNLOCK(vp, 0);
2184		v_decr_usecount(vp);
2185		return;
2186	}
2187
2188	if (vp->v_usecount != 1) {
2189#ifdef DIAGNOSTIC
2190		vprint("vput: negative ref count", vp);
2191#endif
2192		panic("vput: negative ref cnt");
2193	}
2194	/*
2195	 * We want to hold the vnode until the inactive finishes to
2196	 * prevent vgone() races.  We drop the use count here and the
2197	 * hold count below when we're done.
2198	 */
2199	v_decr_useonly(vp);
2200	vp->v_iflag |= VI_OWEINACT;
2201	if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) {
2202		error = VOP_LOCK(vp, LK_UPGRADE|LK_INTERLOCK|LK_NOWAIT);
2203		VI_LOCK(vp);
2204		if (error) {
2205			if (vp->v_usecount > 0)
2206				vp->v_iflag &= ~VI_OWEINACT;
2207			goto done;
2208		}
2209	}
2210	if (vp->v_usecount > 0)
2211		vp->v_iflag &= ~VI_OWEINACT;
2212	if (vp->v_iflag & VI_OWEINACT)
2213		vinactive(vp, td);
2214	VOP_UNLOCK(vp, 0);
2215done:
2216	vdropl(vp);
2217}
2218
2219/*
2220 * Somebody doesn't want the vnode recycled.
2221 */
2222void
2223vhold(struct vnode *vp)
2224{
2225
2226	VI_LOCK(vp);
2227	vholdl(vp);
2228	VI_UNLOCK(vp);
2229}
2230
2231void
2232vholdl(struct vnode *vp)
2233{
2234
2235	vp->v_holdcnt++;
2236	if (VSHOULDBUSY(vp))
2237		vbusy(vp);
2238}
2239
2240/*
2241 * Note that there is one less who cares about this vnode.  vdrop() is the
2242 * opposite of vhold().
2243 */
2244void
2245vdrop(struct vnode *vp)
2246{
2247
2248	VI_LOCK(vp);
2249	vdropl(vp);
2250}
2251
2252/*
2253 * Drop the hold count of the vnode.  If this is the last reference to
2254 * the vnode we will free it if it has been vgone'd otherwise it is
2255 * placed on the free list.
2256 */
2257void
2258vdropl(struct vnode *vp)
2259{
2260
2261	ASSERT_VI_LOCKED(vp, "vdropl");
2262	if (vp->v_holdcnt <= 0)
2263		panic("vdrop: holdcnt %d", vp->v_holdcnt);
2264	vp->v_holdcnt--;
2265	if (vp->v_holdcnt == 0) {
2266		if (vp->v_iflag & VI_DOOMED) {
2267			vdestroy(vp);
2268			return;
2269		} else
2270			vfree(vp);
2271	}
2272	VI_UNLOCK(vp);
2273}
2274
2275/*
2276 * Call VOP_INACTIVE on the vnode and manage the DOINGINACT and OWEINACT
2277 * flags.  DOINGINACT prevents us from recursing in calls to vinactive.
2278 * OWEINACT tracks whether a vnode missed a call to inactive due to a
2279 * failed lock upgrade.
2280 */
2281static void
2282vinactive(struct vnode *vp, struct thread *td)
2283{
2284
2285	ASSERT_VOP_LOCKED(vp, "vinactive");
2286	ASSERT_VI_LOCKED(vp, "vinactive");
2287	VNASSERT((vp->v_iflag & VI_DOINGINACT) == 0, vp,
2288	    ("vinactive: recursed on VI_DOINGINACT"));
2289	vp->v_iflag |= VI_DOINGINACT;
2290	vp->v_iflag &= ~VI_OWEINACT;
2291	VI_UNLOCK(vp);
2292	VOP_INACTIVE(vp, td);
2293	VI_LOCK(vp);
2294	VNASSERT(vp->v_iflag & VI_DOINGINACT, vp,
2295	    ("vinactive: lost VI_DOINGINACT"));
2296	vp->v_iflag &= ~VI_DOINGINACT;
2297}
2298
2299/*
2300 * Remove any vnodes in the vnode table belonging to mount point mp.
2301 *
2302 * If FORCECLOSE is not specified, there should not be any active ones,
2303 * return error if any are found (nb: this is a user error, not a
2304 * system error). If FORCECLOSE is specified, detach any active vnodes
2305 * that are found.
2306 *
2307 * If WRITECLOSE is set, only flush out regular file vnodes open for
2308 * writing.
2309 *
2310 * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped.
2311 *
2312 * `rootrefs' specifies the base reference count for the root vnode
2313 * of this filesystem. The root vnode is considered busy if its
2314 * v_usecount exceeds this value. On a successful return, vflush(, td)
2315 * will call vrele() on the root vnode exactly rootrefs times.
2316 * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must
2317 * be zero.
2318 */
2319#ifdef DIAGNOSTIC
2320static int busyprt = 0;		/* print out busy vnodes */
2321SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "");
2322#endif
2323
2324int
2325vflush( struct mount *mp, int rootrefs, int flags, struct thread *td)
2326{
2327	struct vnode *vp, *mvp, *rootvp = NULL;
2328	struct vattr vattr;
2329	int busy = 0, error;
2330
2331	CTR1(KTR_VFS, "vflush: mp %p", mp);
2332	if (rootrefs > 0) {
2333		KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0,
2334		    ("vflush: bad args"));
2335		/*
2336		 * Get the filesystem root vnode. We can vput() it
2337		 * immediately, since with rootrefs > 0, it won't go away.
2338		 */
2339		if ((error = VFS_ROOT(mp, LK_EXCLUSIVE, &rootvp, td)) != 0)
2340			return (error);
2341		vput(rootvp);
2342
2343	}
2344	MNT_ILOCK(mp);
2345loop:
2346	MNT_VNODE_FOREACH(vp, mp, mvp) {
2347
2348		VI_LOCK(vp);
2349		vholdl(vp);
2350		MNT_IUNLOCK(mp);
2351		error = vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE);
2352		if (error) {
2353			vdrop(vp);
2354			MNT_ILOCK(mp);
2355			MNT_VNODE_FOREACH_ABORT_ILOCKED(mp, mvp);
2356			goto loop;
2357		}
2358		/*
2359		 * Skip over a vnodes marked VV_SYSTEM.
2360		 */
2361		if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) {
2362			VOP_UNLOCK(vp, 0);
2363			vdrop(vp);
2364			MNT_ILOCK(mp);
2365			continue;
2366		}
2367		/*
2368		 * If WRITECLOSE is set, flush out unlinked but still open
2369		 * files (even if open only for reading) and regular file
2370		 * vnodes open for writing.
2371		 */
2372		if (flags & WRITECLOSE) {
2373			error = VOP_GETATTR(vp, &vattr, td->td_ucred, td);
2374			VI_LOCK(vp);
2375
2376			if ((vp->v_type == VNON ||
2377			    (error == 0 && vattr.va_nlink > 0)) &&
2378			    (vp->v_writecount == 0 || vp->v_type != VREG)) {
2379				VOP_UNLOCK(vp, 0);
2380				vdropl(vp);
2381				MNT_ILOCK(mp);
2382				continue;
2383			}
2384		} else
2385			VI_LOCK(vp);
2386		/*
2387		 * With v_usecount == 0, all we need to do is clear out the
2388		 * vnode data structures and we are done.
2389		 *
2390		 * If FORCECLOSE is set, forcibly close the vnode.
2391		 */
2392		if (vp->v_usecount == 0 || (flags & FORCECLOSE)) {
2393			VNASSERT(vp->v_usecount == 0 ||
2394			    (vp->v_type != VCHR && vp->v_type != VBLK), vp,
2395			    ("device VNODE %p is FORCECLOSED", vp));
2396			vgonel(vp);
2397		} else {
2398			busy++;
2399#ifdef DIAGNOSTIC
2400			if (busyprt)
2401				vprint("vflush: busy vnode", vp);
2402#endif
2403		}
2404		VOP_UNLOCK(vp, 0);
2405		vdropl(vp);
2406		MNT_ILOCK(mp);
2407	}
2408	MNT_IUNLOCK(mp);
2409	if (rootrefs > 0 && (flags & FORCECLOSE) == 0) {
2410		/*
2411		 * If just the root vnode is busy, and if its refcount
2412		 * is equal to `rootrefs', then go ahead and kill it.
2413		 */
2414		VI_LOCK(rootvp);
2415		KASSERT(busy > 0, ("vflush: not busy"));
2416		VNASSERT(rootvp->v_usecount >= rootrefs, rootvp,
2417		    ("vflush: usecount %d < rootrefs %d",
2418		     rootvp->v_usecount, rootrefs));
2419		if (busy == 1 && rootvp->v_usecount == rootrefs) {
2420			VOP_LOCK(rootvp, LK_EXCLUSIVE|LK_INTERLOCK);
2421			vgone(rootvp);
2422			VOP_UNLOCK(rootvp, 0);
2423			busy = 0;
2424		} else
2425			VI_UNLOCK(rootvp);
2426	}
2427	if (busy)
2428		return (EBUSY);
2429	for (; rootrefs > 0; rootrefs--)
2430		vrele(rootvp);
2431	return (0);
2432}
2433
2434/*
2435 * Recycle an unused vnode to the front of the free list.
2436 */
2437int
2438vrecycle(struct vnode *vp, struct thread *td)
2439{
2440	int recycled;
2441
2442	ASSERT_VOP_LOCKED(vp, "vrecycle");
2443	recycled = 0;
2444	VI_LOCK(vp);
2445	if (vp->v_usecount == 0) {
2446		recycled = 1;
2447		vgonel(vp);
2448	}
2449	VI_UNLOCK(vp);
2450	return (recycled);
2451}
2452
2453/*
2454 * Eliminate all activity associated with a vnode
2455 * in preparation for reuse.
2456 */
2457void
2458vgone(struct vnode *vp)
2459{
2460	VI_LOCK(vp);
2461	vgonel(vp);
2462	VI_UNLOCK(vp);
2463}
2464
2465/*
2466 * vgone, with the vp interlock held.
2467 */
2468void
2469vgonel(struct vnode *vp)
2470{
2471	struct thread *td;
2472	int oweinact;
2473	int active;
2474	struct mount *mp;
2475
2476	CTR1(KTR_VFS, "vgonel: vp %p", vp);
2477	ASSERT_VOP_LOCKED(vp, "vgonel");
2478	ASSERT_VI_LOCKED(vp, "vgonel");
2479	VNASSERT(vp->v_holdcnt, vp,
2480	    ("vgonel: vp %p has no reference.", vp));
2481	td = curthread;
2482
2483	/*
2484	 * Don't vgonel if we're already doomed.
2485	 */
2486	if (vp->v_iflag & VI_DOOMED)
2487		return;
2488	vp->v_iflag |= VI_DOOMED;
2489	/*
2490	 * Check to see if the vnode is in use.  If so, we have to call
2491	 * VOP_CLOSE() and VOP_INACTIVE().
2492	 */
2493	active = vp->v_usecount;
2494	oweinact = (vp->v_iflag & VI_OWEINACT);
2495	VI_UNLOCK(vp);
2496	/*
2497	 * Clean out any buffers associated with the vnode.
2498	 * If the flush fails, just toss the buffers.
2499	 */
2500	mp = NULL;
2501	if (!TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd))
2502		(void) vn_start_secondary_write(vp, &mp, V_WAIT);
2503	if (vinvalbuf(vp, V_SAVE, td, 0, 0) != 0)
2504		vinvalbuf(vp, 0, td, 0, 0);
2505
2506	/*
2507	 * If purging an active vnode, it must be closed and
2508	 * deactivated before being reclaimed.
2509	 */
2510	if (active)
2511		VOP_CLOSE(vp, FNONBLOCK, NOCRED, td);
2512	if (oweinact || active) {
2513		VI_LOCK(vp);
2514		if ((vp->v_iflag & VI_DOINGINACT) == 0)
2515			vinactive(vp, td);
2516		VI_UNLOCK(vp);
2517	}
2518	/*
2519	 * Reclaim the vnode.
2520	 */
2521	if (VOP_RECLAIM(vp, td))
2522		panic("vgone: cannot reclaim");
2523	if (mp != NULL)
2524		vn_finished_secondary_write(mp);
2525	VNASSERT(vp->v_object == NULL, vp,
2526	    ("vop_reclaim left v_object vp=%p, tag=%s", vp, vp->v_tag));
2527	/*
2528	 * Clear the advisory locks and wake up waiting threads.
2529	 */
2530	lf_purgelocks(vp, &(vp->v_lockf));
2531	/*
2532	 * Delete from old mount point vnode list.
2533	 */
2534	delmntque(vp);
2535	cache_purge(vp);
2536	/*
2537	 * Done with purge, reset to the standard lock and invalidate
2538	 * the vnode.
2539	 */
2540	VI_LOCK(vp);
2541	vp->v_vnlock = &vp->v_lock;
2542	vp->v_op = &dead_vnodeops;
2543	vp->v_tag = "none";
2544	vp->v_type = VBAD;
2545}
2546
2547/*
2548 * Calculate the total number of references to a special device.
2549 */
2550int
2551vcount(struct vnode *vp)
2552{
2553	int count;
2554
2555	dev_lock();
2556	count = vp->v_rdev->si_usecount;
2557	dev_unlock();
2558	return (count);
2559}
2560
2561/*
2562 * Same as above, but using the struct cdev *as argument
2563 */
2564int
2565count_dev(struct cdev *dev)
2566{
2567	int count;
2568
2569	dev_lock();
2570	count = dev->si_usecount;
2571	dev_unlock();
2572	return(count);
2573}
2574
2575/*
2576 * Print out a description of a vnode.
2577 */
2578static char *typename[] =
2579{"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD",
2580 "VMARKER"};
2581
2582void
2583vn_printf(struct vnode *vp, const char *fmt, ...)
2584{
2585	va_list ap;
2586	char buf[256], buf2[16];
2587	u_long flags;
2588
2589	va_start(ap, fmt);
2590	vprintf(fmt, ap);
2591	va_end(ap);
2592	printf("%p: ", (void *)vp);
2593	printf("tag %s, type %s\n", vp->v_tag, typename[vp->v_type]);
2594	printf("    usecount %d, writecount %d, refcount %d mountedhere %p\n",
2595	    vp->v_usecount, vp->v_writecount, vp->v_holdcnt, vp->v_mountedhere);
2596	buf[0] = '\0';
2597	buf[1] = '\0';
2598	if (vp->v_vflag & VV_ROOT)
2599		strlcat(buf, "|VV_ROOT", sizeof(buf));
2600	if (vp->v_vflag & VV_ISTTY)
2601		strlcat(buf, "|VV_ISTTY", sizeof(buf));
2602	if (vp->v_vflag & VV_NOSYNC)
2603		strlcat(buf, "|VV_NOSYNC", sizeof(buf));
2604	if (vp->v_vflag & VV_CACHEDLABEL)
2605		strlcat(buf, "|VV_CACHEDLABEL", sizeof(buf));
2606	if (vp->v_vflag & VV_TEXT)
2607		strlcat(buf, "|VV_TEXT", sizeof(buf));
2608	if (vp->v_vflag & VV_COPYONWRITE)
2609		strlcat(buf, "|VV_COPYONWRITE", sizeof(buf));
2610	if (vp->v_vflag & VV_SYSTEM)
2611		strlcat(buf, "|VV_SYSTEM", sizeof(buf));
2612	if (vp->v_vflag & VV_PROCDEP)
2613		strlcat(buf, "|VV_PROCDEP", sizeof(buf));
2614	if (vp->v_vflag & VV_NOKNOTE)
2615		strlcat(buf, "|VV_NOKNOTE", sizeof(buf));
2616	if (vp->v_vflag & VV_DELETED)
2617		strlcat(buf, "|VV_DELETED", sizeof(buf));
2618	if (vp->v_vflag & VV_MD)
2619		strlcat(buf, "|VV_MD", sizeof(buf));
2620	flags = vp->v_vflag & ~(VV_ROOT | VV_ISTTY | VV_NOSYNC |
2621	    VV_CACHEDLABEL | VV_TEXT | VV_COPYONWRITE | VV_SYSTEM | VV_PROCDEP |
2622	    VV_NOKNOTE | VV_DELETED | VV_MD);
2623	if (flags != 0) {
2624		snprintf(buf2, sizeof(buf2), "|VV(0x%lx)", flags);
2625		strlcat(buf, buf2, sizeof(buf));
2626	}
2627	if (vp->v_iflag & VI_MOUNT)
2628		strlcat(buf, "|VI_MOUNT", sizeof(buf));
2629	if (vp->v_iflag & VI_AGE)
2630		strlcat(buf, "|VI_AGE", sizeof(buf));
2631	if (vp->v_iflag & VI_DOOMED)
2632		strlcat(buf, "|VI_DOOMED", sizeof(buf));
2633	if (vp->v_iflag & VI_FREE)
2634		strlcat(buf, "|VI_FREE", sizeof(buf));
2635	if (vp->v_iflag & VI_OBJDIRTY)
2636		strlcat(buf, "|VI_OBJDIRTY", sizeof(buf));
2637	if (vp->v_iflag & VI_DOINGINACT)
2638		strlcat(buf, "|VI_DOINGINACT", sizeof(buf));
2639	if (vp->v_iflag & VI_OWEINACT)
2640		strlcat(buf, "|VI_OWEINACT", sizeof(buf));
2641	flags = vp->v_iflag & ~(VI_MOUNT | VI_AGE | VI_DOOMED | VI_FREE |
2642	    VI_OBJDIRTY | VI_DOINGINACT | VI_OWEINACT);
2643	if (flags != 0) {
2644		snprintf(buf2, sizeof(buf2), "|VI(0x%lx)", flags);
2645		strlcat(buf, buf2, sizeof(buf));
2646	}
2647	printf("    flags (%s)\n", buf + 1);
2648	if (mtx_owned(VI_MTX(vp)))
2649		printf(" VI_LOCKed");
2650	if (vp->v_object != NULL)
2651		printf("    v_object %p ref %d pages %d\n",
2652		    vp->v_object, vp->v_object->ref_count,
2653		    vp->v_object->resident_page_count);
2654	printf("    ");
2655	lockmgr_printinfo(vp->v_vnlock);
2656	printf("\n");
2657	if (vp->v_data != NULL)
2658		VOP_PRINT(vp);
2659}
2660
2661#ifdef DDB
2662/*
2663 * List all of the locked vnodes in the system.
2664 * Called when debugging the kernel.
2665 */
2666DB_SHOW_COMMAND(lockedvnods, lockedvnodes)
2667{
2668	struct mount *mp, *nmp;
2669	struct vnode *vp;
2670
2671	/*
2672	 * Note: because this is DDB, we can't obey the locking semantics
2673	 * for these structures, which means we could catch an inconsistent
2674	 * state and dereference a nasty pointer.  Not much to be done
2675	 * about that.
2676	 */
2677	db_printf("Locked vnodes\n");
2678	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
2679		nmp = TAILQ_NEXT(mp, mnt_list);
2680		TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
2681			if (vp->v_type != VMARKER &&
2682			    VOP_ISLOCKED(vp))
2683				vprint("", vp);
2684		}
2685		nmp = TAILQ_NEXT(mp, mnt_list);
2686	}
2687}
2688
2689/*
2690 * Show details about the given vnode.
2691 */
2692DB_SHOW_COMMAND(vnode, db_show_vnode)
2693{
2694	struct vnode *vp;
2695
2696	if (!have_addr)
2697		return;
2698	vp = (struct vnode *)addr;
2699	vn_printf(vp, "vnode ");
2700}
2701#endif	/* DDB */
2702
2703/*
2704 * Fill in a struct xvfsconf based on a struct vfsconf.
2705 */
2706static void
2707vfsconf2x(struct vfsconf *vfsp, struct xvfsconf *xvfsp)
2708{
2709
2710	strcpy(xvfsp->vfc_name, vfsp->vfc_name);
2711	xvfsp->vfc_typenum = vfsp->vfc_typenum;
2712	xvfsp->vfc_refcount = vfsp->vfc_refcount;
2713	xvfsp->vfc_flags = vfsp->vfc_flags;
2714	/*
2715	 * These are unused in userland, we keep them
2716	 * to not break binary compatibility.
2717	 */
2718	xvfsp->vfc_vfsops = NULL;
2719	xvfsp->vfc_next = NULL;
2720}
2721
2722/*
2723 * Top level filesystem related information gathering.
2724 */
2725static int
2726sysctl_vfs_conflist(SYSCTL_HANDLER_ARGS)
2727{
2728	struct vfsconf *vfsp;
2729	struct xvfsconf xvfsp;
2730	int error;
2731
2732	error = 0;
2733	TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
2734		bzero(&xvfsp, sizeof(xvfsp));
2735		vfsconf2x(vfsp, &xvfsp);
2736		error = SYSCTL_OUT(req, &xvfsp, sizeof xvfsp);
2737		if (error)
2738			break;
2739	}
2740	return (error);
2741}
2742
2743SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLFLAG_RD, NULL, 0, sysctl_vfs_conflist,
2744    "S,xvfsconf", "List of all configured filesystems");
2745
2746#ifndef BURN_BRIDGES
2747static int	sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS);
2748
2749static int
2750vfs_sysctl(SYSCTL_HANDLER_ARGS)
2751{
2752	int *name = (int *)arg1 - 1;	/* XXX */
2753	u_int namelen = arg2 + 1;	/* XXX */
2754	struct vfsconf *vfsp;
2755	struct xvfsconf xvfsp;
2756
2757	printf("WARNING: userland calling deprecated sysctl, "
2758	    "please rebuild world\n");
2759
2760#if 1 || defined(COMPAT_PRELITE2)
2761	/* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */
2762	if (namelen == 1)
2763		return (sysctl_ovfs_conf(oidp, arg1, arg2, req));
2764#endif
2765
2766	switch (name[1]) {
2767	case VFS_MAXTYPENUM:
2768		if (namelen != 2)
2769			return (ENOTDIR);
2770		return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int)));
2771	case VFS_CONF:
2772		if (namelen != 3)
2773			return (ENOTDIR);	/* overloaded */
2774		TAILQ_FOREACH(vfsp, &vfsconf, vfc_list)
2775			if (vfsp->vfc_typenum == name[2])
2776				break;
2777		if (vfsp == NULL)
2778			return (EOPNOTSUPP);
2779		bzero(&xvfsp, sizeof(xvfsp));
2780		vfsconf2x(vfsp, &xvfsp);
2781		return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp)));
2782	}
2783	return (EOPNOTSUPP);
2784}
2785
2786static SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD | CTLFLAG_SKIP,
2787	vfs_sysctl, "Generic filesystem");
2788
2789#if 1 || defined(COMPAT_PRELITE2)
2790
2791static int
2792sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS)
2793{
2794	int error;
2795	struct vfsconf *vfsp;
2796	struct ovfsconf ovfs;
2797
2798	TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
2799		bzero(&ovfs, sizeof(ovfs));
2800		ovfs.vfc_vfsops = vfsp->vfc_vfsops;	/* XXX used as flag */
2801		strcpy(ovfs.vfc_name, vfsp->vfc_name);
2802		ovfs.vfc_index = vfsp->vfc_typenum;
2803		ovfs.vfc_refcount = vfsp->vfc_refcount;
2804		ovfs.vfc_flags = vfsp->vfc_flags;
2805		error = SYSCTL_OUT(req, &ovfs, sizeof ovfs);
2806		if (error)
2807			return error;
2808	}
2809	return 0;
2810}
2811
2812#endif /* 1 || COMPAT_PRELITE2 */
2813#endif /* !BURN_BRIDGES */
2814
2815#define KINFO_VNODESLOP		10
2816#ifdef notyet
2817/*
2818 * Dump vnode list (via sysctl).
2819 */
2820/* ARGSUSED */
2821static int
2822sysctl_vnode(SYSCTL_HANDLER_ARGS)
2823{
2824	struct xvnode *xvn;
2825	struct thread *td = req->td;
2826	struct mount *mp;
2827	struct vnode *vp;
2828	int error, len, n;
2829
2830	/*
2831	 * Stale numvnodes access is not fatal here.
2832	 */
2833	req->lock = 0;
2834	len = (numvnodes + KINFO_VNODESLOP) * sizeof *xvn;
2835	if (!req->oldptr)
2836		/* Make an estimate */
2837		return (SYSCTL_OUT(req, 0, len));
2838
2839	error = sysctl_wire_old_buffer(req, 0);
2840	if (error != 0)
2841		return (error);
2842	xvn = malloc(len, M_TEMP, M_ZERO | M_WAITOK);
2843	n = 0;
2844	mtx_lock(&mountlist_mtx);
2845	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
2846		if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td))
2847			continue;
2848		MNT_ILOCK(mp);
2849		TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
2850			if (n == len)
2851				break;
2852			vref(vp);
2853			xvn[n].xv_size = sizeof *xvn;
2854			xvn[n].xv_vnode = vp;
2855			xvn[n].xv_id = 0;	/* XXX compat */
2856#define XV_COPY(field) xvn[n].xv_##field = vp->v_##field
2857			XV_COPY(usecount);
2858			XV_COPY(writecount);
2859			XV_COPY(holdcnt);
2860			XV_COPY(mount);
2861			XV_COPY(numoutput);
2862			XV_COPY(type);
2863#undef XV_COPY
2864			xvn[n].xv_flag = vp->v_vflag;
2865
2866			switch (vp->v_type) {
2867			case VREG:
2868			case VDIR:
2869			case VLNK:
2870				break;
2871			case VBLK:
2872			case VCHR:
2873				if (vp->v_rdev == NULL) {
2874					vrele(vp);
2875					continue;
2876				}
2877				xvn[n].xv_dev = dev2udev(vp->v_rdev);
2878				break;
2879			case VSOCK:
2880				xvn[n].xv_socket = vp->v_socket;
2881				break;
2882			case VFIFO:
2883				xvn[n].xv_fifo = vp->v_fifoinfo;
2884				break;
2885			case VNON:
2886			case VBAD:
2887			default:
2888				/* shouldn't happen? */
2889				vrele(vp);
2890				continue;
2891			}
2892			vrele(vp);
2893			++n;
2894		}
2895		MNT_IUNLOCK(mp);
2896		mtx_lock(&mountlist_mtx);
2897		vfs_unbusy(mp, td);
2898		if (n == len)
2899			break;
2900	}
2901	mtx_unlock(&mountlist_mtx);
2902
2903	error = SYSCTL_OUT(req, xvn, n * sizeof *xvn);
2904	free(xvn, M_TEMP);
2905	return (error);
2906}
2907
2908SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD,
2909	0, 0, sysctl_vnode, "S,xvnode", "");
2910#endif
2911
2912/*
2913 * Unmount all filesystems. The list is traversed in reverse order
2914 * of mounting to avoid dependencies.
2915 */
2916void
2917vfs_unmountall(void)
2918{
2919	struct mount *mp;
2920	struct thread *td;
2921	int error;
2922
2923	KASSERT(curthread != NULL, ("vfs_unmountall: NULL curthread"));
2924	td = curthread;
2925	/*
2926	 * Since this only runs when rebooting, it is not interlocked.
2927	 */
2928	while(!TAILQ_EMPTY(&mountlist)) {
2929		mp = TAILQ_LAST(&mountlist, mntlist);
2930		error = dounmount(mp, MNT_FORCE, td);
2931		if (error) {
2932			TAILQ_REMOVE(&mountlist, mp, mnt_list);
2933			/*
2934			 * XXX: Due to the way in which we mount the root
2935			 * file system off of devfs, devfs will generate a
2936			 * "busy" warning when we try to unmount it before
2937			 * the root.  Don't print a warning as a result in
2938			 * order to avoid false positive errors that may
2939			 * cause needless upset.
2940			 */
2941			if (strcmp(mp->mnt_vfc->vfc_name, "devfs") != 0) {
2942				printf("unmount of %s failed (",
2943				    mp->mnt_stat.f_mntonname);
2944				if (error == EBUSY)
2945					printf("BUSY)\n");
2946				else
2947					printf("%d)\n", error);
2948			}
2949		} else {
2950			/* The unmount has removed mp from the mountlist */
2951		}
2952	}
2953}
2954
2955/*
2956 * perform msync on all vnodes under a mount point
2957 * the mount point must be locked.
2958 */
2959void
2960vfs_msync(struct mount *mp, int flags)
2961{
2962	struct vnode *vp, *mvp;
2963	struct vm_object *obj;
2964
2965	MNT_ILOCK(mp);
2966	MNT_VNODE_FOREACH(vp, mp, mvp) {
2967		VI_LOCK(vp);
2968		if ((vp->v_iflag & VI_OBJDIRTY) &&
2969		    (flags == MNT_WAIT || VOP_ISLOCKED(vp) == 0)) {
2970			MNT_IUNLOCK(mp);
2971			if (!vget(vp,
2972			    LK_EXCLUSIVE | LK_RETRY | LK_INTERLOCK,
2973			    curthread)) {
2974				if (vp->v_vflag & VV_NOSYNC) {	/* unlinked */
2975					vput(vp);
2976					MNT_ILOCK(mp);
2977					continue;
2978				}
2979
2980				obj = vp->v_object;
2981				if (obj != NULL) {
2982					VM_OBJECT_LOCK(obj);
2983					vm_object_page_clean(obj, 0, 0,
2984					    flags == MNT_WAIT ?
2985					    OBJPC_SYNC : OBJPC_NOSYNC);
2986					VM_OBJECT_UNLOCK(obj);
2987				}
2988				vput(vp);
2989			}
2990			MNT_ILOCK(mp);
2991		} else
2992			VI_UNLOCK(vp);
2993	}
2994	MNT_IUNLOCK(mp);
2995}
2996
2997/*
2998 * Mark a vnode as free, putting it up for recycling.
2999 */
3000static void
3001vfree(struct vnode *vp)
3002{
3003
3004	CTR1(KTR_VFS, "vfree vp %p", vp);
3005	ASSERT_VI_LOCKED(vp, "vfree");
3006	mtx_lock(&vnode_free_list_mtx);
3007	VNASSERT(vp->v_op != NULL, vp, ("vfree: vnode already reclaimed."));
3008	VNASSERT((vp->v_iflag & VI_FREE) == 0, vp, ("vnode already free"));
3009	VNASSERT(VSHOULDFREE(vp), vp, ("vfree: freeing when we shouldn't"));
3010	VNASSERT((vp->v_iflag & VI_DOOMED) == 0, vp,
3011	    ("vfree: Freeing doomed vnode"));
3012	if (vp->v_iflag & VI_AGE) {
3013		TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
3014	} else {
3015		TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
3016	}
3017	freevnodes++;
3018	vp->v_iflag &= ~VI_AGE;
3019	vp->v_iflag |= VI_FREE;
3020	mtx_unlock(&vnode_free_list_mtx);
3021}
3022
3023/*
3024 * Opposite of vfree() - mark a vnode as in use.
3025 */
3026static void
3027vbusy(struct vnode *vp)
3028{
3029	CTR1(KTR_VFS, "vbusy vp %p", vp);
3030	ASSERT_VI_LOCKED(vp, "vbusy");
3031	VNASSERT((vp->v_iflag & VI_FREE) != 0, vp, ("vnode not free"));
3032	VNASSERT(vp->v_op != NULL, vp, ("vbusy: vnode already reclaimed."));
3033
3034	mtx_lock(&vnode_free_list_mtx);
3035	TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
3036	freevnodes--;
3037	vp->v_iflag &= ~(VI_FREE|VI_AGE);
3038	mtx_unlock(&vnode_free_list_mtx);
3039}
3040
3041/*
3042 * Initalize per-vnode helper structure to hold poll-related state.
3043 */
3044void
3045v_addpollinfo(struct vnode *vp)
3046{
3047	struct vpollinfo *vi;
3048
3049	vi = uma_zalloc(vnodepoll_zone, M_WAITOK);
3050	if (vp->v_pollinfo != NULL) {
3051		uma_zfree(vnodepoll_zone, vi);
3052		return;
3053	}
3054	vp->v_pollinfo = vi;
3055	mtx_init(&vp->v_pollinfo->vpi_lock, "vnode pollinfo", NULL, MTX_DEF);
3056	knlist_init(&vp->v_pollinfo->vpi_selinfo.si_note, vp, vfs_knllock,
3057	    vfs_knlunlock, vfs_knllocked);
3058}
3059
3060/*
3061 * Record a process's interest in events which might happen to
3062 * a vnode.  Because poll uses the historic select-style interface
3063 * internally, this routine serves as both the ``check for any
3064 * pending events'' and the ``record my interest in future events''
3065 * functions.  (These are done together, while the lock is held,
3066 * to avoid race conditions.)
3067 */
3068int
3069vn_pollrecord(struct vnode *vp, struct thread *td, int events)
3070{
3071
3072	if (vp->v_pollinfo == NULL)
3073		v_addpollinfo(vp);
3074	mtx_lock(&vp->v_pollinfo->vpi_lock);
3075	if (vp->v_pollinfo->vpi_revents & events) {
3076		/*
3077		 * This leaves events we are not interested
3078		 * in available for the other process which
3079		 * which presumably had requested them
3080		 * (otherwise they would never have been
3081		 * recorded).
3082		 */
3083		events &= vp->v_pollinfo->vpi_revents;
3084		vp->v_pollinfo->vpi_revents &= ~events;
3085
3086		mtx_unlock(&vp->v_pollinfo->vpi_lock);
3087		return events;
3088	}
3089	vp->v_pollinfo->vpi_events |= events;
3090	selrecord(td, &vp->v_pollinfo->vpi_selinfo);
3091	mtx_unlock(&vp->v_pollinfo->vpi_lock);
3092	return 0;
3093}
3094
3095/*
3096 * Routine to create and manage a filesystem syncer vnode.
3097 */
3098#define sync_close ((int (*)(struct  vop_close_args *))nullop)
3099static int	sync_fsync(struct  vop_fsync_args *);
3100static int	sync_inactive(struct  vop_inactive_args *);
3101static int	sync_reclaim(struct  vop_reclaim_args *);
3102
3103static struct vop_vector sync_vnodeops = {
3104	.vop_bypass =	VOP_EOPNOTSUPP,
3105	.vop_close =	sync_close,		/* close */
3106	.vop_fsync =	sync_fsync,		/* fsync */
3107	.vop_inactive =	sync_inactive,	/* inactive */
3108	.vop_reclaim =	sync_reclaim,	/* reclaim */
3109	.vop_lock1 =	vop_stdlock,	/* lock */
3110	.vop_unlock =	vop_stdunlock,	/* unlock */
3111	.vop_islocked =	vop_stdislocked,	/* islocked */
3112};
3113
3114/*
3115 * Create a new filesystem syncer vnode for the specified mount point.
3116 */
3117int
3118vfs_allocate_syncvnode(struct mount *mp)
3119{
3120	struct vnode *vp;
3121	struct bufobj *bo;
3122	static long start, incr, next;
3123	int error;
3124
3125	/* Allocate a new vnode */
3126	if ((error = getnewvnode("syncer", mp, &sync_vnodeops, &vp)) != 0) {
3127		mp->mnt_syncer = NULL;
3128		return (error);
3129	}
3130	vp->v_type = VNON;
3131	error = insmntque(vp, mp);
3132	if (error != 0)
3133		panic("vfs_allocate_syncvnode: insmntque failed");
3134	/*
3135	 * Place the vnode onto the syncer worklist. We attempt to
3136	 * scatter them about on the list so that they will go off
3137	 * at evenly distributed times even if all the filesystems
3138	 * are mounted at once.
3139	 */
3140	next += incr;
3141	if (next == 0 || next > syncer_maxdelay) {
3142		start /= 2;
3143		incr /= 2;
3144		if (start == 0) {
3145			start = syncer_maxdelay / 2;
3146			incr = syncer_maxdelay;
3147		}
3148		next = start;
3149	}
3150	bo = &vp->v_bufobj;
3151	BO_LOCK(bo);
3152	vn_syncer_add_to_worklist(bo, syncdelay > 0 ? next % syncdelay : 0);
3153	/* XXX - vn_syncer_add_to_worklist() also grabs and drops sync_mtx. */
3154	mtx_lock(&sync_mtx);
3155	sync_vnode_count++;
3156	mtx_unlock(&sync_mtx);
3157	BO_UNLOCK(bo);
3158	mp->mnt_syncer = vp;
3159	return (0);
3160}
3161
3162/*
3163 * Do a lazy sync of the filesystem.
3164 */
3165static int
3166sync_fsync(struct vop_fsync_args *ap)
3167{
3168	struct vnode *syncvp = ap->a_vp;
3169	struct mount *mp = syncvp->v_mount;
3170	struct thread *td = ap->a_td;
3171	int error;
3172	struct bufobj *bo;
3173
3174	/*
3175	 * We only need to do something if this is a lazy evaluation.
3176	 */
3177	if (ap->a_waitfor != MNT_LAZY)
3178		return (0);
3179
3180	/*
3181	 * Move ourselves to the back of the sync list.
3182	 */
3183	bo = &syncvp->v_bufobj;
3184	BO_LOCK(bo);
3185	vn_syncer_add_to_worklist(bo, syncdelay);
3186	BO_UNLOCK(bo);
3187
3188	/*
3189	 * Walk the list of vnodes pushing all that are dirty and
3190	 * not already on the sync list.
3191	 */
3192	mtx_lock(&mountlist_mtx);
3193	if (vfs_busy(mp, LK_EXCLUSIVE | LK_NOWAIT, &mountlist_mtx, td) != 0) {
3194		mtx_unlock(&mountlist_mtx);
3195		return (0);
3196	}
3197	if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) {
3198		vfs_unbusy(mp, td);
3199		return (0);
3200	}
3201	MNT_ILOCK(mp);
3202	mp->mnt_noasync++;
3203	mp->mnt_kern_flag &= ~MNTK_ASYNC;
3204	MNT_IUNLOCK(mp);
3205	vfs_msync(mp, MNT_NOWAIT);
3206	error = VFS_SYNC(mp, MNT_LAZY, td);
3207	MNT_ILOCK(mp);
3208	mp->mnt_noasync--;
3209	if ((mp->mnt_flag & MNT_ASYNC) != 0 && mp->mnt_noasync == 0)
3210		mp->mnt_kern_flag |= MNTK_ASYNC;
3211	MNT_IUNLOCK(mp);
3212	vn_finished_write(mp);
3213	vfs_unbusy(mp, td);
3214	return (error);
3215}
3216
3217/*
3218 * The syncer vnode is no referenced.
3219 */
3220static int
3221sync_inactive(struct vop_inactive_args *ap)
3222{
3223
3224	vgone(ap->a_vp);
3225	return (0);
3226}
3227
3228/*
3229 * The syncer vnode is no longer needed and is being decommissioned.
3230 *
3231 * Modifications to the worklist must be protected by sync_mtx.
3232 */
3233static int
3234sync_reclaim(struct vop_reclaim_args *ap)
3235{
3236	struct vnode *vp = ap->a_vp;
3237	struct bufobj *bo;
3238
3239	bo = &vp->v_bufobj;
3240	BO_LOCK(bo);
3241	vp->v_mount->mnt_syncer = NULL;
3242	if (bo->bo_flag & BO_ONWORKLST) {
3243		mtx_lock(&sync_mtx);
3244		LIST_REMOVE(bo, bo_synclist);
3245		syncer_worklist_len--;
3246		sync_vnode_count--;
3247		mtx_unlock(&sync_mtx);
3248		bo->bo_flag &= ~BO_ONWORKLST;
3249	}
3250	BO_UNLOCK(bo);
3251
3252	return (0);
3253}
3254
3255/*
3256 * Check if vnode represents a disk device
3257 */
3258int
3259vn_isdisk(struct vnode *vp, int *errp)
3260{
3261	int error;
3262
3263	error = 0;
3264	dev_lock();
3265	if (vp->v_type != VCHR)
3266		error = ENOTBLK;
3267	else if (vp->v_rdev == NULL)
3268		error = ENXIO;
3269	else if (vp->v_rdev->si_devsw == NULL)
3270		error = ENXIO;
3271	else if (!(vp->v_rdev->si_devsw->d_flags & D_DISK))
3272		error = ENOTBLK;
3273	dev_unlock();
3274	if (errp != NULL)
3275		*errp = error;
3276	return (error == 0);
3277}
3278
3279/*
3280 * Common filesystem object access control check routine.  Accepts a
3281 * vnode's type, "mode", uid and gid, requested access mode, credentials,
3282 * and optional call-by-reference privused argument allowing vaccess()
3283 * to indicate to the caller whether privilege was used to satisfy the
3284 * request (obsoleted).  Returns 0 on success, or an errno on failure.
3285 *
3286 * The ifdef'd CAPABILITIES version is here for reference, but is not
3287 * actually used.
3288 */
3289int
3290vaccess(enum vtype type, mode_t file_mode, uid_t file_uid, gid_t file_gid,
3291    mode_t acc_mode, struct ucred *cred, int *privused)
3292{
3293	mode_t dac_granted;
3294	mode_t priv_granted;
3295
3296	/*
3297	 * Look for a normal, non-privileged way to access the file/directory
3298	 * as requested.  If it exists, go with that.
3299	 */
3300
3301	if (privused != NULL)
3302		*privused = 0;
3303
3304	dac_granted = 0;
3305
3306	/* Check the owner. */
3307	if (cred->cr_uid == file_uid) {
3308		dac_granted |= VADMIN;
3309		if (file_mode & S_IXUSR)
3310			dac_granted |= VEXEC;
3311		if (file_mode & S_IRUSR)
3312			dac_granted |= VREAD;
3313		if (file_mode & S_IWUSR)
3314			dac_granted |= (VWRITE | VAPPEND);
3315
3316		if ((acc_mode & dac_granted) == acc_mode)
3317			return (0);
3318
3319		goto privcheck;
3320	}
3321
3322	/* Otherwise, check the groups (first match) */
3323	if (groupmember(file_gid, cred)) {
3324		if (file_mode & S_IXGRP)
3325			dac_granted |= VEXEC;
3326		if (file_mode & S_IRGRP)
3327			dac_granted |= VREAD;
3328		if (file_mode & S_IWGRP)
3329			dac_granted |= (VWRITE | VAPPEND);
3330
3331		if ((acc_mode & dac_granted) == acc_mode)
3332			return (0);
3333
3334		goto privcheck;
3335	}
3336
3337	/* Otherwise, check everyone else. */
3338	if (file_mode & S_IXOTH)
3339		dac_granted |= VEXEC;
3340	if (file_mode & S_IROTH)
3341		dac_granted |= VREAD;
3342	if (file_mode & S_IWOTH)
3343		dac_granted |= (VWRITE | VAPPEND);
3344	if ((acc_mode & dac_granted) == acc_mode)
3345		return (0);
3346
3347privcheck:
3348	/*
3349	 * Build a privilege mask to determine if the set of privileges
3350	 * satisfies the requirements when combined with the granted mask
3351	 * from above.  For each privilege, if the privilege is required,
3352	 * bitwise or the request type onto the priv_granted mask.
3353	 */
3354	priv_granted = 0;
3355
3356	if (type == VDIR) {
3357		/*
3358		 * For directories, use PRIV_VFS_LOOKUP to satisfy VEXEC
3359		 * requests, instead of PRIV_VFS_EXEC.
3360		 */
3361		if ((acc_mode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
3362		    !priv_check_cred(cred, PRIV_VFS_LOOKUP, 0))
3363			priv_granted |= VEXEC;
3364	} else {
3365		if ((acc_mode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
3366		    !priv_check_cred(cred, PRIV_VFS_EXEC, 0))
3367			priv_granted |= VEXEC;
3368	}
3369
3370	if ((acc_mode & VREAD) && ((dac_granted & VREAD) == 0) &&
3371	    !priv_check_cred(cred, PRIV_VFS_READ, 0))
3372		priv_granted |= VREAD;
3373
3374	if ((acc_mode & VWRITE) && ((dac_granted & VWRITE) == 0) &&
3375	    !priv_check_cred(cred, PRIV_VFS_WRITE, 0))
3376		priv_granted |= (VWRITE | VAPPEND);
3377
3378	if ((acc_mode & VADMIN) && ((dac_granted & VADMIN) == 0) &&
3379	    !priv_check_cred(cred, PRIV_VFS_ADMIN, 0))
3380		priv_granted |= VADMIN;
3381
3382	if ((acc_mode & (priv_granted | dac_granted)) == acc_mode) {
3383		/* XXX audit: privilege used */
3384		if (privused != NULL)
3385			*privused = 1;
3386		return (0);
3387	}
3388
3389	return ((acc_mode & VADMIN) ? EPERM : EACCES);
3390}
3391
3392/*
3393 * Credential check based on process requesting service, and per-attribute
3394 * permissions.
3395 */
3396int
3397extattr_check_cred(struct vnode *vp, int attrnamespace, struct ucred *cred,
3398    struct thread *td, int access)
3399{
3400
3401	/*
3402	 * Kernel-invoked always succeeds.
3403	 */
3404	if (cred == NOCRED)
3405		return (0);
3406
3407	/*
3408	 * Do not allow privileged processes in jail to directly manipulate
3409	 * system attributes.
3410	 */
3411	switch (attrnamespace) {
3412	case EXTATTR_NAMESPACE_SYSTEM:
3413		/* Potentially should be: return (EPERM); */
3414		return (priv_check_cred(cred, PRIV_VFS_EXTATTR_SYSTEM, 0));
3415	case EXTATTR_NAMESPACE_USER:
3416		return (VOP_ACCESS(vp, access, cred, td));
3417	default:
3418		return (EPERM);
3419	}
3420}
3421
3422#ifdef DEBUG_VFS_LOCKS
3423/*
3424 * This only exists to supress warnings from unlocked specfs accesses.  It is
3425 * no longer ok to have an unlocked VFS.
3426 */
3427#define	IGNORE_LOCK(vp) ((vp)->v_type == VCHR || (vp)->v_type == VBAD)
3428
3429int vfs_badlock_ddb = 1;	/* Drop into debugger on violation. */
3430SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_ddb, CTLFLAG_RW, &vfs_badlock_ddb, 0, "");
3431
3432int vfs_badlock_mutex = 1;	/* Check for interlock across VOPs. */
3433SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_mutex, CTLFLAG_RW, &vfs_badlock_mutex, 0, "");
3434
3435int vfs_badlock_print = 1;	/* Print lock violations. */
3436SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_print, CTLFLAG_RW, &vfs_badlock_print, 0, "");
3437
3438#ifdef KDB
3439int vfs_badlock_backtrace = 1;	/* Print backtrace at lock violations. */
3440SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_backtrace, CTLFLAG_RW, &vfs_badlock_backtrace, 0, "");
3441#endif
3442
3443static void
3444vfs_badlock(const char *msg, const char *str, struct vnode *vp)
3445{
3446
3447#ifdef KDB
3448	if (vfs_badlock_backtrace)
3449		kdb_backtrace();
3450#endif
3451	if (vfs_badlock_print)
3452		printf("%s: %p %s\n", str, (void *)vp, msg);
3453	if (vfs_badlock_ddb)
3454		kdb_enter(KDB_WHY_VFSLOCK, "lock violation");
3455}
3456
3457void
3458assert_vi_locked(struct vnode *vp, const char *str)
3459{
3460
3461	if (vfs_badlock_mutex && !mtx_owned(VI_MTX(vp)))
3462		vfs_badlock("interlock is not locked but should be", str, vp);
3463}
3464
3465void
3466assert_vi_unlocked(struct vnode *vp, const char *str)
3467{
3468
3469	if (vfs_badlock_mutex && mtx_owned(VI_MTX(vp)))
3470		vfs_badlock("interlock is locked but should not be", str, vp);
3471}
3472
3473void
3474assert_vop_locked(struct vnode *vp, const char *str)
3475{
3476
3477	if (vp && !IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) == 0)
3478		vfs_badlock("is not locked but should be", str, vp);
3479}
3480
3481void
3482assert_vop_unlocked(struct vnode *vp, const char *str)
3483{
3484
3485	if (vp && !IGNORE_LOCK(vp) &&
3486	    VOP_ISLOCKED(vp) == LK_EXCLUSIVE)
3487		vfs_badlock("is locked but should not be", str, vp);
3488}
3489
3490void
3491assert_vop_elocked(struct vnode *vp, const char *str)
3492{
3493
3494	if (vp && !IGNORE_LOCK(vp) &&
3495	    VOP_ISLOCKED(vp) != LK_EXCLUSIVE)
3496		vfs_badlock("is not exclusive locked but should be", str, vp);
3497}
3498
3499#if 0
3500void
3501assert_vop_elocked_other(struct vnode *vp, const char *str)
3502{
3503
3504	if (vp && !IGNORE_LOCK(vp) &&
3505	    VOP_ISLOCKED(vp) != LK_EXCLOTHER)
3506		vfs_badlock("is not exclusive locked by another thread",
3507		    str, vp);
3508}
3509
3510void
3511assert_vop_slocked(struct vnode *vp, const char *str)
3512{
3513
3514	if (vp && !IGNORE_LOCK(vp) &&
3515	    VOP_ISLOCKED(vp) != LK_SHARED)
3516		vfs_badlock("is not locked shared but should be", str, vp);
3517}
3518#endif /* 0 */
3519#endif /* DEBUG_VFS_LOCKS */
3520
3521void
3522vop_rename_pre(void *ap)
3523{
3524	struct vop_rename_args *a = ap;
3525
3526#ifdef DEBUG_VFS_LOCKS
3527	if (a->a_tvp)
3528		ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME");
3529	ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME");
3530	ASSERT_VI_UNLOCKED(a->a_fvp, "VOP_RENAME");
3531	ASSERT_VI_UNLOCKED(a->a_fdvp, "VOP_RENAME");
3532
3533	/* Check the source (from). */
3534	if (a->a_tdvp != a->a_fdvp && a->a_tvp != a->a_fdvp)
3535		ASSERT_VOP_UNLOCKED(a->a_fdvp, "vop_rename: fdvp locked");
3536	if (a->a_tvp != a->a_fvp)
3537		ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: fvp locked");
3538
3539	/* Check the target. */
3540	if (a->a_tvp)
3541		ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked");
3542	ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked");
3543#endif
3544	if (a->a_tdvp != a->a_fdvp)
3545		vhold(a->a_fdvp);
3546	if (a->a_tvp != a->a_fvp)
3547		vhold(a->a_fvp);
3548	vhold(a->a_tdvp);
3549	if (a->a_tvp)
3550		vhold(a->a_tvp);
3551}
3552
3553void
3554vop_strategy_pre(void *ap)
3555{
3556#ifdef DEBUG_VFS_LOCKS
3557	struct vop_strategy_args *a;
3558	struct buf *bp;
3559
3560	a = ap;
3561	bp = a->a_bp;
3562
3563	/*
3564	 * Cluster ops lock their component buffers but not the IO container.
3565	 */
3566	if ((bp->b_flags & B_CLUSTER) != 0)
3567		return;
3568
3569	if (!BUF_ISLOCKED(bp)) {
3570		if (vfs_badlock_print)
3571			printf(
3572			    "VOP_STRATEGY: bp is not locked but should be\n");
3573		if (vfs_badlock_ddb)
3574			kdb_enter(KDB_WHY_VFSLOCK, "lock violation");
3575	}
3576#endif
3577}
3578
3579void
3580vop_lookup_pre(void *ap)
3581{
3582#ifdef DEBUG_VFS_LOCKS
3583	struct vop_lookup_args *a;
3584	struct vnode *dvp;
3585
3586	a = ap;
3587	dvp = a->a_dvp;
3588	ASSERT_VI_UNLOCKED(dvp, "VOP_LOOKUP");
3589	ASSERT_VOP_LOCKED(dvp, "VOP_LOOKUP");
3590#endif
3591}
3592
3593void
3594vop_lookup_post(void *ap, int rc)
3595{
3596#ifdef DEBUG_VFS_LOCKS
3597	struct vop_lookup_args *a;
3598	struct vnode *dvp;
3599	struct vnode *vp;
3600
3601	a = ap;
3602	dvp = a->a_dvp;
3603	vp = *(a->a_vpp);
3604
3605	ASSERT_VI_UNLOCKED(dvp, "VOP_LOOKUP");
3606	ASSERT_VOP_LOCKED(dvp, "VOP_LOOKUP");
3607
3608	if (!rc)
3609		ASSERT_VOP_LOCKED(vp, "VOP_LOOKUP (child)");
3610#endif
3611}
3612
3613void
3614vop_lock_pre(void *ap)
3615{
3616#ifdef DEBUG_VFS_LOCKS
3617	struct vop_lock1_args *a = ap;
3618
3619	if ((a->a_flags & LK_INTERLOCK) == 0)
3620		ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
3621	else
3622		ASSERT_VI_LOCKED(a->a_vp, "VOP_LOCK");
3623#endif
3624}
3625
3626void
3627vop_lock_post(void *ap, int rc)
3628{
3629#ifdef DEBUG_VFS_LOCKS
3630	struct vop_lock1_args *a = ap;
3631
3632	ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
3633	if (rc == 0)
3634		ASSERT_VOP_LOCKED(a->a_vp, "VOP_LOCK");
3635#endif
3636}
3637
3638void
3639vop_unlock_pre(void *ap)
3640{
3641#ifdef DEBUG_VFS_LOCKS
3642	struct vop_unlock_args *a = ap;
3643
3644	if (a->a_flags & LK_INTERLOCK)
3645		ASSERT_VI_LOCKED(a->a_vp, "VOP_UNLOCK");
3646	ASSERT_VOP_LOCKED(a->a_vp, "VOP_UNLOCK");
3647#endif
3648}
3649
3650void
3651vop_unlock_post(void *ap, int rc)
3652{
3653#ifdef DEBUG_VFS_LOCKS
3654	struct vop_unlock_args *a = ap;
3655
3656	if (a->a_flags & LK_INTERLOCK)
3657		ASSERT_VI_UNLOCKED(a->a_vp, "VOP_UNLOCK");
3658#endif
3659}
3660
3661void
3662vop_create_post(void *ap, int rc)
3663{
3664	struct vop_create_args *a = ap;
3665
3666	if (!rc)
3667		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
3668}
3669
3670void
3671vop_link_post(void *ap, int rc)
3672{
3673	struct vop_link_args *a = ap;
3674
3675	if (!rc) {
3676		VFS_KNOTE_LOCKED(a->a_vp, NOTE_LINK);
3677		VFS_KNOTE_LOCKED(a->a_tdvp, NOTE_WRITE);
3678	}
3679}
3680
3681void
3682vop_mkdir_post(void *ap, int rc)
3683{
3684	struct vop_mkdir_args *a = ap;
3685
3686	if (!rc)
3687		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK);
3688}
3689
3690void
3691vop_mknod_post(void *ap, int rc)
3692{
3693	struct vop_mknod_args *a = ap;
3694
3695	if (!rc)
3696		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
3697}
3698
3699void
3700vop_remove_post(void *ap, int rc)
3701{
3702	struct vop_remove_args *a = ap;
3703
3704	if (!rc) {
3705		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
3706		VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE);
3707	}
3708}
3709
3710void
3711vop_rename_post(void *ap, int rc)
3712{
3713	struct vop_rename_args *a = ap;
3714
3715	if (!rc) {
3716		VFS_KNOTE_UNLOCKED(a->a_fdvp, NOTE_WRITE);
3717		VFS_KNOTE_UNLOCKED(a->a_tdvp, NOTE_WRITE);
3718		VFS_KNOTE_UNLOCKED(a->a_fvp, NOTE_RENAME);
3719		if (a->a_tvp)
3720			VFS_KNOTE_UNLOCKED(a->a_tvp, NOTE_DELETE);
3721	}
3722	if (a->a_tdvp != a->a_fdvp)
3723		vdrop(a->a_fdvp);
3724	if (a->a_tvp != a->a_fvp)
3725		vdrop(a->a_fvp);
3726	vdrop(a->a_tdvp);
3727	if (a->a_tvp)
3728		vdrop(a->a_tvp);
3729}
3730
3731void
3732vop_rmdir_post(void *ap, int rc)
3733{
3734	struct vop_rmdir_args *a = ap;
3735
3736	if (!rc) {
3737		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK);
3738		VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE);
3739	}
3740}
3741
3742void
3743vop_setattr_post(void *ap, int rc)
3744{
3745	struct vop_setattr_args *a = ap;
3746
3747	if (!rc)
3748		VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
3749}
3750
3751void
3752vop_symlink_post(void *ap, int rc)
3753{
3754	struct vop_symlink_args *a = ap;
3755
3756	if (!rc)
3757		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
3758}
3759
3760static struct knlist fs_knlist;
3761
3762static void
3763vfs_event_init(void *arg)
3764{
3765	knlist_init(&fs_knlist, NULL, NULL, NULL, NULL);
3766}
3767/* XXX - correct order? */
3768SYSINIT(vfs_knlist, SI_SUB_VFS, SI_ORDER_ANY, vfs_event_init, NULL);
3769
3770void
3771vfs_event_signal(fsid_t *fsid, u_int32_t event, intptr_t data __unused)
3772{
3773
3774	KNOTE_UNLOCKED(&fs_knlist, event);
3775}
3776
3777static int	filt_fsattach(struct knote *kn);
3778static void	filt_fsdetach(struct knote *kn);
3779static int	filt_fsevent(struct knote *kn, long hint);
3780
3781struct filterops fs_filtops =
3782	{ 0, filt_fsattach, filt_fsdetach, filt_fsevent };
3783
3784static int
3785filt_fsattach(struct knote *kn)
3786{
3787
3788	kn->kn_flags |= EV_CLEAR;
3789	knlist_add(&fs_knlist, kn, 0);
3790	return (0);
3791}
3792
3793static void
3794filt_fsdetach(struct knote *kn)
3795{
3796
3797	knlist_remove(&fs_knlist, kn, 0);
3798}
3799
3800static int
3801filt_fsevent(struct knote *kn, long hint)
3802{
3803
3804	kn->kn_fflags |= hint;
3805	return (kn->kn_fflags != 0);
3806}
3807
3808static int
3809sysctl_vfs_ctl(SYSCTL_HANDLER_ARGS)
3810{
3811	struct vfsidctl vc;
3812	int error;
3813	struct mount *mp;
3814
3815	error = SYSCTL_IN(req, &vc, sizeof(vc));
3816	if (error)
3817		return (error);
3818	if (vc.vc_vers != VFS_CTL_VERS1)
3819		return (EINVAL);
3820	mp = vfs_getvfs(&vc.vc_fsid);
3821	if (mp == NULL)
3822		return (ENOENT);
3823	/* ensure that a specific sysctl goes to the right filesystem. */
3824	if (strcmp(vc.vc_fstypename, "*") != 0 &&
3825	    strcmp(vc.vc_fstypename, mp->mnt_vfc->vfc_name) != 0) {
3826		vfs_rel(mp);
3827		return (EINVAL);
3828	}
3829	VCTLTOREQ(&vc, req);
3830	error = VFS_SYSCTL(mp, vc.vc_op, req);
3831	vfs_rel(mp);
3832	return (error);
3833}
3834
3835SYSCTL_PROC(_vfs, OID_AUTO, ctl, CTLFLAG_WR, NULL, 0, sysctl_vfs_ctl, "",
3836    "Sysctl by fsid");
3837
3838/*
3839 * Function to initialize a va_filerev field sensibly.
3840 * XXX: Wouldn't a random number make a lot more sense ??
3841 */
3842u_quad_t
3843init_va_filerev(void)
3844{
3845	struct bintime bt;
3846
3847	getbinuptime(&bt);
3848	return (((u_quad_t)bt.sec << 32LL) | (bt.frac >> 32LL));
3849}
3850
3851static int	filt_vfsread(struct knote *kn, long hint);
3852static int	filt_vfswrite(struct knote *kn, long hint);
3853static int	filt_vfsvnode(struct knote *kn, long hint);
3854static void	filt_vfsdetach(struct knote *kn);
3855static struct filterops vfsread_filtops =
3856	{ 1, NULL, filt_vfsdetach, filt_vfsread };
3857static struct filterops vfswrite_filtops =
3858	{ 1, NULL, filt_vfsdetach, filt_vfswrite };
3859static struct filterops vfsvnode_filtops =
3860	{ 1, NULL, filt_vfsdetach, filt_vfsvnode };
3861
3862static void
3863vfs_knllock(void *arg)
3864{
3865	struct vnode *vp = arg;
3866
3867	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3868}
3869
3870static void
3871vfs_knlunlock(void *arg)
3872{
3873	struct vnode *vp = arg;
3874
3875	VOP_UNLOCK(vp, 0);
3876}
3877
3878static int
3879vfs_knllocked(void *arg)
3880{
3881	struct vnode *vp = arg;
3882
3883	return (VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
3884}
3885
3886int
3887vfs_kqfilter(struct vop_kqfilter_args *ap)
3888{
3889	struct vnode *vp = ap->a_vp;
3890	struct knote *kn = ap->a_kn;
3891	struct knlist *knl;
3892
3893	switch (kn->kn_filter) {
3894	case EVFILT_READ:
3895		kn->kn_fop = &vfsread_filtops;
3896		break;
3897	case EVFILT_WRITE:
3898		kn->kn_fop = &vfswrite_filtops;
3899		break;
3900	case EVFILT_VNODE:
3901		kn->kn_fop = &vfsvnode_filtops;
3902		break;
3903	default:
3904		return (EINVAL);
3905	}
3906
3907	kn->kn_hook = (caddr_t)vp;
3908
3909	if (vp->v_pollinfo == NULL)
3910		v_addpollinfo(vp);
3911	if (vp->v_pollinfo == NULL)
3912		return (ENOMEM);
3913	knl = &vp->v_pollinfo->vpi_selinfo.si_note;
3914	knlist_add(knl, kn, 0);
3915
3916	return (0);
3917}
3918
3919/*
3920 * Detach knote from vnode
3921 */
3922static void
3923filt_vfsdetach(struct knote *kn)
3924{
3925	struct vnode *vp = (struct vnode *)kn->kn_hook;
3926
3927	KASSERT(vp->v_pollinfo != NULL, ("Missing v_pollinfo"));
3928	knlist_remove(&vp->v_pollinfo->vpi_selinfo.si_note, kn, 0);
3929}
3930
3931/*ARGSUSED*/
3932static int
3933filt_vfsread(struct knote *kn, long hint)
3934{
3935	struct vnode *vp = (struct vnode *)kn->kn_hook;
3936	struct vattr va;
3937
3938	/*
3939	 * filesystem is gone, so set the EOF flag and schedule
3940	 * the knote for deletion.
3941	 */
3942	if (hint == NOTE_REVOKE) {
3943		kn->kn_flags |= (EV_EOF | EV_ONESHOT);
3944		return (1);
3945	}
3946
3947	if (VOP_GETATTR(vp, &va, curthread->td_ucred, curthread))
3948		return (0);
3949
3950	kn->kn_data = va.va_size - kn->kn_fp->f_offset;
3951	return (kn->kn_data != 0);
3952}
3953
3954/*ARGSUSED*/
3955static int
3956filt_vfswrite(struct knote *kn, long hint)
3957{
3958	/*
3959	 * filesystem is gone, so set the EOF flag and schedule
3960	 * the knote for deletion.
3961	 */
3962	if (hint == NOTE_REVOKE)
3963		kn->kn_flags |= (EV_EOF | EV_ONESHOT);
3964
3965	kn->kn_data = 0;
3966	return (1);
3967}
3968
3969static int
3970filt_vfsvnode(struct knote *kn, long hint)
3971{
3972	if (kn->kn_sfflags & hint)
3973		kn->kn_fflags |= hint;
3974	if (hint == NOTE_REVOKE) {
3975		kn->kn_flags |= EV_EOF;
3976		return (1);
3977	}
3978	return (kn->kn_fflags != 0);
3979}
3980
3981int
3982vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off)
3983{
3984	int error;
3985
3986	if (dp->d_reclen > ap->a_uio->uio_resid)
3987		return (ENAMETOOLONG);
3988	error = uiomove(dp, dp->d_reclen, ap->a_uio);
3989	if (error) {
3990		if (ap->a_ncookies != NULL) {
3991			if (ap->a_cookies != NULL)
3992				free(ap->a_cookies, M_TEMP);
3993			ap->a_cookies = NULL;
3994			*ap->a_ncookies = 0;
3995		}
3996		return (error);
3997	}
3998	if (ap->a_ncookies == NULL)
3999		return (0);
4000
4001	KASSERT(ap->a_cookies,
4002	    ("NULL ap->a_cookies value with non-NULL ap->a_ncookies!"));
4003
4004	*ap->a_cookies = realloc(*ap->a_cookies,
4005	    (*ap->a_ncookies + 1) * sizeof(u_long), M_TEMP, M_WAITOK | M_ZERO);
4006	(*ap->a_cookies)[*ap->a_ncookies] = off;
4007	return (0);
4008}
4009
4010/*
4011 * Mark for update the access time of the file if the filesystem
4012 * supports VA_MARK_ATIME.  This functionality is used by execve
4013 * and mmap, so we want to avoid the synchronous I/O implied by
4014 * directly setting va_atime for the sake of efficiency.
4015 */
4016void
4017vfs_mark_atime(struct vnode *vp, struct thread *td)
4018{
4019	struct vattr atimeattr;
4020
4021	if ((vp->v_mount->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0) {
4022		VATTR_NULL(&atimeattr);
4023		atimeattr.va_vaflags |= VA_MARK_ATIME;
4024		(void)VOP_SETATTR(vp, &atimeattr, td->td_ucred, td);
4025	}
4026}
4027