vfs_subr.c revision 131695
1/*
2 * Copyright (c) 1989, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 4. Neither the name of the University nor the names of its contributors
19 *    may be used to endorse or promote products derived from this software
20 *    without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 *	@(#)vfs_subr.c	8.31 (Berkeley) 5/26/95
35 */
36
37/*
38 * External virtual filesystem routines
39 */
40
41#include <sys/cdefs.h>
42__FBSDID("$FreeBSD: head/sys/kern/vfs_subr.c 131695 2004-07-06 09:37:43Z alfred $");
43
44#include "opt_ddb.h"
45#include "opt_mac.h"
46
47#include <sys/param.h>
48#include <sys/systm.h>
49#include <sys/bio.h>
50#include <sys/buf.h>
51#include <sys/conf.h>
52#include <sys/event.h>
53#include <sys/eventhandler.h>
54#include <sys/extattr.h>
55#include <sys/fcntl.h>
56#include <sys/kernel.h>
57#include <sys/kthread.h>
58#include <sys/mac.h>
59#include <sys/malloc.h>
60#include <sys/mount.h>
61#include <sys/namei.h>
62#include <sys/sleepqueue.h>
63#include <sys/stat.h>
64#include <sys/sysctl.h>
65#include <sys/syslog.h>
66#include <sys/vmmeter.h>
67#include <sys/vnode.h>
68
69#include <vm/vm.h>
70#include <vm/vm_object.h>
71#include <vm/vm_extern.h>
72#include <vm/pmap.h>
73#include <vm/vm_map.h>
74#include <vm/vm_page.h>
75#include <vm/vm_kern.h>
76#include <vm/uma.h>
77
78static MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure");
79
80static void	addalias(struct vnode *vp, struct cdev *nvp_rdev);
81static void	delmntque(struct vnode *vp);
82static void	insmntque(struct vnode *vp, struct mount *mp);
83static void	vclean(struct vnode *vp, int flags, struct thread *td);
84static void	vlruvp(struct vnode *vp);
85static int	flushbuflist(struct buf *blist, int flags, struct vnode *vp,
86		    int slpflag, int slptimeo, int *errorp);
87static void	syncer_shutdown(void *arg, int howto);
88static int	vtryrecycle(struct vnode *vp);
89static void	vx_lock(struct vnode *vp);
90static void	vx_unlock(struct vnode *vp);
91static void	vgonechrl(struct vnode *vp, struct thread *td);
92
93
94/*
95 * Number of vnodes in existence.  Increased whenever getnewvnode()
96 * allocates a new vnode, never decreased.
97 */
98static unsigned long	numvnodes;
99
100SYSCTL_LONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, "");
101
102/*
103 * Conversion tables for conversion from vnode types to inode formats
104 * and back.
105 */
106enum vtype iftovt_tab[16] = {
107	VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
108	VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
109};
110int vttoif_tab[9] = {
111	0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
112	S_IFSOCK, S_IFIFO, S_IFMT,
113};
114
115/*
116 * List of vnodes that are ready for recycling.
117 */
118static TAILQ_HEAD(freelst, vnode) vnode_free_list;
119
120/*
121 * Minimum number of free vnodes.  If there are fewer than this free vnodes,
122 * getnewvnode() will return a newly allocated vnode.
123 */
124static u_long wantfreevnodes = 25;
125SYSCTL_LONG(_vfs, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, "");
126/* Number of vnodes in the free list. */
127static u_long freevnodes;
128SYSCTL_LONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, "");
129
130/*
131 * Various variables used for debugging the new implementation of
132 * reassignbuf().
133 * XXX these are probably of (very) limited utility now.
134 */
135static int reassignbufcalls;
136SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0, "");
137static int nameileafonly;
138SYSCTL_INT(_vfs, OID_AUTO, nameileafonly, CTLFLAG_RW, &nameileafonly, 0, "");
139
140/*
141 * Cache for the mount type id assigned to NFS.  This is used for
142 * special checks in nfs/nfs_nqlease.c and vm/vnode_pager.c.
143 */
144int	nfs_mount_type = -1;
145
146/* To keep more than one thread at a time from running vfs_getnewfsid */
147static struct mtx mntid_mtx;
148
149/*
150 * Lock for any access to the following:
151 *	vnode_free_list
152 *	numvnodes
153 *	freevnodes
154 */
155static struct mtx vnode_free_list_mtx;
156
157/*
158 * For any iteration/modification of dev->si_hlist (linked through
159 * v_specnext)
160 */
161static struct mtx spechash_mtx;
162
163/* Publicly exported FS */
164struct nfs_public nfs_pub;
165
166/* Zone for allocation of new vnodes - used exclusively by getnewvnode() */
167static uma_zone_t vnode_zone;
168static uma_zone_t vnodepoll_zone;
169
170/* Set to 1 to print out reclaim of active vnodes */
171int	prtactive;
172
173/*
174 * The workitem queue.
175 *
176 * It is useful to delay writes of file data and filesystem metadata
177 * for tens of seconds so that quickly created and deleted files need
178 * not waste disk bandwidth being created and removed. To realize this,
179 * we append vnodes to a "workitem" queue. When running with a soft
180 * updates implementation, most pending metadata dependencies should
181 * not wait for more than a few seconds. Thus, mounted on block devices
182 * are delayed only about a half the time that file data is delayed.
183 * Similarly, directory updates are more critical, so are only delayed
184 * about a third the time that file data is delayed. Thus, there are
185 * SYNCER_MAXDELAY queues that are processed round-robin at a rate of
186 * one each second (driven off the filesystem syncer process). The
187 * syncer_delayno variable indicates the next queue that is to be processed.
188 * Items that need to be processed soon are placed in this queue:
189 *
190 *	syncer_workitem_pending[syncer_delayno]
191 *
192 * A delay of fifteen seconds is done by placing the request fifteen
193 * entries later in the queue:
194 *
195 *	syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
196 *
197 */
198static int syncer_delayno;
199static long syncer_mask;
200LIST_HEAD(synclist, vnode);
201static struct synclist *syncer_workitem_pending;
202/*
203 * The sync_mtx protects:
204 *	vp->v_synclist
205 *	sync_vnode_count
206 *	syncer_delayno
207 *	syncer_state
208 *	syncer_workitem_pending
209 *	syncer_worklist_len
210 *	rushjob
211 */
212static struct mtx sync_mtx;
213
214#define SYNCER_MAXDELAY		32
215static int syncer_maxdelay = SYNCER_MAXDELAY;	/* maximum delay time */
216static int syncdelay = 30;		/* max time to delay syncing data */
217static int filedelay = 30;		/* time to delay syncing files */
218SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, "");
219static int dirdelay = 29;		/* time to delay syncing directories */
220SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, "");
221static int metadelay = 28;		/* time to delay syncing metadata */
222SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, "");
223static int rushjob;		/* number of slots to run ASAP */
224static int stat_rush_requests;	/* number of times I/O speeded up */
225SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, "");
226
227/*
228 * When shutting down the syncer, run it at four times normal speed.
229 */
230#define SYNCER_SHUTDOWN_SPEEDUP		4
231static int sync_vnode_count;
232static int syncer_worklist_len;
233static enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY }
234    syncer_state;
235
236/*
237 * Number of vnodes we want to exist at any one time.  This is mostly used
238 * to size hash tables in vnode-related code.  It is normally not used in
239 * getnewvnode(), as wantfreevnodes is normally nonzero.)
240 *
241 * XXX desiredvnodes is historical cruft and should not exist.
242 */
243int desiredvnodes;
244SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW,
245    &desiredvnodes, 0, "Maximum number of vnodes");
246static int minvnodes;
247SYSCTL_INT(_kern, OID_AUTO, minvnodes, CTLFLAG_RW,
248    &minvnodes, 0, "Minimum number of vnodes");
249static int vnlru_nowhere;
250SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW,
251    &vnlru_nowhere, 0, "Number of times the vnlru process ran without success");
252
253/* Hook for calling soft updates. */
254int (*softdep_process_worklist_hook)(struct mount *);
255
256/*
257 * Initialize the vnode management data structures.
258 */
259static void
260vntblinit(void *dummy __unused)
261{
262
263	/*
264	 * Desiredvnodes is a function of the physical memory size and
265	 * the kernel's heap size.  Specifically, desiredvnodes scales
266	 * in proportion to the physical memory size until two fifths
267	 * of the kernel's heap size is consumed by vnodes and vm
268	 * objects.
269	 */
270	desiredvnodes = min(maxproc + cnt.v_page_count / 4, 2 * vm_kmem_size /
271	    (5 * (sizeof(struct vm_object) + sizeof(struct vnode))));
272	minvnodes = desiredvnodes / 4;
273	mtx_init(&mountlist_mtx, "mountlist", NULL, MTX_DEF);
274	mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF);
275	mtx_init(&spechash_mtx, "spechash", NULL, MTX_DEF);
276	TAILQ_INIT(&vnode_free_list);
277	mtx_init(&vnode_free_list_mtx, "vnode_free_list", NULL, MTX_DEF);
278	vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL,
279	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
280	vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo),
281	      NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
282	/*
283	 * Initialize the filesystem syncer.
284	 */
285	syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE,
286		&syncer_mask);
287	syncer_maxdelay = syncer_mask + 1;
288	mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF);
289}
290SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL)
291
292
293/*
294 * Mark a mount point as busy. Used to synchronize access and to delay
295 * unmounting. Interlock is not released on failure.
296 */
297int
298vfs_busy(mp, flags, interlkp, td)
299	struct mount *mp;
300	int flags;
301	struct mtx *interlkp;
302	struct thread *td;
303{
304	int lkflags;
305
306	if (mp->mnt_kern_flag & MNTK_UNMOUNT) {
307		if (flags & LK_NOWAIT)
308			return (ENOENT);
309		mp->mnt_kern_flag |= MNTK_MWAIT;
310		/*
311		 * Since all busy locks are shared except the exclusive
312		 * lock granted when unmounting, the only place that a
313		 * wakeup needs to be done is at the release of the
314		 * exclusive lock at the end of dounmount.
315		 */
316		msleep(mp, interlkp, PVFS, "vfs_busy", 0);
317		return (ENOENT);
318	}
319	lkflags = LK_SHARED | LK_NOPAUSE;
320	if (interlkp)
321		lkflags |= LK_INTERLOCK;
322	if (lockmgr(&mp->mnt_lock, lkflags, interlkp, td))
323		panic("vfs_busy: unexpected lock failure");
324	return (0);
325}
326
327/*
328 * Free a busy filesystem.
329 */
330void
331vfs_unbusy(mp, td)
332	struct mount *mp;
333	struct thread *td;
334{
335
336	lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, td);
337}
338
339/*
340 * Lookup a mount point by filesystem identifier.
341 */
342struct mount *
343vfs_getvfs(fsid)
344	fsid_t *fsid;
345{
346	register struct mount *mp;
347
348	mtx_lock(&mountlist_mtx);
349	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
350		if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
351		    mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
352			mtx_unlock(&mountlist_mtx);
353			return (mp);
354		}
355	}
356	mtx_unlock(&mountlist_mtx);
357	return ((struct mount *) 0);
358}
359
360/*
361 * Check if a user can access priveledged mount options.
362 */
363int
364vfs_suser(struct mount *mp, struct thread *td)
365{
366	int error;
367
368	if ((mp->mnt_flag & MNT_USER) == 0 ||
369	    mp->mnt_cred->cr_uid != td->td_ucred->cr_uid) {
370		if ((error = suser(td)) != 0)
371			return (error);
372	}
373	return (0);
374}
375
376/*
377 * Get a new unique fsid.  Try to make its val[0] unique, since this value
378 * will be used to create fake device numbers for stat().  Also try (but
379 * not so hard) make its val[0] unique mod 2^16, since some emulators only
380 * support 16-bit device numbers.  We end up with unique val[0]'s for the
381 * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls.
382 *
383 * Keep in mind that several mounts may be running in parallel.  Starting
384 * the search one past where the previous search terminated is both a
385 * micro-optimization and a defense against returning the same fsid to
386 * different mounts.
387 */
388void
389vfs_getnewfsid(mp)
390	struct mount *mp;
391{
392	static u_int16_t mntid_base;
393	fsid_t tfsid;
394	int mtype;
395
396	mtx_lock(&mntid_mtx);
397	mtype = mp->mnt_vfc->vfc_typenum;
398	tfsid.val[1] = mtype;
399	mtype = (mtype & 0xFF) << 24;
400	for (;;) {
401		tfsid.val[0] = makedev(255,
402		    mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF));
403		mntid_base++;
404		if (vfs_getvfs(&tfsid) == NULL)
405			break;
406	}
407	mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
408	mp->mnt_stat.f_fsid.val[1] = tfsid.val[1];
409	mtx_unlock(&mntid_mtx);
410}
411
412/*
413 * Knob to control the precision of file timestamps:
414 *
415 *   0 = seconds only; nanoseconds zeroed.
416 *   1 = seconds and nanoseconds, accurate within 1/HZ.
417 *   2 = seconds and nanoseconds, truncated to microseconds.
418 * >=3 = seconds and nanoseconds, maximum precision.
419 */
420enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC };
421
422static int timestamp_precision = TSP_SEC;
423SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW,
424    &timestamp_precision, 0, "");
425
426/*
427 * Get a current timestamp.
428 */
429void
430vfs_timestamp(tsp)
431	struct timespec *tsp;
432{
433	struct timeval tv;
434
435	switch (timestamp_precision) {
436	case TSP_SEC:
437		tsp->tv_sec = time_second;
438		tsp->tv_nsec = 0;
439		break;
440	case TSP_HZ:
441		getnanotime(tsp);
442		break;
443	case TSP_USEC:
444		microtime(&tv);
445		TIMEVAL_TO_TIMESPEC(&tv, tsp);
446		break;
447	case TSP_NSEC:
448	default:
449		nanotime(tsp);
450		break;
451	}
452}
453
454/*
455 * Set vnode attributes to VNOVAL
456 */
457void
458vattr_null(vap)
459	register struct vattr *vap;
460{
461
462	vap->va_type = VNON;
463	vap->va_size = VNOVAL;
464	vap->va_bytes = VNOVAL;
465	vap->va_mode = VNOVAL;
466	vap->va_nlink = VNOVAL;
467	vap->va_uid = VNOVAL;
468	vap->va_gid = VNOVAL;
469	vap->va_fsid = VNOVAL;
470	vap->va_fileid = VNOVAL;
471	vap->va_blocksize = VNOVAL;
472	vap->va_rdev = VNOVAL;
473	vap->va_atime.tv_sec = VNOVAL;
474	vap->va_atime.tv_nsec = VNOVAL;
475	vap->va_mtime.tv_sec = VNOVAL;
476	vap->va_mtime.tv_nsec = VNOVAL;
477	vap->va_ctime.tv_sec = VNOVAL;
478	vap->va_ctime.tv_nsec = VNOVAL;
479	vap->va_birthtime.tv_sec = VNOVAL;
480	vap->va_birthtime.tv_nsec = VNOVAL;
481	vap->va_flags = VNOVAL;
482	vap->va_gen = VNOVAL;
483	vap->va_vaflags = 0;
484}
485
486/*
487 * This routine is called when we have too many vnodes.  It attempts
488 * to free <count> vnodes and will potentially free vnodes that still
489 * have VM backing store (VM backing store is typically the cause
490 * of a vnode blowout so we want to do this).  Therefore, this operation
491 * is not considered cheap.
492 *
493 * A number of conditions may prevent a vnode from being reclaimed.
494 * the buffer cache may have references on the vnode, a directory
495 * vnode may still have references due to the namei cache representing
496 * underlying files, or the vnode may be in active use.   It is not
497 * desireable to reuse such vnodes.  These conditions may cause the
498 * number of vnodes to reach some minimum value regardless of what
499 * you set kern.maxvnodes to.  Do not set kern.maxvnodes too low.
500 */
501static int
502vlrureclaim(struct mount *mp)
503{
504	struct vnode *vp;
505	int done;
506	int trigger;
507	int usevnodes;
508	int count;
509
510	/*
511	 * Calculate the trigger point, don't allow user
512	 * screwups to blow us up.   This prevents us from
513	 * recycling vnodes with lots of resident pages.  We
514	 * aren't trying to free memory, we are trying to
515	 * free vnodes.
516	 */
517	usevnodes = desiredvnodes;
518	if (usevnodes <= 0)
519		usevnodes = 1;
520	trigger = cnt.v_page_count * 2 / usevnodes;
521
522	done = 0;
523	MNT_ILOCK(mp);
524	count = mp->mnt_nvnodelistsize / 10 + 1;
525	while (count && (vp = TAILQ_FIRST(&mp->mnt_nvnodelist)) != NULL) {
526		TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
527		TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
528
529		if (vp->v_type != VNON &&
530		    vp->v_type != VBAD &&
531		    VI_TRYLOCK(vp)) {
532			if (VMIGHTFREE(vp) &&           /* critical path opt */
533			    (vp->v_object == NULL ||
534			    vp->v_object->resident_page_count < trigger)) {
535				MNT_IUNLOCK(mp);
536				vgonel(vp, curthread);
537				done++;
538				MNT_ILOCK(mp);
539			} else
540				VI_UNLOCK(vp);
541		}
542		--count;
543	}
544	MNT_IUNLOCK(mp);
545	return done;
546}
547
548/*
549 * Attempt to recycle vnodes in a context that is always safe to block.
550 * Calling vlrurecycle() from the bowels of filesystem code has some
551 * interesting deadlock problems.
552 */
553static struct proc *vnlruproc;
554static int vnlruproc_sig;
555
556static void
557vnlru_proc(void)
558{
559	struct mount *mp, *nmp;
560	int done;
561	struct proc *p = vnlruproc;
562	struct thread *td = FIRST_THREAD_IN_PROC(p);
563
564	mtx_lock(&Giant);
565
566	EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, p,
567	    SHUTDOWN_PRI_FIRST);
568
569	for (;;) {
570		kthread_suspend_check(p);
571		mtx_lock(&vnode_free_list_mtx);
572		if (numvnodes - freevnodes <= desiredvnodes * 9 / 10) {
573			mtx_unlock(&vnode_free_list_mtx);
574			vnlruproc_sig = 0;
575			wakeup(&vnlruproc_sig);
576			tsleep(vnlruproc, PVFS, "vlruwt", hz);
577			continue;
578		}
579		mtx_unlock(&vnode_free_list_mtx);
580		done = 0;
581		mtx_lock(&mountlist_mtx);
582		for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
583			if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td)) {
584				nmp = TAILQ_NEXT(mp, mnt_list);
585				continue;
586			}
587			done += vlrureclaim(mp);
588			mtx_lock(&mountlist_mtx);
589			nmp = TAILQ_NEXT(mp, mnt_list);
590			vfs_unbusy(mp, td);
591		}
592		mtx_unlock(&mountlist_mtx);
593		if (done == 0) {
594#if 0
595			/* These messages are temporary debugging aids */
596			if (vnlru_nowhere < 5)
597				printf("vnlru process getting nowhere..\n");
598			else if (vnlru_nowhere == 5)
599				printf("vnlru process messages stopped.\n");
600#endif
601			vnlru_nowhere++;
602			tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3);
603		}
604	}
605}
606
607static struct kproc_desc vnlru_kp = {
608	"vnlru",
609	vnlru_proc,
610	&vnlruproc
611};
612SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &vnlru_kp)
613
614
615/*
616 * Routines having to do with the management of the vnode table.
617 */
618
619/*
620 * Check to see if a free vnode can be recycled. If it can,
621 * recycle it and return it with the vnode interlock held.
622 */
623static int
624vtryrecycle(struct vnode *vp)
625{
626	struct thread *td = curthread;
627	vm_object_t object;
628	struct mount *vnmp;
629	int error;
630
631	/* Don't recycle if we can't get the interlock */
632	if (!VI_TRYLOCK(vp))
633		return (EWOULDBLOCK);
634	/*
635	 * This vnode may found and locked via some other list, if so we
636	 * can't recycle it yet.
637	 */
638	if (vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE | LK_NOWAIT, td) != 0)
639		return (EWOULDBLOCK);
640	/*
641	 * Don't recycle if its filesystem is being suspended.
642	 */
643	if (vn_start_write(vp, &vnmp, V_NOWAIT) != 0) {
644		VOP_UNLOCK(vp, 0, td);
645		return (EBUSY);
646	}
647
648	/*
649	 * Don't recycle if we still have cached pages.
650	 */
651	if (VOP_GETVOBJECT(vp, &object) == 0) {
652		VM_OBJECT_LOCK(object);
653		if (object->resident_page_count ||
654		    object->ref_count) {
655			VM_OBJECT_UNLOCK(object);
656			error = EBUSY;
657			goto done;
658		}
659		VM_OBJECT_UNLOCK(object);
660	}
661	if (LIST_FIRST(&vp->v_cache_src)) {
662		/*
663		 * note: nameileafonly sysctl is temporary,
664		 * for debugging only, and will eventually be
665		 * removed.
666		 */
667		if (nameileafonly > 0) {
668			/*
669			 * Do not reuse namei-cached directory
670			 * vnodes that have cached
671			 * subdirectories.
672			 */
673			if (cache_leaf_test(vp) < 0) {
674				error = EISDIR;
675				goto done;
676			}
677		} else if (nameileafonly < 0 ||
678			    vmiodirenable == 0) {
679			/*
680			 * Do not reuse namei-cached directory
681			 * vnodes if nameileafonly is -1 or
682			 * if VMIO backing for directories is
683			 * turned off (otherwise we reuse them
684			 * too quickly).
685			 */
686			error = EBUSY;
687			goto done;
688		}
689	}
690	/*
691	 * If we got this far, we need to acquire the interlock and see if
692	 * anyone picked up this vnode from another list.  If not, we will
693	 * mark it with XLOCK via vgonel() so that anyone who does find it
694	 * will skip over it.
695	 */
696	VI_LOCK(vp);
697	if (VSHOULDBUSY(vp) && (vp->v_iflag & VI_XLOCK) == 0) {
698		VI_UNLOCK(vp);
699		error = EBUSY;
700		goto done;
701	}
702	mtx_lock(&vnode_free_list_mtx);
703	TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
704	vp->v_iflag &= ~VI_FREE;
705	mtx_unlock(&vnode_free_list_mtx);
706	vp->v_iflag |= VI_DOOMED;
707	if (vp->v_type != VBAD) {
708		VOP_UNLOCK(vp, 0, td);
709		vgonel(vp, td);
710		VI_LOCK(vp);
711	} else
712		VOP_UNLOCK(vp, 0, td);
713	vn_finished_write(vnmp);
714	return (0);
715done:
716	VOP_UNLOCK(vp, 0, td);
717	vn_finished_write(vnmp);
718	return (error);
719}
720
721/*
722 * Return the next vnode from the free list.
723 */
724int
725getnewvnode(tag, mp, vops, vpp)
726	const char *tag;
727	struct mount *mp;
728	vop_t **vops;
729	struct vnode **vpp;
730{
731	struct vnode *vp = NULL;
732	struct vpollinfo *pollinfo = NULL;
733
734	mtx_lock(&vnode_free_list_mtx);
735
736	/*
737	 * Try to reuse vnodes if we hit the max.  This situation only
738	 * occurs in certain large-memory (2G+) situations.  We cannot
739	 * attempt to directly reclaim vnodes due to nasty recursion
740	 * problems.
741	 */
742	while (numvnodes - freevnodes > desiredvnodes) {
743		if (vnlruproc_sig == 0) {
744			vnlruproc_sig = 1;      /* avoid unnecessary wakeups */
745			wakeup(vnlruproc);
746		}
747		mtx_unlock(&vnode_free_list_mtx);
748		tsleep(&vnlruproc_sig, PVFS, "vlruwk", hz);
749		mtx_lock(&vnode_free_list_mtx);
750	}
751
752	/*
753	 * Attempt to reuse a vnode already on the free list, allocating
754	 * a new vnode if we can't find one or if we have not reached a
755	 * good minimum for good LRU performance.
756	 */
757
758	if (freevnodes >= wantfreevnodes && numvnodes >= minvnodes) {
759		int error;
760		int count;
761
762		for (count = 0; count < freevnodes; count++) {
763			vp = TAILQ_FIRST(&vnode_free_list);
764
765			KASSERT(vp->v_usecount == 0 &&
766			    (vp->v_iflag & VI_DOINGINACT) == 0,
767			    ("getnewvnode: free vnode isn't"));
768
769			TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
770			TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
771			mtx_unlock(&vnode_free_list_mtx);
772			error = vtryrecycle(vp);
773			mtx_lock(&vnode_free_list_mtx);
774			if (error == 0)
775				break;
776			vp = NULL;
777		}
778	}
779	if (vp) {
780		freevnodes--;
781		mtx_unlock(&vnode_free_list_mtx);
782
783#ifdef INVARIANTS
784		{
785			if (vp->v_data)
786				panic("cleaned vnode isn't");
787			if (vp->v_numoutput)
788				panic("Clean vnode has pending I/O's");
789			if (vp->v_writecount != 0)
790				panic("Non-zero write count");
791		}
792#endif
793		if ((pollinfo = vp->v_pollinfo) != NULL) {
794			/*
795			 * To avoid lock order reversals, the call to
796			 * uma_zfree() must be delayed until the vnode
797			 * interlock is released.
798			 */
799			vp->v_pollinfo = NULL;
800		}
801#ifdef MAC
802		mac_destroy_vnode(vp);
803#endif
804		vp->v_iflag = 0;
805		vp->v_vflag = 0;
806		vp->v_lastw = 0;
807		vp->v_lasta = 0;
808		vp->v_cstart = 0;
809		vp->v_clen = 0;
810		vp->v_socket = 0;
811		lockdestroy(vp->v_vnlock);
812		lockinit(vp->v_vnlock, PVFS, tag, VLKTIMEOUT, LK_NOPAUSE);
813		KASSERT(vp->v_cleanbufcnt == 0, ("cleanbufcnt not 0"));
814		KASSERT(vp->v_cleanblkroot == NULL, ("cleanblkroot not NULL"));
815		KASSERT(vp->v_dirtybufcnt == 0, ("dirtybufcnt not 0"));
816		KASSERT(vp->v_dirtyblkroot == NULL, ("dirtyblkroot not NULL"));
817	} else {
818		numvnodes++;
819		mtx_unlock(&vnode_free_list_mtx);
820
821		vp = (struct vnode *) uma_zalloc(vnode_zone, M_WAITOK|M_ZERO);
822		mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF);
823		VI_LOCK(vp);
824		vp->v_dd = vp;
825		vp->v_vnlock = &vp->v_lock;
826		lockinit(vp->v_vnlock, PVFS, tag, VLKTIMEOUT, LK_NOPAUSE);
827		cache_purge(vp);		/* Sets up v_id. */
828		LIST_INIT(&vp->v_cache_src);
829		TAILQ_INIT(&vp->v_cache_dst);
830	}
831
832	TAILQ_INIT(&vp->v_cleanblkhd);
833	TAILQ_INIT(&vp->v_dirtyblkhd);
834	vp->v_type = VNON;
835	vp->v_tag = tag;
836	vp->v_op = vops;
837	*vpp = vp;
838	vp->v_usecount = 1;
839	vp->v_data = 0;
840	vp->v_cachedid = -1;
841	VI_UNLOCK(vp);
842	if (pollinfo != NULL) {
843		mtx_destroy(&pollinfo->vpi_lock);
844		uma_zfree(vnodepoll_zone, pollinfo);
845	}
846#ifdef MAC
847	mac_init_vnode(vp);
848	if (mp != NULL && (mp->mnt_flag & MNT_MULTILABEL) == 0)
849		mac_associate_vnode_singlelabel(mp, vp);
850#endif
851	delmntque(vp);
852	if (mp != NULL) {
853		insmntque(vp, mp);
854		vp->v_bsize = mp->mnt_stat.f_iosize;
855	}
856
857	return (0);
858}
859
860/*
861 * Delete from old mount point vnode list, if on one.
862 */
863static void
864delmntque(struct vnode *vp)
865{
866	struct mount *mp;
867
868	if (vp->v_mount == NULL)
869		return;
870	mp = vp->v_mount;
871	MNT_ILOCK(mp);
872	vp->v_mount = NULL;
873	KASSERT(mp->mnt_nvnodelistsize > 0,
874		("bad mount point vnode list size"));
875	TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
876	mp->mnt_nvnodelistsize--;
877	MNT_IUNLOCK(mp);
878}
879
880/*
881 * Insert into list of vnodes for the new mount point, if available.
882 */
883static void
884insmntque(struct vnode *vp, struct mount *mp)
885{
886
887	vp->v_mount = mp;
888	KASSERT(mp != NULL, ("Don't call insmntque(foo, NULL)"));
889	MNT_ILOCK(vp->v_mount);
890	TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
891	mp->mnt_nvnodelistsize++;
892	MNT_IUNLOCK(vp->v_mount);
893}
894
895/*
896 * Update outstanding I/O count and do wakeup if requested.
897 */
898void
899vwakeup(bp)
900	register struct buf *bp;
901{
902	register struct vnode *vp;
903
904	bp->b_flags &= ~B_WRITEINPROG;
905	if ((vp = bp->b_vp)) {
906		VI_LOCK(vp);
907		vp->v_numoutput--;
908		if (vp->v_numoutput < 0)
909			panic("vwakeup: neg numoutput");
910		if ((vp->v_numoutput == 0) && (vp->v_iflag & VI_BWAIT)) {
911			vp->v_iflag &= ~VI_BWAIT;
912			wakeup(&vp->v_numoutput);
913		}
914		VI_UNLOCK(vp);
915	}
916}
917
918/*
919 * Flush out and invalidate all buffers associated with a vnode.
920 * Called with the underlying object locked.
921 */
922int
923vinvalbuf(vp, flags, cred, td, slpflag, slptimeo)
924	struct vnode *vp;
925	int flags;
926	struct ucred *cred;
927	struct thread *td;
928	int slpflag, slptimeo;
929{
930	struct buf *blist;
931	int error;
932	vm_object_t object;
933
934	GIANT_REQUIRED;
935
936	ASSERT_VOP_LOCKED(vp, "vinvalbuf");
937
938	VI_LOCK(vp);
939	if (flags & V_SAVE) {
940		while (vp->v_numoutput) {
941			vp->v_iflag |= VI_BWAIT;
942			error = msleep(&vp->v_numoutput, VI_MTX(vp),
943			    slpflag | (PRIBIO + 1), "vinvlbuf", slptimeo);
944			if (error) {
945				VI_UNLOCK(vp);
946				return (error);
947			}
948		}
949		if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
950			VI_UNLOCK(vp);
951			if ((error = VOP_FSYNC(vp, cred, MNT_WAIT, td)) != 0)
952				return (error);
953			/*
954			 * XXX We could save a lock/unlock if this was only
955			 * enabled under INVARIANTS
956			 */
957			VI_LOCK(vp);
958			if (vp->v_numoutput > 0 ||
959			    !TAILQ_EMPTY(&vp->v_dirtyblkhd))
960				panic("vinvalbuf: dirty bufs");
961		}
962	}
963	/*
964	 * If you alter this loop please notice that interlock is dropped and
965	 * reacquired in flushbuflist.  Special care is needed to ensure that
966	 * no race conditions occur from this.
967	 */
968	for (error = 0;;) {
969		if ((blist = TAILQ_FIRST(&vp->v_cleanblkhd)) != 0 &&
970		    flushbuflist(blist, flags, vp, slpflag, slptimeo, &error)) {
971			if (error)
972				break;
973			continue;
974		}
975		if ((blist = TAILQ_FIRST(&vp->v_dirtyblkhd)) != 0 &&
976		    flushbuflist(blist, flags, vp, slpflag, slptimeo, &error)) {
977			if (error)
978				break;
979			continue;
980		}
981		break;
982	}
983	if (error) {
984		VI_UNLOCK(vp);
985		return (error);
986	}
987
988	/*
989	 * Wait for I/O to complete.  XXX needs cleaning up.  The vnode can
990	 * have write I/O in-progress but if there is a VM object then the
991	 * VM object can also have read-I/O in-progress.
992	 */
993	do {
994		while (vp->v_numoutput > 0) {
995			vp->v_iflag |= VI_BWAIT;
996			msleep(&vp->v_numoutput, VI_MTX(vp), PVM, "vnvlbv", 0);
997		}
998		VI_UNLOCK(vp);
999		if (VOP_GETVOBJECT(vp, &object) == 0) {
1000			VM_OBJECT_LOCK(object);
1001			vm_object_pip_wait(object, "vnvlbx");
1002			VM_OBJECT_UNLOCK(object);
1003		}
1004		VI_LOCK(vp);
1005	} while (vp->v_numoutput > 0);
1006	VI_UNLOCK(vp);
1007
1008	/*
1009	 * Destroy the copy in the VM cache, too.
1010	 */
1011	if (VOP_GETVOBJECT(vp, &object) == 0) {
1012		VM_OBJECT_LOCK(object);
1013		vm_object_page_remove(object, 0, 0,
1014			(flags & V_SAVE) ? TRUE : FALSE);
1015		VM_OBJECT_UNLOCK(object);
1016	}
1017
1018#ifdef INVARIANTS
1019	VI_LOCK(vp);
1020	if ((flags & (V_ALT | V_NORMAL)) == 0 &&
1021	    (!TAILQ_EMPTY(&vp->v_dirtyblkhd) ||
1022	     !TAILQ_EMPTY(&vp->v_cleanblkhd)))
1023		panic("vinvalbuf: flush failed");
1024	VI_UNLOCK(vp);
1025#endif
1026	return (0);
1027}
1028
1029/*
1030 * Flush out buffers on the specified list.
1031 *
1032 */
1033static int
1034flushbuflist(blist, flags, vp, slpflag, slptimeo, errorp)
1035	struct buf *blist;
1036	int flags;
1037	struct vnode *vp;
1038	int slpflag, slptimeo;
1039	int *errorp;
1040{
1041	struct buf *bp, *nbp;
1042	int found, error;
1043
1044	ASSERT_VI_LOCKED(vp, "flushbuflist");
1045
1046	for (found = 0, bp = blist; bp; bp = nbp) {
1047		nbp = TAILQ_NEXT(bp, b_vnbufs);
1048		if (((flags & V_NORMAL) && (bp->b_xflags & BX_ALTDATA)) ||
1049		    ((flags & V_ALT) && (bp->b_xflags & BX_ALTDATA) == 0)) {
1050			continue;
1051		}
1052		found += 1;
1053		error = BUF_TIMELOCK(bp,
1054		    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, VI_MTX(vp),
1055		    "flushbuf", slpflag, slptimeo);
1056		if (error) {
1057			if (error != ENOLCK)
1058				*errorp = error;
1059			goto done;
1060		}
1061		/*
1062		 * XXX Since there are no node locks for NFS, I
1063		 * believe there is a slight chance that a delayed
1064		 * write will occur while sleeping just above, so
1065		 * check for it.  Note that vfs_bio_awrite expects
1066		 * buffers to reside on a queue, while bwrite and
1067		 * brelse do not.
1068		 */
1069		if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) &&
1070			(flags & V_SAVE)) {
1071
1072			if (bp->b_vp == vp) {
1073				if (bp->b_flags & B_CLUSTEROK) {
1074					vfs_bio_awrite(bp);
1075				} else {
1076					bremfree(bp);
1077					bp->b_flags |= B_ASYNC;
1078					bwrite(bp);
1079				}
1080			} else {
1081				bremfree(bp);
1082				(void) bwrite(bp);
1083			}
1084			goto done;
1085		}
1086		bremfree(bp);
1087		bp->b_flags |= (B_INVAL | B_NOCACHE | B_RELBUF);
1088		bp->b_flags &= ~B_ASYNC;
1089		brelse(bp);
1090		VI_LOCK(vp);
1091	}
1092	return (found);
1093done:
1094	VI_LOCK(vp);
1095	return (found);
1096}
1097
1098/*
1099 * Truncate a file's buffer and pages to a specified length.  This
1100 * is in lieu of the old vinvalbuf mechanism, which performed unneeded
1101 * sync activity.
1102 */
1103int
1104vtruncbuf(vp, cred, td, length, blksize)
1105	register struct vnode *vp;
1106	struct ucred *cred;
1107	struct thread *td;
1108	off_t length;
1109	int blksize;
1110{
1111	register struct buf *bp;
1112	struct buf *nbp;
1113	int anyfreed;
1114	int trunclbn;
1115
1116	/*
1117	 * Round up to the *next* lbn.
1118	 */
1119	trunclbn = (length + blksize - 1) / blksize;
1120
1121	ASSERT_VOP_LOCKED(vp, "vtruncbuf");
1122restart:
1123	VI_LOCK(vp);
1124	anyfreed = 1;
1125	for (;anyfreed;) {
1126		anyfreed = 0;
1127		for (bp = TAILQ_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) {
1128			nbp = TAILQ_NEXT(bp, b_vnbufs);
1129			if (bp->b_lblkno >= trunclbn) {
1130				if (BUF_LOCK(bp,
1131				    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
1132				    VI_MTX(vp)) == ENOLCK)
1133					goto restart;
1134
1135				bremfree(bp);
1136				bp->b_flags |= (B_INVAL | B_RELBUF);
1137				bp->b_flags &= ~B_ASYNC;
1138				brelse(bp);
1139				anyfreed = 1;
1140
1141				if (nbp &&
1142				    (((nbp->b_xflags & BX_VNCLEAN) == 0) ||
1143				    (nbp->b_vp != vp) ||
1144				    (nbp->b_flags & B_DELWRI))) {
1145					goto restart;
1146				}
1147				VI_LOCK(vp);
1148			}
1149		}
1150
1151		for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
1152			nbp = TAILQ_NEXT(bp, b_vnbufs);
1153			if (bp->b_lblkno >= trunclbn) {
1154				if (BUF_LOCK(bp,
1155				    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
1156				    VI_MTX(vp)) == ENOLCK)
1157					goto restart;
1158				bremfree(bp);
1159				bp->b_flags |= (B_INVAL | B_RELBUF);
1160				bp->b_flags &= ~B_ASYNC;
1161				brelse(bp);
1162				anyfreed = 1;
1163				if (nbp &&
1164				    (((nbp->b_xflags & BX_VNDIRTY) == 0) ||
1165				    (nbp->b_vp != vp) ||
1166				    (nbp->b_flags & B_DELWRI) == 0)) {
1167					goto restart;
1168				}
1169				VI_LOCK(vp);
1170			}
1171		}
1172	}
1173
1174	if (length > 0) {
1175restartsync:
1176		for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
1177			nbp = TAILQ_NEXT(bp, b_vnbufs);
1178			if (bp->b_lblkno > 0)
1179				continue;
1180			/*
1181			 * Since we hold the vnode lock this should only
1182			 * fail if we're racing with the buf daemon.
1183			 */
1184			if (BUF_LOCK(bp,
1185			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
1186			    VI_MTX(vp)) == ENOLCK) {
1187				goto restart;
1188			}
1189			KASSERT((bp->b_flags & B_DELWRI),
1190			    ("buf(%p) on dirty queue without DELWRI", bp));
1191
1192			bremfree(bp);
1193			bawrite(bp);
1194			VI_LOCK(vp);
1195			goto restartsync;
1196		}
1197	}
1198
1199	while (vp->v_numoutput > 0) {
1200		vp->v_iflag |= VI_BWAIT;
1201		msleep(&vp->v_numoutput, VI_MTX(vp), PVM, "vbtrunc", 0);
1202	}
1203	VI_UNLOCK(vp);
1204	vnode_pager_setsize(vp, length);
1205
1206	return (0);
1207}
1208
1209/*
1210 * buf_splay() - splay tree core for the clean/dirty list of buffers in
1211 * 		 a vnode.
1212 *
1213 *	NOTE: We have to deal with the special case of a background bitmap
1214 *	buffer, a situation where two buffers will have the same logical
1215 *	block offset.  We want (1) only the foreground buffer to be accessed
1216 *	in a lookup and (2) must differentiate between the foreground and
1217 *	background buffer in the splay tree algorithm because the splay
1218 *	tree cannot normally handle multiple entities with the same 'index'.
1219 *	We accomplish this by adding differentiating flags to the splay tree's
1220 *	numerical domain.
1221 */
1222static
1223struct buf *
1224buf_splay(daddr_t lblkno, b_xflags_t xflags, struct buf *root)
1225{
1226	struct buf dummy;
1227	struct buf *lefttreemax, *righttreemin, *y;
1228
1229	if (root == NULL)
1230		return (NULL);
1231	lefttreemax = righttreemin = &dummy;
1232	for (;;) {
1233		if (lblkno < root->b_lblkno ||
1234		    (lblkno == root->b_lblkno &&
1235		    (xflags & BX_BKGRDMARKER) < (root->b_xflags & BX_BKGRDMARKER))) {
1236			if ((y = root->b_left) == NULL)
1237				break;
1238			if (lblkno < y->b_lblkno) {
1239				/* Rotate right. */
1240				root->b_left = y->b_right;
1241				y->b_right = root;
1242				root = y;
1243				if ((y = root->b_left) == NULL)
1244					break;
1245			}
1246			/* Link into the new root's right tree. */
1247			righttreemin->b_left = root;
1248			righttreemin = root;
1249		} else if (lblkno > root->b_lblkno ||
1250		    (lblkno == root->b_lblkno &&
1251		    (xflags & BX_BKGRDMARKER) > (root->b_xflags & BX_BKGRDMARKER))) {
1252			if ((y = root->b_right) == NULL)
1253				break;
1254			if (lblkno > y->b_lblkno) {
1255				/* Rotate left. */
1256				root->b_right = y->b_left;
1257				y->b_left = root;
1258				root = y;
1259				if ((y = root->b_right) == NULL)
1260					break;
1261			}
1262			/* Link into the new root's left tree. */
1263			lefttreemax->b_right = root;
1264			lefttreemax = root;
1265		} else {
1266			break;
1267		}
1268		root = y;
1269	}
1270	/* Assemble the new root. */
1271	lefttreemax->b_right = root->b_left;
1272	righttreemin->b_left = root->b_right;
1273	root->b_left = dummy.b_right;
1274	root->b_right = dummy.b_left;
1275	return (root);
1276}
1277
1278static
1279void
1280buf_vlist_remove(struct buf *bp)
1281{
1282	struct vnode *vp = bp->b_vp;
1283	struct buf *root;
1284
1285	ASSERT_VI_LOCKED(vp, "buf_vlist_remove");
1286	if (bp->b_xflags & BX_VNDIRTY) {
1287		if (bp != vp->v_dirtyblkroot) {
1288			root = buf_splay(bp->b_lblkno, bp->b_xflags,
1289			    vp->v_dirtyblkroot);
1290			KASSERT(root == bp,
1291			    ("splay lookup failed during dirty remove"));
1292		}
1293		if (bp->b_left == NULL) {
1294			root = bp->b_right;
1295		} else {
1296			root = buf_splay(bp->b_lblkno, bp->b_xflags,
1297			    bp->b_left);
1298			root->b_right = bp->b_right;
1299		}
1300		vp->v_dirtyblkroot = root;
1301		TAILQ_REMOVE(&vp->v_dirtyblkhd, bp, b_vnbufs);
1302		vp->v_dirtybufcnt--;
1303	} else {
1304		/* KASSERT(bp->b_xflags & BX_VNCLEAN, ("bp wasn't clean")); */
1305		if (bp != vp->v_cleanblkroot) {
1306			root = buf_splay(bp->b_lblkno, bp->b_xflags,
1307			    vp->v_cleanblkroot);
1308			KASSERT(root == bp,
1309			    ("splay lookup failed during clean remove"));
1310		}
1311		if (bp->b_left == NULL) {
1312			root = bp->b_right;
1313		} else {
1314			root = buf_splay(bp->b_lblkno, bp->b_xflags,
1315			    bp->b_left);
1316			root->b_right = bp->b_right;
1317		}
1318		vp->v_cleanblkroot = root;
1319		TAILQ_REMOVE(&vp->v_cleanblkhd, bp, b_vnbufs);
1320		vp->v_cleanbufcnt--;
1321	}
1322	bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN);
1323}
1324
1325/*
1326 * Add the buffer to the sorted clean or dirty block list using a
1327 * splay tree algorithm.
1328 *
1329 * NOTE: xflags is passed as a constant, optimizing this inline function!
1330 */
1331static
1332void
1333buf_vlist_add(struct buf *bp, struct vnode *vp, b_xflags_t xflags)
1334{
1335	struct buf *root;
1336
1337	ASSERT_VI_LOCKED(vp, "buf_vlist_add");
1338	bp->b_xflags |= xflags;
1339	if (xflags & BX_VNDIRTY) {
1340		root = buf_splay(bp->b_lblkno, bp->b_xflags, vp->v_dirtyblkroot);
1341		if (root == NULL) {
1342			bp->b_left = NULL;
1343			bp->b_right = NULL;
1344			TAILQ_INSERT_TAIL(&vp->v_dirtyblkhd, bp, b_vnbufs);
1345		} else if (bp->b_lblkno < root->b_lblkno ||
1346		    (bp->b_lblkno == root->b_lblkno &&
1347		    (bp->b_xflags & BX_BKGRDMARKER) < (root->b_xflags & BX_BKGRDMARKER))) {
1348			bp->b_left = root->b_left;
1349			bp->b_right = root;
1350			root->b_left = NULL;
1351			TAILQ_INSERT_BEFORE(root, bp, b_vnbufs);
1352		} else {
1353			bp->b_right = root->b_right;
1354			bp->b_left = root;
1355			root->b_right = NULL;
1356			TAILQ_INSERT_AFTER(&vp->v_dirtyblkhd,
1357			    root, bp, b_vnbufs);
1358		}
1359		vp->v_dirtybufcnt++;
1360		vp->v_dirtyblkroot = bp;
1361	} else {
1362		/* KASSERT(xflags & BX_VNCLEAN, ("xflags not clean")); */
1363		root = buf_splay(bp->b_lblkno, bp->b_xflags, vp->v_cleanblkroot);
1364		if (root == NULL) {
1365			bp->b_left = NULL;
1366			bp->b_right = NULL;
1367			TAILQ_INSERT_TAIL(&vp->v_cleanblkhd, bp, b_vnbufs);
1368		} else if (bp->b_lblkno < root->b_lblkno ||
1369		    (bp->b_lblkno == root->b_lblkno &&
1370		    (bp->b_xflags & BX_BKGRDMARKER) < (root->b_xflags & BX_BKGRDMARKER))) {
1371			bp->b_left = root->b_left;
1372			bp->b_right = root;
1373			root->b_left = NULL;
1374			TAILQ_INSERT_BEFORE(root, bp, b_vnbufs);
1375		} else {
1376			bp->b_right = root->b_right;
1377			bp->b_left = root;
1378			root->b_right = NULL;
1379			TAILQ_INSERT_AFTER(&vp->v_cleanblkhd,
1380			    root, bp, b_vnbufs);
1381		}
1382		vp->v_cleanbufcnt++;
1383		vp->v_cleanblkroot = bp;
1384	}
1385}
1386
1387/*
1388 * Lookup a buffer using the splay tree.  Note that we specifically avoid
1389 * shadow buffers used in background bitmap writes.
1390 *
1391 * This code isn't quite efficient as it could be because we are maintaining
1392 * two sorted lists and do not know which list the block resides in.
1393 *
1394 * During a "make buildworld" the desired buffer is found at one of
1395 * the roots more than 60% of the time.  Thus, checking both roots
1396 * before performing either splay eliminates unnecessary splays on the
1397 * first tree splayed.
1398 */
1399struct buf *
1400gbincore(struct vnode *vp, daddr_t lblkno)
1401{
1402	struct buf *bp;
1403
1404	GIANT_REQUIRED;
1405
1406	ASSERT_VI_LOCKED(vp, "gbincore");
1407	if ((bp = vp->v_cleanblkroot) != NULL &&
1408	    bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
1409		return (bp);
1410	if ((bp = vp->v_dirtyblkroot) != NULL &&
1411	    bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
1412		return (bp);
1413	if ((bp = vp->v_cleanblkroot) != NULL) {
1414		vp->v_cleanblkroot = bp = buf_splay(lblkno, 0, bp);
1415		if (bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
1416			return (bp);
1417	}
1418	if ((bp = vp->v_dirtyblkroot) != NULL) {
1419		vp->v_dirtyblkroot = bp = buf_splay(lblkno, 0, bp);
1420		if (bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
1421			return (bp);
1422	}
1423	return (NULL);
1424}
1425
1426/*
1427 * Associate a buffer with a vnode.
1428 */
1429void
1430bgetvp(vp, bp)
1431	register struct vnode *vp;
1432	register struct buf *bp;
1433{
1434
1435	KASSERT(bp->b_vp == NULL, ("bgetvp: not free"));
1436
1437	KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0,
1438	    ("bgetvp: bp already attached! %p", bp));
1439
1440	ASSERT_VI_LOCKED(vp, "bgetvp");
1441	vholdl(vp);
1442	bp->b_vp = vp;
1443	bp->b_dev = vn_todev(vp);
1444	/*
1445	 * Insert onto list for new vnode.
1446	 */
1447	buf_vlist_add(bp, vp, BX_VNCLEAN);
1448}
1449
1450/*
1451 * Disassociate a buffer from a vnode.
1452 */
1453void
1454brelvp(bp)
1455	register struct buf *bp;
1456{
1457	struct vnode *vp;
1458
1459	KASSERT(bp->b_vp != NULL, ("brelvp: NULL"));
1460
1461	/*
1462	 * Delete from old vnode list, if on one.
1463	 */
1464	vp = bp->b_vp;
1465	VI_LOCK(vp);
1466	if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN))
1467		buf_vlist_remove(bp);
1468	if ((vp->v_iflag & VI_ONWORKLST) && TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
1469		vp->v_iflag &= ~VI_ONWORKLST;
1470		mtx_lock(&sync_mtx);
1471		LIST_REMOVE(vp, v_synclist);
1472 		syncer_worklist_len--;
1473		mtx_unlock(&sync_mtx);
1474	}
1475	vdropl(vp);
1476	bp->b_vp = (struct vnode *) 0;
1477	if (bp->b_object)
1478		bp->b_object = NULL;
1479	VI_UNLOCK(vp);
1480}
1481
1482/*
1483 * Add an item to the syncer work queue.
1484 */
1485static void
1486vn_syncer_add_to_worklist(struct vnode *vp, int delay)
1487{
1488	int slot;
1489
1490	ASSERT_VI_LOCKED(vp, "vn_syncer_add_to_worklist");
1491
1492	mtx_lock(&sync_mtx);
1493	if (vp->v_iflag & VI_ONWORKLST)
1494		LIST_REMOVE(vp, v_synclist);
1495	else {
1496		vp->v_iflag |= VI_ONWORKLST;
1497 		syncer_worklist_len++;
1498	}
1499
1500	if (delay > syncer_maxdelay - 2)
1501		delay = syncer_maxdelay - 2;
1502	slot = (syncer_delayno + delay) & syncer_mask;
1503
1504	LIST_INSERT_HEAD(&syncer_workitem_pending[slot], vp, v_synclist);
1505	mtx_unlock(&sync_mtx);
1506}
1507
1508static int
1509sysctl_vfs_worklist_len(SYSCTL_HANDLER_ARGS)
1510{
1511	int error, len;
1512
1513	mtx_lock(&sync_mtx);
1514	len = syncer_worklist_len - sync_vnode_count;
1515	mtx_unlock(&sync_mtx);
1516	error = SYSCTL_OUT(req, &len, sizeof(len));
1517	return (error);
1518}
1519
1520SYSCTL_PROC(_vfs, OID_AUTO, worklist_len, CTLTYPE_INT | CTLFLAG_RD, NULL, 0,
1521    sysctl_vfs_worklist_len, "I", "Syncer thread worklist length");
1522
1523struct  proc *updateproc;
1524static void sched_sync(void);
1525static struct kproc_desc up_kp = {
1526	"syncer",
1527	sched_sync,
1528	&updateproc
1529};
1530SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp)
1531
1532/*
1533 * System filesystem synchronizer daemon.
1534 */
1535static void
1536sched_sync(void)
1537{
1538	struct synclist *next;
1539	struct synclist *slp;
1540	struct vnode *vp;
1541	struct mount *mp;
1542	long starttime;
1543	struct thread *td = FIRST_THREAD_IN_PROC(updateproc);
1544	static int dummychan;
1545	int last_work_seen;
1546	int net_worklist_len;
1547	int syncer_final_iter;
1548
1549	mtx_lock(&Giant);
1550	last_work_seen = 0;
1551	syncer_final_iter = 0;
1552	syncer_state = SYNCER_RUNNING;
1553	starttime = time_second;
1554
1555	EVENTHANDLER_REGISTER(shutdown_pre_sync, syncer_shutdown, td->td_proc,
1556	    SHUTDOWN_PRI_LAST);
1557
1558	for (;;) {
1559		mtx_lock(&sync_mtx);
1560		if (syncer_state == SYNCER_FINAL_DELAY &&
1561		    syncer_final_iter == 0) {
1562			mtx_unlock(&sync_mtx);
1563			kthread_suspend_check(td->td_proc);
1564			mtx_lock(&sync_mtx);
1565		}
1566		net_worklist_len = syncer_worklist_len - sync_vnode_count;
1567		if (syncer_state != SYNCER_RUNNING && starttime != time_second)
1568			printf("%d ", net_worklist_len);
1569		starttime = time_second;
1570
1571		/*
1572		 * Push files whose dirty time has expired.  Be careful
1573		 * of interrupt race on slp queue.
1574		 *
1575		 * Skip over empty worklist slots when shutting down.
1576		 */
1577		do {
1578			slp = &syncer_workitem_pending[syncer_delayno];
1579			syncer_delayno += 1;
1580			if (syncer_delayno == syncer_maxdelay)
1581				syncer_delayno = 0;
1582			next = &syncer_workitem_pending[syncer_delayno];
1583			/*
1584			 * If the worklist has wrapped since the
1585			 * it was emptied of all but syncer vnodes,
1586			 * switch to the FINAL_DELAY state and run
1587			 * for one more second.
1588			 */
1589			if (syncer_state == SYNCER_SHUTTING_DOWN &&
1590			    net_worklist_len == 0 &&
1591			    last_work_seen == syncer_delayno) {
1592				syncer_state = SYNCER_FINAL_DELAY;
1593				syncer_final_iter = SYNCER_SHUTDOWN_SPEEDUP;
1594			}
1595		} while (syncer_state != SYNCER_RUNNING && LIST_EMPTY(slp) &&
1596		    syncer_worklist_len > 0);
1597
1598		/*
1599		 * Keep track of the last time there was anything
1600		 * on the worklist other than syncer vnodes.
1601		 * Return to the SHUTTING_DOWN state if any
1602		 * new work appears.
1603		 */
1604		if (net_worklist_len > 0 || syncer_state == SYNCER_RUNNING)
1605			last_work_seen = syncer_delayno;
1606		if (net_worklist_len > 0 && syncer_state == SYNCER_FINAL_DELAY)
1607			syncer_state = SYNCER_SHUTTING_DOWN;
1608		while ((vp = LIST_FIRST(slp)) != NULL) {
1609			if (VOP_ISLOCKED(vp, NULL) != 0 ||
1610			    vn_start_write(vp, &mp, V_NOWAIT) != 0) {
1611				LIST_REMOVE(vp, v_synclist);
1612				LIST_INSERT_HEAD(next, vp, v_synclist);
1613				continue;
1614			}
1615			if (VI_TRYLOCK(vp) == 0) {
1616				LIST_REMOVE(vp, v_synclist);
1617				LIST_INSERT_HEAD(next, vp, v_synclist);
1618				vn_finished_write(mp);
1619				continue;
1620			}
1621			/*
1622			 * We use vhold in case the vnode does not
1623			 * successfully sync.  vhold prevents the vnode from
1624			 * going away when we unlock the sync_mtx so that
1625			 * we can acquire the vnode interlock.
1626			 */
1627			vholdl(vp);
1628			mtx_unlock(&sync_mtx);
1629			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY | LK_INTERLOCK, td);
1630			(void) VOP_FSYNC(vp, td->td_ucred, MNT_LAZY, td);
1631			VOP_UNLOCK(vp, 0, td);
1632			vn_finished_write(mp);
1633			VI_LOCK(vp);
1634			if ((vp->v_iflag & VI_ONWORKLST) != 0) {
1635				/*
1636				 * Put us back on the worklist.  The worklist
1637				 * routine will remove us from our current
1638				 * position and then add us back in at a later
1639				 * position.
1640				 */
1641				vn_syncer_add_to_worklist(vp, syncdelay);
1642			}
1643			vdropl(vp);
1644			VI_UNLOCK(vp);
1645			mtx_lock(&sync_mtx);
1646		}
1647		if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter > 0)
1648			syncer_final_iter--;
1649		mtx_unlock(&sync_mtx);
1650
1651		/*
1652		 * Do soft update processing.
1653		 */
1654		if (softdep_process_worklist_hook != NULL)
1655			(*softdep_process_worklist_hook)(NULL);
1656
1657		/*
1658		 * The variable rushjob allows the kernel to speed up the
1659		 * processing of the filesystem syncer process. A rushjob
1660		 * value of N tells the filesystem syncer to process the next
1661		 * N seconds worth of work on its queue ASAP. Currently rushjob
1662		 * is used by the soft update code to speed up the filesystem
1663		 * syncer process when the incore state is getting so far
1664		 * ahead of the disk that the kernel memory pool is being
1665		 * threatened with exhaustion.
1666		 */
1667		mtx_lock(&sync_mtx);
1668		if (rushjob > 0) {
1669			rushjob -= 1;
1670			mtx_unlock(&sync_mtx);
1671			continue;
1672		}
1673		mtx_unlock(&sync_mtx);
1674		/*
1675		 * Just sleep for a short period if time between
1676		 * iterations when shutting down to allow some I/O
1677		 * to happen.
1678		 *
1679		 * If it has taken us less than a second to process the
1680		 * current work, then wait. Otherwise start right over
1681		 * again. We can still lose time if any single round
1682		 * takes more than two seconds, but it does not really
1683		 * matter as we are just trying to generally pace the
1684		 * filesystem activity.
1685		 */
1686		if (syncer_state != SYNCER_RUNNING)
1687			tsleep(&dummychan, PPAUSE, "syncfnl",
1688			    hz / SYNCER_SHUTDOWN_SPEEDUP);
1689		else if (time_second == starttime)
1690			tsleep(&lbolt, PPAUSE, "syncer", 0);
1691	}
1692}
1693
1694/*
1695 * Request the syncer daemon to speed up its work.
1696 * We never push it to speed up more than half of its
1697 * normal turn time, otherwise it could take over the cpu.
1698 */
1699int
1700speedup_syncer()
1701{
1702	struct thread *td;
1703	int ret = 0;
1704
1705	td = FIRST_THREAD_IN_PROC(updateproc);
1706	sleepq_remove(td, &lbolt);
1707	mtx_lock(&sync_mtx);
1708	if (rushjob < syncdelay / 2) {
1709		rushjob += 1;
1710		stat_rush_requests += 1;
1711		ret = 1;
1712	}
1713	mtx_unlock(&sync_mtx);
1714	return (ret);
1715}
1716
1717/*
1718 * Tell the syncer to speed up its work and run though its work
1719 * list several times, then tell it to shut down.
1720 */
1721static void
1722syncer_shutdown(void *arg, int howto)
1723{
1724	struct thread *td;
1725
1726	td = FIRST_THREAD_IN_PROC(updateproc);
1727	sleepq_remove(td, &lbolt);
1728	mtx_lock(&sync_mtx);
1729	syncer_state = SYNCER_SHUTTING_DOWN;
1730	rushjob = 0;
1731	mtx_unlock(&sync_mtx);
1732	kproc_shutdown(arg, howto);
1733}
1734
1735/*
1736 * Associate a p-buffer with a vnode.
1737 *
1738 * Also sets B_PAGING flag to indicate that vnode is not fully associated
1739 * with the buffer.  i.e. the bp has not been linked into the vnode or
1740 * ref-counted.
1741 */
1742void
1743pbgetvp(vp, bp)
1744	register struct vnode *vp;
1745	register struct buf *bp;
1746{
1747
1748	KASSERT(bp->b_vp == NULL, ("pbgetvp: not free"));
1749
1750	bp->b_vp = vp;
1751	bp->b_object = vp->v_object;
1752	bp->b_flags |= B_PAGING;
1753	bp->b_dev = vn_todev(vp);
1754}
1755
1756/*
1757 * Disassociate a p-buffer from a vnode.
1758 */
1759void
1760pbrelvp(bp)
1761	register struct buf *bp;
1762{
1763
1764	KASSERT(bp->b_vp != NULL, ("pbrelvp: NULL"));
1765
1766	/* XXX REMOVE ME */
1767	VI_LOCK(bp->b_vp);
1768	if (TAILQ_NEXT(bp, b_vnbufs) != NULL) {
1769		panic(
1770		    "relpbuf(): b_vp was probably reassignbuf()d %p %x",
1771		    bp,
1772		    (int)bp->b_flags
1773		);
1774	}
1775	VI_UNLOCK(bp->b_vp);
1776	bp->b_vp = (struct vnode *) 0;
1777	bp->b_object = NULL;
1778	bp->b_flags &= ~B_PAGING;
1779}
1780
1781/*
1782 * Reassign a buffer from one vnode to another.
1783 * Used to assign file specific control information
1784 * (indirect blocks) to the vnode to which they belong.
1785 */
1786void
1787reassignbuf(bp, newvp)
1788	register struct buf *bp;
1789	register struct vnode *newvp;
1790{
1791	struct vnode *vp;
1792	int delay;
1793
1794	if (newvp == NULL) {
1795		printf("reassignbuf: NULL");
1796		return;
1797	}
1798	vp = bp->b_vp;
1799	++reassignbufcalls;
1800
1801	/*
1802	 * B_PAGING flagged buffers cannot be reassigned because their vp
1803	 * is not fully linked in.
1804	 */
1805	if (bp->b_flags & B_PAGING)
1806		panic("cannot reassign paging buffer");
1807
1808	/*
1809	 * Delete from old vnode list, if on one.
1810	 */
1811	VI_LOCK(vp);
1812	if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) {
1813		buf_vlist_remove(bp);
1814		if (vp != newvp) {
1815			vdropl(bp->b_vp);
1816			bp->b_vp = NULL;	/* for clarification */
1817		}
1818	}
1819	if (vp != newvp) {
1820		VI_UNLOCK(vp);
1821		VI_LOCK(newvp);
1822	}
1823	/*
1824	 * If dirty, put on list of dirty buffers; otherwise insert onto list
1825	 * of clean buffers.
1826	 */
1827	if (bp->b_flags & B_DELWRI) {
1828		if ((newvp->v_iflag & VI_ONWORKLST) == 0) {
1829			switch (newvp->v_type) {
1830			case VDIR:
1831				delay = dirdelay;
1832				break;
1833			case VCHR:
1834				delay = metadelay;
1835				break;
1836			default:
1837				delay = filedelay;
1838			}
1839			vn_syncer_add_to_worklist(newvp, delay);
1840		}
1841		buf_vlist_add(bp, newvp, BX_VNDIRTY);
1842	} else {
1843		buf_vlist_add(bp, newvp, BX_VNCLEAN);
1844
1845		if ((newvp->v_iflag & VI_ONWORKLST) &&
1846		    TAILQ_EMPTY(&newvp->v_dirtyblkhd)) {
1847			mtx_lock(&sync_mtx);
1848			LIST_REMOVE(newvp, v_synclist);
1849 			syncer_worklist_len--;
1850			mtx_unlock(&sync_mtx);
1851			newvp->v_iflag &= ~VI_ONWORKLST;
1852		}
1853	}
1854	if (bp->b_vp != newvp) {
1855		bp->b_vp = newvp;
1856		vholdl(bp->b_vp);
1857	}
1858	VI_UNLOCK(newvp);
1859}
1860
1861/*
1862 * Create a vnode for a device.
1863 * Used for mounting the root filesystem.
1864 */
1865int
1866bdevvp(dev, vpp)
1867	struct cdev *dev;
1868	struct vnode **vpp;
1869{
1870	register struct vnode *vp;
1871	struct vnode *nvp;
1872	int error;
1873
1874	if (dev == NULL) {
1875		*vpp = NULLVP;
1876		return (ENXIO);
1877	}
1878	if (vfinddev(dev, vpp))
1879		return (0);
1880
1881	error = getnewvnode("none", (struct mount *)0, spec_vnodeop_p, &nvp);
1882	if (error) {
1883		*vpp = NULLVP;
1884		return (error);
1885	}
1886	vp = nvp;
1887	vp->v_type = VCHR;
1888	vp->v_bsize = DEV_BSIZE;
1889	addalias(vp, dev);
1890	*vpp = vp;
1891	return (0);
1892}
1893
1894static void
1895v_incr_usecount(struct vnode *vp, int delta)
1896{
1897
1898	vp->v_usecount += delta;
1899	if (vp->v_type == VCHR && vp->v_rdev != NULL) {
1900		mtx_lock(&spechash_mtx);
1901		vp->v_rdev->si_usecount += delta;
1902		mtx_unlock(&spechash_mtx);
1903	}
1904}
1905
1906/*
1907 * Add vnode to the alias list hung off the struct cdev *.
1908 *
1909 * The reason for this gunk is that multiple vnodes can reference
1910 * the same physical device, so checking vp->v_usecount to see
1911 * how many users there are is inadequate; the v_usecount for
1912 * the vnodes need to be accumulated.  vcount() does that.
1913 */
1914struct vnode *
1915addaliasu(nvp, nvp_rdev)
1916	struct vnode *nvp;
1917	dev_t nvp_rdev;
1918{
1919	struct vnode *ovp;
1920	vop_t **ops;
1921	struct cdev *dev;
1922
1923	if (nvp->v_type == VBLK)
1924		return (nvp);
1925	if (nvp->v_type != VCHR)
1926		panic("addaliasu on non-special vnode");
1927	dev = findcdev(nvp_rdev);
1928	if (dev == NULL)
1929		return (nvp);
1930	/*
1931	 * Check to see if we have a bdevvp vnode with no associated
1932	 * filesystem. If so, we want to associate the filesystem of
1933	 * the new newly instigated vnode with the bdevvp vnode and
1934	 * discard the newly created vnode rather than leaving the
1935	 * bdevvp vnode lying around with no associated filesystem.
1936	 */
1937	if (vfinddev(dev, &ovp) == 0 || ovp->v_data != NULL) {
1938		addalias(nvp, dev);
1939		return (nvp);
1940	}
1941	/*
1942	 * Discard unneeded vnode, but save its node specific data.
1943	 * Note that if there is a lock, it is carried over in the
1944	 * node specific data to the replacement vnode.
1945	 */
1946	vref(ovp);
1947	ovp->v_data = nvp->v_data;
1948	ovp->v_tag = nvp->v_tag;
1949	nvp->v_data = NULL;
1950	lockdestroy(ovp->v_vnlock);
1951	lockinit(ovp->v_vnlock, PVFS, nvp->v_vnlock->lk_wmesg,
1952	    nvp->v_vnlock->lk_timo, nvp->v_vnlock->lk_flags & LK_EXTFLG_MASK);
1953	ops = ovp->v_op;
1954	ovp->v_op = nvp->v_op;
1955	if (VOP_ISLOCKED(nvp, curthread)) {
1956		VOP_UNLOCK(nvp, 0, curthread);
1957		vn_lock(ovp, LK_EXCLUSIVE | LK_RETRY, curthread);
1958	}
1959	nvp->v_op = ops;
1960	delmntque(ovp);
1961	insmntque(ovp, nvp->v_mount);
1962	vrele(nvp);
1963	vgone(nvp);
1964	return (ovp);
1965}
1966
1967/* This is a local helper function that do the same as addaliasu, but for a
1968 * struct cdev *instead of an dev_t. */
1969static void
1970addalias(nvp, dev)
1971	struct vnode *nvp;
1972	struct cdev *dev;
1973{
1974
1975	KASSERT(nvp->v_type == VCHR, ("addalias on non-special vnode"));
1976	dev_ref(dev);
1977	nvp->v_rdev = dev;
1978	VI_LOCK(nvp);
1979	mtx_lock(&spechash_mtx);
1980	SLIST_INSERT_HEAD(&dev->si_hlist, nvp, v_specnext);
1981	dev->si_usecount += nvp->v_usecount;
1982	mtx_unlock(&spechash_mtx);
1983	VI_UNLOCK(nvp);
1984}
1985
1986/*
1987 * Grab a particular vnode from the free list, increment its
1988 * reference count and lock it. The vnode lock bit is set if the
1989 * vnode is being eliminated in vgone. The process is awakened
1990 * when the transition is completed, and an error returned to
1991 * indicate that the vnode is no longer usable (possibly having
1992 * been changed to a new filesystem type).
1993 */
1994int
1995vget(vp, flags, td)
1996	register struct vnode *vp;
1997	int flags;
1998	struct thread *td;
1999{
2000	int error;
2001
2002	/*
2003	 * If the vnode is in the process of being cleaned out for
2004	 * another use, we wait for the cleaning to finish and then
2005	 * return failure. Cleaning is determined by checking that
2006	 * the VI_XLOCK flag is set.
2007	 */
2008	if ((flags & LK_INTERLOCK) == 0)
2009		VI_LOCK(vp);
2010	if (vp->v_iflag & VI_XLOCK && vp->v_vxthread != curthread) {
2011		if ((flags & LK_NOWAIT) == 0) {
2012			vp->v_iflag |= VI_XWANT;
2013			msleep(vp, VI_MTX(vp), PINOD | PDROP, "vget", 0);
2014			return (ENOENT);
2015		}
2016		VI_UNLOCK(vp);
2017		return (EBUSY);
2018	}
2019
2020	v_incr_usecount(vp, 1);
2021
2022	if (VSHOULDBUSY(vp))
2023		vbusy(vp);
2024	if (flags & LK_TYPE_MASK) {
2025		if ((error = vn_lock(vp, flags | LK_INTERLOCK, td)) != 0) {
2026			/*
2027			 * must expand vrele here because we do not want
2028			 * to call VOP_INACTIVE if the reference count
2029			 * drops back to zero since it was never really
2030			 * active. We must remove it from the free list
2031			 * before sleeping so that multiple processes do
2032			 * not try to recycle it.
2033			 */
2034			VI_LOCK(vp);
2035			v_incr_usecount(vp, -1);
2036			if (VSHOULDFREE(vp))
2037				vfree(vp);
2038			else
2039				vlruvp(vp);
2040			VI_UNLOCK(vp);
2041		}
2042		return (error);
2043	}
2044	VI_UNLOCK(vp);
2045	return (0);
2046}
2047
2048/*
2049 * Increase the reference count of a vnode.
2050 */
2051void
2052vref(struct vnode *vp)
2053{
2054
2055	VI_LOCK(vp);
2056	v_incr_usecount(vp, 1);
2057	VI_UNLOCK(vp);
2058}
2059
2060/*
2061 * Return reference count of a vnode.
2062 *
2063 * The results of this call are only guaranteed when some mechanism other
2064 * than the VI lock is used to stop other processes from gaining references
2065 * to the vnode.  This may be the case if the caller holds the only reference.
2066 * This is also useful when stale data is acceptable as race conditions may
2067 * be accounted for by some other means.
2068 */
2069int
2070vrefcnt(struct vnode *vp)
2071{
2072	int usecnt;
2073
2074	VI_LOCK(vp);
2075	usecnt = vp->v_usecount;
2076	VI_UNLOCK(vp);
2077
2078	return (usecnt);
2079}
2080
2081
2082/*
2083 * Vnode put/release.
2084 * If count drops to zero, call inactive routine and return to freelist.
2085 */
2086void
2087vrele(vp)
2088	struct vnode *vp;
2089{
2090	struct thread *td = curthread;	/* XXX */
2091
2092	GIANT_REQUIRED;
2093
2094	KASSERT(vp != NULL, ("vrele: null vp"));
2095
2096	VI_LOCK(vp);
2097
2098	/* Skip this v_writecount check if we're going to panic below. */
2099	KASSERT(vp->v_writecount < vp->v_usecount || vp->v_usecount < 1,
2100	    ("vrele: missed vn_close"));
2101
2102	if (vp->v_usecount > 1 || ((vp->v_iflag & VI_DOINGINACT) &&
2103	    vp->v_usecount == 1)) {
2104		v_incr_usecount(vp, -1);
2105		VI_UNLOCK(vp);
2106
2107		return;
2108	}
2109
2110	if (vp->v_usecount == 1) {
2111		v_incr_usecount(vp, -1);
2112		/*
2113		 * We must call VOP_INACTIVE with the node locked. Mark
2114		 * as VI_DOINGINACT to avoid recursion.
2115		 */
2116		if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK, td) == 0) {
2117			VI_LOCK(vp);
2118			vp->v_iflag |= VI_DOINGINACT;
2119			VI_UNLOCK(vp);
2120			VOP_INACTIVE(vp, td);
2121			VI_LOCK(vp);
2122			KASSERT(vp->v_iflag & VI_DOINGINACT,
2123			    ("vrele: lost VI_DOINGINACT"));
2124			vp->v_iflag &= ~VI_DOINGINACT;
2125		} else
2126			VI_LOCK(vp);
2127		if (VSHOULDFREE(vp))
2128			vfree(vp);
2129		else
2130			vlruvp(vp);
2131		VI_UNLOCK(vp);
2132
2133	} else {
2134#ifdef DIAGNOSTIC
2135		vprint("vrele: negative ref count", vp);
2136#endif
2137		VI_UNLOCK(vp);
2138		panic("vrele: negative ref cnt");
2139	}
2140}
2141
2142/*
2143 * Release an already locked vnode.  This give the same effects as
2144 * unlock+vrele(), but takes less time and avoids releasing and
2145 * re-aquiring the lock (as vrele() aquires the lock internally.)
2146 */
2147void
2148vput(vp)
2149	struct vnode *vp;
2150{
2151	struct thread *td = curthread;	/* XXX */
2152
2153	GIANT_REQUIRED;
2154
2155	KASSERT(vp != NULL, ("vput: null vp"));
2156	VI_LOCK(vp);
2157	/* Skip this v_writecount check if we're going to panic below. */
2158	KASSERT(vp->v_writecount < vp->v_usecount || vp->v_usecount < 1,
2159	    ("vput: missed vn_close"));
2160
2161	if (vp->v_usecount > 1 || ((vp->v_iflag & VI_DOINGINACT) &&
2162	    vp->v_usecount == 1)) {
2163		v_incr_usecount(vp, -1);
2164		VOP_UNLOCK(vp, LK_INTERLOCK, td);
2165		return;
2166	}
2167
2168	if (vp->v_usecount == 1) {
2169		v_incr_usecount(vp, -1);
2170		/*
2171		 * We must call VOP_INACTIVE with the node locked, so
2172		 * we just need to release the vnode mutex. Mark as
2173		 * as VI_DOINGINACT to avoid recursion.
2174		 */
2175		vp->v_iflag |= VI_DOINGINACT;
2176		VI_UNLOCK(vp);
2177		VOP_INACTIVE(vp, td);
2178		VI_LOCK(vp);
2179		KASSERT(vp->v_iflag & VI_DOINGINACT,
2180		    ("vput: lost VI_DOINGINACT"));
2181		vp->v_iflag &= ~VI_DOINGINACT;
2182		if (VSHOULDFREE(vp))
2183			vfree(vp);
2184		else
2185			vlruvp(vp);
2186		VI_UNLOCK(vp);
2187
2188	} else {
2189#ifdef DIAGNOSTIC
2190		vprint("vput: negative ref count", vp);
2191#endif
2192		panic("vput: negative ref cnt");
2193	}
2194}
2195
2196/*
2197 * Somebody doesn't want the vnode recycled.
2198 */
2199void
2200vhold(struct vnode *vp)
2201{
2202
2203	VI_LOCK(vp);
2204	vholdl(vp);
2205	VI_UNLOCK(vp);
2206}
2207
2208void
2209vholdl(vp)
2210	register struct vnode *vp;
2211{
2212
2213	vp->v_holdcnt++;
2214	if (VSHOULDBUSY(vp))
2215		vbusy(vp);
2216}
2217
2218/*
2219 * Note that there is one less who cares about this vnode.  vdrop() is the
2220 * opposite of vhold().
2221 */
2222void
2223vdrop(struct vnode *vp)
2224{
2225
2226	VI_LOCK(vp);
2227	vdropl(vp);
2228	VI_UNLOCK(vp);
2229}
2230
2231void
2232vdropl(vp)
2233	register struct vnode *vp;
2234{
2235
2236	if (vp->v_holdcnt <= 0)
2237		panic("vdrop: holdcnt");
2238	vp->v_holdcnt--;
2239	if (VSHOULDFREE(vp))
2240		vfree(vp);
2241	else
2242		vlruvp(vp);
2243}
2244
2245/*
2246 * Remove any vnodes in the vnode table belonging to mount point mp.
2247 *
2248 * If FORCECLOSE is not specified, there should not be any active ones,
2249 * return error if any are found (nb: this is a user error, not a
2250 * system error). If FORCECLOSE is specified, detach any active vnodes
2251 * that are found.
2252 *
2253 * If WRITECLOSE is set, only flush out regular file vnodes open for
2254 * writing.
2255 *
2256 * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped.
2257 *
2258 * `rootrefs' specifies the base reference count for the root vnode
2259 * of this filesystem. The root vnode is considered busy if its
2260 * v_usecount exceeds this value. On a successful return, vflush()
2261 * will call vrele() on the root vnode exactly rootrefs times.
2262 * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must
2263 * be zero.
2264 */
2265#ifdef DIAGNOSTIC
2266static int busyprt = 0;		/* print out busy vnodes */
2267SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "");
2268#endif
2269
2270int
2271vflush(mp, rootrefs, flags)
2272	struct mount *mp;
2273	int rootrefs;
2274	int flags;
2275{
2276	struct thread *td = curthread;	/* XXX */
2277	struct vnode *vp, *nvp, *rootvp = NULL;
2278	struct vattr vattr;
2279	int busy = 0, error;
2280
2281	if (rootrefs > 0) {
2282		KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0,
2283		    ("vflush: bad args"));
2284		/*
2285		 * Get the filesystem root vnode. We can vput() it
2286		 * immediately, since with rootrefs > 0, it won't go away.
2287		 */
2288		if ((error = VFS_ROOT(mp, &rootvp)) != 0)
2289			return (error);
2290		vput(rootvp);
2291
2292	}
2293	MNT_ILOCK(mp);
2294loop:
2295	MNT_VNODE_FOREACH(vp, mp, nvp) {
2296
2297		VI_LOCK(vp);
2298		MNT_IUNLOCK(mp);
2299		error = vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE, td);
2300		if (error) {
2301			MNT_ILOCK(mp);
2302			goto loop;
2303		}
2304		/*
2305		 * Skip over a vnodes marked VV_SYSTEM.
2306		 */
2307		if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) {
2308			VOP_UNLOCK(vp, 0, td);
2309			MNT_ILOCK(mp);
2310			continue;
2311		}
2312		/*
2313		 * If WRITECLOSE is set, flush out unlinked but still open
2314		 * files (even if open only for reading) and regular file
2315		 * vnodes open for writing.
2316		 */
2317		if (flags & WRITECLOSE) {
2318			error = VOP_GETATTR(vp, &vattr, td->td_ucred, td);
2319			VI_LOCK(vp);
2320
2321			if ((vp->v_type == VNON ||
2322			    (error == 0 && vattr.va_nlink > 0)) &&
2323			    (vp->v_writecount == 0 || vp->v_type != VREG)) {
2324				VOP_UNLOCK(vp, LK_INTERLOCK, td);
2325				MNT_ILOCK(mp);
2326				continue;
2327			}
2328		} else
2329			VI_LOCK(vp);
2330
2331		VOP_UNLOCK(vp, 0, td);
2332
2333		/*
2334		 * With v_usecount == 0, all we need to do is clear out the
2335		 * vnode data structures and we are done.
2336		 */
2337		if (vp->v_usecount == 0) {
2338			vgonel(vp, td);
2339			MNT_ILOCK(mp);
2340			continue;
2341		}
2342
2343		/*
2344		 * If FORCECLOSE is set, forcibly close the vnode. For block
2345		 * or character devices, revert to an anonymous device. For
2346		 * all other files, just kill them.
2347		 */
2348		if (flags & FORCECLOSE) {
2349			if (vp->v_type != VCHR)
2350				vgonel(vp, td);
2351			else
2352				vgonechrl(vp, td);
2353			MNT_ILOCK(mp);
2354			continue;
2355		}
2356#ifdef DIAGNOSTIC
2357		if (busyprt)
2358			vprint("vflush: busy vnode", vp);
2359#endif
2360		VI_UNLOCK(vp);
2361		MNT_ILOCK(mp);
2362		busy++;
2363	}
2364	MNT_IUNLOCK(mp);
2365	if (rootrefs > 0 && (flags & FORCECLOSE) == 0) {
2366		/*
2367		 * If just the root vnode is busy, and if its refcount
2368		 * is equal to `rootrefs', then go ahead and kill it.
2369		 */
2370		VI_LOCK(rootvp);
2371		KASSERT(busy > 0, ("vflush: not busy"));
2372		KASSERT(rootvp->v_usecount >= rootrefs, ("vflush: rootrefs"));
2373		if (busy == 1 && rootvp->v_usecount == rootrefs) {
2374			vgonel(rootvp, td);
2375			busy = 0;
2376		} else
2377			VI_UNLOCK(rootvp);
2378	}
2379	if (busy)
2380		return (EBUSY);
2381	for (; rootrefs > 0; rootrefs--)
2382		vrele(rootvp);
2383	return (0);
2384}
2385
2386/*
2387 * This moves a now (likely recyclable) vnode to the end of the
2388 * mountlist.  XXX However, it is temporarily disabled until we
2389 * can clean up ffs_sync() and friends, which have loop restart
2390 * conditions which this code causes to operate O(N^2).
2391 */
2392static void
2393vlruvp(struct vnode *vp)
2394{
2395#if 0
2396	struct mount *mp;
2397
2398	if ((mp = vp->v_mount) != NULL) {
2399		MNT_ILOCK(mp);
2400		TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
2401		TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
2402		MNT_IUNLOCK(mp);
2403	}
2404#endif
2405}
2406
2407static void
2408vx_lock(struct vnode *vp)
2409{
2410
2411	ASSERT_VI_LOCKED(vp, "vx_lock");
2412
2413	/*
2414	 * Prevent the vnode from being recycled or brought into use while we
2415	 * clean it out.
2416	 */
2417	if (vp->v_iflag & VI_XLOCK)
2418		panic("vclean: deadlock");
2419	vp->v_iflag |= VI_XLOCK;
2420	vp->v_vxthread = curthread;
2421}
2422
2423static void
2424vx_unlock(struct vnode *vp)
2425{
2426	ASSERT_VI_LOCKED(vp, "vx_unlock");
2427	vp->v_iflag &= ~VI_XLOCK;
2428	vp->v_vxthread = NULL;
2429	if (vp->v_iflag & VI_XWANT) {
2430		vp->v_iflag &= ~VI_XWANT;
2431		wakeup(vp);
2432	}
2433}
2434
2435/*
2436 * Disassociate the underlying filesystem from a vnode.
2437 */
2438static void
2439vclean(vp, flags, td)
2440	struct vnode *vp;
2441	int flags;
2442	struct thread *td;
2443{
2444	int active;
2445
2446	ASSERT_VI_LOCKED(vp, "vclean");
2447	/*
2448	 * Check to see if the vnode is in use. If so we have to reference it
2449	 * before we clean it out so that its count cannot fall to zero and
2450	 * generate a race against ourselves to recycle it.
2451	 */
2452	if ((active = vp->v_usecount))
2453		v_incr_usecount(vp, 1);
2454
2455	/*
2456	 * Even if the count is zero, the VOP_INACTIVE routine may still
2457	 * have the object locked while it cleans it out. The VOP_LOCK
2458	 * ensures that the VOP_INACTIVE routine is done with its work.
2459	 * For active vnodes, it ensures that no other activity can
2460	 * occur while the underlying object is being cleaned out.
2461	 */
2462	VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK, td);
2463
2464	/*
2465	 * Clean out any buffers associated with the vnode.
2466	 * If the flush fails, just toss the buffers.
2467	 */
2468	if (flags & DOCLOSE) {
2469		struct buf *bp;
2470		bp = TAILQ_FIRST(&vp->v_dirtyblkhd);
2471		if (bp != NULL)
2472			(void) vn_write_suspend_wait(vp, NULL, V_WAIT);
2473		if (vinvalbuf(vp, V_SAVE, NOCRED, td, 0, 0) != 0)
2474			vinvalbuf(vp, 0, NOCRED, td, 0, 0);
2475	}
2476
2477	VOP_DESTROYVOBJECT(vp);
2478
2479	/*
2480	 * Any other processes trying to obtain this lock must first
2481	 * wait for VXLOCK to clear, then call the new lock operation.
2482	 */
2483	VOP_UNLOCK(vp, 0, td);
2484
2485	/*
2486	 * If purging an active vnode, it must be closed and
2487	 * deactivated before being reclaimed. Note that the
2488	 * VOP_INACTIVE will unlock the vnode.
2489	 */
2490	if (active) {
2491		if (flags & DOCLOSE)
2492			VOP_CLOSE(vp, FNONBLOCK, NOCRED, td);
2493		VI_LOCK(vp);
2494		if ((vp->v_iflag & VI_DOINGINACT) == 0) {
2495			vp->v_iflag |= VI_DOINGINACT;
2496			VI_UNLOCK(vp);
2497			if (vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT, td) != 0)
2498				panic("vclean: cannot relock.");
2499			VOP_INACTIVE(vp, td);
2500			VI_LOCK(vp);
2501			KASSERT(vp->v_iflag & VI_DOINGINACT,
2502			    ("vclean: lost VI_DOINGINACT"));
2503			vp->v_iflag &= ~VI_DOINGINACT;
2504		}
2505		VI_UNLOCK(vp);
2506	}
2507	/*
2508	 * Reclaim the vnode.
2509	 */
2510	if (VOP_RECLAIM(vp, td))
2511		panic("vclean: cannot reclaim");
2512
2513	if (active) {
2514		/*
2515		 * Inline copy of vrele() since VOP_INACTIVE
2516		 * has already been called.
2517		 */
2518		VI_LOCK(vp);
2519		v_incr_usecount(vp, -1);
2520		if (vp->v_usecount <= 0) {
2521#ifdef INVARIANTS
2522			if (vp->v_usecount < 0 || vp->v_writecount != 0) {
2523				vprint("vclean: bad ref count", vp);
2524				panic("vclean: ref cnt");
2525			}
2526#endif
2527			if (VSHOULDFREE(vp))
2528				vfree(vp);
2529		}
2530		VI_UNLOCK(vp);
2531	}
2532	/*
2533	 * Delete from old mount point vnode list.
2534	 */
2535	delmntque(vp);
2536	cache_purge(vp);
2537	VI_LOCK(vp);
2538	if (VSHOULDFREE(vp))
2539		vfree(vp);
2540
2541	/*
2542	 * Done with purge, reset to the standard lock and
2543	 * notify sleepers of the grim news.
2544	 */
2545	vp->v_vnlock = &vp->v_lock;
2546	vp->v_op = dead_vnodeop_p;
2547	if (vp->v_pollinfo != NULL)
2548		vn_pollgone(vp);
2549	vp->v_tag = "none";
2550}
2551
2552/*
2553 * Eliminate all activity associated with the requested vnode
2554 * and with all vnodes aliased to the requested vnode.
2555 */
2556int
2557vop_revoke(ap)
2558	struct vop_revoke_args /* {
2559		struct vnode *a_vp;
2560		int a_flags;
2561	} */ *ap;
2562{
2563	struct vnode *vp, *vq;
2564	struct cdev *dev;
2565
2566	KASSERT((ap->a_flags & REVOKEALL) != 0, ("vop_revoke"));
2567	vp = ap->a_vp;
2568	KASSERT((vp->v_type == VCHR), ("vop_revoke: not VCHR"));
2569
2570	VI_LOCK(vp);
2571	/*
2572	 * If a vgone (or vclean) is already in progress,
2573	 * wait until it is done and return.
2574	 */
2575	if (vp->v_iflag & VI_XLOCK) {
2576		vp->v_iflag |= VI_XWANT;
2577		msleep(vp, VI_MTX(vp), PINOD | PDROP,
2578		    "vop_revokeall", 0);
2579		return (0);
2580	}
2581	VI_UNLOCK(vp);
2582	dev = vp->v_rdev;
2583	for (;;) {
2584		mtx_lock(&spechash_mtx);
2585		vq = SLIST_FIRST(&dev->si_hlist);
2586		mtx_unlock(&spechash_mtx);
2587		if (vq == NULL)
2588			break;
2589		vgone(vq);
2590	}
2591	return (0);
2592}
2593
2594/*
2595 * Recycle an unused vnode to the front of the free list.
2596 * Release the passed interlock if the vnode will be recycled.
2597 */
2598int
2599vrecycle(vp, inter_lkp, td)
2600	struct vnode *vp;
2601	struct mtx *inter_lkp;
2602	struct thread *td;
2603{
2604
2605	VI_LOCK(vp);
2606	if (vp->v_usecount == 0) {
2607		if (inter_lkp) {
2608			mtx_unlock(inter_lkp);
2609		}
2610		vgonel(vp, td);
2611		return (1);
2612	}
2613	VI_UNLOCK(vp);
2614	return (0);
2615}
2616
2617/*
2618 * Eliminate all activity associated with a vnode
2619 * in preparation for reuse.
2620 */
2621void
2622vgone(vp)
2623	register struct vnode *vp;
2624{
2625	struct thread *td = curthread;	/* XXX */
2626
2627	VI_LOCK(vp);
2628	vgonel(vp, td);
2629}
2630
2631/*
2632 * Disassociate a character device from the its underlying filesystem and
2633 * attach it to spec.  This is for use when the chr device is still active
2634 * and the filesystem is going away.
2635 */
2636static void
2637vgonechrl(struct vnode *vp, struct thread *td)
2638{
2639	ASSERT_VI_LOCKED(vp, "vgonechrl");
2640	vx_lock(vp);
2641	/*
2642	 * This is a custom version of vclean() which does not tearm down
2643	 * the bufs or vm objects held by this vnode.  This allows filesystems
2644	 * to continue using devices which were discovered via another
2645	 * filesystem that has been unmounted.
2646	 */
2647	if (vp->v_usecount != 0) {
2648		v_incr_usecount(vp, 1);
2649		/*
2650		 * Ensure that no other activity can occur while the
2651		 * underlying object is being cleaned out.
2652		 */
2653		VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK, td);
2654		/*
2655		 * Any other processes trying to obtain this lock must first
2656		 * wait for VXLOCK to clear, then call the new lock operation.
2657		 */
2658		VOP_UNLOCK(vp, 0, td);
2659		vp->v_vnlock = &vp->v_lock;
2660		vp->v_tag = "orphanchr";
2661		vp->v_op = spec_vnodeop_p;
2662		delmntque(vp);
2663		cache_purge(vp);
2664		vrele(vp);
2665		VI_LOCK(vp);
2666	} else
2667		vclean(vp, 0, td);
2668	vp->v_op = spec_vnodeop_p;
2669	vx_unlock(vp);
2670	VI_UNLOCK(vp);
2671}
2672
2673/*
2674 * vgone, with the vp interlock held.
2675 */
2676void
2677vgonel(vp, td)
2678	struct vnode *vp;
2679	struct thread *td;
2680{
2681	/*
2682	 * If a vgone (or vclean) is already in progress,
2683	 * wait until it is done and return.
2684	 */
2685	ASSERT_VI_LOCKED(vp, "vgonel");
2686	if (vp->v_iflag & VI_XLOCK) {
2687		vp->v_iflag |= VI_XWANT;
2688		msleep(vp, VI_MTX(vp), PINOD | PDROP, "vgone", 0);
2689		return;
2690	}
2691	vx_lock(vp);
2692
2693	/*
2694	 * Clean out the filesystem specific data.
2695	 */
2696	vclean(vp, DOCLOSE, td);
2697	VI_UNLOCK(vp);
2698
2699	/*
2700	 * If special device, remove it from special device alias list
2701	 * if it is on one.
2702	 */
2703	VI_LOCK(vp);
2704	if (vp->v_type == VCHR && vp->v_rdev != NULL) {
2705		mtx_lock(&spechash_mtx);
2706		SLIST_REMOVE(&vp->v_rdev->si_hlist, vp, vnode, v_specnext);
2707		vp->v_rdev->si_usecount -= vp->v_usecount;
2708		mtx_unlock(&spechash_mtx);
2709		dev_rel(vp->v_rdev);
2710		vp->v_rdev = NULL;
2711	}
2712
2713	/*
2714	 * If it is on the freelist and not already at the head,
2715	 * move it to the head of the list. The test of the
2716	 * VDOOMED flag and the reference count of zero is because
2717	 * it will be removed from the free list by getnewvnode,
2718	 * but will not have its reference count incremented until
2719	 * after calling vgone. If the reference count were
2720	 * incremented first, vgone would (incorrectly) try to
2721	 * close the previous instance of the underlying object.
2722	 */
2723	if (vp->v_usecount == 0 && !(vp->v_iflag & VI_DOOMED)) {
2724		mtx_lock(&vnode_free_list_mtx);
2725		if (vp->v_iflag & VI_FREE) {
2726			TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
2727		} else {
2728			vp->v_iflag |= VI_FREE;
2729			freevnodes++;
2730		}
2731		TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
2732		mtx_unlock(&vnode_free_list_mtx);
2733	}
2734
2735	vp->v_type = VBAD;
2736	vx_unlock(vp);
2737	VI_UNLOCK(vp);
2738}
2739
2740/*
2741 * Lookup a vnode by device number.
2742 */
2743int
2744vfinddev(dev, vpp)
2745	struct cdev *dev;
2746	struct vnode **vpp;
2747{
2748	struct vnode *vp;
2749
2750	mtx_lock(&spechash_mtx);
2751	SLIST_FOREACH(vp, &dev->si_hlist, v_specnext) {
2752		*vpp = vp;
2753		mtx_unlock(&spechash_mtx);
2754		return (1);
2755	}
2756	mtx_unlock(&spechash_mtx);
2757	return (0);
2758}
2759
2760/*
2761 * Calculate the total number of references to a special device.
2762 */
2763int
2764vcount(vp)
2765	struct vnode *vp;
2766{
2767	int count;
2768
2769	mtx_lock(&spechash_mtx);
2770	count = vp->v_rdev->si_usecount;
2771	mtx_unlock(&spechash_mtx);
2772	return (count);
2773}
2774
2775/*
2776 * Same as above, but using the struct cdev *as argument
2777 */
2778int
2779count_dev(dev)
2780	struct cdev *dev;
2781{
2782	int count;
2783
2784	mtx_lock(&spechash_mtx);
2785	count = dev->si_usecount;
2786	mtx_unlock(&spechash_mtx);
2787	return(count);
2788}
2789
2790/*
2791 * Print out a description of a vnode.
2792 */
2793static char *typename[] =
2794{"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"};
2795
2796void
2797vprint(label, vp)
2798	char *label;
2799	struct vnode *vp;
2800{
2801	char buf[96];
2802
2803	if (label != NULL)
2804		printf("%s: %p: ", label, (void *)vp);
2805	else
2806		printf("%p: ", (void *)vp);
2807	printf("tag %s, type %s, usecount %d, writecount %d, refcount %d,",
2808	    vp->v_tag, typename[vp->v_type], vp->v_usecount,
2809	    vp->v_writecount, vp->v_holdcnt);
2810	buf[0] = '\0';
2811	if (vp->v_vflag & VV_ROOT)
2812		strcat(buf, "|VV_ROOT");
2813	if (vp->v_vflag & VV_TEXT)
2814		strcat(buf, "|VV_TEXT");
2815	if (vp->v_vflag & VV_SYSTEM)
2816		strcat(buf, "|VV_SYSTEM");
2817	if (vp->v_iflag & VI_XLOCK)
2818		strcat(buf, "|VI_XLOCK");
2819	if (vp->v_iflag & VI_XWANT)
2820		strcat(buf, "|VI_XWANT");
2821	if (vp->v_iflag & VI_BWAIT)
2822		strcat(buf, "|VI_BWAIT");
2823	if (vp->v_iflag & VI_DOOMED)
2824		strcat(buf, "|VI_DOOMED");
2825	if (vp->v_iflag & VI_FREE)
2826		strcat(buf, "|VI_FREE");
2827	if (vp->v_vflag & VV_OBJBUF)
2828		strcat(buf, "|VV_OBJBUF");
2829	if (buf[0] != '\0')
2830		printf(" flags (%s),", &buf[1]);
2831	lockmgr_printinfo(vp->v_vnlock);
2832	printf("\n");
2833	if (vp->v_data != NULL)
2834		VOP_PRINT(vp);
2835}
2836
2837#ifdef DDB
2838#include <ddb/ddb.h>
2839/*
2840 * List all of the locked vnodes in the system.
2841 * Called when debugging the kernel.
2842 */
2843DB_SHOW_COMMAND(lockedvnods, lockedvnodes)
2844{
2845	struct mount *mp, *nmp;
2846	struct vnode *vp;
2847
2848	/*
2849	 * Note: because this is DDB, we can't obey the locking semantics
2850	 * for these structures, which means we could catch an inconsistent
2851	 * state and dereference a nasty pointer.  Not much to be done
2852	 * about that.
2853	 */
2854	printf("Locked vnodes\n");
2855	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
2856		nmp = TAILQ_NEXT(mp, mnt_list);
2857		TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
2858			if (VOP_ISLOCKED(vp, NULL))
2859				vprint(NULL, vp);
2860		}
2861		nmp = TAILQ_NEXT(mp, mnt_list);
2862	}
2863}
2864#endif
2865
2866/*
2867 * Fill in a struct xvfsconf based on a struct vfsconf.
2868 */
2869static void
2870vfsconf2x(struct vfsconf *vfsp, struct xvfsconf *xvfsp)
2871{
2872
2873	strcpy(xvfsp->vfc_name, vfsp->vfc_name);
2874	xvfsp->vfc_typenum = vfsp->vfc_typenum;
2875	xvfsp->vfc_refcount = vfsp->vfc_refcount;
2876	xvfsp->vfc_flags = vfsp->vfc_flags;
2877	/*
2878	 * These are unused in userland, we keep them
2879	 * to not break binary compatibility.
2880	 */
2881	xvfsp->vfc_vfsops = NULL;
2882	xvfsp->vfc_next = NULL;
2883}
2884
2885/*
2886 * Top level filesystem related information gathering.
2887 */
2888static int
2889sysctl_vfs_conflist(SYSCTL_HANDLER_ARGS)
2890{
2891	struct vfsconf *vfsp;
2892	struct xvfsconf *xvfsp;
2893	int cnt, error, i;
2894
2895	cnt = 0;
2896	for (vfsp = vfsconf; vfsp != NULL; vfsp = vfsp->vfc_next)
2897		cnt++;
2898	xvfsp = malloc(sizeof(struct xvfsconf) * cnt, M_TEMP, M_WAITOK);
2899	/*
2900	 * Handle the race that we will have here when struct vfsconf
2901	 * will be locked down by using both cnt and checking vfc_next
2902	 * against NULL to determine the end of the loop.  The race will
2903	 * happen because we will have to unlock before calling malloc().
2904	 * We are protected by Giant for now.
2905	 */
2906	i = 0;
2907	for (vfsp = vfsconf; vfsp != NULL && i < cnt; vfsp = vfsp->vfc_next) {
2908		vfsconf2x(vfsp, xvfsp + i);
2909		i++;
2910	}
2911	error = SYSCTL_OUT(req, xvfsp, sizeof(struct xvfsconf) * i);
2912	free(xvfsp, M_TEMP);
2913	return (error);
2914}
2915
2916SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLFLAG_RD, NULL, 0, sysctl_vfs_conflist,
2917    "S,xvfsconf", "List of all configured filesystems");
2918
2919#ifndef BURN_BRIDGES
2920static int	sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS);
2921
2922static int
2923vfs_sysctl(SYSCTL_HANDLER_ARGS)
2924{
2925	int *name = (int *)arg1 - 1;	/* XXX */
2926	u_int namelen = arg2 + 1;	/* XXX */
2927	struct vfsconf *vfsp;
2928	struct xvfsconf xvfsp;
2929
2930	printf("WARNING: userland calling deprecated sysctl, "
2931	    "please rebuild world\n");
2932
2933#if 1 || defined(COMPAT_PRELITE2)
2934	/* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */
2935	if (namelen == 1)
2936		return (sysctl_ovfs_conf(oidp, arg1, arg2, req));
2937#endif
2938
2939	switch (name[1]) {
2940	case VFS_MAXTYPENUM:
2941		if (namelen != 2)
2942			return (ENOTDIR);
2943		return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int)));
2944	case VFS_CONF:
2945		if (namelen != 3)
2946			return (ENOTDIR);	/* overloaded */
2947		for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
2948			if (vfsp->vfc_typenum == name[2])
2949				break;
2950		if (vfsp == NULL)
2951			return (EOPNOTSUPP);
2952		vfsconf2x(vfsp, &xvfsp);
2953		return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp)));
2954	}
2955	return (EOPNOTSUPP);
2956}
2957
2958SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD | CTLFLAG_SKIP, vfs_sysctl,
2959	"Generic filesystem");
2960
2961#if 1 || defined(COMPAT_PRELITE2)
2962
2963static int
2964sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS)
2965{
2966	int error;
2967	struct vfsconf *vfsp;
2968	struct ovfsconf ovfs;
2969
2970	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
2971		ovfs.vfc_vfsops = vfsp->vfc_vfsops;	/* XXX used as flag */
2972		strcpy(ovfs.vfc_name, vfsp->vfc_name);
2973		ovfs.vfc_index = vfsp->vfc_typenum;
2974		ovfs.vfc_refcount = vfsp->vfc_refcount;
2975		ovfs.vfc_flags = vfsp->vfc_flags;
2976		error = SYSCTL_OUT(req, &ovfs, sizeof ovfs);
2977		if (error)
2978			return error;
2979	}
2980	return 0;
2981}
2982
2983#endif /* 1 || COMPAT_PRELITE2 */
2984#endif /* !BURN_BRIDGES */
2985
2986#define KINFO_VNODESLOP		10
2987#ifdef notyet
2988/*
2989 * Dump vnode list (via sysctl).
2990 */
2991/* ARGSUSED */
2992static int
2993sysctl_vnode(SYSCTL_HANDLER_ARGS)
2994{
2995	struct xvnode *xvn;
2996	struct thread *td = req->td;
2997	struct mount *mp;
2998	struct vnode *vp;
2999	int error, len, n;
3000
3001	/*
3002	 * Stale numvnodes access is not fatal here.
3003	 */
3004	req->lock = 0;
3005	len = (numvnodes + KINFO_VNODESLOP) * sizeof *xvn;
3006	if (!req->oldptr)
3007		/* Make an estimate */
3008		return (SYSCTL_OUT(req, 0, len));
3009
3010	error = sysctl_wire_old_buffer(req, 0);
3011	if (error != 0)
3012		return (error);
3013	xvn = malloc(len, M_TEMP, M_ZERO | M_WAITOK);
3014	n = 0;
3015	mtx_lock(&mountlist_mtx);
3016	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
3017		if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td))
3018			continue;
3019		MNT_ILOCK(mp);
3020		TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
3021			if (n == len)
3022				break;
3023			vref(vp);
3024			xvn[n].xv_size = sizeof *xvn;
3025			xvn[n].xv_vnode = vp;
3026#define XV_COPY(field) xvn[n].xv_##field = vp->v_##field
3027			XV_COPY(usecount);
3028			XV_COPY(writecount);
3029			XV_COPY(holdcnt);
3030			XV_COPY(id);
3031			XV_COPY(mount);
3032			XV_COPY(numoutput);
3033			XV_COPY(type);
3034#undef XV_COPY
3035			xvn[n].xv_flag = vp->v_vflag;
3036
3037			switch (vp->v_type) {
3038			case VREG:
3039			case VDIR:
3040			case VLNK:
3041				xvn[n].xv_dev = vp->v_cachedfs;
3042				xvn[n].xv_ino = vp->v_cachedid;
3043				break;
3044			case VBLK:
3045			case VCHR:
3046				if (vp->v_rdev == NULL) {
3047					vrele(vp);
3048					continue;
3049				}
3050				xvn[n].xv_dev = dev2udev(vp->v_rdev);
3051				break;
3052			case VSOCK:
3053				xvn[n].xv_socket = vp->v_socket;
3054				break;
3055			case VFIFO:
3056				xvn[n].xv_fifo = vp->v_fifoinfo;
3057				break;
3058			case VNON:
3059			case VBAD:
3060			default:
3061				/* shouldn't happen? */
3062				vrele(vp);
3063				continue;
3064			}
3065			vrele(vp);
3066			++n;
3067		}
3068		MNT_IUNLOCK(mp);
3069		mtx_lock(&mountlist_mtx);
3070		vfs_unbusy(mp, td);
3071		if (n == len)
3072			break;
3073	}
3074	mtx_unlock(&mountlist_mtx);
3075
3076	error = SYSCTL_OUT(req, xvn, n * sizeof *xvn);
3077	free(xvn, M_TEMP);
3078	return (error);
3079}
3080
3081SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD,
3082	0, 0, sysctl_vnode, "S,xvnode", "");
3083#endif
3084
3085/*
3086 * Check to see if a filesystem is mounted on a block device.
3087 */
3088int
3089vfs_mountedon(vp)
3090	struct vnode *vp;
3091{
3092
3093	if (vp->v_rdev->si_mountpoint != NULL)
3094		return (EBUSY);
3095	return (0);
3096}
3097
3098/*
3099 * Unmount all filesystems. The list is traversed in reverse order
3100 * of mounting to avoid dependencies.
3101 */
3102void
3103vfs_unmountall()
3104{
3105	struct mount *mp;
3106	struct thread *td;
3107	int error;
3108
3109	if (curthread != NULL)
3110		td = curthread;
3111	else
3112		td = FIRST_THREAD_IN_PROC(initproc); /* XXX XXX proc0? */
3113	/*
3114	 * Since this only runs when rebooting, it is not interlocked.
3115	 */
3116	while(!TAILQ_EMPTY(&mountlist)) {
3117		mp = TAILQ_LAST(&mountlist, mntlist);
3118		error = dounmount(mp, MNT_FORCE, td);
3119		if (error) {
3120			TAILQ_REMOVE(&mountlist, mp, mnt_list);
3121			printf("unmount of %s failed (",
3122			    mp->mnt_stat.f_mntonname);
3123			if (error == EBUSY)
3124				printf("BUSY)\n");
3125			else
3126				printf("%d)\n", error);
3127		} else {
3128			/* The unmount has removed mp from the mountlist */
3129		}
3130	}
3131}
3132
3133/*
3134 * perform msync on all vnodes under a mount point
3135 * the mount point must be locked.
3136 */
3137void
3138vfs_msync(struct mount *mp, int flags)
3139{
3140	struct vnode *vp, *nvp;
3141	struct vm_object *obj;
3142	int tries;
3143
3144	GIANT_REQUIRED;
3145
3146	tries = 5;
3147	MNT_ILOCK(mp);
3148loop:
3149	TAILQ_FOREACH_SAFE(vp, &mp->mnt_nvnodelist, v_nmntvnodes, nvp) {
3150		if (vp->v_mount != mp) {
3151			if (--tries > 0)
3152				goto loop;
3153			break;
3154		}
3155
3156		VI_LOCK(vp);
3157		if (vp->v_iflag & VI_XLOCK) {
3158			VI_UNLOCK(vp);
3159			continue;
3160		}
3161
3162		if ((vp->v_iflag & VI_OBJDIRTY) &&
3163		    (flags == MNT_WAIT || VOP_ISLOCKED(vp, NULL) == 0)) {
3164			MNT_IUNLOCK(mp);
3165			if (!vget(vp,
3166			    LK_EXCLUSIVE | LK_RETRY | LK_INTERLOCK,
3167			    curthread)) {
3168				if (vp->v_vflag & VV_NOSYNC) {	/* unlinked */
3169					vput(vp);
3170					MNT_ILOCK(mp);
3171					continue;
3172				}
3173
3174				if (VOP_GETVOBJECT(vp, &obj) == 0) {
3175					VM_OBJECT_LOCK(obj);
3176					vm_object_page_clean(obj, 0, 0,
3177					    flags == MNT_WAIT ?
3178					    OBJPC_SYNC : OBJPC_NOSYNC);
3179					VM_OBJECT_UNLOCK(obj);
3180				}
3181				vput(vp);
3182			}
3183			MNT_ILOCK(mp);
3184			if (TAILQ_NEXT(vp, v_nmntvnodes) != nvp) {
3185				if (--tries > 0)
3186					goto loop;
3187				break;
3188			}
3189		} else
3190			VI_UNLOCK(vp);
3191	}
3192	MNT_IUNLOCK(mp);
3193}
3194
3195/*
3196 * Create the VM object needed for VMIO and mmap support.  This
3197 * is done for all VREG files in the system.  Some filesystems might
3198 * afford the additional metadata buffering capability of the
3199 * VMIO code by making the device node be VMIO mode also.
3200 *
3201 * vp must be locked when vfs_object_create is called.
3202 */
3203int
3204vfs_object_create(vp, td, cred)
3205	struct vnode *vp;
3206	struct thread *td;
3207	struct ucred *cred;
3208{
3209
3210	GIANT_REQUIRED;
3211	return (VOP_CREATEVOBJECT(vp, cred, td));
3212}
3213
3214/*
3215 * Mark a vnode as free, putting it up for recycling.
3216 */
3217void
3218vfree(vp)
3219	struct vnode *vp;
3220{
3221
3222	ASSERT_VI_LOCKED(vp, "vfree");
3223	mtx_lock(&vnode_free_list_mtx);
3224	KASSERT((vp->v_iflag & VI_FREE) == 0, ("vnode already free"));
3225	if (vp->v_iflag & VI_AGE) {
3226		TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
3227	} else {
3228		TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
3229	}
3230	freevnodes++;
3231	mtx_unlock(&vnode_free_list_mtx);
3232	vp->v_iflag &= ~VI_AGE;
3233	vp->v_iflag |= VI_FREE;
3234}
3235
3236/*
3237 * Opposite of vfree() - mark a vnode as in use.
3238 */
3239void
3240vbusy(vp)
3241	struct vnode *vp;
3242{
3243
3244	ASSERT_VI_LOCKED(vp, "vbusy");
3245	KASSERT((vp->v_iflag & VI_FREE) != 0, ("vnode not free"));
3246
3247	mtx_lock(&vnode_free_list_mtx);
3248	TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
3249	freevnodes--;
3250	mtx_unlock(&vnode_free_list_mtx);
3251
3252	vp->v_iflag &= ~(VI_FREE|VI_AGE);
3253}
3254
3255/*
3256 * Initalize per-vnode helper structure to hold poll-related state.
3257 */
3258void
3259v_addpollinfo(struct vnode *vp)
3260{
3261
3262	vp->v_pollinfo = uma_zalloc(vnodepoll_zone, M_WAITOK);
3263	mtx_init(&vp->v_pollinfo->vpi_lock, "vnode pollinfo", NULL, MTX_DEF);
3264}
3265
3266/*
3267 * Record a process's interest in events which might happen to
3268 * a vnode.  Because poll uses the historic select-style interface
3269 * internally, this routine serves as both the ``check for any
3270 * pending events'' and the ``record my interest in future events''
3271 * functions.  (These are done together, while the lock is held,
3272 * to avoid race conditions.)
3273 */
3274int
3275vn_pollrecord(vp, td, events)
3276	struct vnode *vp;
3277	struct thread *td;
3278	short events;
3279{
3280
3281	if (vp->v_pollinfo == NULL)
3282		v_addpollinfo(vp);
3283	mtx_lock(&vp->v_pollinfo->vpi_lock);
3284	if (vp->v_pollinfo->vpi_revents & events) {
3285		/*
3286		 * This leaves events we are not interested
3287		 * in available for the other process which
3288		 * which presumably had requested them
3289		 * (otherwise they would never have been
3290		 * recorded).
3291		 */
3292		events &= vp->v_pollinfo->vpi_revents;
3293		vp->v_pollinfo->vpi_revents &= ~events;
3294
3295		mtx_unlock(&vp->v_pollinfo->vpi_lock);
3296		return events;
3297	}
3298	vp->v_pollinfo->vpi_events |= events;
3299	selrecord(td, &vp->v_pollinfo->vpi_selinfo);
3300	mtx_unlock(&vp->v_pollinfo->vpi_lock);
3301	return 0;
3302}
3303
3304/*
3305 * Note the occurrence of an event.  If the VN_POLLEVENT macro is used,
3306 * it is possible for us to miss an event due to race conditions, but
3307 * that condition is expected to be rare, so for the moment it is the
3308 * preferred interface.
3309 */
3310void
3311vn_pollevent(vp, events)
3312	struct vnode *vp;
3313	short events;
3314{
3315
3316	if (vp->v_pollinfo == NULL)
3317		v_addpollinfo(vp);
3318	mtx_lock(&vp->v_pollinfo->vpi_lock);
3319	if (vp->v_pollinfo->vpi_events & events) {
3320		/*
3321		 * We clear vpi_events so that we don't
3322		 * call selwakeup() twice if two events are
3323		 * posted before the polling process(es) is
3324		 * awakened.  This also ensures that we take at
3325		 * most one selwakeup() if the polling process
3326		 * is no longer interested.  However, it does
3327		 * mean that only one event can be noticed at
3328		 * a time.  (Perhaps we should only clear those
3329		 * event bits which we note?) XXX
3330		 */
3331		vp->v_pollinfo->vpi_events = 0;	/* &= ~events ??? */
3332		vp->v_pollinfo->vpi_revents |= events;
3333		selwakeuppri(&vp->v_pollinfo->vpi_selinfo, PRIBIO);
3334	}
3335	mtx_unlock(&vp->v_pollinfo->vpi_lock);
3336}
3337
3338/*
3339 * Wake up anyone polling on vp because it is being revoked.
3340 * This depends on dead_poll() returning POLLHUP for correct
3341 * behavior.
3342 */
3343void
3344vn_pollgone(vp)
3345	struct vnode *vp;
3346{
3347
3348	mtx_lock(&vp->v_pollinfo->vpi_lock);
3349	VN_KNOTE(vp, NOTE_REVOKE);
3350	if (vp->v_pollinfo->vpi_events) {
3351		vp->v_pollinfo->vpi_events = 0;
3352		selwakeuppri(&vp->v_pollinfo->vpi_selinfo, PRIBIO);
3353	}
3354	mtx_unlock(&vp->v_pollinfo->vpi_lock);
3355}
3356
3357
3358
3359/*
3360 * Routine to create and manage a filesystem syncer vnode.
3361 */
3362#define sync_close ((int (*)(struct  vop_close_args *))nullop)
3363static int	sync_fsync(struct  vop_fsync_args *);
3364static int	sync_inactive(struct  vop_inactive_args *);
3365static int	sync_reclaim(struct  vop_reclaim_args *);
3366
3367static vop_t **sync_vnodeop_p;
3368static struct vnodeopv_entry_desc sync_vnodeop_entries[] = {
3369	{ &vop_default_desc,	(vop_t *) vop_eopnotsupp },
3370	{ &vop_close_desc,	(vop_t *) sync_close },		/* close */
3371	{ &vop_fsync_desc,	(vop_t *) sync_fsync },		/* fsync */
3372	{ &vop_inactive_desc,	(vop_t *) sync_inactive },	/* inactive */
3373	{ &vop_reclaim_desc,	(vop_t *) sync_reclaim },	/* reclaim */
3374	{ &vop_lock_desc,	(vop_t *) vop_stdlock },	/* lock */
3375	{ &vop_unlock_desc,	(vop_t *) vop_stdunlock },	/* unlock */
3376	{ &vop_islocked_desc,	(vop_t *) vop_stdislocked },	/* islocked */
3377	{ NULL, NULL }
3378};
3379static struct vnodeopv_desc sync_vnodeop_opv_desc =
3380	{ &sync_vnodeop_p, sync_vnodeop_entries };
3381
3382VNODEOP_SET(sync_vnodeop_opv_desc);
3383
3384/*
3385 * Create a new filesystem syncer vnode for the specified mount point.
3386 */
3387int
3388vfs_allocate_syncvnode(mp)
3389	struct mount *mp;
3390{
3391	struct vnode *vp;
3392	static long start, incr, next;
3393	int error;
3394
3395	/* Allocate a new vnode */
3396	if ((error = getnewvnode("syncer", mp, sync_vnodeop_p, &vp)) != 0) {
3397		mp->mnt_syncer = NULL;
3398		return (error);
3399	}
3400	vp->v_type = VNON;
3401	/*
3402	 * Place the vnode onto the syncer worklist. We attempt to
3403	 * scatter them about on the list so that they will go off
3404	 * at evenly distributed times even if all the filesystems
3405	 * are mounted at once.
3406	 */
3407	next += incr;
3408	if (next == 0 || next > syncer_maxdelay) {
3409		start /= 2;
3410		incr /= 2;
3411		if (start == 0) {
3412			start = syncer_maxdelay / 2;
3413			incr = syncer_maxdelay;
3414		}
3415		next = start;
3416	}
3417	VI_LOCK(vp);
3418	vn_syncer_add_to_worklist(vp, syncdelay > 0 ? next % syncdelay : 0);
3419	/* XXX - vn_syncer_add_to_worklist() also grabs and drops sync_mtx. */
3420	mtx_lock(&sync_mtx);
3421	sync_vnode_count++;
3422	mtx_unlock(&sync_mtx);
3423	VI_UNLOCK(vp);
3424	mp->mnt_syncer = vp;
3425	return (0);
3426}
3427
3428/*
3429 * Do a lazy sync of the filesystem.
3430 */
3431static int
3432sync_fsync(ap)
3433	struct vop_fsync_args /* {
3434		struct vnode *a_vp;
3435		struct ucred *a_cred;
3436		int a_waitfor;
3437		struct thread *a_td;
3438	} */ *ap;
3439{
3440	struct vnode *syncvp = ap->a_vp;
3441	struct mount *mp = syncvp->v_mount;
3442	struct thread *td = ap->a_td;
3443	int error, asyncflag;
3444
3445	/*
3446	 * We only need to do something if this is a lazy evaluation.
3447	 */
3448	if (ap->a_waitfor != MNT_LAZY)
3449		return (0);
3450
3451	/*
3452	 * Move ourselves to the back of the sync list.
3453	 */
3454	VI_LOCK(syncvp);
3455	vn_syncer_add_to_worklist(syncvp, syncdelay);
3456	VI_UNLOCK(syncvp);
3457
3458	/*
3459	 * Walk the list of vnodes pushing all that are dirty and
3460	 * not already on the sync list.
3461	 */
3462	mtx_lock(&mountlist_mtx);
3463	if (vfs_busy(mp, LK_EXCLUSIVE | LK_NOWAIT, &mountlist_mtx, td) != 0) {
3464		mtx_unlock(&mountlist_mtx);
3465		return (0);
3466	}
3467	if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) {
3468		vfs_unbusy(mp, td);
3469		return (0);
3470	}
3471	asyncflag = mp->mnt_flag & MNT_ASYNC;
3472	mp->mnt_flag &= ~MNT_ASYNC;
3473	vfs_msync(mp, MNT_NOWAIT);
3474	error = VFS_SYNC(mp, MNT_LAZY, ap->a_cred, td);
3475	if (asyncflag)
3476		mp->mnt_flag |= MNT_ASYNC;
3477	vn_finished_write(mp);
3478	vfs_unbusy(mp, td);
3479	return (error);
3480}
3481
3482/*
3483 * The syncer vnode is no referenced.
3484 */
3485static int
3486sync_inactive(ap)
3487	struct vop_inactive_args /* {
3488		struct vnode *a_vp;
3489		struct thread *a_td;
3490	} */ *ap;
3491{
3492
3493	VOP_UNLOCK(ap->a_vp, 0, ap->a_td);
3494	vgone(ap->a_vp);
3495	return (0);
3496}
3497
3498/*
3499 * The syncer vnode is no longer needed and is being decommissioned.
3500 *
3501 * Modifications to the worklist must be protected by sync_mtx.
3502 */
3503static int
3504sync_reclaim(ap)
3505	struct vop_reclaim_args /* {
3506		struct vnode *a_vp;
3507	} */ *ap;
3508{
3509	struct vnode *vp = ap->a_vp;
3510
3511	VI_LOCK(vp);
3512	vp->v_mount->mnt_syncer = NULL;
3513	if (vp->v_iflag & VI_ONWORKLST) {
3514		mtx_lock(&sync_mtx);
3515		LIST_REMOVE(vp, v_synclist);
3516 		syncer_worklist_len--;
3517		sync_vnode_count--;
3518		mtx_unlock(&sync_mtx);
3519		vp->v_iflag &= ~VI_ONWORKLST;
3520	}
3521	VI_UNLOCK(vp);
3522
3523	return (0);
3524}
3525
3526/*
3527 * extract the struct cdev *from a VCHR
3528 */
3529struct cdev *
3530vn_todev(vp)
3531	struct vnode *vp;
3532{
3533
3534	if (vp->v_type != VCHR)
3535		return (NULL);
3536	return (vp->v_rdev);
3537}
3538
3539/*
3540 * Check if vnode represents a disk device
3541 */
3542int
3543vn_isdisk(vp, errp)
3544	struct vnode *vp;
3545	int *errp;
3546{
3547	int error;
3548
3549	error = 0;
3550	if (vp->v_type != VCHR)
3551		error = ENOTBLK;
3552	else if (vp->v_rdev == NULL)
3553		error = ENXIO;
3554	else if (!(devsw(vp->v_rdev)->d_flags & D_DISK))
3555		error = ENOTBLK;
3556	if (errp != NULL)
3557		*errp = error;
3558	return (error == 0);
3559}
3560
3561/*
3562 * Free data allocated by namei(); see namei(9) for details.
3563 */
3564void
3565NDFREE(ndp, flags)
3566     struct nameidata *ndp;
3567     const u_int flags;
3568{
3569
3570	if (!(flags & NDF_NO_FREE_PNBUF) &&
3571	    (ndp->ni_cnd.cn_flags & HASBUF)) {
3572		uma_zfree(namei_zone, ndp->ni_cnd.cn_pnbuf);
3573		ndp->ni_cnd.cn_flags &= ~HASBUF;
3574	}
3575	if (!(flags & NDF_NO_DVP_UNLOCK) &&
3576	    (ndp->ni_cnd.cn_flags & LOCKPARENT) &&
3577	    ndp->ni_dvp != ndp->ni_vp)
3578		VOP_UNLOCK(ndp->ni_dvp, 0, ndp->ni_cnd.cn_thread);
3579	if (!(flags & NDF_NO_DVP_RELE) &&
3580	    (ndp->ni_cnd.cn_flags & (LOCKPARENT|WANTPARENT))) {
3581		vrele(ndp->ni_dvp);
3582		ndp->ni_dvp = NULL;
3583	}
3584	if (!(flags & NDF_NO_VP_UNLOCK) &&
3585	    (ndp->ni_cnd.cn_flags & LOCKLEAF) && ndp->ni_vp)
3586		VOP_UNLOCK(ndp->ni_vp, 0, ndp->ni_cnd.cn_thread);
3587	if (!(flags & NDF_NO_VP_RELE) &&
3588	    ndp->ni_vp) {
3589		vrele(ndp->ni_vp);
3590		ndp->ni_vp = NULL;
3591	}
3592	if (!(flags & NDF_NO_STARTDIR_RELE) &&
3593	    (ndp->ni_cnd.cn_flags & SAVESTART)) {
3594		vrele(ndp->ni_startdir);
3595		ndp->ni_startdir = NULL;
3596	}
3597}
3598
3599/*
3600 * Common filesystem object access control check routine.  Accepts a
3601 * vnode's type, "mode", uid and gid, requested access mode, credentials,
3602 * and optional call-by-reference privused argument allowing vaccess()
3603 * to indicate to the caller whether privilege was used to satisfy the
3604 * request (obsoleted).  Returns 0 on success, or an errno on failure.
3605 */
3606int
3607vaccess(type, file_mode, file_uid, file_gid, acc_mode, cred, privused)
3608	enum vtype type;
3609	mode_t file_mode;
3610	uid_t file_uid;
3611	gid_t file_gid;
3612	mode_t acc_mode;
3613	struct ucred *cred;
3614	int *privused;
3615{
3616	mode_t dac_granted;
3617#ifdef CAPABILITIES
3618	mode_t cap_granted;
3619#endif
3620
3621	/*
3622	 * Look for a normal, non-privileged way to access the file/directory
3623	 * as requested.  If it exists, go with that.
3624	 */
3625
3626	if (privused != NULL)
3627		*privused = 0;
3628
3629	dac_granted = 0;
3630
3631	/* Check the owner. */
3632	if (cred->cr_uid == file_uid) {
3633		dac_granted |= VADMIN;
3634		if (file_mode & S_IXUSR)
3635			dac_granted |= VEXEC;
3636		if (file_mode & S_IRUSR)
3637			dac_granted |= VREAD;
3638		if (file_mode & S_IWUSR)
3639			dac_granted |= (VWRITE | VAPPEND);
3640
3641		if ((acc_mode & dac_granted) == acc_mode)
3642			return (0);
3643
3644		goto privcheck;
3645	}
3646
3647	/* Otherwise, check the groups (first match) */
3648	if (groupmember(file_gid, cred)) {
3649		if (file_mode & S_IXGRP)
3650			dac_granted |= VEXEC;
3651		if (file_mode & S_IRGRP)
3652			dac_granted |= VREAD;
3653		if (file_mode & S_IWGRP)
3654			dac_granted |= (VWRITE | VAPPEND);
3655
3656		if ((acc_mode & dac_granted) == acc_mode)
3657			return (0);
3658
3659		goto privcheck;
3660	}
3661
3662	/* Otherwise, check everyone else. */
3663	if (file_mode & S_IXOTH)
3664		dac_granted |= VEXEC;
3665	if (file_mode & S_IROTH)
3666		dac_granted |= VREAD;
3667	if (file_mode & S_IWOTH)
3668		dac_granted |= (VWRITE | VAPPEND);
3669	if ((acc_mode & dac_granted) == acc_mode)
3670		return (0);
3671
3672privcheck:
3673	if (!suser_cred(cred, PRISON_ROOT)) {
3674		/* XXX audit: privilege used */
3675		if (privused != NULL)
3676			*privused = 1;
3677		return (0);
3678	}
3679
3680#ifdef CAPABILITIES
3681	/*
3682	 * Build a capability mask to determine if the set of capabilities
3683	 * satisfies the requirements when combined with the granted mask
3684	 * from above.
3685	 * For each capability, if the capability is required, bitwise
3686	 * or the request type onto the cap_granted mask.
3687	 */
3688	cap_granted = 0;
3689
3690	if (type == VDIR) {
3691		/*
3692		 * For directories, use CAP_DAC_READ_SEARCH to satisfy
3693		 * VEXEC requests, instead of CAP_DAC_EXECUTE.
3694		 */
3695		if ((acc_mode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
3696		    !cap_check(cred, NULL, CAP_DAC_READ_SEARCH, PRISON_ROOT))
3697			cap_granted |= VEXEC;
3698	} else {
3699		if ((acc_mode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
3700		    !cap_check(cred, NULL, CAP_DAC_EXECUTE, PRISON_ROOT))
3701			cap_granted |= VEXEC;
3702	}
3703
3704	if ((acc_mode & VREAD) && ((dac_granted & VREAD) == 0) &&
3705	    !cap_check(cred, NULL, CAP_DAC_READ_SEARCH, PRISON_ROOT))
3706		cap_granted |= VREAD;
3707
3708	if ((acc_mode & VWRITE) && ((dac_granted & VWRITE) == 0) &&
3709	    !cap_check(cred, NULL, CAP_DAC_WRITE, PRISON_ROOT))
3710		cap_granted |= (VWRITE | VAPPEND);
3711
3712	if ((acc_mode & VADMIN) && ((dac_granted & VADMIN) == 0) &&
3713	    !cap_check(cred, NULL, CAP_FOWNER, PRISON_ROOT))
3714		cap_granted |= VADMIN;
3715
3716	if ((acc_mode & (cap_granted | dac_granted)) == acc_mode) {
3717		/* XXX audit: privilege used */
3718		if (privused != NULL)
3719			*privused = 1;
3720		return (0);
3721	}
3722#endif
3723
3724	return ((acc_mode & VADMIN) ? EPERM : EACCES);
3725}
3726
3727/*
3728 * Credential check based on process requesting service, and per-attribute
3729 * permissions.
3730 */
3731int
3732extattr_check_cred(struct vnode *vp, int attrnamespace,
3733    struct ucred *cred, struct thread *td, int access)
3734{
3735
3736	/*
3737	 * Kernel-invoked always succeeds.
3738	 */
3739	if (cred == NOCRED)
3740		return (0);
3741
3742	/*
3743	 * Do not allow privileged processes in jail to directly
3744	 * manipulate system attributes.
3745	 *
3746	 * XXX What capability should apply here?
3747	 * Probably CAP_SYS_SETFFLAG.
3748	 */
3749	switch (attrnamespace) {
3750	case EXTATTR_NAMESPACE_SYSTEM:
3751		/* Potentially should be: return (EPERM); */
3752		return (suser_cred(cred, 0));
3753	case EXTATTR_NAMESPACE_USER:
3754		return (VOP_ACCESS(vp, access, cred, td));
3755	default:
3756		return (EPERM);
3757	}
3758}
3759
3760#ifdef DEBUG_VFS_LOCKS
3761/*
3762 * This only exists to supress warnings from unlocked specfs accesses.  It is
3763 * no longer ok to have an unlocked VFS.
3764 */
3765#define	IGNORE_LOCK(vp) ((vp)->v_type == VCHR || (vp)->v_type == VBAD)
3766
3767int vfs_badlock_ddb = 1;	/* Drop into debugger on violation. */
3768int vfs_badlock_mutex = 1;	/* Check for interlock across VOPs. */
3769int vfs_badlock_print = 1;	/* Print lock violations. */
3770
3771static void
3772vfs_badlock(const char *msg, const char *str, struct vnode *vp)
3773{
3774
3775	if (vfs_badlock_print)
3776		printf("%s: %p %s\n", str, (void *)vp, msg);
3777	if (vfs_badlock_ddb)
3778		Debugger("lock violation");
3779}
3780
3781void
3782assert_vi_locked(struct vnode *vp, const char *str)
3783{
3784
3785	if (vfs_badlock_mutex && !mtx_owned(VI_MTX(vp)))
3786		vfs_badlock("interlock is not locked but should be", str, vp);
3787}
3788
3789void
3790assert_vi_unlocked(struct vnode *vp, const char *str)
3791{
3792
3793	if (vfs_badlock_mutex && mtx_owned(VI_MTX(vp)))
3794		vfs_badlock("interlock is locked but should not be", str, vp);
3795}
3796
3797void
3798assert_vop_locked(struct vnode *vp, const char *str)
3799{
3800
3801	if (vp && !IGNORE_LOCK(vp) && VOP_ISLOCKED(vp, NULL) == 0)
3802		vfs_badlock("is not locked but should be", str, vp);
3803}
3804
3805void
3806assert_vop_unlocked(struct vnode *vp, const char *str)
3807{
3808
3809	if (vp && !IGNORE_LOCK(vp) &&
3810	    VOP_ISLOCKED(vp, curthread) == LK_EXCLUSIVE)
3811		vfs_badlock("is locked but should not be", str, vp);
3812}
3813
3814#if 0
3815void
3816assert_vop_elocked(struct vnode *vp, const char *str)
3817{
3818
3819	if (vp && !IGNORE_LOCK(vp) &&
3820	    VOP_ISLOCKED(vp, curthread) != LK_EXCLUSIVE)
3821		vfs_badlock("is not exclusive locked but should be", str, vp);
3822}
3823
3824void
3825assert_vop_elocked_other(struct vnode *vp, const char *str)
3826{
3827
3828	if (vp && !IGNORE_LOCK(vp) &&
3829	    VOP_ISLOCKED(vp, curthread) != LK_EXCLOTHER)
3830		vfs_badlock("is not exclusive locked by another thread",
3831		    str, vp);
3832}
3833
3834void
3835assert_vop_slocked(struct vnode *vp, const char *str)
3836{
3837
3838	if (vp && !IGNORE_LOCK(vp) &&
3839	    VOP_ISLOCKED(vp, curthread) != LK_SHARED)
3840		vfs_badlock("is not locked shared but should be", str, vp);
3841}
3842#endif /* 0 */
3843
3844void
3845vop_rename_pre(void *ap)
3846{
3847	struct vop_rename_args *a = ap;
3848
3849	if (a->a_tvp)
3850		ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME");
3851	ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME");
3852	ASSERT_VI_UNLOCKED(a->a_fvp, "VOP_RENAME");
3853	ASSERT_VI_UNLOCKED(a->a_fdvp, "VOP_RENAME");
3854
3855	/* Check the source (from). */
3856	if (a->a_tdvp != a->a_fdvp)
3857		ASSERT_VOP_UNLOCKED(a->a_fdvp, "vop_rename: fdvp locked");
3858	if (a->a_tvp != a->a_fvp)
3859		ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: tvp locked");
3860
3861	/* Check the target. */
3862	if (a->a_tvp)
3863		ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked");
3864	ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked");
3865}
3866
3867void
3868vop_strategy_pre(void *ap)
3869{
3870	struct vop_strategy_args *a;
3871	struct buf *bp;
3872
3873	a = ap;
3874	bp = a->a_bp;
3875
3876	/*
3877	 * Cluster ops lock their component buffers but not the IO container.
3878	 */
3879	if ((bp->b_flags & B_CLUSTER) != 0)
3880		return;
3881
3882	if (BUF_REFCNT(bp) < 1) {
3883		if (vfs_badlock_print)
3884			printf(
3885			    "VOP_STRATEGY: bp is not locked but should be\n");
3886		if (vfs_badlock_ddb)
3887			Debugger("lock violation");
3888	}
3889}
3890
3891void
3892vop_lookup_pre(void *ap)
3893{
3894	struct vop_lookup_args *a;
3895	struct vnode *dvp;
3896
3897	a = ap;
3898	dvp = a->a_dvp;
3899	ASSERT_VI_UNLOCKED(dvp, "VOP_LOOKUP");
3900	ASSERT_VOP_LOCKED(dvp, "VOP_LOOKUP");
3901}
3902
3903void
3904vop_lookup_post(void *ap, int rc)
3905{
3906	struct vop_lookup_args *a;
3907	struct componentname *cnp;
3908	struct vnode *dvp;
3909	struct vnode *vp;
3910	int flags;
3911
3912	a = ap;
3913	dvp = a->a_dvp;
3914	cnp = a->a_cnp;
3915	vp = *(a->a_vpp);
3916	flags = cnp->cn_flags;
3917
3918	ASSERT_VI_UNLOCKED(dvp, "VOP_LOOKUP");
3919
3920	/*
3921	 * If this is the last path component for this lookup and LOCKPARENT
3922	 * is set, OR if there is an error the directory has to be locked.
3923	 */
3924	if ((flags & LOCKPARENT) && (flags & ISLASTCN))
3925		ASSERT_VOP_LOCKED(dvp, "VOP_LOOKUP (LOCKPARENT)");
3926	else if (rc != 0)
3927		ASSERT_VOP_LOCKED(dvp, "VOP_LOOKUP (error)");
3928	else if (dvp != vp)
3929		ASSERT_VOP_UNLOCKED(dvp, "VOP_LOOKUP (dvp)");
3930	if (flags & PDIRUNLOCK)
3931		ASSERT_VOP_UNLOCKED(dvp, "VOP_LOOKUP (PDIRUNLOCK)");
3932}
3933
3934void
3935vop_lock_pre(void *ap)
3936{
3937	struct vop_lock_args *a = ap;
3938
3939	if ((a->a_flags & LK_INTERLOCK) == 0)
3940		ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
3941	else
3942		ASSERT_VI_LOCKED(a->a_vp, "VOP_LOCK");
3943}
3944
3945void
3946vop_lock_post(void *ap, int rc)
3947{
3948	struct vop_lock_args *a = ap;
3949
3950	ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
3951	if (rc == 0)
3952		ASSERT_VOP_LOCKED(a->a_vp, "VOP_LOCK");
3953}
3954
3955void
3956vop_unlock_pre(void *ap)
3957{
3958	struct vop_unlock_args *a = ap;
3959
3960	if (a->a_flags & LK_INTERLOCK)
3961		ASSERT_VI_LOCKED(a->a_vp, "VOP_UNLOCK");
3962	ASSERT_VOP_LOCKED(a->a_vp, "VOP_UNLOCK");
3963}
3964
3965void
3966vop_unlock_post(void *ap, int rc)
3967{
3968	struct vop_unlock_args *a = ap;
3969
3970	if (a->a_flags & LK_INTERLOCK)
3971		ASSERT_VI_UNLOCKED(a->a_vp, "VOP_UNLOCK");
3972}
3973#endif /* DEBUG_VFS_LOCKS */
3974
3975static struct klist fs_klist = SLIST_HEAD_INITIALIZER(&fs_klist);
3976
3977void
3978vfs_event_signal(fsid_t *fsid, u_int32_t event, intptr_t data __unused)
3979{
3980
3981	KNOTE(&fs_klist, event);
3982}
3983
3984static int	filt_fsattach(struct knote *kn);
3985static void	filt_fsdetach(struct knote *kn);
3986static int	filt_fsevent(struct knote *kn, long hint);
3987
3988struct filterops fs_filtops =
3989	{ 0, filt_fsattach, filt_fsdetach, filt_fsevent };
3990
3991static int
3992filt_fsattach(struct knote *kn)
3993{
3994
3995	kn->kn_flags |= EV_CLEAR;
3996	SLIST_INSERT_HEAD(&fs_klist, kn, kn_selnext);
3997	return (0);
3998}
3999
4000static void
4001filt_fsdetach(struct knote *kn)
4002{
4003
4004	SLIST_REMOVE(&fs_klist, kn, knote, kn_selnext);
4005}
4006
4007static int
4008filt_fsevent(struct knote *kn, long hint)
4009{
4010
4011	kn->kn_fflags |= hint;
4012	return (kn->kn_fflags != 0);
4013}
4014
4015static int
4016sysctl_vfs_ctl(SYSCTL_HANDLER_ARGS)
4017{
4018	struct vfsidctl vc;
4019	int error;
4020	struct mount *mp;
4021
4022	error = SYSCTL_IN(req, &vc, sizeof(vc));
4023	if (error)
4024		return (error);
4025	if (vc.vc_vers != VFS_CTL_VERS1)
4026		return (EINVAL);
4027	mp = vfs_getvfs(&vc.vc_fsid);
4028	if (mp == NULL)
4029		return (ENOENT);
4030	/* ensure that a specific sysctl goes to the right filesystem. */
4031	if (strcmp(vc.vc_fstypename, "*") != 0 &&
4032	    strcmp(vc.vc_fstypename, mp->mnt_vfc->vfc_name) != 0) {
4033		return (EINVAL);
4034	}
4035	VCTLTOREQ(&vc, req);
4036	return (VFS_SYSCTL(mp, vc.vc_op, req));
4037}
4038
4039SYSCTL_PROC(_vfs, OID_AUTO, ctl, CTLFLAG_RD,
4040        NULL, 0, sysctl_vfs_ctl, "", "Message queue IDs");
4041