vfs_subr.c revision 114074
1/*
2 * Copyright (c) 1989, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 *    must display the following acknowledgement:
20 *	This product includes software developed by the University of
21 *	California, Berkeley and its contributors.
22 * 4. Neither the name of the University nor the names of its contributors
23 *    may be used to endorse or promote products derived from this software
24 *    without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 *
38 *	@(#)vfs_subr.c	8.31 (Berkeley) 5/26/95
39 * $FreeBSD: head/sys/kern/vfs_subr.c 114074 2003-04-26 18:33:18Z alc $
40 */
41
42/*
43 * External virtual filesystem routines
44 */
45#include "opt_ddb.h"
46#include "opt_mac.h"
47
48#include <sys/param.h>
49#include <sys/systm.h>
50#include <sys/bio.h>
51#include <sys/buf.h>
52#include <sys/conf.h>
53#include <sys/eventhandler.h>
54#include <sys/extattr.h>
55#include <sys/fcntl.h>
56#include <sys/kernel.h>
57#include <sys/kthread.h>
58#include <sys/mac.h>
59#include <sys/malloc.h>
60#include <sys/mount.h>
61#include <sys/namei.h>
62#include <sys/stat.h>
63#include <sys/sysctl.h>
64#include <sys/syslog.h>
65#include <sys/vmmeter.h>
66#include <sys/vnode.h>
67
68#include <vm/vm.h>
69#include <vm/vm_object.h>
70#include <vm/vm_extern.h>
71#include <vm/pmap.h>
72#include <vm/vm_map.h>
73#include <vm/vm_page.h>
74#include <vm/uma.h>
75
76static MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure");
77
78static void	addalias(struct vnode *vp, dev_t nvp_rdev);
79static void	insmntque(struct vnode *vp, struct mount *mp);
80static void	vclean(struct vnode *vp, int flags, struct thread *td);
81static void	vlruvp(struct vnode *vp);
82static int	flushbuflist(struct buf *blist, int flags, struct vnode *vp,
83		    int slpflag, int slptimeo, int *errorp);
84static int	vcanrecycle(struct vnode *vp, struct mount **vnmpp);
85
86
87/*
88 * Number of vnodes in existence.  Increased whenever getnewvnode()
89 * allocates a new vnode, never decreased.
90 */
91static unsigned long	numvnodes;
92
93SYSCTL_LONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, "");
94
95/*
96 * Conversion tables for conversion from vnode types to inode formats
97 * and back.
98 */
99enum vtype iftovt_tab[16] = {
100	VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
101	VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
102};
103int vttoif_tab[9] = {
104	0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
105	S_IFSOCK, S_IFIFO, S_IFMT,
106};
107
108/*
109 * List of vnodes that are ready for recycling.
110 */
111static TAILQ_HEAD(freelst, vnode) vnode_free_list;
112
113/*
114 * Minimum number of free vnodes.  If there are fewer than this free vnodes,
115 * getnewvnode() will return a newly allocated vnode.
116 */
117static u_long wantfreevnodes = 25;
118SYSCTL_LONG(_vfs, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, "");
119/* Number of vnodes in the free list. */
120static u_long freevnodes;
121SYSCTL_LONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, "");
122
123/*
124 * Various variables used for debugging the new implementation of
125 * reassignbuf().
126 * XXX these are probably of (very) limited utility now.
127 */
128static int reassignbufcalls;
129SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0, "");
130static int nameileafonly;
131SYSCTL_INT(_vfs, OID_AUTO, nameileafonly, CTLFLAG_RW, &nameileafonly, 0, "");
132
133/*
134 * Cache for the mount type id assigned to NFS.  This is used for
135 * special checks in nfs/nfs_nqlease.c and vm/vnode_pager.c.
136 */
137int	nfs_mount_type = -1;
138
139/* To keep more than one thread at a time from running vfs_getnewfsid */
140static struct mtx mntid_mtx;
141
142/*
143 * Lock for any access to the following:
144 *	vnode_free_list
145 *	numvnodes
146 *	freevnodes
147 */
148static struct mtx vnode_free_list_mtx;
149
150/*
151 * For any iteration/modification of dev->si_hlist (linked through
152 * v_specnext)
153 */
154static struct mtx spechash_mtx;
155
156/* Publicly exported FS */
157struct nfs_public nfs_pub;
158
159/* Zone for allocation of new vnodes - used exclusively by getnewvnode() */
160static uma_zone_t vnode_zone;
161static uma_zone_t vnodepoll_zone;
162
163/* Set to 1 to print out reclaim of active vnodes */
164int	prtactive;
165
166/*
167 * The workitem queue.
168 *
169 * It is useful to delay writes of file data and filesystem metadata
170 * for tens of seconds so that quickly created and deleted files need
171 * not waste disk bandwidth being created and removed. To realize this,
172 * we append vnodes to a "workitem" queue. When running with a soft
173 * updates implementation, most pending metadata dependencies should
174 * not wait for more than a few seconds. Thus, mounted on block devices
175 * are delayed only about a half the time that file data is delayed.
176 * Similarly, directory updates are more critical, so are only delayed
177 * about a third the time that file data is delayed. Thus, there are
178 * SYNCER_MAXDELAY queues that are processed round-robin at a rate of
179 * one each second (driven off the filesystem syncer process). The
180 * syncer_delayno variable indicates the next queue that is to be processed.
181 * Items that need to be processed soon are placed in this queue:
182 *
183 *	syncer_workitem_pending[syncer_delayno]
184 *
185 * A delay of fifteen seconds is done by placing the request fifteen
186 * entries later in the queue:
187 *
188 *	syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
189 *
190 */
191static int syncer_delayno;
192static long syncer_mask;
193LIST_HEAD(synclist, vnode);
194static struct synclist *syncer_workitem_pending;
195/*
196 * The sync_mtx protects:
197 *	vp->v_synclist
198 *	syncer_delayno
199 *	syncer_workitem_pending
200 *	rushjob
201 */
202static struct mtx sync_mtx;
203
204#define SYNCER_MAXDELAY		32
205static int syncer_maxdelay = SYNCER_MAXDELAY;	/* maximum delay time */
206static int syncdelay = 30;		/* max time to delay syncing data */
207static int filedelay = 30;		/* time to delay syncing files */
208SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, "");
209static int dirdelay = 29;		/* time to delay syncing directories */
210SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, "");
211static int metadelay = 28;		/* time to delay syncing metadata */
212SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, "");
213static int rushjob;		/* number of slots to run ASAP */
214static int stat_rush_requests;	/* number of times I/O speeded up */
215SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, "");
216
217/*
218 * Number of vnodes we want to exist at any one time.  This is mostly used
219 * to size hash tables in vnode-related code.  It is normally not used in
220 * getnewvnode(), as wantfreevnodes is normally nonzero.)
221 *
222 * XXX desiredvnodes is historical cruft and should not exist.
223 */
224int desiredvnodes;
225SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW,
226    &desiredvnodes, 0, "Maximum number of vnodes");
227static int minvnodes;
228SYSCTL_INT(_kern, OID_AUTO, minvnodes, CTLFLAG_RW,
229    &minvnodes, 0, "Minimum number of vnodes");
230static int vnlru_nowhere;
231SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW, &vnlru_nowhere, 0,
232    "Number of times the vnlru process ran without success");
233
234/* Hook for calling soft updates */
235int (*softdep_process_worklist_hook)(struct mount *);
236
237/*
238 * This only exists to supress warnings from unlocked specfs accesses.  It is
239 * no longer ok to have an unlocked VFS.
240 */
241#define IGNORE_LOCK(vp) ((vp)->v_type == VCHR || (vp)->v_type == VBAD)
242
243/* Print lock violations */
244int vfs_badlock_print = 1;
245
246/* Panic on violation */
247int vfs_badlock_panic = 1;
248
249/* Check for interlock across VOPs */
250int vfs_badlock_mutex = 1;
251
252static void
253vfs_badlock(char *msg, char *str, struct vnode *vp)
254{
255	if (vfs_badlock_print)
256		printf("%s: %p %s\n", str, vp, msg);
257	if (vfs_badlock_panic)
258		Debugger("Lock violation.\n");
259}
260
261void
262assert_vi_unlocked(struct vnode *vp, char *str)
263{
264	if (vfs_badlock_mutex && mtx_owned(VI_MTX(vp)))
265		vfs_badlock("interlock is locked but should not be", str, vp);
266}
267
268void
269assert_vi_locked(struct vnode *vp, char *str)
270{
271	if (vfs_badlock_mutex && !mtx_owned(VI_MTX(vp)))
272		vfs_badlock("interlock is not locked but should be", str, vp);
273}
274
275void
276assert_vop_locked(struct vnode *vp, char *str)
277{
278	if (vp && !IGNORE_LOCK(vp) && !VOP_ISLOCKED(vp, NULL))
279		vfs_badlock("is not locked but should be", str, vp);
280}
281
282void
283assert_vop_unlocked(struct vnode *vp, char *str)
284{
285	if (vp && !IGNORE_LOCK(vp) &&
286	    VOP_ISLOCKED(vp, curthread) == LK_EXCLUSIVE)
287		vfs_badlock("is locked but should not be", str, vp);
288}
289
290void
291assert_vop_elocked(struct vnode *vp, char *str)
292{
293	if (vp && !IGNORE_LOCK(vp) &&
294	    VOP_ISLOCKED(vp, curthread) != LK_EXCLUSIVE)
295		vfs_badlock("is not exclusive locked but should be", str, vp);
296}
297
298void
299assert_vop_elocked_other(struct vnode *vp, char *str)
300{
301	if (vp && !IGNORE_LOCK(vp) &&
302	    VOP_ISLOCKED(vp, curthread) != LK_EXCLOTHER)
303		vfs_badlock("is not exclusive locked by another thread",
304		    str, vp);
305}
306
307void
308assert_vop_slocked(struct vnode *vp, char *str)
309{
310	if (vp && !IGNORE_LOCK(vp) &&
311	    VOP_ISLOCKED(vp, curthread) != LK_SHARED)
312		vfs_badlock("is not locked shared but should be", str, vp);
313}
314
315void
316vop_rename_pre(void *ap)
317{
318	struct vop_rename_args *a = ap;
319
320	if (a->a_tvp)
321		ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME");
322	ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME");
323	ASSERT_VI_UNLOCKED(a->a_fvp, "VOP_RENAME");
324	ASSERT_VI_UNLOCKED(a->a_fdvp, "VOP_RENAME");
325
326	/* Check the source (from) */
327	if (a->a_tdvp != a->a_fdvp)
328		ASSERT_VOP_UNLOCKED(a->a_fdvp, "vop_rename: fdvp locked.\n");
329	if (a->a_tvp != a->a_fvp)
330		ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: tvp locked.\n");
331
332	/* Check the target */
333	if (a->a_tvp)
334		ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked.\n");
335
336	ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked.\n");
337}
338
339void
340vop_strategy_pre(void *ap)
341{
342	struct vop_strategy_args *a = ap;
343	struct buf *bp;
344
345	bp = a->a_bp;
346
347	/*
348	 * Cluster ops lock their component buffers but not the IO container.
349	 */
350	if ((bp->b_flags & B_CLUSTER) != 0)
351		return;
352
353	if (BUF_REFCNT(bp) < 1) {
354		if (vfs_badlock_print)
355			printf("VOP_STRATEGY: bp is not locked but should be.\n");
356		if (vfs_badlock_panic)
357			Debugger("Lock violation.\n");
358	}
359}
360
361void
362vop_lookup_pre(void *ap)
363{
364	struct vop_lookup_args *a = ap;
365	struct vnode *dvp;
366
367	dvp = a->a_dvp;
368
369	ASSERT_VI_UNLOCKED(dvp, "VOP_LOOKUP");
370	ASSERT_VOP_LOCKED(dvp, "VOP_LOOKUP");
371}
372
373void
374vop_lookup_post(void *ap, int rc)
375{
376	struct vop_lookup_args *a = ap;
377	struct componentname *cnp;
378	struct vnode *dvp;
379	struct vnode *vp;
380	int flags;
381
382	dvp = a->a_dvp;
383	cnp = a->a_cnp;
384	vp = *(a->a_vpp);
385	flags = cnp->cn_flags;
386
387
388	ASSERT_VI_UNLOCKED(dvp, "VOP_LOOKUP");
389	/*
390	 * If this is the last path component for this lookup and LOCPARENT
391	 * is set, OR if there is an error the directory has to be locked.
392	 */
393	if ((flags & LOCKPARENT) && (flags & ISLASTCN))
394		ASSERT_VOP_LOCKED(dvp, "VOP_LOOKUP (LOCKPARENT)");
395	else if (rc != 0)
396		ASSERT_VOP_LOCKED(dvp, "VOP_LOOKUP (error)");
397	else if (dvp != vp)
398		ASSERT_VOP_UNLOCKED(dvp, "VOP_LOOKUP (dvp)");
399
400	if (flags & PDIRUNLOCK)
401		ASSERT_VOP_UNLOCKED(dvp, "VOP_LOOKUP (PDIRUNLOCK)");
402}
403
404void
405vop_unlock_pre(void *ap)
406{
407	struct vop_unlock_args *a = ap;
408
409	if (a->a_flags & LK_INTERLOCK)
410		ASSERT_VI_LOCKED(a->a_vp, "VOP_UNLOCK");
411
412	ASSERT_VOP_LOCKED(a->a_vp, "VOP_UNLOCK");
413}
414
415void
416vop_unlock_post(void *ap, int rc)
417{
418	struct vop_unlock_args *a = ap;
419
420	if (a->a_flags & LK_INTERLOCK)
421		ASSERT_VI_UNLOCKED(a->a_vp, "VOP_UNLOCK");
422}
423
424void
425vop_lock_pre(void *ap)
426{
427	struct vop_lock_args *a = ap;
428
429	if ((a->a_flags & LK_INTERLOCK) == 0)
430		ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
431	else
432		ASSERT_VI_LOCKED(a->a_vp, "VOP_LOCK");
433}
434
435void
436vop_lock_post(void *ap, int rc)
437{
438	struct vop_lock_args *a;
439
440	a = ap;
441
442	ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
443	if (rc == 0)
444		ASSERT_VOP_LOCKED(a->a_vp, "VOP_LOCK");
445}
446
447void
448v_addpollinfo(struct vnode *vp)
449{
450	vp->v_pollinfo = uma_zalloc(vnodepoll_zone, M_WAITOK);
451	mtx_init(&vp->v_pollinfo->vpi_lock, "vnode pollinfo", NULL, MTX_DEF);
452}
453
454/*
455 * Initialize the vnode management data structures.
456 */
457static void
458vntblinit(void *dummy __unused)
459{
460
461	desiredvnodes = maxproc + cnt.v_page_count / 4;
462	minvnodes = desiredvnodes / 4;
463	mtx_init(&mountlist_mtx, "mountlist", NULL, MTX_DEF);
464	mtx_init(&mntvnode_mtx, "mntvnode", NULL, MTX_DEF);
465	mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF);
466	mtx_init(&spechash_mtx, "spechash", NULL, MTX_DEF);
467	TAILQ_INIT(&vnode_free_list);
468	mtx_init(&vnode_free_list_mtx, "vnode_free_list", NULL, MTX_DEF);
469	vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL,
470	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
471	vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo),
472	      NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
473	/*
474	 * Initialize the filesystem syncer.
475	 */
476	syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE,
477		&syncer_mask);
478	syncer_maxdelay = syncer_mask + 1;
479	mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF);
480}
481SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL)
482
483
484/*
485 * Mark a mount point as busy. Used to synchronize access and to delay
486 * unmounting. Interlock is not released on failure.
487 */
488int
489vfs_busy(mp, flags, interlkp, td)
490	struct mount *mp;
491	int flags;
492	struct mtx *interlkp;
493	struct thread *td;
494{
495	int lkflags;
496
497	if (mp->mnt_kern_flag & MNTK_UNMOUNT) {
498		if (flags & LK_NOWAIT)
499			return (ENOENT);
500		mp->mnt_kern_flag |= MNTK_MWAIT;
501		/*
502		 * Since all busy locks are shared except the exclusive
503		 * lock granted when unmounting, the only place that a
504		 * wakeup needs to be done is at the release of the
505		 * exclusive lock at the end of dounmount.
506		 */
507		msleep(mp, interlkp, PVFS, "vfs_busy", 0);
508		return (ENOENT);
509	}
510	lkflags = LK_SHARED | LK_NOPAUSE;
511	if (interlkp)
512		lkflags |= LK_INTERLOCK;
513	if (lockmgr(&mp->mnt_lock, lkflags, interlkp, td))
514		panic("vfs_busy: unexpected lock failure");
515	return (0);
516}
517
518/*
519 * Free a busy filesystem.
520 */
521void
522vfs_unbusy(mp, td)
523	struct mount *mp;
524	struct thread *td;
525{
526
527	lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, td);
528}
529
530/*
531 * Lookup a mount point by filesystem identifier.
532 */
533struct mount *
534vfs_getvfs(fsid)
535	fsid_t *fsid;
536{
537	register struct mount *mp;
538
539	mtx_lock(&mountlist_mtx);
540	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
541		if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
542		    mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
543			mtx_unlock(&mountlist_mtx);
544			return (mp);
545		}
546	}
547	mtx_unlock(&mountlist_mtx);
548	return ((struct mount *) 0);
549}
550
551/*
552 * Get a new unique fsid.  Try to make its val[0] unique, since this value
553 * will be used to create fake device numbers for stat().  Also try (but
554 * not so hard) make its val[0] unique mod 2^16, since some emulators only
555 * support 16-bit device numbers.  We end up with unique val[0]'s for the
556 * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls.
557 *
558 * Keep in mind that several mounts may be running in parallel.  Starting
559 * the search one past where the previous search terminated is both a
560 * micro-optimization and a defense against returning the same fsid to
561 * different mounts.
562 */
563void
564vfs_getnewfsid(mp)
565	struct mount *mp;
566{
567	static u_int16_t mntid_base;
568	fsid_t tfsid;
569	int mtype;
570
571	mtx_lock(&mntid_mtx);
572	mtype = mp->mnt_vfc->vfc_typenum;
573	tfsid.val[1] = mtype;
574	mtype = (mtype & 0xFF) << 24;
575	for (;;) {
576		tfsid.val[0] = makeudev(255,
577		    mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF));
578		mntid_base++;
579		if (vfs_getvfs(&tfsid) == NULL)
580			break;
581	}
582	mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
583	mp->mnt_stat.f_fsid.val[1] = tfsid.val[1];
584	mtx_unlock(&mntid_mtx);
585}
586
587/*
588 * Knob to control the precision of file timestamps:
589 *
590 *   0 = seconds only; nanoseconds zeroed.
591 *   1 = seconds and nanoseconds, accurate within 1/HZ.
592 *   2 = seconds and nanoseconds, truncated to microseconds.
593 * >=3 = seconds and nanoseconds, maximum precision.
594 */
595enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC };
596
597static int timestamp_precision = TSP_SEC;
598SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW,
599    &timestamp_precision, 0, "");
600
601/*
602 * Get a current timestamp.
603 */
604void
605vfs_timestamp(tsp)
606	struct timespec *tsp;
607{
608	struct timeval tv;
609
610	switch (timestamp_precision) {
611	case TSP_SEC:
612		tsp->tv_sec = time_second;
613		tsp->tv_nsec = 0;
614		break;
615	case TSP_HZ:
616		getnanotime(tsp);
617		break;
618	case TSP_USEC:
619		microtime(&tv);
620		TIMEVAL_TO_TIMESPEC(&tv, tsp);
621		break;
622	case TSP_NSEC:
623	default:
624		nanotime(tsp);
625		break;
626	}
627}
628
629/*
630 * Set vnode attributes to VNOVAL
631 */
632void
633vattr_null(vap)
634	register struct vattr *vap;
635{
636
637	vap->va_type = VNON;
638	vap->va_size = VNOVAL;
639	vap->va_bytes = VNOVAL;
640	vap->va_mode = VNOVAL;
641	vap->va_nlink = VNOVAL;
642	vap->va_uid = VNOVAL;
643	vap->va_gid = VNOVAL;
644	vap->va_fsid = VNOVAL;
645	vap->va_fileid = VNOVAL;
646	vap->va_blocksize = VNOVAL;
647	vap->va_rdev = VNOVAL;
648	vap->va_atime.tv_sec = VNOVAL;
649	vap->va_atime.tv_nsec = VNOVAL;
650	vap->va_mtime.tv_sec = VNOVAL;
651	vap->va_mtime.tv_nsec = VNOVAL;
652	vap->va_ctime.tv_sec = VNOVAL;
653	vap->va_ctime.tv_nsec = VNOVAL;
654	vap->va_birthtime.tv_sec = VNOVAL;
655	vap->va_birthtime.tv_nsec = VNOVAL;
656	vap->va_flags = VNOVAL;
657	vap->va_gen = VNOVAL;
658	vap->va_vaflags = 0;
659}
660
661/*
662 * This routine is called when we have too many vnodes.  It attempts
663 * to free <count> vnodes and will potentially free vnodes that still
664 * have VM backing store (VM backing store is typically the cause
665 * of a vnode blowout so we want to do this).  Therefore, this operation
666 * is not considered cheap.
667 *
668 * A number of conditions may prevent a vnode from being reclaimed.
669 * the buffer cache may have references on the vnode, a directory
670 * vnode may still have references due to the namei cache representing
671 * underlying files, or the vnode may be in active use.   It is not
672 * desireable to reuse such vnodes.  These conditions may cause the
673 * number of vnodes to reach some minimum value regardless of what
674 * you set kern.maxvnodes to.  Do not set kern.maxvnodes too low.
675 */
676static int
677vlrureclaim(struct mount *mp)
678{
679	struct vnode *vp;
680	int done;
681	int trigger;
682	int usevnodes;
683	int count;
684
685	/*
686	 * Calculate the trigger point, don't allow user
687	 * screwups to blow us up.   This prevents us from
688	 * recycling vnodes with lots of resident pages.  We
689	 * aren't trying to free memory, we are trying to
690	 * free vnodes.
691	 */
692	usevnodes = desiredvnodes;
693	if (usevnodes <= 0)
694		usevnodes = 1;
695	trigger = cnt.v_page_count * 2 / usevnodes;
696
697	done = 0;
698	mtx_lock(&mntvnode_mtx);
699	count = mp->mnt_nvnodelistsize / 10 + 1;
700	while (count && (vp = TAILQ_FIRST(&mp->mnt_nvnodelist)) != NULL) {
701		TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
702		TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
703
704		if (vp->v_type != VNON &&
705		    vp->v_type != VBAD &&
706		    VI_TRYLOCK(vp)) {
707			if (VMIGHTFREE(vp) &&           /* critical path opt */
708			    (vp->v_object == NULL ||
709			    vp->v_object->resident_page_count < trigger)) {
710				mtx_unlock(&mntvnode_mtx);
711				vgonel(vp, curthread);
712				done++;
713				mtx_lock(&mntvnode_mtx);
714			} else
715				VI_UNLOCK(vp);
716		}
717		--count;
718	}
719	mtx_unlock(&mntvnode_mtx);
720	return done;
721}
722
723/*
724 * Attempt to recycle vnodes in a context that is always safe to block.
725 * Calling vlrurecycle() from the bowels of filesystem code has some
726 * interesting deadlock problems.
727 */
728static struct proc *vnlruproc;
729static int vnlruproc_sig;
730
731static void
732vnlru_proc(void)
733{
734	struct mount *mp, *nmp;
735	int s;
736	int done;
737	struct proc *p = vnlruproc;
738	struct thread *td = FIRST_THREAD_IN_PROC(p);	/* XXXKSE */
739
740	mtx_lock(&Giant);
741
742	EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, p,
743	    SHUTDOWN_PRI_FIRST);
744
745	s = splbio();
746	for (;;) {
747		kthread_suspend_check(p);
748		mtx_lock(&vnode_free_list_mtx);
749		if (numvnodes - freevnodes <= desiredvnodes * 9 / 10) {
750			mtx_unlock(&vnode_free_list_mtx);
751			vnlruproc_sig = 0;
752			wakeup(&vnlruproc_sig);
753			tsleep(vnlruproc, PVFS, "vlruwt", hz);
754			continue;
755		}
756		mtx_unlock(&vnode_free_list_mtx);
757		done = 0;
758		mtx_lock(&mountlist_mtx);
759		for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
760			if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td)) {
761				nmp = TAILQ_NEXT(mp, mnt_list);
762				continue;
763			}
764			done += vlrureclaim(mp);
765			mtx_lock(&mountlist_mtx);
766			nmp = TAILQ_NEXT(mp, mnt_list);
767			vfs_unbusy(mp, td);
768		}
769		mtx_unlock(&mountlist_mtx);
770		if (done == 0) {
771#if 0
772			/* These messages are temporary debugging aids */
773			if (vnlru_nowhere < 5)
774				printf("vnlru process getting nowhere..\n");
775			else if (vnlru_nowhere == 5)
776				printf("vnlru process messages stopped.\n");
777#endif
778			vnlru_nowhere++;
779			tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3);
780		}
781	}
782	splx(s);
783}
784
785static struct kproc_desc vnlru_kp = {
786	"vnlru",
787	vnlru_proc,
788	&vnlruproc
789};
790SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &vnlru_kp)
791
792
793/*
794 * Routines having to do with the management of the vnode table.
795 */
796
797/*
798 * Check to see if a free vnode can be recycled. If it can,
799 * return it locked with the vn lock, but not interlock. Also
800 * get the vn_start_write lock. Otherwise indicate the error.
801 */
802static int
803vcanrecycle(struct vnode *vp, struct mount **vnmpp)
804{
805	struct thread *td = curthread;
806	vm_object_t object;
807	int error;
808
809	/* Don't recycle if we can't get the interlock */
810	if (!VI_TRYLOCK(vp))
811		return (EWOULDBLOCK);
812
813	/* We should be able to immediately acquire this */
814	/* XXX This looks like it should panic if it fails */
815	if (vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE, td) != 0) {
816		if (VOP_ISLOCKED(vp, td))
817			panic("vcanrecycle: locked vnode");
818		return (EWOULDBLOCK);
819	}
820
821	/*
822	 * Don't recycle if its filesystem is being suspended.
823	 */
824	if (vn_start_write(vp, vnmpp, V_NOWAIT) != 0) {
825		error = EBUSY;
826		goto done;
827	}
828
829	/*
830	 * Don't recycle if we still have cached pages.
831	 */
832	if (VOP_GETVOBJECT(vp, &object) == 0 &&
833	     (object->resident_page_count ||
834	      object->ref_count)) {
835		error = EBUSY;
836		goto done;
837	}
838	if (LIST_FIRST(&vp->v_cache_src)) {
839		/*
840		 * note: nameileafonly sysctl is temporary,
841		 * for debugging only, and will eventually be
842		 * removed.
843		 */
844		if (nameileafonly > 0) {
845			/*
846			 * Do not reuse namei-cached directory
847			 * vnodes that have cached
848			 * subdirectories.
849			 */
850			if (cache_leaf_test(vp) < 0) {
851				error = EISDIR;
852				goto done;
853			}
854		} else if (nameileafonly < 0 ||
855			    vmiodirenable == 0) {
856			/*
857			 * Do not reuse namei-cached directory
858			 * vnodes if nameileafonly is -1 or
859			 * if VMIO backing for directories is
860			 * turned off (otherwise we reuse them
861			 * too quickly).
862			 */
863			error = EBUSY;
864			goto done;
865		}
866	}
867	return (0);
868done:
869	VOP_UNLOCK(vp, 0, td);
870	return (error);
871}
872
873/*
874 * Return the next vnode from the free list.
875 */
876int
877getnewvnode(tag, mp, vops, vpp)
878	const char *tag;
879	struct mount *mp;
880	vop_t **vops;
881	struct vnode **vpp;
882{
883	int s;
884	struct thread *td = curthread;	/* XXX */
885	struct vnode *vp = NULL;
886	struct vpollinfo *pollinfo = NULL;
887	struct mount *vnmp;
888
889	s = splbio();
890	mtx_lock(&vnode_free_list_mtx);
891
892	/*
893	 * Try to reuse vnodes if we hit the max.  This situation only
894	 * occurs in certain large-memory (2G+) situations.  We cannot
895	 * attempt to directly reclaim vnodes due to nasty recursion
896	 * problems.
897	 */
898	while (numvnodes - freevnodes > desiredvnodes) {
899		if (vnlruproc_sig == 0) {
900			vnlruproc_sig = 1;      /* avoid unnecessary wakeups */
901			wakeup(vnlruproc);
902		}
903		mtx_unlock(&vnode_free_list_mtx);
904		tsleep(&vnlruproc_sig, PVFS, "vlruwk", hz);
905		mtx_lock(&vnode_free_list_mtx);
906	}
907
908	/*
909	 * Attempt to reuse a vnode already on the free list, allocating
910	 * a new vnode if we can't find one or if we have not reached a
911	 * good minimum for good LRU performance.
912	 */
913
914	if (freevnodes >= wantfreevnodes && numvnodes >= minvnodes) {
915		int error;
916		int count;
917
918		for (count = 0; count < freevnodes; count++) {
919			vp = TAILQ_FIRST(&vnode_free_list);
920
921			KASSERT(vp->v_usecount == 0 &&
922			    (vp->v_iflag & VI_DOINGINACT) == 0,
923			    ("getnewvnode: free vnode isn't"));
924
925			TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
926			/*
927			 * We have to drop the free list mtx to avoid lock
928			 * order reversals with interlock.
929			 */
930			mtx_unlock(&vnode_free_list_mtx);
931			error = vcanrecycle(vp, &vnmp);
932			mtx_lock(&vnode_free_list_mtx);
933			if (error == 0)
934				break;
935			TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
936			vp = NULL;
937		}
938	}
939	if (vp) {
940		freevnodes--;
941		mtx_unlock(&vnode_free_list_mtx);
942
943		cache_purge(vp);
944		VI_LOCK(vp);
945		vp->v_iflag |= VI_DOOMED;
946		vp->v_iflag &= ~VI_FREE;
947		if (vp->v_type != VBAD) {
948			VOP_UNLOCK(vp, 0, td);
949			vgonel(vp, td);
950			VI_LOCK(vp);
951		} else {
952			VOP_UNLOCK(vp, 0, td);
953		}
954		vn_finished_write(vnmp);
955
956#ifdef INVARIANTS
957		{
958			if (vp->v_data)
959				panic("cleaned vnode isn't");
960			if (vp->v_numoutput)
961				panic("Clean vnode has pending I/O's");
962			if (vp->v_writecount != 0)
963				panic("Non-zero write count");
964		}
965#endif
966		if ((pollinfo = vp->v_pollinfo) != NULL) {
967			/*
968			 * To avoid lock order reversals, the call to
969			 * uma_zfree() must be delayed until the vnode
970			 * interlock is released.
971			 */
972			vp->v_pollinfo = NULL;
973		}
974#ifdef MAC
975		mac_destroy_vnode(vp);
976#endif
977		vp->v_iflag = 0;
978		vp->v_vflag = 0;
979		vp->v_lastw = 0;
980		vp->v_lasta = 0;
981		vp->v_cstart = 0;
982		vp->v_clen = 0;
983		vp->v_socket = 0;
984		lockdestroy(vp->v_vnlock);
985		lockinit(vp->v_vnlock, PVFS, tag, VLKTIMEOUT, LK_NOPAUSE);
986		KASSERT(vp->v_cleanbufcnt == 0, ("cleanbufcnt not 0"));
987		KASSERT(vp->v_cleanblkroot == NULL, ("cleanblkroot not NULL"));
988		KASSERT(vp->v_dirtybufcnt == 0, ("dirtybufcnt not 0"));
989		KASSERT(vp->v_dirtyblkroot == NULL, ("dirtyblkroot not NULL"));
990	} else {
991		numvnodes++;
992		mtx_unlock(&vnode_free_list_mtx);
993
994		vp = (struct vnode *) uma_zalloc(vnode_zone, M_WAITOK|M_ZERO);
995		mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF);
996		VI_LOCK(vp);
997		vp->v_dd = vp;
998		vp->v_vnlock = &vp->v_lock;
999		lockinit(vp->v_vnlock, PVFS, tag, VLKTIMEOUT, LK_NOPAUSE);
1000		cache_purge(vp);
1001		LIST_INIT(&vp->v_cache_src);
1002		TAILQ_INIT(&vp->v_cache_dst);
1003	}
1004
1005	TAILQ_INIT(&vp->v_cleanblkhd);
1006	TAILQ_INIT(&vp->v_dirtyblkhd);
1007	vp->v_type = VNON;
1008	vp->v_tag = tag;
1009	vp->v_op = vops;
1010	*vpp = vp;
1011	vp->v_usecount = 1;
1012	vp->v_data = 0;
1013	vp->v_cachedid = -1;
1014	VI_UNLOCK(vp);
1015	if (pollinfo != NULL) {
1016		mtx_destroy(&pollinfo->vpi_lock);
1017		uma_zfree(vnodepoll_zone, pollinfo);
1018	}
1019#ifdef MAC
1020	mac_init_vnode(vp);
1021	if (mp != NULL && (mp->mnt_flag & MNT_MULTILABEL) == 0)
1022		mac_associate_vnode_singlelabel(mp, vp);
1023#endif
1024	insmntque(vp, mp);
1025
1026	return (0);
1027}
1028
1029/*
1030 * Move a vnode from one mount queue to another.
1031 */
1032static void
1033insmntque(vp, mp)
1034	register struct vnode *vp;
1035	register struct mount *mp;
1036{
1037
1038	mtx_lock(&mntvnode_mtx);
1039	/*
1040	 * Delete from old mount point vnode list, if on one.
1041	 */
1042	if (vp->v_mount != NULL) {
1043		KASSERT(vp->v_mount->mnt_nvnodelistsize > 0,
1044			("bad mount point vnode list size"));
1045		TAILQ_REMOVE(&vp->v_mount->mnt_nvnodelist, vp, v_nmntvnodes);
1046		vp->v_mount->mnt_nvnodelistsize--;
1047	}
1048	/*
1049	 * Insert into list of vnodes for the new mount point, if available.
1050	 */
1051	if ((vp->v_mount = mp) == NULL) {
1052		mtx_unlock(&mntvnode_mtx);
1053		return;
1054	}
1055	TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
1056	mp->mnt_nvnodelistsize++;
1057	mtx_unlock(&mntvnode_mtx);
1058}
1059
1060/*
1061 * Update outstanding I/O count and do wakeup if requested.
1062 */
1063void
1064vwakeup(bp)
1065	register struct buf *bp;
1066{
1067	register struct vnode *vp;
1068
1069	bp->b_flags &= ~B_WRITEINPROG;
1070	if ((vp = bp->b_vp)) {
1071		VI_LOCK(vp);
1072		vp->v_numoutput--;
1073		if (vp->v_numoutput < 0)
1074			panic("vwakeup: neg numoutput");
1075		if ((vp->v_numoutput == 0) && (vp->v_iflag & VI_BWAIT)) {
1076			vp->v_iflag &= ~VI_BWAIT;
1077			wakeup(&vp->v_numoutput);
1078		}
1079		VI_UNLOCK(vp);
1080	}
1081}
1082
1083/*
1084 * Flush out and invalidate all buffers associated with a vnode.
1085 * Called with the underlying object locked.
1086 */
1087int
1088vinvalbuf(vp, flags, cred, td, slpflag, slptimeo)
1089	struct vnode *vp;
1090	int flags;
1091	struct ucred *cred;
1092	struct thread *td;
1093	int slpflag, slptimeo;
1094{
1095	struct buf *blist;
1096	int s, error;
1097	vm_object_t object;
1098
1099	GIANT_REQUIRED;
1100
1101	ASSERT_VOP_LOCKED(vp, "vinvalbuf");
1102
1103	VI_LOCK(vp);
1104	if (flags & V_SAVE) {
1105		s = splbio();
1106		while (vp->v_numoutput) {
1107			vp->v_iflag |= VI_BWAIT;
1108			error = msleep(&vp->v_numoutput, VI_MTX(vp),
1109			    slpflag | (PRIBIO + 1), "vinvlbuf", slptimeo);
1110			if (error) {
1111				VI_UNLOCK(vp);
1112				splx(s);
1113				return (error);
1114			}
1115		}
1116		if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
1117			splx(s);
1118			VI_UNLOCK(vp);
1119			if ((error = VOP_FSYNC(vp, cred, MNT_WAIT, td)) != 0)
1120				return (error);
1121			/*
1122			 * XXX We could save a lock/unlock if this was only
1123			 * enabled under INVARIANTS
1124			 */
1125			VI_LOCK(vp);
1126			s = splbio();
1127			if (vp->v_numoutput > 0 ||
1128			    !TAILQ_EMPTY(&vp->v_dirtyblkhd))
1129				panic("vinvalbuf: dirty bufs");
1130		}
1131		splx(s);
1132	}
1133	s = splbio();
1134	/*
1135	 * If you alter this loop please notice that interlock is dropped and
1136	 * reacquired in flushbuflist.  Special care is needed to ensure that
1137	 * no race conditions occur from this.
1138	 */
1139	for (error = 0;;) {
1140		if ((blist = TAILQ_FIRST(&vp->v_cleanblkhd)) != 0 &&
1141		    flushbuflist(blist, flags, vp, slpflag, slptimeo, &error)) {
1142			if (error)
1143				break;
1144			continue;
1145		}
1146		if ((blist = TAILQ_FIRST(&vp->v_dirtyblkhd)) != 0 &&
1147		    flushbuflist(blist, flags, vp, slpflag, slptimeo, &error)) {
1148			if (error)
1149				break;
1150			continue;
1151		}
1152		break;
1153	}
1154	if (error) {
1155		splx(s);
1156		VI_UNLOCK(vp);
1157		return (error);
1158	}
1159
1160	/*
1161	 * Wait for I/O to complete.  XXX needs cleaning up.  The vnode can
1162	 * have write I/O in-progress but if there is a VM object then the
1163	 * VM object can also have read-I/O in-progress.
1164	 */
1165	do {
1166		while (vp->v_numoutput > 0) {
1167			vp->v_iflag |= VI_BWAIT;
1168			msleep(&vp->v_numoutput, VI_MTX(vp), PVM, "vnvlbv", 0);
1169		}
1170		VI_UNLOCK(vp);
1171		if (VOP_GETVOBJECT(vp, &object) == 0) {
1172			VM_OBJECT_LOCK(object);
1173			vm_object_pip_wait(object, "vnvlbx");
1174			VM_OBJECT_UNLOCK(object);
1175		}
1176		VI_LOCK(vp);
1177	} while (vp->v_numoutput > 0);
1178	VI_UNLOCK(vp);
1179
1180	splx(s);
1181
1182	/*
1183	 * Destroy the copy in the VM cache, too.
1184	 */
1185	if (VOP_GETVOBJECT(vp, &object) == 0) {
1186		VM_OBJECT_LOCK(object);
1187		vm_object_page_remove(object, 0, 0,
1188			(flags & V_SAVE) ? TRUE : FALSE);
1189		VM_OBJECT_UNLOCK(object);
1190	}
1191
1192#ifdef INVARIANTS
1193	VI_LOCK(vp);
1194	if ((flags & (V_ALT | V_NORMAL)) == 0 &&
1195	    (!TAILQ_EMPTY(&vp->v_dirtyblkhd) ||
1196	     !TAILQ_EMPTY(&vp->v_cleanblkhd)))
1197		panic("vinvalbuf: flush failed");
1198	VI_UNLOCK(vp);
1199#endif
1200	return (0);
1201}
1202
1203/*
1204 * Flush out buffers on the specified list.
1205 *
1206 */
1207static int
1208flushbuflist(blist, flags, vp, slpflag, slptimeo, errorp)
1209	struct buf *blist;
1210	int flags;
1211	struct vnode *vp;
1212	int slpflag, slptimeo;
1213	int *errorp;
1214{
1215	struct buf *bp, *nbp;
1216	int found, error;
1217
1218	ASSERT_VI_LOCKED(vp, "flushbuflist");
1219
1220	for (found = 0, bp = blist; bp; bp = nbp) {
1221		nbp = TAILQ_NEXT(bp, b_vnbufs);
1222		if (((flags & V_NORMAL) && (bp->b_xflags & BX_ALTDATA)) ||
1223		    ((flags & V_ALT) && (bp->b_xflags & BX_ALTDATA) == 0)) {
1224			continue;
1225		}
1226		found += 1;
1227		error = BUF_TIMELOCK(bp,
1228		    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, VI_MTX(vp),
1229		    "flushbuf", slpflag, slptimeo);
1230		if (error) {
1231			if (error != ENOLCK)
1232				*errorp = error;
1233			goto done;
1234		}
1235		/*
1236		 * XXX Since there are no node locks for NFS, I
1237		 * believe there is a slight chance that a delayed
1238		 * write will occur while sleeping just above, so
1239		 * check for it.  Note that vfs_bio_awrite expects
1240		 * buffers to reside on a queue, while BUF_WRITE and
1241		 * brelse do not.
1242		 */
1243		if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) &&
1244			(flags & V_SAVE)) {
1245
1246			if (bp->b_vp == vp) {
1247				if (bp->b_flags & B_CLUSTEROK) {
1248					vfs_bio_awrite(bp);
1249				} else {
1250					bremfree(bp);
1251					bp->b_flags |= B_ASYNC;
1252					BUF_WRITE(bp);
1253				}
1254			} else {
1255				bremfree(bp);
1256				(void) BUF_WRITE(bp);
1257			}
1258			goto done;
1259		}
1260		bremfree(bp);
1261		bp->b_flags |= (B_INVAL | B_NOCACHE | B_RELBUF);
1262		bp->b_flags &= ~B_ASYNC;
1263		brelse(bp);
1264		VI_LOCK(vp);
1265	}
1266	return (found);
1267done:
1268	VI_LOCK(vp);
1269	return (found);
1270}
1271
1272/*
1273 * Truncate a file's buffer and pages to a specified length.  This
1274 * is in lieu of the old vinvalbuf mechanism, which performed unneeded
1275 * sync activity.
1276 */
1277int
1278vtruncbuf(vp, cred, td, length, blksize)
1279	register struct vnode *vp;
1280	struct ucred *cred;
1281	struct thread *td;
1282	off_t length;
1283	int blksize;
1284{
1285	register struct buf *bp;
1286	struct buf *nbp;
1287	int s, anyfreed;
1288	int trunclbn;
1289
1290	/*
1291	 * Round up to the *next* lbn.
1292	 */
1293	trunclbn = (length + blksize - 1) / blksize;
1294
1295	s = splbio();
1296	ASSERT_VOP_LOCKED(vp, "vtruncbuf");
1297restart:
1298	VI_LOCK(vp);
1299	anyfreed = 1;
1300	for (;anyfreed;) {
1301		anyfreed = 0;
1302		for (bp = TAILQ_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) {
1303			nbp = TAILQ_NEXT(bp, b_vnbufs);
1304			if (bp->b_lblkno >= trunclbn) {
1305				if (BUF_LOCK(bp,
1306				    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
1307				    VI_MTX(vp)) == ENOLCK)
1308					goto restart;
1309
1310				bremfree(bp);
1311				bp->b_flags |= (B_INVAL | B_RELBUF);
1312				bp->b_flags &= ~B_ASYNC;
1313				brelse(bp);
1314				anyfreed = 1;
1315
1316				if (nbp &&
1317				    (((nbp->b_xflags & BX_VNCLEAN) == 0) ||
1318				    (nbp->b_vp != vp) ||
1319				    (nbp->b_flags & B_DELWRI))) {
1320					goto restart;
1321				}
1322				VI_LOCK(vp);
1323			}
1324		}
1325
1326		for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
1327			nbp = TAILQ_NEXT(bp, b_vnbufs);
1328			if (bp->b_lblkno >= trunclbn) {
1329				if (BUF_LOCK(bp,
1330				    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
1331				    VI_MTX(vp)) == ENOLCK)
1332					goto restart;
1333				bremfree(bp);
1334				bp->b_flags |= (B_INVAL | B_RELBUF);
1335				bp->b_flags &= ~B_ASYNC;
1336				brelse(bp);
1337				anyfreed = 1;
1338				if (nbp &&
1339				    (((nbp->b_xflags & BX_VNDIRTY) == 0) ||
1340				    (nbp->b_vp != vp) ||
1341				    (nbp->b_flags & B_DELWRI) == 0)) {
1342					goto restart;
1343				}
1344				VI_LOCK(vp);
1345			}
1346		}
1347	}
1348
1349	if (length > 0) {
1350restartsync:
1351		for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
1352			nbp = TAILQ_NEXT(bp, b_vnbufs);
1353			if (bp->b_lblkno > 0)
1354				continue;
1355			/*
1356			 * Since we hold the vnode lock this should only
1357			 * fail if we're racing with the buf daemon.
1358			 */
1359			if (BUF_LOCK(bp,
1360			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
1361			    VI_MTX(vp)) == ENOLCK) {
1362				goto restart;
1363			}
1364			KASSERT((bp->b_flags & B_DELWRI),
1365			    ("buf(%p) on dirty queue without DELWRI.", bp));
1366
1367			bremfree(bp);
1368			bawrite(bp);
1369			VI_LOCK(vp);
1370			goto restartsync;
1371		}
1372	}
1373
1374	while (vp->v_numoutput > 0) {
1375		vp->v_iflag |= VI_BWAIT;
1376		msleep(&vp->v_numoutput, VI_MTX(vp), PVM, "vbtrunc", 0);
1377	}
1378	VI_UNLOCK(vp);
1379	splx(s);
1380
1381	vnode_pager_setsize(vp, length);
1382
1383	return (0);
1384}
1385
1386/*
1387 * buf_splay() - splay tree core for the clean/dirty list of buffers in
1388 * 		 a vnode.
1389 *
1390 *	NOTE: We have to deal with the special case of a background bitmap
1391 *	buffer, a situation where two buffers will have the same logical
1392 *	block offset.  We want (1) only the foreground buffer to be accessed
1393 *	in a lookup and (2) must differentiate between the foreground and
1394 *	background buffer in the splay tree algorithm because the splay
1395 *	tree cannot normally handle multiple entities with the same 'index'.
1396 *	We accomplish this by adding differentiating flags to the splay tree's
1397 *	numerical domain.
1398 */
1399static
1400struct buf *
1401buf_splay(daddr_t lblkno, b_xflags_t xflags, struct buf *root)
1402{
1403	struct buf dummy;
1404	struct buf *lefttreemax, *righttreemin, *y;
1405
1406	if (root == NULL)
1407		return (NULL);
1408	lefttreemax = righttreemin = &dummy;
1409	for (;;) {
1410		if (lblkno < root->b_lblkno ||
1411		    (lblkno == root->b_lblkno &&
1412		    (xflags & BX_BKGRDMARKER) < (root->b_xflags & BX_BKGRDMARKER))) {
1413			if ((y = root->b_left) == NULL)
1414				break;
1415			if (lblkno < y->b_lblkno) {
1416				/* Rotate right. */
1417				root->b_left = y->b_right;
1418				y->b_right = root;
1419				root = y;
1420				if ((y = root->b_left) == NULL)
1421					break;
1422			}
1423			/* Link into the new root's right tree. */
1424			righttreemin->b_left = root;
1425			righttreemin = root;
1426		} else if (lblkno > root->b_lblkno ||
1427		    (lblkno == root->b_lblkno &&
1428		    (xflags & BX_BKGRDMARKER) > (root->b_xflags & BX_BKGRDMARKER))) {
1429			if ((y = root->b_right) == NULL)
1430				break;
1431			if (lblkno > y->b_lblkno) {
1432				/* Rotate left. */
1433				root->b_right = y->b_left;
1434				y->b_left = root;
1435				root = y;
1436				if ((y = root->b_right) == NULL)
1437					break;
1438			}
1439			/* Link into the new root's left tree. */
1440			lefttreemax->b_right = root;
1441			lefttreemax = root;
1442		} else {
1443			break;
1444		}
1445		root = y;
1446	}
1447	/* Assemble the new root. */
1448	lefttreemax->b_right = root->b_left;
1449	righttreemin->b_left = root->b_right;
1450	root->b_left = dummy.b_right;
1451	root->b_right = dummy.b_left;
1452	return (root);
1453}
1454
1455static
1456void
1457buf_vlist_remove(struct buf *bp)
1458{
1459	struct vnode *vp = bp->b_vp;
1460	struct buf *root;
1461
1462	ASSERT_VI_LOCKED(vp, "buf_vlist_remove");
1463	if (bp->b_xflags & BX_VNDIRTY) {
1464		if (bp != vp->v_dirtyblkroot) {
1465			root = buf_splay(bp->b_lblkno, bp->b_xflags, vp->v_dirtyblkroot);
1466			KASSERT(root == bp, ("splay lookup failed during dirty remove"));
1467		}
1468		if (bp->b_left == NULL) {
1469			root = bp->b_right;
1470		} else {
1471			root = buf_splay(bp->b_lblkno, bp->b_xflags, bp->b_left);
1472			root->b_right = bp->b_right;
1473		}
1474		vp->v_dirtyblkroot = root;
1475		TAILQ_REMOVE(&vp->v_dirtyblkhd, bp, b_vnbufs);
1476		vp->v_dirtybufcnt--;
1477	} else {
1478		/* KASSERT(bp->b_xflags & BX_VNCLEAN, ("bp wasn't clean")); */
1479		if (bp != vp->v_cleanblkroot) {
1480			root = buf_splay(bp->b_lblkno, bp->b_xflags, vp->v_cleanblkroot);
1481			KASSERT(root == bp, ("splay lookup failed during clean remove"));
1482		}
1483		if (bp->b_left == NULL) {
1484			root = bp->b_right;
1485		} else {
1486			root = buf_splay(bp->b_lblkno, bp->b_xflags, bp->b_left);
1487			root->b_right = bp->b_right;
1488		}
1489		vp->v_cleanblkroot = root;
1490		TAILQ_REMOVE(&vp->v_cleanblkhd, bp, b_vnbufs);
1491		vp->v_cleanbufcnt--;
1492	}
1493	bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN);
1494}
1495
1496/*
1497 * Add the buffer to the sorted clean or dirty block list using a
1498 * splay tree algorithm.
1499 *
1500 * NOTE: xflags is passed as a constant, optimizing this inline function!
1501 */
1502static
1503void
1504buf_vlist_add(struct buf *bp, struct vnode *vp, b_xflags_t xflags)
1505{
1506	struct buf *root;
1507
1508	ASSERT_VI_LOCKED(vp, "buf_vlist_add");
1509	bp->b_xflags |= xflags;
1510	if (xflags & BX_VNDIRTY) {
1511		root = buf_splay(bp->b_lblkno, bp->b_xflags, vp->v_dirtyblkroot);
1512		if (root == NULL) {
1513			bp->b_left = NULL;
1514			bp->b_right = NULL;
1515			TAILQ_INSERT_TAIL(&vp->v_dirtyblkhd, bp, b_vnbufs);
1516		} else if (bp->b_lblkno < root->b_lblkno ||
1517		    (bp->b_lblkno == root->b_lblkno &&
1518		    (bp->b_xflags & BX_BKGRDMARKER) < (root->b_xflags & BX_BKGRDMARKER))) {
1519			bp->b_left = root->b_left;
1520			bp->b_right = root;
1521			root->b_left = NULL;
1522			TAILQ_INSERT_BEFORE(root, bp, b_vnbufs);
1523		} else {
1524			bp->b_right = root->b_right;
1525			bp->b_left = root;
1526			root->b_right = NULL;
1527			TAILQ_INSERT_AFTER(&vp->v_dirtyblkhd,
1528			    root, bp, b_vnbufs);
1529		}
1530		vp->v_dirtybufcnt++;
1531		vp->v_dirtyblkroot = bp;
1532	} else {
1533		/* KASSERT(xflags & BX_VNCLEAN, ("xflags not clean")); */
1534		root = buf_splay(bp->b_lblkno, bp->b_xflags, vp->v_cleanblkroot);
1535		if (root == NULL) {
1536			bp->b_left = NULL;
1537			bp->b_right = NULL;
1538			TAILQ_INSERT_TAIL(&vp->v_cleanblkhd, bp, b_vnbufs);
1539		} else if (bp->b_lblkno < root->b_lblkno ||
1540		    (bp->b_lblkno == root->b_lblkno &&
1541		    (bp->b_xflags & BX_BKGRDMARKER) < (root->b_xflags & BX_BKGRDMARKER))) {
1542			bp->b_left = root->b_left;
1543			bp->b_right = root;
1544			root->b_left = NULL;
1545			TAILQ_INSERT_BEFORE(root, bp, b_vnbufs);
1546		} else {
1547			bp->b_right = root->b_right;
1548			bp->b_left = root;
1549			root->b_right = NULL;
1550			TAILQ_INSERT_AFTER(&vp->v_cleanblkhd,
1551			    root, bp, b_vnbufs);
1552		}
1553		vp->v_cleanbufcnt++;
1554		vp->v_cleanblkroot = bp;
1555	}
1556}
1557
1558/*
1559 * Lookup a buffer using the splay tree.  Note that we specifically avoid
1560 * shadow buffers used in background bitmap writes.
1561 *
1562 * This code isn't quite efficient as it could be because we are maintaining
1563 * two sorted lists and do not know which list the block resides in.
1564 */
1565struct buf *
1566gbincore(struct vnode *vp, daddr_t lblkno)
1567{
1568	struct buf *bp;
1569
1570	GIANT_REQUIRED;
1571
1572	ASSERT_VI_LOCKED(vp, "gbincore");
1573	bp = vp->v_cleanblkroot = buf_splay(lblkno, 0, vp->v_cleanblkroot);
1574	if (bp && bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
1575		return(bp);
1576	bp = vp->v_dirtyblkroot = buf_splay(lblkno, 0, vp->v_dirtyblkroot);
1577	if (bp && bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
1578		return(bp);
1579	return(NULL);
1580}
1581
1582/*
1583 * Associate a buffer with a vnode.
1584 */
1585void
1586bgetvp(vp, bp)
1587	register struct vnode *vp;
1588	register struct buf *bp;
1589{
1590	int s;
1591
1592	KASSERT(bp->b_vp == NULL, ("bgetvp: not free"));
1593
1594	KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0,
1595	    ("bgetvp: bp already attached! %p", bp));
1596
1597	ASSERT_VI_LOCKED(vp, "bgetvp");
1598	vholdl(vp);
1599	bp->b_vp = vp;
1600	bp->b_dev = vn_todev(vp);
1601	/*
1602	 * Insert onto list for new vnode.
1603	 */
1604	s = splbio();
1605	buf_vlist_add(bp, vp, BX_VNCLEAN);
1606	splx(s);
1607}
1608
1609/*
1610 * Disassociate a buffer from a vnode.
1611 */
1612void
1613brelvp(bp)
1614	register struct buf *bp;
1615{
1616	struct vnode *vp;
1617	int s;
1618
1619	KASSERT(bp->b_vp != NULL, ("brelvp: NULL"));
1620
1621	/*
1622	 * Delete from old vnode list, if on one.
1623	 */
1624	vp = bp->b_vp;
1625	s = splbio();
1626	VI_LOCK(vp);
1627	if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN))
1628		buf_vlist_remove(bp);
1629	if ((vp->v_iflag & VI_ONWORKLST) && TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
1630		vp->v_iflag &= ~VI_ONWORKLST;
1631		mtx_lock(&sync_mtx);
1632		LIST_REMOVE(vp, v_synclist);
1633		mtx_unlock(&sync_mtx);
1634	}
1635	vdropl(vp);
1636	VI_UNLOCK(vp);
1637	bp->b_vp = (struct vnode *) 0;
1638	if (bp->b_object)
1639		bp->b_object = NULL;
1640	splx(s);
1641}
1642
1643/*
1644 * Add an item to the syncer work queue.
1645 */
1646static void
1647vn_syncer_add_to_worklist(struct vnode *vp, int delay)
1648{
1649	int s, slot;
1650
1651	s = splbio();
1652	ASSERT_VI_LOCKED(vp, "vn_syncer_add_to_worklist");
1653
1654	mtx_lock(&sync_mtx);
1655	if (vp->v_iflag & VI_ONWORKLST)
1656		LIST_REMOVE(vp, v_synclist);
1657	else
1658		vp->v_iflag |= VI_ONWORKLST;
1659
1660	if (delay > syncer_maxdelay - 2)
1661		delay = syncer_maxdelay - 2;
1662	slot = (syncer_delayno + delay) & syncer_mask;
1663
1664	LIST_INSERT_HEAD(&syncer_workitem_pending[slot], vp, v_synclist);
1665	mtx_unlock(&sync_mtx);
1666
1667	splx(s);
1668}
1669
1670struct  proc *updateproc;
1671static void sched_sync(void);
1672static struct kproc_desc up_kp = {
1673	"syncer",
1674	sched_sync,
1675	&updateproc
1676};
1677SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp)
1678
1679/*
1680 * System filesystem synchronizer daemon.
1681 */
1682static void
1683sched_sync(void)
1684{
1685	struct synclist *slp;
1686	struct vnode *vp;
1687	struct mount *mp;
1688	long starttime;
1689	int s;
1690	struct thread *td = FIRST_THREAD_IN_PROC(updateproc);  /* XXXKSE */
1691
1692	mtx_lock(&Giant);
1693
1694	EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, td->td_proc,
1695	    SHUTDOWN_PRI_LAST);
1696
1697	for (;;) {
1698		kthread_suspend_check(td->td_proc);
1699
1700		starttime = time_second;
1701
1702		/*
1703		 * Push files whose dirty time has expired.  Be careful
1704		 * of interrupt race on slp queue.
1705		 */
1706		s = splbio();
1707		mtx_lock(&sync_mtx);
1708		slp = &syncer_workitem_pending[syncer_delayno];
1709		syncer_delayno += 1;
1710		if (syncer_delayno == syncer_maxdelay)
1711			syncer_delayno = 0;
1712		splx(s);
1713
1714		while ((vp = LIST_FIRST(slp)) != NULL) {
1715			mtx_unlock(&sync_mtx);
1716			if (VOP_ISLOCKED(vp, NULL) == 0 &&
1717			    vn_start_write(vp, &mp, V_NOWAIT) == 0) {
1718				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
1719				(void) VOP_FSYNC(vp, td->td_ucred, MNT_LAZY, td);
1720				VOP_UNLOCK(vp, 0, td);
1721				vn_finished_write(mp);
1722			}
1723			s = splbio();
1724			mtx_lock(&sync_mtx);
1725			if (LIST_FIRST(slp) == vp) {
1726				mtx_unlock(&sync_mtx);
1727				/*
1728				 * Note: VFS vnodes can remain on the
1729				 * worklist too with no dirty blocks, but
1730				 * since sync_fsync() moves it to a different
1731				 * slot we are safe.
1732				 */
1733				VI_LOCK(vp);
1734				if (TAILQ_EMPTY(&vp->v_dirtyblkhd) &&
1735				    !vn_isdisk(vp, NULL)) {
1736					panic("sched_sync: fsync failed "
1737					      "vp %p tag %s", vp, vp->v_tag);
1738				}
1739				/*
1740				 * Put us back on the worklist.  The worklist
1741				 * routine will remove us from our current
1742				 * position and then add us back in at a later
1743				 * position.
1744				 */
1745				vn_syncer_add_to_worklist(vp, syncdelay);
1746				VI_UNLOCK(vp);
1747				mtx_lock(&sync_mtx);
1748			}
1749			splx(s);
1750		}
1751		mtx_unlock(&sync_mtx);
1752
1753		/*
1754		 * Do soft update processing.
1755		 */
1756		if (softdep_process_worklist_hook != NULL)
1757			(*softdep_process_worklist_hook)(NULL);
1758
1759		/*
1760		 * The variable rushjob allows the kernel to speed up the
1761		 * processing of the filesystem syncer process. A rushjob
1762		 * value of N tells the filesystem syncer to process the next
1763		 * N seconds worth of work on its queue ASAP. Currently rushjob
1764		 * is used by the soft update code to speed up the filesystem
1765		 * syncer process when the incore state is getting so far
1766		 * ahead of the disk that the kernel memory pool is being
1767		 * threatened with exhaustion.
1768		 */
1769		mtx_lock(&sync_mtx);
1770		if (rushjob > 0) {
1771			rushjob -= 1;
1772			mtx_unlock(&sync_mtx);
1773			continue;
1774		}
1775		mtx_unlock(&sync_mtx);
1776		/*
1777		 * If it has taken us less than a second to process the
1778		 * current work, then wait. Otherwise start right over
1779		 * again. We can still lose time if any single round
1780		 * takes more than two seconds, but it does not really
1781		 * matter as we are just trying to generally pace the
1782		 * filesystem activity.
1783		 */
1784		if (time_second == starttime)
1785			tsleep(&lbolt, PPAUSE, "syncer", 0);
1786	}
1787}
1788
1789/*
1790 * Request the syncer daemon to speed up its work.
1791 * We never push it to speed up more than half of its
1792 * normal turn time, otherwise it could take over the cpu.
1793 * XXXKSE  only one update?
1794 */
1795int
1796speedup_syncer()
1797{
1798	struct thread *td;
1799	int ret = 0;
1800
1801	td = FIRST_THREAD_IN_PROC(updateproc);
1802	mtx_lock_spin(&sched_lock);
1803	if (td->td_wchan == &lbolt) {
1804		unsleep(td);
1805		TD_CLR_SLEEPING(td);
1806		setrunnable(td);
1807	}
1808	mtx_unlock_spin(&sched_lock);
1809	mtx_lock(&sync_mtx);
1810	if (rushjob < syncdelay / 2) {
1811		rushjob += 1;
1812		stat_rush_requests += 1;
1813		ret = 1;
1814	}
1815	mtx_unlock(&sync_mtx);
1816	return (ret);
1817}
1818
1819/*
1820 * Associate a p-buffer with a vnode.
1821 *
1822 * Also sets B_PAGING flag to indicate that vnode is not fully associated
1823 * with the buffer.  i.e. the bp has not been linked into the vnode or
1824 * ref-counted.
1825 */
1826void
1827pbgetvp(vp, bp)
1828	register struct vnode *vp;
1829	register struct buf *bp;
1830{
1831
1832	KASSERT(bp->b_vp == NULL, ("pbgetvp: not free"));
1833
1834	bp->b_vp = vp;
1835	bp->b_flags |= B_PAGING;
1836	bp->b_dev = vn_todev(vp);
1837}
1838
1839/*
1840 * Disassociate a p-buffer from a vnode.
1841 */
1842void
1843pbrelvp(bp)
1844	register struct buf *bp;
1845{
1846
1847	KASSERT(bp->b_vp != NULL, ("pbrelvp: NULL"));
1848
1849	/* XXX REMOVE ME */
1850	VI_LOCK(bp->b_vp);
1851	if (TAILQ_NEXT(bp, b_vnbufs) != NULL) {
1852		panic(
1853		    "relpbuf(): b_vp was probably reassignbuf()d %p %x",
1854		    bp,
1855		    (int)bp->b_flags
1856		);
1857	}
1858	VI_UNLOCK(bp->b_vp);
1859	bp->b_vp = (struct vnode *) 0;
1860	bp->b_flags &= ~B_PAGING;
1861}
1862
1863/*
1864 * Reassign a buffer from one vnode to another.
1865 * Used to assign file specific control information
1866 * (indirect blocks) to the vnode to which they belong.
1867 */
1868void
1869reassignbuf(bp, newvp)
1870	register struct buf *bp;
1871	register struct vnode *newvp;
1872{
1873	int delay;
1874	int s;
1875
1876	if (newvp == NULL) {
1877		printf("reassignbuf: NULL");
1878		return;
1879	}
1880	++reassignbufcalls;
1881
1882	/*
1883	 * B_PAGING flagged buffers cannot be reassigned because their vp
1884	 * is not fully linked in.
1885	 */
1886	if (bp->b_flags & B_PAGING)
1887		panic("cannot reassign paging buffer");
1888
1889	s = splbio();
1890	/*
1891	 * Delete from old vnode list, if on one.
1892	 */
1893	VI_LOCK(bp->b_vp);
1894	if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) {
1895		buf_vlist_remove(bp);
1896		if (bp->b_vp != newvp) {
1897			vdropl(bp->b_vp);
1898			bp->b_vp = NULL;	/* for clarification */
1899		}
1900	}
1901	VI_UNLOCK(bp->b_vp);
1902	/*
1903	 * If dirty, put on list of dirty buffers; otherwise insert onto list
1904	 * of clean buffers.
1905	 */
1906	VI_LOCK(newvp);
1907	if (bp->b_flags & B_DELWRI) {
1908		if ((newvp->v_iflag & VI_ONWORKLST) == 0) {
1909			switch (newvp->v_type) {
1910			case VDIR:
1911				delay = dirdelay;
1912				break;
1913			case VCHR:
1914				if (newvp->v_rdev->si_mountpoint != NULL) {
1915					delay = metadelay;
1916					break;
1917				}
1918				/* FALLTHROUGH */
1919			default:
1920				delay = filedelay;
1921			}
1922			vn_syncer_add_to_worklist(newvp, delay);
1923		}
1924		buf_vlist_add(bp, newvp, BX_VNDIRTY);
1925	} else {
1926		buf_vlist_add(bp, newvp, BX_VNCLEAN);
1927
1928		if ((newvp->v_iflag & VI_ONWORKLST) &&
1929		    TAILQ_EMPTY(&newvp->v_dirtyblkhd)) {
1930			mtx_lock(&sync_mtx);
1931			LIST_REMOVE(newvp, v_synclist);
1932			mtx_unlock(&sync_mtx);
1933			newvp->v_iflag &= ~VI_ONWORKLST;
1934		}
1935	}
1936	if (bp->b_vp != newvp) {
1937		bp->b_vp = newvp;
1938		vholdl(bp->b_vp);
1939	}
1940	VI_UNLOCK(newvp);
1941	splx(s);
1942}
1943
1944/*
1945 * Create a vnode for a device.
1946 * Used for mounting the root filesystem.
1947 */
1948int
1949bdevvp(dev, vpp)
1950	dev_t dev;
1951	struct vnode **vpp;
1952{
1953	register struct vnode *vp;
1954	struct vnode *nvp;
1955	int error;
1956
1957	if (dev == NODEV) {
1958		*vpp = NULLVP;
1959		return (ENXIO);
1960	}
1961	if (vfinddev(dev, VCHR, vpp))
1962		return (0);
1963	error = getnewvnode("none", (struct mount *)0, spec_vnodeop_p, &nvp);
1964	if (error) {
1965		*vpp = NULLVP;
1966		return (error);
1967	}
1968	vp = nvp;
1969	vp->v_type = VCHR;
1970	addalias(vp, dev);
1971	*vpp = vp;
1972	return (0);
1973}
1974
1975static void
1976v_incr_usecount(struct vnode *vp, int delta)
1977{
1978	vp->v_usecount += delta;
1979	if (vp->v_type == VCHR && vp->v_rdev != NULL) {
1980		mtx_lock(&spechash_mtx);
1981		vp->v_rdev->si_usecount += delta;
1982		mtx_unlock(&spechash_mtx);
1983	}
1984}
1985
1986/*
1987 * Add vnode to the alias list hung off the dev_t.
1988 *
1989 * The reason for this gunk is that multiple vnodes can reference
1990 * the same physical device, so checking vp->v_usecount to see
1991 * how many users there are is inadequate; the v_usecount for
1992 * the vnodes need to be accumulated.  vcount() does that.
1993 */
1994struct vnode *
1995addaliasu(nvp, nvp_rdev)
1996	struct vnode *nvp;
1997	udev_t nvp_rdev;
1998{
1999	struct vnode *ovp;
2000	vop_t **ops;
2001	dev_t dev;
2002
2003	if (nvp->v_type == VBLK)
2004		return (nvp);
2005	if (nvp->v_type != VCHR)
2006		panic("addaliasu on non-special vnode");
2007	dev = udev2dev(nvp_rdev, 0);
2008	/*
2009	 * Check to see if we have a bdevvp vnode with no associated
2010	 * filesystem. If so, we want to associate the filesystem of
2011	 * the new newly instigated vnode with the bdevvp vnode and
2012	 * discard the newly created vnode rather than leaving the
2013	 * bdevvp vnode lying around with no associated filesystem.
2014	 */
2015	if (vfinddev(dev, nvp->v_type, &ovp) == 0 || ovp->v_data != NULL) {
2016		addalias(nvp, dev);
2017		return (nvp);
2018	}
2019	/*
2020	 * Discard unneeded vnode, but save its node specific data.
2021	 * Note that if there is a lock, it is carried over in the
2022	 * node specific data to the replacement vnode.
2023	 */
2024	vref(ovp);
2025	ovp->v_data = nvp->v_data;
2026	ovp->v_tag = nvp->v_tag;
2027	nvp->v_data = NULL;
2028	lockdestroy(ovp->v_vnlock);
2029	lockinit(ovp->v_vnlock, PVFS, nvp->v_vnlock->lk_wmesg,
2030	    nvp->v_vnlock->lk_timo, nvp->v_vnlock->lk_flags & LK_EXTFLG_MASK);
2031	ops = ovp->v_op;
2032	ovp->v_op = nvp->v_op;
2033	if (VOP_ISLOCKED(nvp, curthread)) {
2034		VOP_UNLOCK(nvp, 0, curthread);
2035		vn_lock(ovp, LK_EXCLUSIVE | LK_RETRY, curthread);
2036	}
2037	nvp->v_op = ops;
2038	insmntque(ovp, nvp->v_mount);
2039	vrele(nvp);
2040	vgone(nvp);
2041	return (ovp);
2042}
2043
2044/* This is a local helper function that do the same as addaliasu, but for a
2045 * dev_t instead of an udev_t. */
2046static void
2047addalias(nvp, dev)
2048	struct vnode *nvp;
2049	dev_t dev;
2050{
2051
2052	KASSERT(nvp->v_type == VCHR, ("addalias on non-special vnode"));
2053	nvp->v_rdev = dev;
2054	VI_LOCK(nvp);
2055	mtx_lock(&spechash_mtx);
2056	SLIST_INSERT_HEAD(&dev->si_hlist, nvp, v_specnext);
2057	dev->si_usecount += nvp->v_usecount;
2058	mtx_unlock(&spechash_mtx);
2059	VI_UNLOCK(nvp);
2060}
2061
2062/*
2063 * Grab a particular vnode from the free list, increment its
2064 * reference count and lock it. The vnode lock bit is set if the
2065 * vnode is being eliminated in vgone. The process is awakened
2066 * when the transition is completed, and an error returned to
2067 * indicate that the vnode is no longer usable (possibly having
2068 * been changed to a new filesystem type).
2069 */
2070int
2071vget(vp, flags, td)
2072	register struct vnode *vp;
2073	int flags;
2074	struct thread *td;
2075{
2076	int error;
2077
2078	/*
2079	 * If the vnode is in the process of being cleaned out for
2080	 * another use, we wait for the cleaning to finish and then
2081	 * return failure. Cleaning is determined by checking that
2082	 * the VI_XLOCK flag is set.
2083	 */
2084	if ((flags & LK_INTERLOCK) == 0)
2085		VI_LOCK(vp);
2086	if (vp->v_iflag & VI_XLOCK && vp->v_vxproc != curthread) {
2087		vp->v_iflag |= VI_XWANT;
2088		msleep(vp, VI_MTX(vp), PINOD | PDROP, "vget", 0);
2089		return (ENOENT);
2090	}
2091
2092	v_incr_usecount(vp, 1);
2093
2094	if (VSHOULDBUSY(vp))
2095		vbusy(vp);
2096	if (flags & LK_TYPE_MASK) {
2097		if ((error = vn_lock(vp, flags | LK_INTERLOCK, td)) != 0) {
2098			/*
2099			 * must expand vrele here because we do not want
2100			 * to call VOP_INACTIVE if the reference count
2101			 * drops back to zero since it was never really
2102			 * active. We must remove it from the free list
2103			 * before sleeping so that multiple processes do
2104			 * not try to recycle it.
2105			 */
2106			VI_LOCK(vp);
2107			v_incr_usecount(vp, -1);
2108			if (VSHOULDFREE(vp))
2109				vfree(vp);
2110			else
2111				vlruvp(vp);
2112			VI_UNLOCK(vp);
2113		}
2114		return (error);
2115	}
2116	VI_UNLOCK(vp);
2117	return (0);
2118}
2119
2120/*
2121 * Increase the reference count of a vnode.
2122 */
2123void
2124vref(struct vnode *vp)
2125{
2126	VI_LOCK(vp);
2127	v_incr_usecount(vp, 1);
2128	VI_UNLOCK(vp);
2129}
2130
2131/*
2132 * Return reference count of a vnode.
2133 *
2134 * The results of this call are only guaranteed when some mechanism other
2135 * than the VI lock is used to stop other processes from gaining references
2136 * to the vnode.  This may be the case if the caller holds the only reference.
2137 * This is also useful when stale data is acceptable as race conditions may
2138 * be accounted for by some other means.
2139 */
2140int
2141vrefcnt(struct vnode *vp)
2142{
2143	int usecnt;
2144
2145	VI_LOCK(vp);
2146	usecnt = vp->v_usecount;
2147	VI_UNLOCK(vp);
2148
2149	return (usecnt);
2150}
2151
2152
2153/*
2154 * Vnode put/release.
2155 * If count drops to zero, call inactive routine and return to freelist.
2156 */
2157void
2158vrele(vp)
2159	struct vnode *vp;
2160{
2161	struct thread *td = curthread;	/* XXX */
2162
2163	KASSERT(vp != NULL, ("vrele: null vp"));
2164
2165	VI_LOCK(vp);
2166
2167	/* Skip this v_writecount check if we're going to panic below. */
2168	KASSERT(vp->v_writecount < vp->v_usecount || vp->v_usecount < 1,
2169	    ("vrele: missed vn_close"));
2170
2171	if (vp->v_usecount > 1 || ((vp->v_iflag & VI_DOINGINACT) &&
2172	    vp->v_usecount == 1)) {
2173		v_incr_usecount(vp, -1);
2174		VI_UNLOCK(vp);
2175
2176		return;
2177	}
2178
2179	if (vp->v_usecount == 1) {
2180		v_incr_usecount(vp, -1);
2181		/*
2182		 * We must call VOP_INACTIVE with the node locked. Mark
2183		 * as VI_DOINGINACT to avoid recursion.
2184		 */
2185		if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK, td) == 0) {
2186			VI_LOCK(vp);
2187			vp->v_iflag |= VI_DOINGINACT;
2188			VI_UNLOCK(vp);
2189			VOP_INACTIVE(vp, td);
2190			VI_LOCK(vp);
2191			KASSERT(vp->v_iflag & VI_DOINGINACT,
2192			    ("vrele: lost VI_DOINGINACT"));
2193			vp->v_iflag &= ~VI_DOINGINACT;
2194			VI_UNLOCK(vp);
2195		}
2196		VI_LOCK(vp);
2197		if (VSHOULDFREE(vp))
2198			vfree(vp);
2199		else
2200			vlruvp(vp);
2201		VI_UNLOCK(vp);
2202
2203	} else {
2204#ifdef DIAGNOSTIC
2205		vprint("vrele: negative ref count", vp);
2206#endif
2207		VI_UNLOCK(vp);
2208		panic("vrele: negative ref cnt");
2209	}
2210}
2211
2212/*
2213 * Release an already locked vnode.  This give the same effects as
2214 * unlock+vrele(), but takes less time and avoids releasing and
2215 * re-aquiring the lock (as vrele() aquires the lock internally.)
2216 */
2217void
2218vput(vp)
2219	struct vnode *vp;
2220{
2221	struct thread *td = curthread;	/* XXX */
2222
2223	GIANT_REQUIRED;
2224
2225	KASSERT(vp != NULL, ("vput: null vp"));
2226	VI_LOCK(vp);
2227	/* Skip this v_writecount check if we're going to panic below. */
2228	KASSERT(vp->v_writecount < vp->v_usecount || vp->v_usecount < 1,
2229	    ("vput: missed vn_close"));
2230
2231	if (vp->v_usecount > 1 || ((vp->v_iflag & VI_DOINGINACT) &&
2232	    vp->v_usecount == 1)) {
2233		v_incr_usecount(vp, -1);
2234		VOP_UNLOCK(vp, LK_INTERLOCK, td);
2235		return;
2236	}
2237
2238	if (vp->v_usecount == 1) {
2239		v_incr_usecount(vp, -1);
2240		/*
2241		 * We must call VOP_INACTIVE with the node locked, so
2242		 * we just need to release the vnode mutex. Mark as
2243		 * as VI_DOINGINACT to avoid recursion.
2244		 */
2245		vp->v_iflag |= VI_DOINGINACT;
2246		VI_UNLOCK(vp);
2247		VOP_INACTIVE(vp, td);
2248		VI_LOCK(vp);
2249		KASSERT(vp->v_iflag & VI_DOINGINACT,
2250		    ("vput: lost VI_DOINGINACT"));
2251		vp->v_iflag &= ~VI_DOINGINACT;
2252		if (VSHOULDFREE(vp))
2253			vfree(vp);
2254		else
2255			vlruvp(vp);
2256		VI_UNLOCK(vp);
2257
2258	} else {
2259#ifdef DIAGNOSTIC
2260		vprint("vput: negative ref count", vp);
2261#endif
2262		panic("vput: negative ref cnt");
2263	}
2264}
2265
2266/*
2267 * Somebody doesn't want the vnode recycled.
2268 */
2269void
2270vhold(struct vnode *vp)
2271{
2272	VI_LOCK(vp);
2273	vholdl(vp);
2274	VI_UNLOCK(vp);
2275}
2276
2277void
2278vholdl(vp)
2279	register struct vnode *vp;
2280{
2281	int s;
2282
2283	s = splbio();
2284	vp->v_holdcnt++;
2285	if (VSHOULDBUSY(vp))
2286		vbusy(vp);
2287	splx(s);
2288}
2289
2290/*
2291 * Note that there is one less who cares about this vnode.  vdrop() is the
2292 * opposite of vhold().
2293 */
2294void
2295vdrop(struct vnode *vp)
2296{
2297	VI_LOCK(vp);
2298	vdropl(vp);
2299	VI_UNLOCK(vp);
2300}
2301
2302void
2303vdropl(vp)
2304	register struct vnode *vp;
2305{
2306	int s;
2307
2308	s = splbio();
2309	if (vp->v_holdcnt <= 0)
2310		panic("vdrop: holdcnt");
2311	vp->v_holdcnt--;
2312	if (VSHOULDFREE(vp))
2313		vfree(vp);
2314	else
2315		vlruvp(vp);
2316	splx(s);
2317}
2318
2319/*
2320 * Remove any vnodes in the vnode table belonging to mount point mp.
2321 *
2322 * If FORCECLOSE is not specified, there should not be any active ones,
2323 * return error if any are found (nb: this is a user error, not a
2324 * system error). If FORCECLOSE is specified, detach any active vnodes
2325 * that are found.
2326 *
2327 * If WRITECLOSE is set, only flush out regular file vnodes open for
2328 * writing.
2329 *
2330 * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped.
2331 *
2332 * `rootrefs' specifies the base reference count for the root vnode
2333 * of this filesystem. The root vnode is considered busy if its
2334 * v_usecount exceeds this value. On a successful return, vflush()
2335 * will call vrele() on the root vnode exactly rootrefs times.
2336 * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must
2337 * be zero.
2338 */
2339#ifdef DIAGNOSTIC
2340static int busyprt = 0;		/* print out busy vnodes */
2341SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "");
2342#endif
2343
2344int
2345vflush(mp, rootrefs, flags)
2346	struct mount *mp;
2347	int rootrefs;
2348	int flags;
2349{
2350	struct thread *td = curthread;	/* XXX */
2351	struct vnode *vp, *nvp, *rootvp = NULL;
2352	struct vattr vattr;
2353	int busy = 0, error;
2354
2355	if (rootrefs > 0) {
2356		KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0,
2357		    ("vflush: bad args"));
2358		/*
2359		 * Get the filesystem root vnode. We can vput() it
2360		 * immediately, since with rootrefs > 0, it won't go away.
2361		 */
2362		if ((error = VFS_ROOT(mp, &rootvp)) != 0)
2363			return (error);
2364		vput(rootvp);
2365
2366	}
2367	mtx_lock(&mntvnode_mtx);
2368loop:
2369	for (vp = TAILQ_FIRST(&mp->mnt_nvnodelist); vp; vp = nvp) {
2370		/*
2371		 * Make sure this vnode wasn't reclaimed in getnewvnode().
2372		 * Start over if it has (it won't be on the list anymore).
2373		 */
2374		if (vp->v_mount != mp)
2375			goto loop;
2376		nvp = TAILQ_NEXT(vp, v_nmntvnodes);
2377
2378		VI_LOCK(vp);
2379		mtx_unlock(&mntvnode_mtx);
2380		vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY, td);
2381		/*
2382		 * Skip over a vnodes marked VV_SYSTEM.
2383		 */
2384		if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) {
2385			VOP_UNLOCK(vp, 0, td);
2386			mtx_lock(&mntvnode_mtx);
2387			continue;
2388		}
2389		/*
2390		 * If WRITECLOSE is set, flush out unlinked but still open
2391		 * files (even if open only for reading) and regular file
2392		 * vnodes open for writing.
2393		 */
2394		if (flags & WRITECLOSE) {
2395			error = VOP_GETATTR(vp, &vattr, td->td_ucred, td);
2396			VI_LOCK(vp);
2397
2398			if ((vp->v_type == VNON ||
2399			    (error == 0 && vattr.va_nlink > 0)) &&
2400			    (vp->v_writecount == 0 || vp->v_type != VREG)) {
2401				VOP_UNLOCK(vp, LK_INTERLOCK, td);
2402				mtx_lock(&mntvnode_mtx);
2403				continue;
2404			}
2405		} else
2406			VI_LOCK(vp);
2407
2408		VOP_UNLOCK(vp, 0, td);
2409
2410		/*
2411		 * With v_usecount == 0, all we need to do is clear out the
2412		 * vnode data structures and we are done.
2413		 */
2414		if (vp->v_usecount == 0) {
2415			vgonel(vp, td);
2416			mtx_lock(&mntvnode_mtx);
2417			continue;
2418		}
2419
2420		/*
2421		 * If FORCECLOSE is set, forcibly close the vnode. For block
2422		 * or character devices, revert to an anonymous device. For
2423		 * all other files, just kill them.
2424		 */
2425		if (flags & FORCECLOSE) {
2426			if (vp->v_type != VCHR) {
2427				vgonel(vp, td);
2428			} else {
2429				vclean(vp, 0, td);
2430				VI_UNLOCK(vp);
2431				vp->v_op = spec_vnodeop_p;
2432				insmntque(vp, (struct mount *) 0);
2433			}
2434			mtx_lock(&mntvnode_mtx);
2435			continue;
2436		}
2437#ifdef DIAGNOSTIC
2438		if (busyprt)
2439			vprint("vflush: busy vnode", vp);
2440#endif
2441		VI_UNLOCK(vp);
2442		mtx_lock(&mntvnode_mtx);
2443		busy++;
2444	}
2445	mtx_unlock(&mntvnode_mtx);
2446	if (rootrefs > 0 && (flags & FORCECLOSE) == 0) {
2447		/*
2448		 * If just the root vnode is busy, and if its refcount
2449		 * is equal to `rootrefs', then go ahead and kill it.
2450		 */
2451		VI_LOCK(rootvp);
2452		KASSERT(busy > 0, ("vflush: not busy"));
2453		KASSERT(rootvp->v_usecount >= rootrefs, ("vflush: rootrefs"));
2454		if (busy == 1 && rootvp->v_usecount == rootrefs) {
2455			vgonel(rootvp, td);
2456			busy = 0;
2457		} else
2458			VI_UNLOCK(rootvp);
2459	}
2460	if (busy)
2461		return (EBUSY);
2462	for (; rootrefs > 0; rootrefs--)
2463		vrele(rootvp);
2464	return (0);
2465}
2466
2467/*
2468 * This moves a now (likely recyclable) vnode to the end of the
2469 * mountlist.  XXX However, it is temporarily disabled until we
2470 * can clean up ffs_sync() and friends, which have loop restart
2471 * conditions which this code causes to operate O(N^2).
2472 */
2473static void
2474vlruvp(struct vnode *vp)
2475{
2476#if 0
2477	struct mount *mp;
2478
2479	if ((mp = vp->v_mount) != NULL) {
2480		mtx_lock(&mntvnode_mtx);
2481		TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
2482		TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
2483		mtx_unlock(&mntvnode_mtx);
2484	}
2485#endif
2486}
2487
2488/*
2489 * Disassociate the underlying filesystem from a vnode.
2490 */
2491static void
2492vclean(vp, flags, td)
2493	struct vnode *vp;
2494	int flags;
2495	struct thread *td;
2496{
2497	int active;
2498
2499	ASSERT_VI_LOCKED(vp, "vclean");
2500	/*
2501	 * Check to see if the vnode is in use. If so we have to reference it
2502	 * before we clean it out so that its count cannot fall to zero and
2503	 * generate a race against ourselves to recycle it.
2504	 */
2505	if ((active = vp->v_usecount))
2506		v_incr_usecount(vp, 1);
2507
2508	/*
2509	 * Prevent the vnode from being recycled or brought into use while we
2510	 * clean it out.
2511	 */
2512	if (vp->v_iflag & VI_XLOCK)
2513		panic("vclean: deadlock");
2514	vp->v_iflag |= VI_XLOCK;
2515	vp->v_vxproc = curthread;
2516	/*
2517	 * Even if the count is zero, the VOP_INACTIVE routine may still
2518	 * have the object locked while it cleans it out. The VOP_LOCK
2519	 * ensures that the VOP_INACTIVE routine is done with its work.
2520	 * For active vnodes, it ensures that no other activity can
2521	 * occur while the underlying object is being cleaned out.
2522	 */
2523	VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK, td);
2524
2525	/*
2526	 * Clean out any buffers associated with the vnode.
2527	 * If the flush fails, just toss the buffers.
2528	 */
2529	if (flags & DOCLOSE) {
2530		struct buf *bp;
2531		VI_LOCK(vp);
2532		bp = TAILQ_FIRST(&vp->v_dirtyblkhd);
2533		VI_UNLOCK(vp);
2534		if (bp != NULL)
2535			(void) vn_write_suspend_wait(vp, NULL, V_WAIT);
2536		if (vinvalbuf(vp, V_SAVE, NOCRED, td, 0, 0) != 0)
2537			vinvalbuf(vp, 0, NOCRED, td, 0, 0);
2538	}
2539
2540	VOP_DESTROYVOBJECT(vp);
2541
2542	/*
2543	 * Any other processes trying to obtain this lock must first
2544	 * wait for VXLOCK to clear, then call the new lock operation.
2545	 */
2546	VOP_UNLOCK(vp, 0, td);
2547
2548	/*
2549	 * If purging an active vnode, it must be closed and
2550	 * deactivated before being reclaimed. Note that the
2551	 * VOP_INACTIVE will unlock the vnode.
2552	 */
2553	if (active) {
2554		if (flags & DOCLOSE)
2555			VOP_CLOSE(vp, FNONBLOCK, NOCRED, td);
2556		VI_LOCK(vp);
2557		if ((vp->v_iflag & VI_DOINGINACT) == 0) {
2558			vp->v_iflag |= VI_DOINGINACT;
2559			VI_UNLOCK(vp);
2560			if (vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT, td) != 0)
2561				panic("vclean: cannot relock.");
2562			VOP_INACTIVE(vp, td);
2563			VI_LOCK(vp);
2564			KASSERT(vp->v_iflag & VI_DOINGINACT,
2565			    ("vclean: lost VI_DOINGINACT"));
2566			vp->v_iflag &= ~VI_DOINGINACT;
2567		}
2568		VI_UNLOCK(vp);
2569	}
2570
2571	/*
2572	 * Reclaim the vnode.
2573	 */
2574	if (VOP_RECLAIM(vp, td))
2575		panic("vclean: cannot reclaim");
2576
2577	if (active) {
2578		/*
2579		 * Inline copy of vrele() since VOP_INACTIVE
2580		 * has already been called.
2581		 */
2582		VI_LOCK(vp);
2583		v_incr_usecount(vp, -1);
2584		if (vp->v_usecount <= 0) {
2585#ifdef DIAGNOSTIC
2586			if (vp->v_usecount < 0 || vp->v_writecount != 0) {
2587				vprint("vclean: bad ref count", vp);
2588				panic("vclean: ref cnt");
2589			}
2590#endif
2591			vfree(vp);
2592		}
2593		VI_UNLOCK(vp);
2594	}
2595
2596	cache_purge(vp);
2597	VI_LOCK(vp);
2598	if (VSHOULDFREE(vp))
2599		vfree(vp);
2600
2601	/*
2602	 * Done with purge, reset to the standard lock and
2603	 * notify sleepers of the grim news.
2604	 */
2605	vp->v_vnlock = &vp->v_lock;
2606	vp->v_op = dead_vnodeop_p;
2607	if (vp->v_pollinfo != NULL)
2608		vn_pollgone(vp);
2609	vp->v_tag = "none";
2610	vp->v_iflag &= ~VI_XLOCK;
2611	vp->v_vxproc = NULL;
2612	if (vp->v_iflag & VI_XWANT) {
2613		vp->v_iflag &= ~VI_XWANT;
2614		wakeup(vp);
2615	}
2616}
2617
2618/*
2619 * Eliminate all activity associated with the requested vnode
2620 * and with all vnodes aliased to the requested vnode.
2621 */
2622int
2623vop_revoke(ap)
2624	struct vop_revoke_args /* {
2625		struct vnode *a_vp;
2626		int a_flags;
2627	} */ *ap;
2628{
2629	struct vnode *vp, *vq;
2630	dev_t dev;
2631
2632	KASSERT((ap->a_flags & REVOKEALL) != 0, ("vop_revoke"));
2633	vp = ap->a_vp;
2634	KASSERT((vp->v_type == VCHR), ("vop_revoke: not VCHR"));
2635
2636	VI_LOCK(vp);
2637	/*
2638	 * If a vgone (or vclean) is already in progress,
2639	 * wait until it is done and return.
2640	 */
2641	if (vp->v_iflag & VI_XLOCK) {
2642		vp->v_iflag |= VI_XWANT;
2643		msleep(vp, VI_MTX(vp), PINOD | PDROP,
2644		    "vop_revokeall", 0);
2645		return (0);
2646	}
2647	VI_UNLOCK(vp);
2648	dev = vp->v_rdev;
2649	for (;;) {
2650		mtx_lock(&spechash_mtx);
2651		vq = SLIST_FIRST(&dev->si_hlist);
2652		mtx_unlock(&spechash_mtx);
2653		if (!vq)
2654			break;
2655		vgone(vq);
2656	}
2657	return (0);
2658}
2659
2660/*
2661 * Recycle an unused vnode to the front of the free list.
2662 * Release the passed interlock if the vnode will be recycled.
2663 */
2664int
2665vrecycle(vp, inter_lkp, td)
2666	struct vnode *vp;
2667	struct mtx *inter_lkp;
2668	struct thread *td;
2669{
2670
2671	VI_LOCK(vp);
2672	if (vp->v_usecount == 0) {
2673		if (inter_lkp) {
2674			mtx_unlock(inter_lkp);
2675		}
2676		vgonel(vp, td);
2677		return (1);
2678	}
2679	VI_UNLOCK(vp);
2680	return (0);
2681}
2682
2683/*
2684 * Eliminate all activity associated with a vnode
2685 * in preparation for reuse.
2686 */
2687void
2688vgone(vp)
2689	register struct vnode *vp;
2690{
2691	struct thread *td = curthread;	/* XXX */
2692
2693	VI_LOCK(vp);
2694	vgonel(vp, td);
2695}
2696
2697/*
2698 * vgone, with the vp interlock held.
2699 */
2700void
2701vgonel(vp, td)
2702	struct vnode *vp;
2703	struct thread *td;
2704{
2705	int s;
2706
2707	/*
2708	 * If a vgone (or vclean) is already in progress,
2709	 * wait until it is done and return.
2710	 */
2711	ASSERT_VI_LOCKED(vp, "vgonel");
2712	if (vp->v_iflag & VI_XLOCK) {
2713		vp->v_iflag |= VI_XWANT;
2714		msleep(vp, VI_MTX(vp), PINOD | PDROP, "vgone", 0);
2715		return;
2716	}
2717
2718	/*
2719	 * Clean out the filesystem specific data.
2720	 */
2721	vclean(vp, DOCLOSE, td);
2722	VI_UNLOCK(vp);
2723
2724	/*
2725	 * Delete from old mount point vnode list, if on one.
2726	 */
2727	if (vp->v_mount != NULL)
2728		insmntque(vp, (struct mount *)0);
2729	/*
2730	 * If special device, remove it from special device alias list
2731	 * if it is on one.
2732	 */
2733	if (vp->v_type == VCHR && vp->v_rdev != NULL && vp->v_rdev != NODEV) {
2734		VI_LOCK(vp);
2735		mtx_lock(&spechash_mtx);
2736		SLIST_REMOVE(&vp->v_rdev->si_hlist, vp, vnode, v_specnext);
2737		vp->v_rdev->si_usecount -= vp->v_usecount;
2738		mtx_unlock(&spechash_mtx);
2739		VI_UNLOCK(vp);
2740		vp->v_rdev = NULL;
2741	}
2742
2743	/*
2744	 * If it is on the freelist and not already at the head,
2745	 * move it to the head of the list. The test of the
2746	 * VDOOMED flag and the reference count of zero is because
2747	 * it will be removed from the free list by getnewvnode,
2748	 * but will not have its reference count incremented until
2749	 * after calling vgone. If the reference count were
2750	 * incremented first, vgone would (incorrectly) try to
2751	 * close the previous instance of the underlying object.
2752	 */
2753	VI_LOCK(vp);
2754	if (vp->v_usecount == 0 && !(vp->v_iflag & VI_DOOMED)) {
2755		s = splbio();
2756		mtx_lock(&vnode_free_list_mtx);
2757		if (vp->v_iflag & VI_FREE) {
2758			TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
2759		} else {
2760			vp->v_iflag |= VI_FREE;
2761			freevnodes++;
2762		}
2763		TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
2764		mtx_unlock(&vnode_free_list_mtx);
2765		splx(s);
2766	}
2767
2768	vp->v_type = VBAD;
2769	VI_UNLOCK(vp);
2770}
2771
2772/*
2773 * Lookup a vnode by device number.
2774 */
2775int
2776vfinddev(dev, type, vpp)
2777	dev_t dev;
2778	enum vtype type;
2779	struct vnode **vpp;
2780{
2781	struct vnode *vp;
2782
2783	mtx_lock(&spechash_mtx);
2784	SLIST_FOREACH(vp, &dev->si_hlist, v_specnext) {
2785		if (type == vp->v_type) {
2786			*vpp = vp;
2787			mtx_unlock(&spechash_mtx);
2788			return (1);
2789		}
2790	}
2791	mtx_unlock(&spechash_mtx);
2792	return (0);
2793}
2794
2795/*
2796 * Calculate the total number of references to a special device.
2797 */
2798int
2799vcount(vp)
2800	struct vnode *vp;
2801{
2802	int count;
2803
2804	mtx_lock(&spechash_mtx);
2805	count = vp->v_rdev->si_usecount;
2806	mtx_unlock(&spechash_mtx);
2807	return (count);
2808}
2809
2810/*
2811 * Same as above, but using the dev_t as argument
2812 */
2813int
2814count_dev(dev)
2815	dev_t dev;
2816{
2817	struct vnode *vp;
2818
2819	vp = SLIST_FIRST(&dev->si_hlist);
2820	if (vp == NULL)
2821		return (0);
2822	return(vcount(vp));
2823}
2824
2825/*
2826 * Print out a description of a vnode.
2827 */
2828static char *typename[] =
2829{"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"};
2830
2831void
2832vprint(label, vp)
2833	char *label;
2834	struct vnode *vp;
2835{
2836	char buf[96];
2837
2838	if (label != NULL)
2839		printf("%s: %p: ", label, (void *)vp);
2840	else
2841		printf("%p: ", (void *)vp);
2842	printf("tag %s, type %s, usecount %d, writecount %d, refcount %d,",
2843	    vp->v_tag, typename[vp->v_type], vp->v_usecount,
2844	    vp->v_writecount, vp->v_holdcnt);
2845	buf[0] = '\0';
2846	if (vp->v_vflag & VV_ROOT)
2847		strcat(buf, "|VV_ROOT");
2848	if (vp->v_vflag & VV_TEXT)
2849		strcat(buf, "|VV_TEXT");
2850	if (vp->v_vflag & VV_SYSTEM)
2851		strcat(buf, "|VV_SYSTEM");
2852	if (vp->v_iflag & VI_XLOCK)
2853		strcat(buf, "|VI_XLOCK");
2854	if (vp->v_iflag & VI_XWANT)
2855		strcat(buf, "|VI_XWANT");
2856	if (vp->v_iflag & VI_BWAIT)
2857		strcat(buf, "|VI_BWAIT");
2858	if (vp->v_iflag & VI_DOOMED)
2859		strcat(buf, "|VI_DOOMED");
2860	if (vp->v_iflag & VI_FREE)
2861		strcat(buf, "|VI_FREE");
2862	if (vp->v_vflag & VV_OBJBUF)
2863		strcat(buf, "|VV_OBJBUF");
2864	if (buf[0] != '\0')
2865		printf(" flags (%s),", &buf[1]);
2866	lockmgr_printinfo(vp->v_vnlock);
2867	printf("\n");
2868	if (vp->v_data != NULL)
2869		VOP_PRINT(vp);
2870}
2871
2872#ifdef DDB
2873#include <ddb/ddb.h>
2874/*
2875 * List all of the locked vnodes in the system.
2876 * Called when debugging the kernel.
2877 */
2878DB_SHOW_COMMAND(lockedvnods, lockedvnodes)
2879{
2880	struct thread *td = curthread;	/* XXX */
2881	struct mount *mp, *nmp;
2882	struct vnode *vp;
2883
2884	printf("Locked vnodes\n");
2885	mtx_lock(&mountlist_mtx);
2886	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
2887		if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td)) {
2888			nmp = TAILQ_NEXT(mp, mnt_list);
2889			continue;
2890		}
2891		mtx_lock(&mntvnode_mtx);
2892		TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
2893			if (VOP_ISLOCKED(vp, NULL))
2894				vprint(NULL, vp);
2895		}
2896		mtx_unlock(&mntvnode_mtx);
2897		mtx_lock(&mountlist_mtx);
2898		nmp = TAILQ_NEXT(mp, mnt_list);
2899		vfs_unbusy(mp, td);
2900	}
2901	mtx_unlock(&mountlist_mtx);
2902}
2903#endif
2904
2905/*
2906 * Fill in a struct xvfsconf based on a struct vfsconf.
2907 */
2908static void
2909vfsconf2x(struct vfsconf *vfsp, struct xvfsconf *xvfsp)
2910{
2911
2912	strcpy(xvfsp->vfc_name, vfsp->vfc_name);
2913	xvfsp->vfc_typenum = vfsp->vfc_typenum;
2914	xvfsp->vfc_refcount = vfsp->vfc_refcount;
2915	xvfsp->vfc_flags = vfsp->vfc_flags;
2916	/*
2917	 * These are unused in userland, we keep them
2918	 * to not break binary compatibility.
2919	 */
2920	xvfsp->vfc_vfsops = NULL;
2921	xvfsp->vfc_next = NULL;
2922}
2923
2924static int
2925sysctl_vfs_conflist(SYSCTL_HANDLER_ARGS)
2926{
2927	struct vfsconf *vfsp;
2928	struct xvfsconf *xvfsp;
2929	int cnt, error, i;
2930
2931	cnt = 0;
2932	for (vfsp = vfsconf; vfsp != NULL; vfsp = vfsp->vfc_next)
2933		cnt++;
2934	xvfsp = malloc(sizeof(struct xvfsconf) * cnt, M_TEMP, M_WAITOK);
2935	/*
2936	 * Handle the race that we will have here when struct vfsconf
2937	 * will be locked down by using both cnt and checking vfc_next
2938	 * against NULL to determine the end of the loop.  The race will
2939	 * happen because we will have to unlock before calling malloc().
2940	 * We are protected by Giant for now.
2941	 */
2942	i = 0;
2943	for (vfsp = vfsconf; vfsp != NULL && i < cnt; vfsp = vfsp->vfc_next) {
2944		vfsconf2x(vfsp, xvfsp + i);
2945		i++;
2946	}
2947	error = SYSCTL_OUT(req, xvfsp, sizeof(struct xvfsconf) * i);
2948	free(xvfsp, M_TEMP);
2949	return (error);
2950}
2951
2952SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLFLAG_RD, NULL, 0, sysctl_vfs_conflist,
2953    "S,xvfsconf", "List of all configured filesystems");
2954
2955/*
2956 * Top level filesystem related information gathering.
2957 */
2958static int	sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS);
2959
2960static int
2961vfs_sysctl(SYSCTL_HANDLER_ARGS)
2962{
2963	int *name = (int *)arg1 - 1;	/* XXX */
2964	u_int namelen = arg2 + 1;	/* XXX */
2965	struct vfsconf *vfsp;
2966	struct xvfsconf xvfsp;
2967
2968	printf("WARNING: userland calling deprecated sysctl, "
2969	    "please rebuild world\n");
2970
2971#if 1 || defined(COMPAT_PRELITE2)
2972	/* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */
2973	if (namelen == 1)
2974		return (sysctl_ovfs_conf(oidp, arg1, arg2, req));
2975#endif
2976
2977	switch (name[1]) {
2978	case VFS_MAXTYPENUM:
2979		if (namelen != 2)
2980			return (ENOTDIR);
2981		return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int)));
2982	case VFS_CONF:
2983		if (namelen != 3)
2984			return (ENOTDIR);	/* overloaded */
2985		for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
2986			if (vfsp->vfc_typenum == name[2])
2987				break;
2988		if (vfsp == NULL)
2989			return (EOPNOTSUPP);
2990		vfsconf2x(vfsp, &xvfsp);
2991		return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp)));
2992	}
2993	return (EOPNOTSUPP);
2994}
2995
2996SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD | CTLFLAG_SKIP, vfs_sysctl,
2997	"Generic filesystem");
2998
2999#if 1 || defined(COMPAT_PRELITE2)
3000
3001static int
3002sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS)
3003{
3004	int error;
3005	struct vfsconf *vfsp;
3006	struct ovfsconf ovfs;
3007
3008	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
3009		ovfs.vfc_vfsops = vfsp->vfc_vfsops;	/* XXX used as flag */
3010		strcpy(ovfs.vfc_name, vfsp->vfc_name);
3011		ovfs.vfc_index = vfsp->vfc_typenum;
3012		ovfs.vfc_refcount = vfsp->vfc_refcount;
3013		ovfs.vfc_flags = vfsp->vfc_flags;
3014		error = SYSCTL_OUT(req, &ovfs, sizeof ovfs);
3015		if (error)
3016			return error;
3017	}
3018	return 0;
3019}
3020
3021#endif /* 1 || COMPAT_PRELITE2 */
3022
3023#define KINFO_VNODESLOP		10
3024#ifdef notyet
3025/*
3026 * Dump vnode list (via sysctl).
3027 */
3028/* ARGSUSED */
3029static int
3030sysctl_vnode(SYSCTL_HANDLER_ARGS)
3031{
3032	struct xvnode *xvn;
3033	struct thread *td = req->td;
3034	struct mount *mp;
3035	struct vnode *vp;
3036	int error, len, n;
3037
3038	/*
3039	 * Stale numvnodes access is not fatal here.
3040	 */
3041	req->lock = 0;
3042	len = (numvnodes + KINFO_VNODESLOP) * sizeof *xvn;
3043	if (!req->oldptr)
3044		/* Make an estimate */
3045		return (SYSCTL_OUT(req, 0, len));
3046
3047	sysctl_wire_old_buffer(req, 0);
3048	xvn = malloc(len, M_TEMP, M_ZERO | M_WAITOK);
3049	n = 0;
3050	mtx_lock(&mountlist_mtx);
3051	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
3052		if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td))
3053			continue;
3054		mtx_lock(&mntvnode_mtx);
3055		TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
3056			if (n == len)
3057				break;
3058			vref(vp);
3059			xvn[n].xv_size = sizeof *xvn;
3060			xvn[n].xv_vnode = vp;
3061#define XV_COPY(field) xvn[n].xv_##field = vp->v_##field
3062			XV_COPY(usecount);
3063			XV_COPY(writecount);
3064			XV_COPY(holdcnt);
3065			XV_COPY(id);
3066			XV_COPY(mount);
3067			XV_COPY(numoutput);
3068			XV_COPY(type);
3069#undef XV_COPY
3070			xvn[n].xv_flag = vp->v_vflag;
3071
3072			switch (vp->v_type) {
3073			case VREG:
3074			case VDIR:
3075			case VLNK:
3076				xvn[n].xv_dev = vp->v_cachedfs;
3077				xvn[n].xv_ino = vp->v_cachedid;
3078				break;
3079			case VBLK:
3080			case VCHR:
3081				if (vp->v_rdev == NULL) {
3082					vrele(vp);
3083					continue;
3084				}
3085				xvn[n].xv_dev = dev2udev(vp->v_rdev);
3086				break;
3087			case VSOCK:
3088				xvn[n].xv_socket = vp->v_socket;
3089				break;
3090			case VFIFO:
3091				xvn[n].xv_fifo = vp->v_fifoinfo;
3092				break;
3093			case VNON:
3094			case VBAD:
3095			default:
3096				/* shouldn't happen? */
3097				vrele(vp);
3098				continue;
3099			}
3100			vrele(vp);
3101			++n;
3102		}
3103		mtx_unlock(&mntvnode_mtx);
3104		mtx_lock(&mountlist_mtx);
3105		vfs_unbusy(mp, td);
3106		if (n == len)
3107			break;
3108	}
3109	mtx_unlock(&mountlist_mtx);
3110
3111	error = SYSCTL_OUT(req, xvn, n * sizeof *xvn);
3112	free(xvn, M_TEMP);
3113	return (error);
3114}
3115
3116SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD,
3117	0, 0, sysctl_vnode, "S,xvnode", "");
3118#endif
3119
3120/*
3121 * Check to see if a filesystem is mounted on a block device.
3122 */
3123int
3124vfs_mountedon(vp)
3125	struct vnode *vp;
3126{
3127
3128	if (vp->v_rdev->si_mountpoint != NULL)
3129		return (EBUSY);
3130	return (0);
3131}
3132
3133/*
3134 * Unmount all filesystems. The list is traversed in reverse order
3135 * of mounting to avoid dependencies.
3136 */
3137void
3138vfs_unmountall()
3139{
3140	struct mount *mp;
3141	struct thread *td;
3142	int error;
3143
3144	if (curthread != NULL)
3145		td = curthread;
3146	else
3147		td = FIRST_THREAD_IN_PROC(initproc); /* XXX XXX proc0? */
3148	/*
3149	 * Since this only runs when rebooting, it is not interlocked.
3150	 */
3151	while(!TAILQ_EMPTY(&mountlist)) {
3152		mp = TAILQ_LAST(&mountlist, mntlist);
3153		error = dounmount(mp, MNT_FORCE, td);
3154		if (error) {
3155			TAILQ_REMOVE(&mountlist, mp, mnt_list);
3156			printf("unmount of %s failed (",
3157			    mp->mnt_stat.f_mntonname);
3158			if (error == EBUSY)
3159				printf("BUSY)\n");
3160			else
3161				printf("%d)\n", error);
3162		} else {
3163			/* The unmount has removed mp from the mountlist */
3164		}
3165	}
3166}
3167
3168/*
3169 * perform msync on all vnodes under a mount point
3170 * the mount point must be locked.
3171 */
3172void
3173vfs_msync(struct mount *mp, int flags)
3174{
3175	struct vnode *vp, *nvp;
3176	struct vm_object *obj;
3177	int tries;
3178
3179	GIANT_REQUIRED;
3180
3181	tries = 5;
3182	mtx_lock(&mntvnode_mtx);
3183loop:
3184	for (vp = TAILQ_FIRST(&mp->mnt_nvnodelist); vp != NULL; vp = nvp) {
3185		if (vp->v_mount != mp) {
3186			if (--tries > 0)
3187				goto loop;
3188			break;
3189		}
3190		nvp = TAILQ_NEXT(vp, v_nmntvnodes);
3191
3192		VI_LOCK(vp);
3193		if (vp->v_iflag & VI_XLOCK) {	/* XXX: what if MNT_WAIT? */
3194			VI_UNLOCK(vp);
3195			continue;
3196		}
3197
3198		if ((vp->v_iflag & VI_OBJDIRTY) &&
3199		    (flags == MNT_WAIT || VOP_ISLOCKED(vp, NULL) == 0)) {
3200			mtx_unlock(&mntvnode_mtx);
3201			if (!vget(vp,
3202			    LK_EXCLUSIVE | LK_RETRY | LK_INTERLOCK,
3203			    curthread)) {
3204				if (vp->v_vflag & VV_NOSYNC) {	/* unlinked */
3205					vput(vp);
3206					mtx_lock(&mntvnode_mtx);
3207					continue;
3208				}
3209
3210				if (VOP_GETVOBJECT(vp, &obj) == 0) {
3211					VM_OBJECT_LOCK(obj);
3212					vm_object_page_clean(obj, 0, 0,
3213					    flags == MNT_WAIT ?
3214					    OBJPC_SYNC : OBJPC_NOSYNC);
3215					VM_OBJECT_UNLOCK(obj);
3216				}
3217				vput(vp);
3218			}
3219			mtx_lock(&mntvnode_mtx);
3220			if (TAILQ_NEXT(vp, v_nmntvnodes) != nvp) {
3221				if (--tries > 0)
3222					goto loop;
3223				break;
3224			}
3225		} else
3226			VI_UNLOCK(vp);
3227	}
3228	mtx_unlock(&mntvnode_mtx);
3229}
3230
3231/*
3232 * Create the VM object needed for VMIO and mmap support.  This
3233 * is done for all VREG files in the system.  Some filesystems might
3234 * afford the additional metadata buffering capability of the
3235 * VMIO code by making the device node be VMIO mode also.
3236 *
3237 * vp must be locked when vfs_object_create is called.
3238 */
3239int
3240vfs_object_create(vp, td, cred)
3241	struct vnode *vp;
3242	struct thread *td;
3243	struct ucred *cred;
3244{
3245	GIANT_REQUIRED;
3246	return (VOP_CREATEVOBJECT(vp, cred, td));
3247}
3248
3249/*
3250 * Mark a vnode as free, putting it up for recycling.
3251 */
3252void
3253vfree(vp)
3254	struct vnode *vp;
3255{
3256	int s;
3257
3258	ASSERT_VI_LOCKED(vp, "vfree");
3259	s = splbio();
3260	mtx_lock(&vnode_free_list_mtx);
3261	KASSERT((vp->v_iflag & VI_FREE) == 0, ("vnode already free"));
3262	if (vp->v_iflag & VI_AGE) {
3263		TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
3264	} else {
3265		TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
3266	}
3267	freevnodes++;
3268	mtx_unlock(&vnode_free_list_mtx);
3269	vp->v_iflag &= ~VI_AGE;
3270	vp->v_iflag |= VI_FREE;
3271	splx(s);
3272}
3273
3274/*
3275 * Opposite of vfree() - mark a vnode as in use.
3276 */
3277void
3278vbusy(vp)
3279	struct vnode *vp;
3280{
3281	int s;
3282
3283	s = splbio();
3284	ASSERT_VI_LOCKED(vp, "vbusy");
3285	KASSERT((vp->v_iflag & VI_FREE) != 0, ("vnode not free"));
3286
3287	mtx_lock(&vnode_free_list_mtx);
3288	TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
3289	freevnodes--;
3290	mtx_unlock(&vnode_free_list_mtx);
3291
3292	vp->v_iflag &= ~(VI_FREE|VI_AGE);
3293	splx(s);
3294}
3295
3296/*
3297 * Record a process's interest in events which might happen to
3298 * a vnode.  Because poll uses the historic select-style interface
3299 * internally, this routine serves as both the ``check for any
3300 * pending events'' and the ``record my interest in future events''
3301 * functions.  (These are done together, while the lock is held,
3302 * to avoid race conditions.)
3303 */
3304int
3305vn_pollrecord(vp, td, events)
3306	struct vnode *vp;
3307	struct thread *td;
3308	short events;
3309{
3310
3311	if (vp->v_pollinfo == NULL)
3312		v_addpollinfo(vp);
3313	mtx_lock(&vp->v_pollinfo->vpi_lock);
3314	if (vp->v_pollinfo->vpi_revents & events) {
3315		/*
3316		 * This leaves events we are not interested
3317		 * in available for the other process which
3318		 * which presumably had requested them
3319		 * (otherwise they would never have been
3320		 * recorded).
3321		 */
3322		events &= vp->v_pollinfo->vpi_revents;
3323		vp->v_pollinfo->vpi_revents &= ~events;
3324
3325		mtx_unlock(&vp->v_pollinfo->vpi_lock);
3326		return events;
3327	}
3328	vp->v_pollinfo->vpi_events |= events;
3329	selrecord(td, &vp->v_pollinfo->vpi_selinfo);
3330	mtx_unlock(&vp->v_pollinfo->vpi_lock);
3331	return 0;
3332}
3333
3334/*
3335 * Note the occurrence of an event.  If the VN_POLLEVENT macro is used,
3336 * it is possible for us to miss an event due to race conditions, but
3337 * that condition is expected to be rare, so for the moment it is the
3338 * preferred interface.
3339 */
3340void
3341vn_pollevent(vp, events)
3342	struct vnode *vp;
3343	short events;
3344{
3345
3346	if (vp->v_pollinfo == NULL)
3347		v_addpollinfo(vp);
3348	mtx_lock(&vp->v_pollinfo->vpi_lock);
3349	if (vp->v_pollinfo->vpi_events & events) {
3350		/*
3351		 * We clear vpi_events so that we don't
3352		 * call selwakeup() twice if two events are
3353		 * posted before the polling process(es) is
3354		 * awakened.  This also ensures that we take at
3355		 * most one selwakeup() if the polling process
3356		 * is no longer interested.  However, it does
3357		 * mean that only one event can be noticed at
3358		 * a time.  (Perhaps we should only clear those
3359		 * event bits which we note?) XXX
3360		 */
3361		vp->v_pollinfo->vpi_events = 0;	/* &= ~events ??? */
3362		vp->v_pollinfo->vpi_revents |= events;
3363		selwakeup(&vp->v_pollinfo->vpi_selinfo);
3364	}
3365	mtx_unlock(&vp->v_pollinfo->vpi_lock);
3366}
3367
3368/*
3369 * Wake up anyone polling on vp because it is being revoked.
3370 * This depends on dead_poll() returning POLLHUP for correct
3371 * behavior.
3372 */
3373void
3374vn_pollgone(vp)
3375	struct vnode *vp;
3376{
3377
3378	mtx_lock(&vp->v_pollinfo->vpi_lock);
3379	VN_KNOTE(vp, NOTE_REVOKE);
3380	if (vp->v_pollinfo->vpi_events) {
3381		vp->v_pollinfo->vpi_events = 0;
3382		selwakeup(&vp->v_pollinfo->vpi_selinfo);
3383	}
3384	mtx_unlock(&vp->v_pollinfo->vpi_lock);
3385}
3386
3387
3388
3389/*
3390 * Routine to create and manage a filesystem syncer vnode.
3391 */
3392#define sync_close ((int (*)(struct  vop_close_args *))nullop)
3393static int	sync_fsync(struct  vop_fsync_args *);
3394static int	sync_inactive(struct  vop_inactive_args *);
3395static int	sync_reclaim(struct  vop_reclaim_args *);
3396
3397static vop_t **sync_vnodeop_p;
3398static struct vnodeopv_entry_desc sync_vnodeop_entries[] = {
3399	{ &vop_default_desc,	(vop_t *) vop_eopnotsupp },
3400	{ &vop_close_desc,	(vop_t *) sync_close },		/* close */
3401	{ &vop_fsync_desc,	(vop_t *) sync_fsync },		/* fsync */
3402	{ &vop_inactive_desc,	(vop_t *) sync_inactive },	/* inactive */
3403	{ &vop_reclaim_desc,	(vop_t *) sync_reclaim },	/* reclaim */
3404	{ &vop_lock_desc,	(vop_t *) vop_stdlock },	/* lock */
3405	{ &vop_unlock_desc,	(vop_t *) vop_stdunlock },	/* unlock */
3406	{ &vop_islocked_desc,	(vop_t *) vop_stdislocked },	/* islocked */
3407	{ NULL, NULL }
3408};
3409static struct vnodeopv_desc sync_vnodeop_opv_desc =
3410	{ &sync_vnodeop_p, sync_vnodeop_entries };
3411
3412VNODEOP_SET(sync_vnodeop_opv_desc);
3413
3414/*
3415 * Create a new filesystem syncer vnode for the specified mount point.
3416 */
3417int
3418vfs_allocate_syncvnode(mp)
3419	struct mount *mp;
3420{
3421	struct vnode *vp;
3422	static long start, incr, next;
3423	int error;
3424
3425	/* Allocate a new vnode */
3426	if ((error = getnewvnode("syncer", mp, sync_vnodeop_p, &vp)) != 0) {
3427		mp->mnt_syncer = NULL;
3428		return (error);
3429	}
3430	vp->v_type = VNON;
3431	/*
3432	 * Place the vnode onto the syncer worklist. We attempt to
3433	 * scatter them about on the list so that they will go off
3434	 * at evenly distributed times even if all the filesystems
3435	 * are mounted at once.
3436	 */
3437	next += incr;
3438	if (next == 0 || next > syncer_maxdelay) {
3439		start /= 2;
3440		incr /= 2;
3441		if (start == 0) {
3442			start = syncer_maxdelay / 2;
3443			incr = syncer_maxdelay;
3444		}
3445		next = start;
3446	}
3447	VI_LOCK(vp);
3448	vn_syncer_add_to_worklist(vp, syncdelay > 0 ? next % syncdelay : 0);
3449	VI_UNLOCK(vp);
3450	mp->mnt_syncer = vp;
3451	return (0);
3452}
3453
3454/*
3455 * Do a lazy sync of the filesystem.
3456 */
3457static int
3458sync_fsync(ap)
3459	struct vop_fsync_args /* {
3460		struct vnode *a_vp;
3461		struct ucred *a_cred;
3462		int a_waitfor;
3463		struct thread *a_td;
3464	} */ *ap;
3465{
3466	struct vnode *syncvp = ap->a_vp;
3467	struct mount *mp = syncvp->v_mount;
3468	struct thread *td = ap->a_td;
3469	int error, asyncflag;
3470
3471	/*
3472	 * We only need to do something if this is a lazy evaluation.
3473	 */
3474	if (ap->a_waitfor != MNT_LAZY)
3475		return (0);
3476
3477	/*
3478	 * Move ourselves to the back of the sync list.
3479	 */
3480	VI_LOCK(syncvp);
3481	vn_syncer_add_to_worklist(syncvp, syncdelay);
3482	VI_UNLOCK(syncvp);
3483
3484	/*
3485	 * Walk the list of vnodes pushing all that are dirty and
3486	 * not already on the sync list.
3487	 */
3488	mtx_lock(&mountlist_mtx);
3489	if (vfs_busy(mp, LK_EXCLUSIVE | LK_NOWAIT, &mountlist_mtx, td) != 0) {
3490		mtx_unlock(&mountlist_mtx);
3491		return (0);
3492	}
3493	if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) {
3494		vfs_unbusy(mp, td);
3495		return (0);
3496	}
3497	asyncflag = mp->mnt_flag & MNT_ASYNC;
3498	mp->mnt_flag &= ~MNT_ASYNC;
3499	vfs_msync(mp, MNT_NOWAIT);
3500	error = VFS_SYNC(mp, MNT_LAZY, ap->a_cred, td);
3501	if (asyncflag)
3502		mp->mnt_flag |= MNT_ASYNC;
3503	vn_finished_write(mp);
3504	vfs_unbusy(mp, td);
3505	return (error);
3506}
3507
3508/*
3509 * The syncer vnode is no referenced.
3510 */
3511static int
3512sync_inactive(ap)
3513	struct vop_inactive_args /* {
3514		struct vnode *a_vp;
3515		struct thread *a_td;
3516	} */ *ap;
3517{
3518
3519	VOP_UNLOCK(ap->a_vp, 0, ap->a_td);
3520	vgone(ap->a_vp);
3521	return (0);
3522}
3523
3524/*
3525 * The syncer vnode is no longer needed and is being decommissioned.
3526 *
3527 * Modifications to the worklist must be protected at splbio().
3528 */
3529static int
3530sync_reclaim(ap)
3531	struct vop_reclaim_args /* {
3532		struct vnode *a_vp;
3533	} */ *ap;
3534{
3535	struct vnode *vp = ap->a_vp;
3536	int s;
3537
3538	s = splbio();
3539	vp->v_mount->mnt_syncer = NULL;
3540	VI_LOCK(vp);
3541	if (vp->v_iflag & VI_ONWORKLST) {
3542		mtx_lock(&sync_mtx);
3543		LIST_REMOVE(vp, v_synclist);
3544		mtx_unlock(&sync_mtx);
3545		vp->v_iflag &= ~VI_ONWORKLST;
3546	}
3547	VI_UNLOCK(vp);
3548	splx(s);
3549
3550	return (0);
3551}
3552
3553/*
3554 * extract the dev_t from a VCHR
3555 */
3556dev_t
3557vn_todev(vp)
3558	struct vnode *vp;
3559{
3560	if (vp->v_type != VCHR)
3561		return (NODEV);
3562	return (vp->v_rdev);
3563}
3564
3565/*
3566 * Check if vnode represents a disk device
3567 */
3568int
3569vn_isdisk(vp, errp)
3570	struct vnode *vp;
3571	int *errp;
3572{
3573	struct cdevsw *cdevsw;
3574
3575	if (vp->v_type != VCHR) {
3576		if (errp != NULL)
3577			*errp = ENOTBLK;
3578		return (0);
3579	}
3580	if (vp->v_rdev == NULL) {
3581		if (errp != NULL)
3582			*errp = ENXIO;
3583		return (0);
3584	}
3585	cdevsw = devsw(vp->v_rdev);
3586	if (cdevsw == NULL) {
3587		if (errp != NULL)
3588			*errp = ENXIO;
3589		return (0);
3590	}
3591	if (!(cdevsw->d_flags & D_DISK)) {
3592		if (errp != NULL)
3593			*errp = ENOTBLK;
3594		return (0);
3595	}
3596	if (errp != NULL)
3597		*errp = 0;
3598	return (1);
3599}
3600
3601/*
3602 * Free data allocated by namei(); see namei(9) for details.
3603 */
3604void
3605NDFREE(ndp, flags)
3606     struct nameidata *ndp;
3607     const uint flags;
3608{
3609	if (!(flags & NDF_NO_FREE_PNBUF) &&
3610	    (ndp->ni_cnd.cn_flags & HASBUF)) {
3611		uma_zfree(namei_zone, ndp->ni_cnd.cn_pnbuf);
3612		ndp->ni_cnd.cn_flags &= ~HASBUF;
3613	}
3614	if (!(flags & NDF_NO_DVP_UNLOCK) &&
3615	    (ndp->ni_cnd.cn_flags & LOCKPARENT) &&
3616	    ndp->ni_dvp != ndp->ni_vp)
3617		VOP_UNLOCK(ndp->ni_dvp, 0, ndp->ni_cnd.cn_thread);
3618	if (!(flags & NDF_NO_DVP_RELE) &&
3619	    (ndp->ni_cnd.cn_flags & (LOCKPARENT|WANTPARENT))) {
3620		vrele(ndp->ni_dvp);
3621		ndp->ni_dvp = NULL;
3622	}
3623	if (!(flags & NDF_NO_VP_UNLOCK) &&
3624	    (ndp->ni_cnd.cn_flags & LOCKLEAF) && ndp->ni_vp)
3625		VOP_UNLOCK(ndp->ni_vp, 0, ndp->ni_cnd.cn_thread);
3626	if (!(flags & NDF_NO_VP_RELE) &&
3627	    ndp->ni_vp) {
3628		vrele(ndp->ni_vp);
3629		ndp->ni_vp = NULL;
3630	}
3631	if (!(flags & NDF_NO_STARTDIR_RELE) &&
3632	    (ndp->ni_cnd.cn_flags & SAVESTART)) {
3633		vrele(ndp->ni_startdir);
3634		ndp->ni_startdir = NULL;
3635	}
3636}
3637
3638/*
3639 * Common filesystem object access control check routine.  Accepts a
3640 * vnode's type, "mode", uid and gid, requested access mode, credentials,
3641 * and optional call-by-reference privused argument allowing vaccess()
3642 * to indicate to the caller whether privilege was used to satisfy the
3643 * request (obsoleted).  Returns 0 on success, or an errno on failure.
3644 */
3645int
3646vaccess(type, file_mode, file_uid, file_gid, acc_mode, cred, privused)
3647	enum vtype type;
3648	mode_t file_mode;
3649	uid_t file_uid;
3650	gid_t file_gid;
3651	mode_t acc_mode;
3652	struct ucred *cred;
3653	int *privused;
3654{
3655	mode_t dac_granted;
3656#ifdef CAPABILITIES
3657	mode_t cap_granted;
3658#endif
3659
3660	/*
3661	 * Look for a normal, non-privileged way to access the file/directory
3662	 * as requested.  If it exists, go with that.
3663	 */
3664
3665	if (privused != NULL)
3666		*privused = 0;
3667
3668	dac_granted = 0;
3669
3670	/* Check the owner. */
3671	if (cred->cr_uid == file_uid) {
3672		dac_granted |= VADMIN;
3673		if (file_mode & S_IXUSR)
3674			dac_granted |= VEXEC;
3675		if (file_mode & S_IRUSR)
3676			dac_granted |= VREAD;
3677		if (file_mode & S_IWUSR)
3678			dac_granted |= (VWRITE | VAPPEND);
3679
3680		if ((acc_mode & dac_granted) == acc_mode)
3681			return (0);
3682
3683		goto privcheck;
3684	}
3685
3686	/* Otherwise, check the groups (first match) */
3687	if (groupmember(file_gid, cred)) {
3688		if (file_mode & S_IXGRP)
3689			dac_granted |= VEXEC;
3690		if (file_mode & S_IRGRP)
3691			dac_granted |= VREAD;
3692		if (file_mode & S_IWGRP)
3693			dac_granted |= (VWRITE | VAPPEND);
3694
3695		if ((acc_mode & dac_granted) == acc_mode)
3696			return (0);
3697
3698		goto privcheck;
3699	}
3700
3701	/* Otherwise, check everyone else. */
3702	if (file_mode & S_IXOTH)
3703		dac_granted |= VEXEC;
3704	if (file_mode & S_IROTH)
3705		dac_granted |= VREAD;
3706	if (file_mode & S_IWOTH)
3707		dac_granted |= (VWRITE | VAPPEND);
3708	if ((acc_mode & dac_granted) == acc_mode)
3709		return (0);
3710
3711privcheck:
3712	if (!suser_cred(cred, PRISON_ROOT)) {
3713		/* XXX audit: privilege used */
3714		if (privused != NULL)
3715			*privused = 1;
3716		return (0);
3717	}
3718
3719#ifdef CAPABILITIES
3720	/*
3721	 * Build a capability mask to determine if the set of capabilities
3722	 * satisfies the requirements when combined with the granted mask
3723	 * from above.
3724	 * For each capability, if the capability is required, bitwise
3725	 * or the request type onto the cap_granted mask.
3726	 */
3727	cap_granted = 0;
3728
3729	if (type == VDIR) {
3730		/*
3731		 * For directories, use CAP_DAC_READ_SEARCH to satisfy
3732		 * VEXEC requests, instead of CAP_DAC_EXECUTE.
3733		 */
3734		if ((acc_mode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
3735		    !cap_check(cred, NULL, CAP_DAC_READ_SEARCH, PRISON_ROOT))
3736			cap_granted |= VEXEC;
3737	} else {
3738		if ((acc_mode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
3739		    !cap_check(cred, NULL, CAP_DAC_EXECUTE, PRISON_ROOT))
3740			cap_granted |= VEXEC;
3741	}
3742
3743	if ((acc_mode & VREAD) && ((dac_granted & VREAD) == 0) &&
3744	    !cap_check(cred, NULL, CAP_DAC_READ_SEARCH, PRISON_ROOT))
3745		cap_granted |= VREAD;
3746
3747	if ((acc_mode & VWRITE) && ((dac_granted & VWRITE) == 0) &&
3748	    !cap_check(cred, NULL, CAP_DAC_WRITE, PRISON_ROOT))
3749		cap_granted |= (VWRITE | VAPPEND);
3750
3751	if ((acc_mode & VADMIN) && ((dac_granted & VADMIN) == 0) &&
3752	    !cap_check(cred, NULL, CAP_FOWNER, PRISON_ROOT))
3753		cap_granted |= VADMIN;
3754
3755	if ((acc_mode & (cap_granted | dac_granted)) == acc_mode) {
3756		/* XXX audit: privilege used */
3757		if (privused != NULL)
3758			*privused = 1;
3759		return (0);
3760	}
3761#endif
3762
3763	return ((acc_mode & VADMIN) ? EPERM : EACCES);
3764}
3765
3766/*
3767 * Credential check based on process requesting service, and per-attribute
3768 * permissions.
3769 */
3770int
3771extattr_check_cred(struct vnode *vp, int attrnamespace,
3772    struct ucred *cred, struct thread *td, int access)
3773{
3774
3775	/*
3776	 * Kernel-invoked always succeeds.
3777	 */
3778	if (cred == NOCRED)
3779		return (0);
3780
3781	/*
3782	 * Do not allow privileged processes in jail to directly
3783	 * manipulate system attributes.
3784	 *
3785	 * XXX What capability should apply here?
3786	 * Probably CAP_SYS_SETFFLAG.
3787	 */
3788	switch (attrnamespace) {
3789	case EXTATTR_NAMESPACE_SYSTEM:
3790		/* Potentially should be: return (EPERM); */
3791		return (suser_cred(cred, 0));
3792	case EXTATTR_NAMESPACE_USER:
3793		return (VOP_ACCESS(vp, access, cred, td));
3794	default:
3795		return (EPERM);
3796	}
3797}
3798