vfs_subr.c revision 114945
1/*
2 * Copyright (c) 1989, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 *    must display the following acknowledgement:
20 *	This product includes software developed by the University of
21 *	California, Berkeley and its contributors.
22 * 4. Neither the name of the University nor the names of its contributors
23 *    may be used to endorse or promote products derived from this software
24 *    without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 *
38 *	@(#)vfs_subr.c	8.31 (Berkeley) 5/26/95
39 * $FreeBSD: head/sys/kern/vfs_subr.c 114945 2003-05-12 14:37:47Z rwatson $
40 */
41
42/*
43 * External virtual filesystem routines
44 */
45#include "opt_ddb.h"
46#include "opt_mac.h"
47
48#include <sys/param.h>
49#include <sys/systm.h>
50#include <sys/bio.h>
51#include <sys/buf.h>
52#include <sys/conf.h>
53#include <sys/eventhandler.h>
54#include <sys/extattr.h>
55#include <sys/fcntl.h>
56#include <sys/kernel.h>
57#include <sys/kthread.h>
58#include <sys/mac.h>
59#include <sys/malloc.h>
60#include <sys/mount.h>
61#include <sys/namei.h>
62#include <sys/stat.h>
63#include <sys/sysctl.h>
64#include <sys/syslog.h>
65#include <sys/vmmeter.h>
66#include <sys/vnode.h>
67
68#include <vm/vm.h>
69#include <vm/vm_object.h>
70#include <vm/vm_extern.h>
71#include <vm/pmap.h>
72#include <vm/vm_map.h>
73#include <vm/vm_page.h>
74#include <vm/uma.h>
75
76static MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure");
77
78static void	addalias(struct vnode *vp, dev_t nvp_rdev);
79static void	insmntque(struct vnode *vp, struct mount *mp);
80static void	vclean(struct vnode *vp, int flags, struct thread *td);
81static void	vlruvp(struct vnode *vp);
82static int	flushbuflist(struct buf *blist, int flags, struct vnode *vp,
83		    int slpflag, int slptimeo, int *errorp);
84static int	vcanrecycle(struct vnode *vp, struct mount **vnmpp);
85
86
87/*
88 * Number of vnodes in existence.  Increased whenever getnewvnode()
89 * allocates a new vnode, never decreased.
90 */
91static unsigned long	numvnodes;
92
93SYSCTL_LONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, "");
94
95/*
96 * Conversion tables for conversion from vnode types to inode formats
97 * and back.
98 */
99enum vtype iftovt_tab[16] = {
100	VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
101	VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
102};
103int vttoif_tab[9] = {
104	0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
105	S_IFSOCK, S_IFIFO, S_IFMT,
106};
107
108/*
109 * List of vnodes that are ready for recycling.
110 */
111static TAILQ_HEAD(freelst, vnode) vnode_free_list;
112
113/*
114 * Minimum number of free vnodes.  If there are fewer than this free vnodes,
115 * getnewvnode() will return a newly allocated vnode.
116 */
117static u_long wantfreevnodes = 25;
118SYSCTL_LONG(_vfs, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, "");
119/* Number of vnodes in the free list. */
120static u_long freevnodes;
121SYSCTL_LONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, "");
122
123/*
124 * Various variables used for debugging the new implementation of
125 * reassignbuf().
126 * XXX these are probably of (very) limited utility now.
127 */
128static int reassignbufcalls;
129SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0, "");
130static int nameileafonly;
131SYSCTL_INT(_vfs, OID_AUTO, nameileafonly, CTLFLAG_RW, &nameileafonly, 0, "");
132
133/*
134 * Cache for the mount type id assigned to NFS.  This is used for
135 * special checks in nfs/nfs_nqlease.c and vm/vnode_pager.c.
136 */
137int	nfs_mount_type = -1;
138
139/* To keep more than one thread at a time from running vfs_getnewfsid */
140static struct mtx mntid_mtx;
141
142/*
143 * Lock for any access to the following:
144 *	vnode_free_list
145 *	numvnodes
146 *	freevnodes
147 */
148static struct mtx vnode_free_list_mtx;
149
150/*
151 * For any iteration/modification of dev->si_hlist (linked through
152 * v_specnext)
153 */
154static struct mtx spechash_mtx;
155
156/* Publicly exported FS */
157struct nfs_public nfs_pub;
158
159/* Zone for allocation of new vnodes - used exclusively by getnewvnode() */
160static uma_zone_t vnode_zone;
161static uma_zone_t vnodepoll_zone;
162
163/* Set to 1 to print out reclaim of active vnodes */
164int	prtactive;
165
166/*
167 * The workitem queue.
168 *
169 * It is useful to delay writes of file data and filesystem metadata
170 * for tens of seconds so that quickly created and deleted files need
171 * not waste disk bandwidth being created and removed. To realize this,
172 * we append vnodes to a "workitem" queue. When running with a soft
173 * updates implementation, most pending metadata dependencies should
174 * not wait for more than a few seconds. Thus, mounted on block devices
175 * are delayed only about a half the time that file data is delayed.
176 * Similarly, directory updates are more critical, so are only delayed
177 * about a third the time that file data is delayed. Thus, there are
178 * SYNCER_MAXDELAY queues that are processed round-robin at a rate of
179 * one each second (driven off the filesystem syncer process). The
180 * syncer_delayno variable indicates the next queue that is to be processed.
181 * Items that need to be processed soon are placed in this queue:
182 *
183 *	syncer_workitem_pending[syncer_delayno]
184 *
185 * A delay of fifteen seconds is done by placing the request fifteen
186 * entries later in the queue:
187 *
188 *	syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
189 *
190 */
191static int syncer_delayno;
192static long syncer_mask;
193LIST_HEAD(synclist, vnode);
194static struct synclist *syncer_workitem_pending;
195/*
196 * The sync_mtx protects:
197 *	vp->v_synclist
198 *	syncer_delayno
199 *	syncer_workitem_pending
200 *	rushjob
201 */
202static struct mtx sync_mtx;
203
204#define SYNCER_MAXDELAY		32
205static int syncer_maxdelay = SYNCER_MAXDELAY;	/* maximum delay time */
206static int syncdelay = 30;		/* max time to delay syncing data */
207static int filedelay = 30;		/* time to delay syncing files */
208SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, "");
209static int dirdelay = 29;		/* time to delay syncing directories */
210SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, "");
211static int metadelay = 28;		/* time to delay syncing metadata */
212SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, "");
213static int rushjob;		/* number of slots to run ASAP */
214static int stat_rush_requests;	/* number of times I/O speeded up */
215SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, "");
216
217/*
218 * Number of vnodes we want to exist at any one time.  This is mostly used
219 * to size hash tables in vnode-related code.  It is normally not used in
220 * getnewvnode(), as wantfreevnodes is normally nonzero.)
221 *
222 * XXX desiredvnodes is historical cruft and should not exist.
223 */
224int desiredvnodes;
225SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW,
226    &desiredvnodes, 0, "Maximum number of vnodes");
227static int minvnodes;
228SYSCTL_INT(_kern, OID_AUTO, minvnodes, CTLFLAG_RW,
229    &minvnodes, 0, "Minimum number of vnodes");
230static int vnlru_nowhere;
231SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW, &vnlru_nowhere, 0,
232    "Number of times the vnlru process ran without success");
233
234/* Hook for calling soft updates */
235int (*softdep_process_worklist_hook)(struct mount *);
236
237/*
238 * This only exists to supress warnings from unlocked specfs accesses.  It is
239 * no longer ok to have an unlocked VFS.
240 */
241#define IGNORE_LOCK(vp) ((vp)->v_type == VCHR || (vp)->v_type == VBAD)
242
243/* Print lock violations */
244int vfs_badlock_print = 1;
245
246/* Panic on violation */
247int vfs_badlock_panic = 1;
248
249/* Check for interlock across VOPs */
250int vfs_badlock_mutex = 1;
251
252static void
253vfs_badlock(char *msg, char *str, struct vnode *vp)
254{
255	if (vfs_badlock_print)
256		printf("%s: %p %s\n", str, vp, msg);
257	if (vfs_badlock_panic)
258		Debugger("Lock violation.\n");
259}
260
261void
262assert_vi_unlocked(struct vnode *vp, char *str)
263{
264	if (vfs_badlock_mutex && mtx_owned(VI_MTX(vp)))
265		vfs_badlock("interlock is locked but should not be", str, vp);
266}
267
268void
269assert_vi_locked(struct vnode *vp, char *str)
270{
271	if (vfs_badlock_mutex && !mtx_owned(VI_MTX(vp)))
272		vfs_badlock("interlock is not locked but should be", str, vp);
273}
274
275void
276assert_vop_locked(struct vnode *vp, char *str)
277{
278	if (vp && !IGNORE_LOCK(vp) && !VOP_ISLOCKED(vp, NULL))
279		vfs_badlock("is not locked but should be", str, vp);
280}
281
282void
283assert_vop_unlocked(struct vnode *vp, char *str)
284{
285	if (vp && !IGNORE_LOCK(vp) &&
286	    VOP_ISLOCKED(vp, curthread) == LK_EXCLUSIVE)
287		vfs_badlock("is locked but should not be", str, vp);
288}
289
290void
291assert_vop_elocked(struct vnode *vp, char *str)
292{
293	if (vp && !IGNORE_LOCK(vp) &&
294	    VOP_ISLOCKED(vp, curthread) != LK_EXCLUSIVE)
295		vfs_badlock("is not exclusive locked but should be", str, vp);
296}
297
298void
299assert_vop_elocked_other(struct vnode *vp, char *str)
300{
301	if (vp && !IGNORE_LOCK(vp) &&
302	    VOP_ISLOCKED(vp, curthread) != LK_EXCLOTHER)
303		vfs_badlock("is not exclusive locked by another thread",
304		    str, vp);
305}
306
307void
308assert_vop_slocked(struct vnode *vp, char *str)
309{
310	if (vp && !IGNORE_LOCK(vp) &&
311	    VOP_ISLOCKED(vp, curthread) != LK_SHARED)
312		vfs_badlock("is not locked shared but should be", str, vp);
313}
314
315void
316vop_rename_pre(void *ap)
317{
318	struct vop_rename_args *a = ap;
319
320	if (a->a_tvp)
321		ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME");
322	ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME");
323	ASSERT_VI_UNLOCKED(a->a_fvp, "VOP_RENAME");
324	ASSERT_VI_UNLOCKED(a->a_fdvp, "VOP_RENAME");
325
326	/* Check the source (from) */
327	if (a->a_tdvp != a->a_fdvp)
328		ASSERT_VOP_UNLOCKED(a->a_fdvp, "vop_rename: fdvp locked.\n");
329	if (a->a_tvp != a->a_fvp)
330		ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: tvp locked.\n");
331
332	/* Check the target */
333	if (a->a_tvp)
334		ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked.\n");
335
336	ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked.\n");
337}
338
339void
340vop_strategy_pre(void *ap)
341{
342	struct vop_strategy_args *a = ap;
343	struct buf *bp;
344
345	bp = a->a_bp;
346
347	/*
348	 * Cluster ops lock their component buffers but not the IO container.
349	 */
350	if ((bp->b_flags & B_CLUSTER) != 0)
351		return;
352
353	if (BUF_REFCNT(bp) < 1) {
354		if (vfs_badlock_print)
355			printf("VOP_STRATEGY: bp is not locked but should be.\n");
356		if (vfs_badlock_panic)
357			Debugger("Lock violation.\n");
358	}
359}
360
361void
362vop_lookup_pre(void *ap)
363{
364	struct vop_lookup_args *a = ap;
365	struct vnode *dvp;
366
367	dvp = a->a_dvp;
368
369	ASSERT_VI_UNLOCKED(dvp, "VOP_LOOKUP");
370	ASSERT_VOP_LOCKED(dvp, "VOP_LOOKUP");
371}
372
373void
374vop_lookup_post(void *ap, int rc)
375{
376	struct vop_lookup_args *a = ap;
377	struct componentname *cnp;
378	struct vnode *dvp;
379	struct vnode *vp;
380	int flags;
381
382	dvp = a->a_dvp;
383	cnp = a->a_cnp;
384	vp = *(a->a_vpp);
385	flags = cnp->cn_flags;
386
387
388	ASSERT_VI_UNLOCKED(dvp, "VOP_LOOKUP");
389	/*
390	 * If this is the last path component for this lookup and LOCPARENT
391	 * is set, OR if there is an error the directory has to be locked.
392	 */
393	if ((flags & LOCKPARENT) && (flags & ISLASTCN))
394		ASSERT_VOP_LOCKED(dvp, "VOP_LOOKUP (LOCKPARENT)");
395	else if (rc != 0)
396		ASSERT_VOP_LOCKED(dvp, "VOP_LOOKUP (error)");
397	else if (dvp != vp)
398		ASSERT_VOP_UNLOCKED(dvp, "VOP_LOOKUP (dvp)");
399
400	if (flags & PDIRUNLOCK)
401		ASSERT_VOP_UNLOCKED(dvp, "VOP_LOOKUP (PDIRUNLOCK)");
402}
403
404void
405vop_unlock_pre(void *ap)
406{
407	struct vop_unlock_args *a = ap;
408
409	if (a->a_flags & LK_INTERLOCK)
410		ASSERT_VI_LOCKED(a->a_vp, "VOP_UNLOCK");
411
412	ASSERT_VOP_LOCKED(a->a_vp, "VOP_UNLOCK");
413}
414
415void
416vop_unlock_post(void *ap, int rc)
417{
418	struct vop_unlock_args *a = ap;
419
420	if (a->a_flags & LK_INTERLOCK)
421		ASSERT_VI_UNLOCKED(a->a_vp, "VOP_UNLOCK");
422}
423
424void
425vop_lock_pre(void *ap)
426{
427	struct vop_lock_args *a = ap;
428
429	if ((a->a_flags & LK_INTERLOCK) == 0)
430		ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
431	else
432		ASSERT_VI_LOCKED(a->a_vp, "VOP_LOCK");
433}
434
435void
436vop_lock_post(void *ap, int rc)
437{
438	struct vop_lock_args *a;
439
440	a = ap;
441
442	ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
443	if (rc == 0)
444		ASSERT_VOP_LOCKED(a->a_vp, "VOP_LOCK");
445}
446
447void
448v_addpollinfo(struct vnode *vp)
449{
450	vp->v_pollinfo = uma_zalloc(vnodepoll_zone, M_WAITOK);
451	mtx_init(&vp->v_pollinfo->vpi_lock, "vnode pollinfo", NULL, MTX_DEF);
452}
453
454/*
455 * Initialize the vnode management data structures.
456 */
457static void
458vntblinit(void *dummy __unused)
459{
460
461	desiredvnodes = maxproc + cnt.v_page_count / 4;
462	minvnodes = desiredvnodes / 4;
463	mtx_init(&mountlist_mtx, "mountlist", NULL, MTX_DEF);
464	mtx_init(&mntvnode_mtx, "mntvnode", NULL, MTX_DEF);
465	mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF);
466	mtx_init(&spechash_mtx, "spechash", NULL, MTX_DEF);
467	TAILQ_INIT(&vnode_free_list);
468	mtx_init(&vnode_free_list_mtx, "vnode_free_list", NULL, MTX_DEF);
469	vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL,
470	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
471	vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo),
472	      NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
473	/*
474	 * Initialize the filesystem syncer.
475	 */
476	syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE,
477		&syncer_mask);
478	syncer_maxdelay = syncer_mask + 1;
479	mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF);
480}
481SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL)
482
483
484/*
485 * Mark a mount point as busy. Used to synchronize access and to delay
486 * unmounting. Interlock is not released on failure.
487 */
488int
489vfs_busy(mp, flags, interlkp, td)
490	struct mount *mp;
491	int flags;
492	struct mtx *interlkp;
493	struct thread *td;
494{
495	int lkflags;
496
497	if (mp->mnt_kern_flag & MNTK_UNMOUNT) {
498		if (flags & LK_NOWAIT)
499			return (ENOENT);
500		mp->mnt_kern_flag |= MNTK_MWAIT;
501		/*
502		 * Since all busy locks are shared except the exclusive
503		 * lock granted when unmounting, the only place that a
504		 * wakeup needs to be done is at the release of the
505		 * exclusive lock at the end of dounmount.
506		 */
507		msleep(mp, interlkp, PVFS, "vfs_busy", 0);
508		return (ENOENT);
509	}
510	lkflags = LK_SHARED | LK_NOPAUSE;
511	if (interlkp)
512		lkflags |= LK_INTERLOCK;
513	if (lockmgr(&mp->mnt_lock, lkflags, interlkp, td))
514		panic("vfs_busy: unexpected lock failure");
515	return (0);
516}
517
518/*
519 * Free a busy filesystem.
520 */
521void
522vfs_unbusy(mp, td)
523	struct mount *mp;
524	struct thread *td;
525{
526
527	lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, td);
528}
529
530/*
531 * Lookup a mount point by filesystem identifier.
532 */
533struct mount *
534vfs_getvfs(fsid)
535	fsid_t *fsid;
536{
537	register struct mount *mp;
538
539	mtx_lock(&mountlist_mtx);
540	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
541		if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
542		    mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
543			mtx_unlock(&mountlist_mtx);
544			return (mp);
545		}
546	}
547	mtx_unlock(&mountlist_mtx);
548	return ((struct mount *) 0);
549}
550
551/*
552 * Get a new unique fsid.  Try to make its val[0] unique, since this value
553 * will be used to create fake device numbers for stat().  Also try (but
554 * not so hard) make its val[0] unique mod 2^16, since some emulators only
555 * support 16-bit device numbers.  We end up with unique val[0]'s for the
556 * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls.
557 *
558 * Keep in mind that several mounts may be running in parallel.  Starting
559 * the search one past where the previous search terminated is both a
560 * micro-optimization and a defense against returning the same fsid to
561 * different mounts.
562 */
563void
564vfs_getnewfsid(mp)
565	struct mount *mp;
566{
567	static u_int16_t mntid_base;
568	fsid_t tfsid;
569	int mtype;
570
571	mtx_lock(&mntid_mtx);
572	mtype = mp->mnt_vfc->vfc_typenum;
573	tfsid.val[1] = mtype;
574	mtype = (mtype & 0xFF) << 24;
575	for (;;) {
576		tfsid.val[0] = makeudev(255,
577		    mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF));
578		mntid_base++;
579		if (vfs_getvfs(&tfsid) == NULL)
580			break;
581	}
582	mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
583	mp->mnt_stat.f_fsid.val[1] = tfsid.val[1];
584	mtx_unlock(&mntid_mtx);
585}
586
587/*
588 * Knob to control the precision of file timestamps:
589 *
590 *   0 = seconds only; nanoseconds zeroed.
591 *   1 = seconds and nanoseconds, accurate within 1/HZ.
592 *   2 = seconds and nanoseconds, truncated to microseconds.
593 * >=3 = seconds and nanoseconds, maximum precision.
594 */
595enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC };
596
597static int timestamp_precision = TSP_SEC;
598SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW,
599    &timestamp_precision, 0, "");
600
601/*
602 * Get a current timestamp.
603 */
604void
605vfs_timestamp(tsp)
606	struct timespec *tsp;
607{
608	struct timeval tv;
609
610	switch (timestamp_precision) {
611	case TSP_SEC:
612		tsp->tv_sec = time_second;
613		tsp->tv_nsec = 0;
614		break;
615	case TSP_HZ:
616		getnanotime(tsp);
617		break;
618	case TSP_USEC:
619		microtime(&tv);
620		TIMEVAL_TO_TIMESPEC(&tv, tsp);
621		break;
622	case TSP_NSEC:
623	default:
624		nanotime(tsp);
625		break;
626	}
627}
628
629/*
630 * Set vnode attributes to VNOVAL
631 */
632void
633vattr_null(vap)
634	register struct vattr *vap;
635{
636
637	vap->va_type = VNON;
638	vap->va_size = VNOVAL;
639	vap->va_bytes = VNOVAL;
640	vap->va_mode = VNOVAL;
641	vap->va_nlink = VNOVAL;
642	vap->va_uid = VNOVAL;
643	vap->va_gid = VNOVAL;
644	vap->va_fsid = VNOVAL;
645	vap->va_fileid = VNOVAL;
646	vap->va_blocksize = VNOVAL;
647	vap->va_rdev = VNOVAL;
648	vap->va_atime.tv_sec = VNOVAL;
649	vap->va_atime.tv_nsec = VNOVAL;
650	vap->va_mtime.tv_sec = VNOVAL;
651	vap->va_mtime.tv_nsec = VNOVAL;
652	vap->va_ctime.tv_sec = VNOVAL;
653	vap->va_ctime.tv_nsec = VNOVAL;
654	vap->va_birthtime.tv_sec = VNOVAL;
655	vap->va_birthtime.tv_nsec = VNOVAL;
656	vap->va_flags = VNOVAL;
657	vap->va_gen = VNOVAL;
658	vap->va_vaflags = 0;
659}
660
661/*
662 * This routine is called when we have too many vnodes.  It attempts
663 * to free <count> vnodes and will potentially free vnodes that still
664 * have VM backing store (VM backing store is typically the cause
665 * of a vnode blowout so we want to do this).  Therefore, this operation
666 * is not considered cheap.
667 *
668 * A number of conditions may prevent a vnode from being reclaimed.
669 * the buffer cache may have references on the vnode, a directory
670 * vnode may still have references due to the namei cache representing
671 * underlying files, or the vnode may be in active use.   It is not
672 * desireable to reuse such vnodes.  These conditions may cause the
673 * number of vnodes to reach some minimum value regardless of what
674 * you set kern.maxvnodes to.  Do not set kern.maxvnodes too low.
675 */
676static int
677vlrureclaim(struct mount *mp)
678{
679	struct vnode *vp;
680	int done;
681	int trigger;
682	int usevnodes;
683	int count;
684
685	/*
686	 * Calculate the trigger point, don't allow user
687	 * screwups to blow us up.   This prevents us from
688	 * recycling vnodes with lots of resident pages.  We
689	 * aren't trying to free memory, we are trying to
690	 * free vnodes.
691	 */
692	usevnodes = desiredvnodes;
693	if (usevnodes <= 0)
694		usevnodes = 1;
695	trigger = cnt.v_page_count * 2 / usevnodes;
696
697	done = 0;
698	mtx_lock(&mntvnode_mtx);
699	count = mp->mnt_nvnodelistsize / 10 + 1;
700	while (count && (vp = TAILQ_FIRST(&mp->mnt_nvnodelist)) != NULL) {
701		TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
702		TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
703
704		if (vp->v_type != VNON &&
705		    vp->v_type != VBAD &&
706		    VI_TRYLOCK(vp)) {
707			if (VMIGHTFREE(vp) &&           /* critical path opt */
708			    (vp->v_object == NULL ||
709			    vp->v_object->resident_page_count < trigger)) {
710				mtx_unlock(&mntvnode_mtx);
711				vgonel(vp, curthread);
712				done++;
713				mtx_lock(&mntvnode_mtx);
714			} else
715				VI_UNLOCK(vp);
716		}
717		--count;
718	}
719	mtx_unlock(&mntvnode_mtx);
720	return done;
721}
722
723/*
724 * Attempt to recycle vnodes in a context that is always safe to block.
725 * Calling vlrurecycle() from the bowels of filesystem code has some
726 * interesting deadlock problems.
727 */
728static struct proc *vnlruproc;
729static int vnlruproc_sig;
730
731static void
732vnlru_proc(void)
733{
734	struct mount *mp, *nmp;
735	int s;
736	int done;
737	struct proc *p = vnlruproc;
738	struct thread *td = FIRST_THREAD_IN_PROC(p);	/* XXXKSE */
739
740	mtx_lock(&Giant);
741
742	EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, p,
743	    SHUTDOWN_PRI_FIRST);
744
745	s = splbio();
746	for (;;) {
747		kthread_suspend_check(p);
748		mtx_lock(&vnode_free_list_mtx);
749		if (numvnodes - freevnodes <= desiredvnodes * 9 / 10) {
750			mtx_unlock(&vnode_free_list_mtx);
751			vnlruproc_sig = 0;
752			wakeup(&vnlruproc_sig);
753			tsleep(vnlruproc, PVFS, "vlruwt", hz);
754			continue;
755		}
756		mtx_unlock(&vnode_free_list_mtx);
757		done = 0;
758		mtx_lock(&mountlist_mtx);
759		for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
760			if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td)) {
761				nmp = TAILQ_NEXT(mp, mnt_list);
762				continue;
763			}
764			done += vlrureclaim(mp);
765			mtx_lock(&mountlist_mtx);
766			nmp = TAILQ_NEXT(mp, mnt_list);
767			vfs_unbusy(mp, td);
768		}
769		mtx_unlock(&mountlist_mtx);
770		if (done == 0) {
771#if 0
772			/* These messages are temporary debugging aids */
773			if (vnlru_nowhere < 5)
774				printf("vnlru process getting nowhere..\n");
775			else if (vnlru_nowhere == 5)
776				printf("vnlru process messages stopped.\n");
777#endif
778			vnlru_nowhere++;
779			tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3);
780		}
781	}
782	splx(s);
783}
784
785static struct kproc_desc vnlru_kp = {
786	"vnlru",
787	vnlru_proc,
788	&vnlruproc
789};
790SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &vnlru_kp)
791
792
793/*
794 * Routines having to do with the management of the vnode table.
795 */
796
797/*
798 * Check to see if a free vnode can be recycled. If it can,
799 * return it locked with the vn lock, but not interlock. Also
800 * get the vn_start_write lock. Otherwise indicate the error.
801 */
802static int
803vcanrecycle(struct vnode *vp, struct mount **vnmpp)
804{
805	struct thread *td = curthread;
806	vm_object_t object;
807	int error;
808
809	/* Don't recycle if we can't get the interlock */
810	if (!VI_TRYLOCK(vp))
811		return (EWOULDBLOCK);
812
813	/* We should be able to immediately acquire this */
814	/* XXX This looks like it should panic if it fails */
815	if (vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE, td) != 0) {
816		if (VOP_ISLOCKED(vp, td))
817			panic("vcanrecycle: locked vnode");
818		return (EWOULDBLOCK);
819	}
820
821	/*
822	 * Don't recycle if its filesystem is being suspended.
823	 */
824	if (vn_start_write(vp, vnmpp, V_NOWAIT) != 0) {
825		error = EBUSY;
826		goto done;
827	}
828
829	/*
830	 * Don't recycle if we still have cached pages.
831	 */
832	if (VOP_GETVOBJECT(vp, &object) == 0) {
833		VM_OBJECT_LOCK(object);
834		if (object->resident_page_count ||
835		    object->ref_count) {
836			VM_OBJECT_UNLOCK(object);
837			error = EBUSY;
838			goto done;
839		}
840		VM_OBJECT_UNLOCK(object);
841	}
842	if (LIST_FIRST(&vp->v_cache_src)) {
843		/*
844		 * note: nameileafonly sysctl is temporary,
845		 * for debugging only, and will eventually be
846		 * removed.
847		 */
848		if (nameileafonly > 0) {
849			/*
850			 * Do not reuse namei-cached directory
851			 * vnodes that have cached
852			 * subdirectories.
853			 */
854			if (cache_leaf_test(vp) < 0) {
855				error = EISDIR;
856				goto done;
857			}
858		} else if (nameileafonly < 0 ||
859			    vmiodirenable == 0) {
860			/*
861			 * Do not reuse namei-cached directory
862			 * vnodes if nameileafonly is -1 or
863			 * if VMIO backing for directories is
864			 * turned off (otherwise we reuse them
865			 * too quickly).
866			 */
867			error = EBUSY;
868			goto done;
869		}
870	}
871	return (0);
872done:
873	VOP_UNLOCK(vp, 0, td);
874	return (error);
875}
876
877/*
878 * Return the next vnode from the free list.
879 */
880int
881getnewvnode(tag, mp, vops, vpp)
882	const char *tag;
883	struct mount *mp;
884	vop_t **vops;
885	struct vnode **vpp;
886{
887	int s;
888	struct thread *td = curthread;	/* XXX */
889	struct vnode *vp = NULL;
890	struct vpollinfo *pollinfo = NULL;
891	struct mount *vnmp;
892
893	s = splbio();
894	mtx_lock(&vnode_free_list_mtx);
895
896	/*
897	 * Try to reuse vnodes if we hit the max.  This situation only
898	 * occurs in certain large-memory (2G+) situations.  We cannot
899	 * attempt to directly reclaim vnodes due to nasty recursion
900	 * problems.
901	 */
902	while (numvnodes - freevnodes > desiredvnodes) {
903		if (vnlruproc_sig == 0) {
904			vnlruproc_sig = 1;      /* avoid unnecessary wakeups */
905			wakeup(vnlruproc);
906		}
907		mtx_unlock(&vnode_free_list_mtx);
908		tsleep(&vnlruproc_sig, PVFS, "vlruwk", hz);
909		mtx_lock(&vnode_free_list_mtx);
910	}
911
912	/*
913	 * Attempt to reuse a vnode already on the free list, allocating
914	 * a new vnode if we can't find one or if we have not reached a
915	 * good minimum for good LRU performance.
916	 */
917
918	if (freevnodes >= wantfreevnodes && numvnodes >= minvnodes) {
919		int error;
920		int count;
921
922		for (count = 0; count < freevnodes; count++) {
923			vp = TAILQ_FIRST(&vnode_free_list);
924
925			KASSERT(vp->v_usecount == 0 &&
926			    (vp->v_iflag & VI_DOINGINACT) == 0,
927			    ("getnewvnode: free vnode isn't"));
928
929			TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
930			/*
931			 * We have to drop the free list mtx to avoid lock
932			 * order reversals with interlock.
933			 */
934			mtx_unlock(&vnode_free_list_mtx);
935			error = vcanrecycle(vp, &vnmp);
936			mtx_lock(&vnode_free_list_mtx);
937			if (error == 0)
938				break;
939			TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
940			vp = NULL;
941		}
942	}
943	if (vp) {
944		freevnodes--;
945		mtx_unlock(&vnode_free_list_mtx);
946
947		cache_purge(vp);
948		VI_LOCK(vp);
949		vp->v_iflag |= VI_DOOMED;
950		vp->v_iflag &= ~VI_FREE;
951		if (vp->v_type != VBAD) {
952			VOP_UNLOCK(vp, 0, td);
953			vgonel(vp, td);
954			VI_LOCK(vp);
955		} else {
956			VOP_UNLOCK(vp, 0, td);
957		}
958		vn_finished_write(vnmp);
959
960#ifdef INVARIANTS
961		{
962			if (vp->v_data)
963				panic("cleaned vnode isn't");
964			if (vp->v_numoutput)
965				panic("Clean vnode has pending I/O's");
966			if (vp->v_writecount != 0)
967				panic("Non-zero write count");
968		}
969#endif
970		if ((pollinfo = vp->v_pollinfo) != NULL) {
971			/*
972			 * To avoid lock order reversals, the call to
973			 * uma_zfree() must be delayed until the vnode
974			 * interlock is released.
975			 */
976			vp->v_pollinfo = NULL;
977		}
978#ifdef MAC
979		mac_destroy_vnode(vp);
980#endif
981		vp->v_iflag = 0;
982		vp->v_vflag = 0;
983		vp->v_lastw = 0;
984		vp->v_lasta = 0;
985		vp->v_cstart = 0;
986		vp->v_clen = 0;
987		vp->v_socket = 0;
988		lockdestroy(vp->v_vnlock);
989		lockinit(vp->v_vnlock, PVFS, tag, VLKTIMEOUT, LK_NOPAUSE);
990		KASSERT(vp->v_cleanbufcnt == 0, ("cleanbufcnt not 0"));
991		KASSERT(vp->v_cleanblkroot == NULL, ("cleanblkroot not NULL"));
992		KASSERT(vp->v_dirtybufcnt == 0, ("dirtybufcnt not 0"));
993		KASSERT(vp->v_dirtyblkroot == NULL, ("dirtyblkroot not NULL"));
994	} else {
995		numvnodes++;
996		mtx_unlock(&vnode_free_list_mtx);
997
998		vp = (struct vnode *) uma_zalloc(vnode_zone, M_WAITOK|M_ZERO);
999		mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF);
1000		VI_LOCK(vp);
1001		vp->v_dd = vp;
1002		vp->v_vnlock = &vp->v_lock;
1003		lockinit(vp->v_vnlock, PVFS, tag, VLKTIMEOUT, LK_NOPAUSE);
1004		cache_purge(vp);
1005		LIST_INIT(&vp->v_cache_src);
1006		TAILQ_INIT(&vp->v_cache_dst);
1007	}
1008
1009	TAILQ_INIT(&vp->v_cleanblkhd);
1010	TAILQ_INIT(&vp->v_dirtyblkhd);
1011	vp->v_type = VNON;
1012	vp->v_tag = tag;
1013	vp->v_op = vops;
1014	*vpp = vp;
1015	vp->v_usecount = 1;
1016	vp->v_data = 0;
1017	vp->v_cachedid = -1;
1018	VI_UNLOCK(vp);
1019	if (pollinfo != NULL) {
1020		mtx_destroy(&pollinfo->vpi_lock);
1021		uma_zfree(vnodepoll_zone, pollinfo);
1022	}
1023#ifdef MAC
1024	mac_init_vnode(vp);
1025	if (mp != NULL && (mp->mnt_flag & MNT_MULTILABEL) == 0)
1026		mac_associate_vnode_singlelabel(mp, vp);
1027#endif
1028	insmntque(vp, mp);
1029
1030	return (0);
1031}
1032
1033/*
1034 * Move a vnode from one mount queue to another.
1035 */
1036static void
1037insmntque(vp, mp)
1038	register struct vnode *vp;
1039	register struct mount *mp;
1040{
1041
1042	mtx_lock(&mntvnode_mtx);
1043	/*
1044	 * Delete from old mount point vnode list, if on one.
1045	 */
1046	if (vp->v_mount != NULL) {
1047		KASSERT(vp->v_mount->mnt_nvnodelistsize > 0,
1048			("bad mount point vnode list size"));
1049		TAILQ_REMOVE(&vp->v_mount->mnt_nvnodelist, vp, v_nmntvnodes);
1050		vp->v_mount->mnt_nvnodelistsize--;
1051	}
1052	/*
1053	 * Insert into list of vnodes for the new mount point, if available.
1054	 */
1055	if ((vp->v_mount = mp) == NULL) {
1056		mtx_unlock(&mntvnode_mtx);
1057		return;
1058	}
1059	TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
1060	mp->mnt_nvnodelistsize++;
1061	mtx_unlock(&mntvnode_mtx);
1062}
1063
1064/*
1065 * Update outstanding I/O count and do wakeup if requested.
1066 */
1067void
1068vwakeup(bp)
1069	register struct buf *bp;
1070{
1071	register struct vnode *vp;
1072
1073	bp->b_flags &= ~B_WRITEINPROG;
1074	if ((vp = bp->b_vp)) {
1075		VI_LOCK(vp);
1076		vp->v_numoutput--;
1077		if (vp->v_numoutput < 0)
1078			panic("vwakeup: neg numoutput");
1079		if ((vp->v_numoutput == 0) && (vp->v_iflag & VI_BWAIT)) {
1080			vp->v_iflag &= ~VI_BWAIT;
1081			wakeup(&vp->v_numoutput);
1082		}
1083		VI_UNLOCK(vp);
1084	}
1085}
1086
1087/*
1088 * Flush out and invalidate all buffers associated with a vnode.
1089 * Called with the underlying object locked.
1090 */
1091int
1092vinvalbuf(vp, flags, cred, td, slpflag, slptimeo)
1093	struct vnode *vp;
1094	int flags;
1095	struct ucred *cred;
1096	struct thread *td;
1097	int slpflag, slptimeo;
1098{
1099	struct buf *blist;
1100	int s, error;
1101	vm_object_t object;
1102
1103	GIANT_REQUIRED;
1104
1105	ASSERT_VOP_LOCKED(vp, "vinvalbuf");
1106
1107	VI_LOCK(vp);
1108	if (flags & V_SAVE) {
1109		s = splbio();
1110		while (vp->v_numoutput) {
1111			vp->v_iflag |= VI_BWAIT;
1112			error = msleep(&vp->v_numoutput, VI_MTX(vp),
1113			    slpflag | (PRIBIO + 1), "vinvlbuf", slptimeo);
1114			if (error) {
1115				VI_UNLOCK(vp);
1116				splx(s);
1117				return (error);
1118			}
1119		}
1120		if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
1121			splx(s);
1122			VI_UNLOCK(vp);
1123			if ((error = VOP_FSYNC(vp, cred, MNT_WAIT, td)) != 0)
1124				return (error);
1125			/*
1126			 * XXX We could save a lock/unlock if this was only
1127			 * enabled under INVARIANTS
1128			 */
1129			VI_LOCK(vp);
1130			s = splbio();
1131			if (vp->v_numoutput > 0 ||
1132			    !TAILQ_EMPTY(&vp->v_dirtyblkhd))
1133				panic("vinvalbuf: dirty bufs");
1134		}
1135		splx(s);
1136	}
1137	s = splbio();
1138	/*
1139	 * If you alter this loop please notice that interlock is dropped and
1140	 * reacquired in flushbuflist.  Special care is needed to ensure that
1141	 * no race conditions occur from this.
1142	 */
1143	for (error = 0;;) {
1144		if ((blist = TAILQ_FIRST(&vp->v_cleanblkhd)) != 0 &&
1145		    flushbuflist(blist, flags, vp, slpflag, slptimeo, &error)) {
1146			if (error)
1147				break;
1148			continue;
1149		}
1150		if ((blist = TAILQ_FIRST(&vp->v_dirtyblkhd)) != 0 &&
1151		    flushbuflist(blist, flags, vp, slpflag, slptimeo, &error)) {
1152			if (error)
1153				break;
1154			continue;
1155		}
1156		break;
1157	}
1158	if (error) {
1159		splx(s);
1160		VI_UNLOCK(vp);
1161		return (error);
1162	}
1163
1164	/*
1165	 * Wait for I/O to complete.  XXX needs cleaning up.  The vnode can
1166	 * have write I/O in-progress but if there is a VM object then the
1167	 * VM object can also have read-I/O in-progress.
1168	 */
1169	do {
1170		while (vp->v_numoutput > 0) {
1171			vp->v_iflag |= VI_BWAIT;
1172			msleep(&vp->v_numoutput, VI_MTX(vp), PVM, "vnvlbv", 0);
1173		}
1174		VI_UNLOCK(vp);
1175		if (VOP_GETVOBJECT(vp, &object) == 0) {
1176			VM_OBJECT_LOCK(object);
1177			vm_object_pip_wait(object, "vnvlbx");
1178			VM_OBJECT_UNLOCK(object);
1179		}
1180		VI_LOCK(vp);
1181	} while (vp->v_numoutput > 0);
1182	VI_UNLOCK(vp);
1183
1184	splx(s);
1185
1186	/*
1187	 * Destroy the copy in the VM cache, too.
1188	 */
1189	if (VOP_GETVOBJECT(vp, &object) == 0) {
1190		VM_OBJECT_LOCK(object);
1191		vm_object_page_remove(object, 0, 0,
1192			(flags & V_SAVE) ? TRUE : FALSE);
1193		VM_OBJECT_UNLOCK(object);
1194	}
1195
1196#ifdef INVARIANTS
1197	VI_LOCK(vp);
1198	if ((flags & (V_ALT | V_NORMAL)) == 0 &&
1199	    (!TAILQ_EMPTY(&vp->v_dirtyblkhd) ||
1200	     !TAILQ_EMPTY(&vp->v_cleanblkhd)))
1201		panic("vinvalbuf: flush failed");
1202	VI_UNLOCK(vp);
1203#endif
1204	return (0);
1205}
1206
1207/*
1208 * Flush out buffers on the specified list.
1209 *
1210 */
1211static int
1212flushbuflist(blist, flags, vp, slpflag, slptimeo, errorp)
1213	struct buf *blist;
1214	int flags;
1215	struct vnode *vp;
1216	int slpflag, slptimeo;
1217	int *errorp;
1218{
1219	struct buf *bp, *nbp;
1220	int found, error;
1221
1222	ASSERT_VI_LOCKED(vp, "flushbuflist");
1223
1224	for (found = 0, bp = blist; bp; bp = nbp) {
1225		nbp = TAILQ_NEXT(bp, b_vnbufs);
1226		if (((flags & V_NORMAL) && (bp->b_xflags & BX_ALTDATA)) ||
1227		    ((flags & V_ALT) && (bp->b_xflags & BX_ALTDATA) == 0)) {
1228			continue;
1229		}
1230		found += 1;
1231		error = BUF_TIMELOCK(bp,
1232		    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, VI_MTX(vp),
1233		    "flushbuf", slpflag, slptimeo);
1234		if (error) {
1235			if (error != ENOLCK)
1236				*errorp = error;
1237			goto done;
1238		}
1239		/*
1240		 * XXX Since there are no node locks for NFS, I
1241		 * believe there is a slight chance that a delayed
1242		 * write will occur while sleeping just above, so
1243		 * check for it.  Note that vfs_bio_awrite expects
1244		 * buffers to reside on a queue, while BUF_WRITE and
1245		 * brelse do not.
1246		 */
1247		if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) &&
1248			(flags & V_SAVE)) {
1249
1250			if (bp->b_vp == vp) {
1251				if (bp->b_flags & B_CLUSTEROK) {
1252					vfs_bio_awrite(bp);
1253				} else {
1254					bremfree(bp);
1255					bp->b_flags |= B_ASYNC;
1256					BUF_WRITE(bp);
1257				}
1258			} else {
1259				bremfree(bp);
1260				(void) BUF_WRITE(bp);
1261			}
1262			goto done;
1263		}
1264		bremfree(bp);
1265		bp->b_flags |= (B_INVAL | B_NOCACHE | B_RELBUF);
1266		bp->b_flags &= ~B_ASYNC;
1267		brelse(bp);
1268		VI_LOCK(vp);
1269	}
1270	return (found);
1271done:
1272	VI_LOCK(vp);
1273	return (found);
1274}
1275
1276/*
1277 * Truncate a file's buffer and pages to a specified length.  This
1278 * is in lieu of the old vinvalbuf mechanism, which performed unneeded
1279 * sync activity.
1280 */
1281int
1282vtruncbuf(vp, cred, td, length, blksize)
1283	register struct vnode *vp;
1284	struct ucred *cred;
1285	struct thread *td;
1286	off_t length;
1287	int blksize;
1288{
1289	register struct buf *bp;
1290	struct buf *nbp;
1291	int s, anyfreed;
1292	int trunclbn;
1293
1294	/*
1295	 * Round up to the *next* lbn.
1296	 */
1297	trunclbn = (length + blksize - 1) / blksize;
1298
1299	s = splbio();
1300	ASSERT_VOP_LOCKED(vp, "vtruncbuf");
1301restart:
1302	VI_LOCK(vp);
1303	anyfreed = 1;
1304	for (;anyfreed;) {
1305		anyfreed = 0;
1306		for (bp = TAILQ_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) {
1307			nbp = TAILQ_NEXT(bp, b_vnbufs);
1308			if (bp->b_lblkno >= trunclbn) {
1309				if (BUF_LOCK(bp,
1310				    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
1311				    VI_MTX(vp)) == ENOLCK)
1312					goto restart;
1313
1314				bremfree(bp);
1315				bp->b_flags |= (B_INVAL | B_RELBUF);
1316				bp->b_flags &= ~B_ASYNC;
1317				brelse(bp);
1318				anyfreed = 1;
1319
1320				if (nbp &&
1321				    (((nbp->b_xflags & BX_VNCLEAN) == 0) ||
1322				    (nbp->b_vp != vp) ||
1323				    (nbp->b_flags & B_DELWRI))) {
1324					goto restart;
1325				}
1326				VI_LOCK(vp);
1327			}
1328		}
1329
1330		for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
1331			nbp = TAILQ_NEXT(bp, b_vnbufs);
1332			if (bp->b_lblkno >= trunclbn) {
1333				if (BUF_LOCK(bp,
1334				    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
1335				    VI_MTX(vp)) == ENOLCK)
1336					goto restart;
1337				bremfree(bp);
1338				bp->b_flags |= (B_INVAL | B_RELBUF);
1339				bp->b_flags &= ~B_ASYNC;
1340				brelse(bp);
1341				anyfreed = 1;
1342				if (nbp &&
1343				    (((nbp->b_xflags & BX_VNDIRTY) == 0) ||
1344				    (nbp->b_vp != vp) ||
1345				    (nbp->b_flags & B_DELWRI) == 0)) {
1346					goto restart;
1347				}
1348				VI_LOCK(vp);
1349			}
1350		}
1351	}
1352
1353	if (length > 0) {
1354restartsync:
1355		for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
1356			nbp = TAILQ_NEXT(bp, b_vnbufs);
1357			if (bp->b_lblkno > 0)
1358				continue;
1359			/*
1360			 * Since we hold the vnode lock this should only
1361			 * fail if we're racing with the buf daemon.
1362			 */
1363			if (BUF_LOCK(bp,
1364			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
1365			    VI_MTX(vp)) == ENOLCK) {
1366				goto restart;
1367			}
1368			KASSERT((bp->b_flags & B_DELWRI),
1369			    ("buf(%p) on dirty queue without DELWRI.", bp));
1370
1371			bremfree(bp);
1372			bawrite(bp);
1373			VI_LOCK(vp);
1374			goto restartsync;
1375		}
1376	}
1377
1378	while (vp->v_numoutput > 0) {
1379		vp->v_iflag |= VI_BWAIT;
1380		msleep(&vp->v_numoutput, VI_MTX(vp), PVM, "vbtrunc", 0);
1381	}
1382	VI_UNLOCK(vp);
1383	splx(s);
1384
1385	vnode_pager_setsize(vp, length);
1386
1387	return (0);
1388}
1389
1390/*
1391 * buf_splay() - splay tree core for the clean/dirty list of buffers in
1392 * 		 a vnode.
1393 *
1394 *	NOTE: We have to deal with the special case of a background bitmap
1395 *	buffer, a situation where two buffers will have the same logical
1396 *	block offset.  We want (1) only the foreground buffer to be accessed
1397 *	in a lookup and (2) must differentiate between the foreground and
1398 *	background buffer in the splay tree algorithm because the splay
1399 *	tree cannot normally handle multiple entities with the same 'index'.
1400 *	We accomplish this by adding differentiating flags to the splay tree's
1401 *	numerical domain.
1402 */
1403static
1404struct buf *
1405buf_splay(daddr_t lblkno, b_xflags_t xflags, struct buf *root)
1406{
1407	struct buf dummy;
1408	struct buf *lefttreemax, *righttreemin, *y;
1409
1410	if (root == NULL)
1411		return (NULL);
1412	lefttreemax = righttreemin = &dummy;
1413	for (;;) {
1414		if (lblkno < root->b_lblkno ||
1415		    (lblkno == root->b_lblkno &&
1416		    (xflags & BX_BKGRDMARKER) < (root->b_xflags & BX_BKGRDMARKER))) {
1417			if ((y = root->b_left) == NULL)
1418				break;
1419			if (lblkno < y->b_lblkno) {
1420				/* Rotate right. */
1421				root->b_left = y->b_right;
1422				y->b_right = root;
1423				root = y;
1424				if ((y = root->b_left) == NULL)
1425					break;
1426			}
1427			/* Link into the new root's right tree. */
1428			righttreemin->b_left = root;
1429			righttreemin = root;
1430		} else if (lblkno > root->b_lblkno ||
1431		    (lblkno == root->b_lblkno &&
1432		    (xflags & BX_BKGRDMARKER) > (root->b_xflags & BX_BKGRDMARKER))) {
1433			if ((y = root->b_right) == NULL)
1434				break;
1435			if (lblkno > y->b_lblkno) {
1436				/* Rotate left. */
1437				root->b_right = y->b_left;
1438				y->b_left = root;
1439				root = y;
1440				if ((y = root->b_right) == NULL)
1441					break;
1442			}
1443			/* Link into the new root's left tree. */
1444			lefttreemax->b_right = root;
1445			lefttreemax = root;
1446		} else {
1447			break;
1448		}
1449		root = y;
1450	}
1451	/* Assemble the new root. */
1452	lefttreemax->b_right = root->b_left;
1453	righttreemin->b_left = root->b_right;
1454	root->b_left = dummy.b_right;
1455	root->b_right = dummy.b_left;
1456	return (root);
1457}
1458
1459static
1460void
1461buf_vlist_remove(struct buf *bp)
1462{
1463	struct vnode *vp = bp->b_vp;
1464	struct buf *root;
1465
1466	ASSERT_VI_LOCKED(vp, "buf_vlist_remove");
1467	if (bp->b_xflags & BX_VNDIRTY) {
1468		if (bp != vp->v_dirtyblkroot) {
1469			root = buf_splay(bp->b_lblkno, bp->b_xflags, vp->v_dirtyblkroot);
1470			KASSERT(root == bp, ("splay lookup failed during dirty remove"));
1471		}
1472		if (bp->b_left == NULL) {
1473			root = bp->b_right;
1474		} else {
1475			root = buf_splay(bp->b_lblkno, bp->b_xflags, bp->b_left);
1476			root->b_right = bp->b_right;
1477		}
1478		vp->v_dirtyblkroot = root;
1479		TAILQ_REMOVE(&vp->v_dirtyblkhd, bp, b_vnbufs);
1480		vp->v_dirtybufcnt--;
1481	} else {
1482		/* KASSERT(bp->b_xflags & BX_VNCLEAN, ("bp wasn't clean")); */
1483		if (bp != vp->v_cleanblkroot) {
1484			root = buf_splay(bp->b_lblkno, bp->b_xflags, vp->v_cleanblkroot);
1485			KASSERT(root == bp, ("splay lookup failed during clean remove"));
1486		}
1487		if (bp->b_left == NULL) {
1488			root = bp->b_right;
1489		} else {
1490			root = buf_splay(bp->b_lblkno, bp->b_xflags, bp->b_left);
1491			root->b_right = bp->b_right;
1492		}
1493		vp->v_cleanblkroot = root;
1494		TAILQ_REMOVE(&vp->v_cleanblkhd, bp, b_vnbufs);
1495		vp->v_cleanbufcnt--;
1496	}
1497	bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN);
1498}
1499
1500/*
1501 * Add the buffer to the sorted clean or dirty block list using a
1502 * splay tree algorithm.
1503 *
1504 * NOTE: xflags is passed as a constant, optimizing this inline function!
1505 */
1506static
1507void
1508buf_vlist_add(struct buf *bp, struct vnode *vp, b_xflags_t xflags)
1509{
1510	struct buf *root;
1511
1512	ASSERT_VI_LOCKED(vp, "buf_vlist_add");
1513	bp->b_xflags |= xflags;
1514	if (xflags & BX_VNDIRTY) {
1515		root = buf_splay(bp->b_lblkno, bp->b_xflags, vp->v_dirtyblkroot);
1516		if (root == NULL) {
1517			bp->b_left = NULL;
1518			bp->b_right = NULL;
1519			TAILQ_INSERT_TAIL(&vp->v_dirtyblkhd, bp, b_vnbufs);
1520		} else if (bp->b_lblkno < root->b_lblkno ||
1521		    (bp->b_lblkno == root->b_lblkno &&
1522		    (bp->b_xflags & BX_BKGRDMARKER) < (root->b_xflags & BX_BKGRDMARKER))) {
1523			bp->b_left = root->b_left;
1524			bp->b_right = root;
1525			root->b_left = NULL;
1526			TAILQ_INSERT_BEFORE(root, bp, b_vnbufs);
1527		} else {
1528			bp->b_right = root->b_right;
1529			bp->b_left = root;
1530			root->b_right = NULL;
1531			TAILQ_INSERT_AFTER(&vp->v_dirtyblkhd,
1532			    root, bp, b_vnbufs);
1533		}
1534		vp->v_dirtybufcnt++;
1535		vp->v_dirtyblkroot = bp;
1536	} else {
1537		/* KASSERT(xflags & BX_VNCLEAN, ("xflags not clean")); */
1538		root = buf_splay(bp->b_lblkno, bp->b_xflags, vp->v_cleanblkroot);
1539		if (root == NULL) {
1540			bp->b_left = NULL;
1541			bp->b_right = NULL;
1542			TAILQ_INSERT_TAIL(&vp->v_cleanblkhd, bp, b_vnbufs);
1543		} else if (bp->b_lblkno < root->b_lblkno ||
1544		    (bp->b_lblkno == root->b_lblkno &&
1545		    (bp->b_xflags & BX_BKGRDMARKER) < (root->b_xflags & BX_BKGRDMARKER))) {
1546			bp->b_left = root->b_left;
1547			bp->b_right = root;
1548			root->b_left = NULL;
1549			TAILQ_INSERT_BEFORE(root, bp, b_vnbufs);
1550		} else {
1551			bp->b_right = root->b_right;
1552			bp->b_left = root;
1553			root->b_right = NULL;
1554			TAILQ_INSERT_AFTER(&vp->v_cleanblkhd,
1555			    root, bp, b_vnbufs);
1556		}
1557		vp->v_cleanbufcnt++;
1558		vp->v_cleanblkroot = bp;
1559	}
1560}
1561
1562/*
1563 * Lookup a buffer using the splay tree.  Note that we specifically avoid
1564 * shadow buffers used in background bitmap writes.
1565 *
1566 * This code isn't quite efficient as it could be because we are maintaining
1567 * two sorted lists and do not know which list the block resides in.
1568 */
1569struct buf *
1570gbincore(struct vnode *vp, daddr_t lblkno)
1571{
1572	struct buf *bp;
1573
1574	GIANT_REQUIRED;
1575
1576	ASSERT_VI_LOCKED(vp, "gbincore");
1577	bp = vp->v_cleanblkroot = buf_splay(lblkno, 0, vp->v_cleanblkroot);
1578	if (bp && bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
1579		return(bp);
1580	bp = vp->v_dirtyblkroot = buf_splay(lblkno, 0, vp->v_dirtyblkroot);
1581	if (bp && bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
1582		return(bp);
1583	return(NULL);
1584}
1585
1586/*
1587 * Associate a buffer with a vnode.
1588 */
1589void
1590bgetvp(vp, bp)
1591	register struct vnode *vp;
1592	register struct buf *bp;
1593{
1594	int s;
1595
1596	KASSERT(bp->b_vp == NULL, ("bgetvp: not free"));
1597
1598	KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0,
1599	    ("bgetvp: bp already attached! %p", bp));
1600
1601	ASSERT_VI_LOCKED(vp, "bgetvp");
1602	vholdl(vp);
1603	bp->b_vp = vp;
1604	bp->b_dev = vn_todev(vp);
1605	/*
1606	 * Insert onto list for new vnode.
1607	 */
1608	s = splbio();
1609	buf_vlist_add(bp, vp, BX_VNCLEAN);
1610	splx(s);
1611}
1612
1613/*
1614 * Disassociate a buffer from a vnode.
1615 */
1616void
1617brelvp(bp)
1618	register struct buf *bp;
1619{
1620	struct vnode *vp;
1621	int s;
1622
1623	KASSERT(bp->b_vp != NULL, ("brelvp: NULL"));
1624
1625	/*
1626	 * Delete from old vnode list, if on one.
1627	 */
1628	vp = bp->b_vp;
1629	s = splbio();
1630	VI_LOCK(vp);
1631	if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN))
1632		buf_vlist_remove(bp);
1633	if ((vp->v_iflag & VI_ONWORKLST) && TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
1634		vp->v_iflag &= ~VI_ONWORKLST;
1635		mtx_lock(&sync_mtx);
1636		LIST_REMOVE(vp, v_synclist);
1637		mtx_unlock(&sync_mtx);
1638	}
1639	vdropl(vp);
1640	VI_UNLOCK(vp);
1641	bp->b_vp = (struct vnode *) 0;
1642	if (bp->b_object)
1643		bp->b_object = NULL;
1644	splx(s);
1645}
1646
1647/*
1648 * Add an item to the syncer work queue.
1649 */
1650static void
1651vn_syncer_add_to_worklist(struct vnode *vp, int delay)
1652{
1653	int s, slot;
1654
1655	s = splbio();
1656	ASSERT_VI_LOCKED(vp, "vn_syncer_add_to_worklist");
1657
1658	mtx_lock(&sync_mtx);
1659	if (vp->v_iflag & VI_ONWORKLST)
1660		LIST_REMOVE(vp, v_synclist);
1661	else
1662		vp->v_iflag |= VI_ONWORKLST;
1663
1664	if (delay > syncer_maxdelay - 2)
1665		delay = syncer_maxdelay - 2;
1666	slot = (syncer_delayno + delay) & syncer_mask;
1667
1668	LIST_INSERT_HEAD(&syncer_workitem_pending[slot], vp, v_synclist);
1669	mtx_unlock(&sync_mtx);
1670
1671	splx(s);
1672}
1673
1674struct  proc *updateproc;
1675static void sched_sync(void);
1676static struct kproc_desc up_kp = {
1677	"syncer",
1678	sched_sync,
1679	&updateproc
1680};
1681SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp)
1682
1683/*
1684 * System filesystem synchronizer daemon.
1685 */
1686static void
1687sched_sync(void)
1688{
1689	struct synclist *slp;
1690	struct vnode *vp;
1691	struct mount *mp;
1692	long starttime;
1693	int s;
1694	struct thread *td = FIRST_THREAD_IN_PROC(updateproc);  /* XXXKSE */
1695
1696	mtx_lock(&Giant);
1697
1698	EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, td->td_proc,
1699	    SHUTDOWN_PRI_LAST);
1700
1701	for (;;) {
1702		kthread_suspend_check(td->td_proc);
1703
1704		starttime = time_second;
1705
1706		/*
1707		 * Push files whose dirty time has expired.  Be careful
1708		 * of interrupt race on slp queue.
1709		 */
1710		s = splbio();
1711		mtx_lock(&sync_mtx);
1712		slp = &syncer_workitem_pending[syncer_delayno];
1713		syncer_delayno += 1;
1714		if (syncer_delayno == syncer_maxdelay)
1715			syncer_delayno = 0;
1716		splx(s);
1717
1718		while ((vp = LIST_FIRST(slp)) != NULL) {
1719			mtx_unlock(&sync_mtx);
1720			if (VOP_ISLOCKED(vp, NULL) == 0 &&
1721			    vn_start_write(vp, &mp, V_NOWAIT) == 0) {
1722				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
1723				(void) VOP_FSYNC(vp, td->td_ucred, MNT_LAZY, td);
1724				VOP_UNLOCK(vp, 0, td);
1725				vn_finished_write(mp);
1726			}
1727			s = splbio();
1728			mtx_lock(&sync_mtx);
1729			if (LIST_FIRST(slp) == vp) {
1730				mtx_unlock(&sync_mtx);
1731				/*
1732				 * Note: VFS vnodes can remain on the
1733				 * worklist too with no dirty blocks, but
1734				 * since sync_fsync() moves it to a different
1735				 * slot we are safe.
1736				 */
1737				VI_LOCK(vp);
1738				if (TAILQ_EMPTY(&vp->v_dirtyblkhd) &&
1739				    !vn_isdisk(vp, NULL)) {
1740					panic("sched_sync: fsync failed "
1741					      "vp %p tag %s", vp, vp->v_tag);
1742				}
1743				/*
1744				 * Put us back on the worklist.  The worklist
1745				 * routine will remove us from our current
1746				 * position and then add us back in at a later
1747				 * position.
1748				 */
1749				vn_syncer_add_to_worklist(vp, syncdelay);
1750				VI_UNLOCK(vp);
1751				mtx_lock(&sync_mtx);
1752			}
1753			splx(s);
1754		}
1755		mtx_unlock(&sync_mtx);
1756
1757		/*
1758		 * Do soft update processing.
1759		 */
1760		if (softdep_process_worklist_hook != NULL)
1761			(*softdep_process_worklist_hook)(NULL);
1762
1763		/*
1764		 * The variable rushjob allows the kernel to speed up the
1765		 * processing of the filesystem syncer process. A rushjob
1766		 * value of N tells the filesystem syncer to process the next
1767		 * N seconds worth of work on its queue ASAP. Currently rushjob
1768		 * is used by the soft update code to speed up the filesystem
1769		 * syncer process when the incore state is getting so far
1770		 * ahead of the disk that the kernel memory pool is being
1771		 * threatened with exhaustion.
1772		 */
1773		mtx_lock(&sync_mtx);
1774		if (rushjob > 0) {
1775			rushjob -= 1;
1776			mtx_unlock(&sync_mtx);
1777			continue;
1778		}
1779		mtx_unlock(&sync_mtx);
1780		/*
1781		 * If it has taken us less than a second to process the
1782		 * current work, then wait. Otherwise start right over
1783		 * again. We can still lose time if any single round
1784		 * takes more than two seconds, but it does not really
1785		 * matter as we are just trying to generally pace the
1786		 * filesystem activity.
1787		 */
1788		if (time_second == starttime)
1789			tsleep(&lbolt, PPAUSE, "syncer", 0);
1790	}
1791}
1792
1793/*
1794 * Request the syncer daemon to speed up its work.
1795 * We never push it to speed up more than half of its
1796 * normal turn time, otherwise it could take over the cpu.
1797 * XXXKSE  only one update?
1798 */
1799int
1800speedup_syncer()
1801{
1802	struct thread *td;
1803	int ret = 0;
1804
1805	td = FIRST_THREAD_IN_PROC(updateproc);
1806	mtx_lock_spin(&sched_lock);
1807	if (td->td_wchan == &lbolt) {
1808		unsleep(td);
1809		TD_CLR_SLEEPING(td);
1810		setrunnable(td);
1811	}
1812	mtx_unlock_spin(&sched_lock);
1813	mtx_lock(&sync_mtx);
1814	if (rushjob < syncdelay / 2) {
1815		rushjob += 1;
1816		stat_rush_requests += 1;
1817		ret = 1;
1818	}
1819	mtx_unlock(&sync_mtx);
1820	return (ret);
1821}
1822
1823/*
1824 * Associate a p-buffer with a vnode.
1825 *
1826 * Also sets B_PAGING flag to indicate that vnode is not fully associated
1827 * with the buffer.  i.e. the bp has not been linked into the vnode or
1828 * ref-counted.
1829 */
1830void
1831pbgetvp(vp, bp)
1832	register struct vnode *vp;
1833	register struct buf *bp;
1834{
1835
1836	KASSERT(bp->b_vp == NULL, ("pbgetvp: not free"));
1837
1838	bp->b_vp = vp;
1839	bp->b_flags |= B_PAGING;
1840	bp->b_dev = vn_todev(vp);
1841}
1842
1843/*
1844 * Disassociate a p-buffer from a vnode.
1845 */
1846void
1847pbrelvp(bp)
1848	register struct buf *bp;
1849{
1850
1851	KASSERT(bp->b_vp != NULL, ("pbrelvp: NULL"));
1852
1853	/* XXX REMOVE ME */
1854	VI_LOCK(bp->b_vp);
1855	if (TAILQ_NEXT(bp, b_vnbufs) != NULL) {
1856		panic(
1857		    "relpbuf(): b_vp was probably reassignbuf()d %p %x",
1858		    bp,
1859		    (int)bp->b_flags
1860		);
1861	}
1862	VI_UNLOCK(bp->b_vp);
1863	bp->b_vp = (struct vnode *) 0;
1864	bp->b_flags &= ~B_PAGING;
1865}
1866
1867/*
1868 * Reassign a buffer from one vnode to another.
1869 * Used to assign file specific control information
1870 * (indirect blocks) to the vnode to which they belong.
1871 */
1872void
1873reassignbuf(bp, newvp)
1874	register struct buf *bp;
1875	register struct vnode *newvp;
1876{
1877	int delay;
1878	int s;
1879
1880	if (newvp == NULL) {
1881		printf("reassignbuf: NULL");
1882		return;
1883	}
1884	++reassignbufcalls;
1885
1886	/*
1887	 * B_PAGING flagged buffers cannot be reassigned because their vp
1888	 * is not fully linked in.
1889	 */
1890	if (bp->b_flags & B_PAGING)
1891		panic("cannot reassign paging buffer");
1892
1893	s = splbio();
1894	/*
1895	 * Delete from old vnode list, if on one.
1896	 */
1897	VI_LOCK(bp->b_vp);
1898	if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) {
1899		buf_vlist_remove(bp);
1900		if (bp->b_vp != newvp) {
1901			vdropl(bp->b_vp);
1902			bp->b_vp = NULL;	/* for clarification */
1903		}
1904	}
1905	VI_UNLOCK(bp->b_vp);
1906	/*
1907	 * If dirty, put on list of dirty buffers; otherwise insert onto list
1908	 * of clean buffers.
1909	 */
1910	VI_LOCK(newvp);
1911	if (bp->b_flags & B_DELWRI) {
1912		if ((newvp->v_iflag & VI_ONWORKLST) == 0) {
1913			switch (newvp->v_type) {
1914			case VDIR:
1915				delay = dirdelay;
1916				break;
1917			case VCHR:
1918				if (newvp->v_rdev->si_mountpoint != NULL) {
1919					delay = metadelay;
1920					break;
1921				}
1922				/* FALLTHROUGH */
1923			default:
1924				delay = filedelay;
1925			}
1926			vn_syncer_add_to_worklist(newvp, delay);
1927		}
1928		buf_vlist_add(bp, newvp, BX_VNDIRTY);
1929	} else {
1930		buf_vlist_add(bp, newvp, BX_VNCLEAN);
1931
1932		if ((newvp->v_iflag & VI_ONWORKLST) &&
1933		    TAILQ_EMPTY(&newvp->v_dirtyblkhd)) {
1934			mtx_lock(&sync_mtx);
1935			LIST_REMOVE(newvp, v_synclist);
1936			mtx_unlock(&sync_mtx);
1937			newvp->v_iflag &= ~VI_ONWORKLST;
1938		}
1939	}
1940	if (bp->b_vp != newvp) {
1941		bp->b_vp = newvp;
1942		vholdl(bp->b_vp);
1943	}
1944	VI_UNLOCK(newvp);
1945	splx(s);
1946}
1947
1948/*
1949 * Create a vnode for a device.
1950 * Used for mounting the root filesystem.
1951 */
1952int
1953bdevvp(dev, vpp)
1954	dev_t dev;
1955	struct vnode **vpp;
1956{
1957	register struct vnode *vp;
1958	struct vnode *nvp;
1959	int error;
1960
1961	if (dev == NODEV) {
1962		*vpp = NULLVP;
1963		return (ENXIO);
1964	}
1965	if (vfinddev(dev, VCHR, vpp))
1966		return (0);
1967	error = getnewvnode("none", (struct mount *)0, spec_vnodeop_p, &nvp);
1968	if (error) {
1969		*vpp = NULLVP;
1970		return (error);
1971	}
1972	vp = nvp;
1973	vp->v_type = VCHR;
1974	addalias(vp, dev);
1975	*vpp = vp;
1976	return (0);
1977}
1978
1979static void
1980v_incr_usecount(struct vnode *vp, int delta)
1981{
1982	vp->v_usecount += delta;
1983	if (vp->v_type == VCHR && vp->v_rdev != NULL) {
1984		mtx_lock(&spechash_mtx);
1985		vp->v_rdev->si_usecount += delta;
1986		mtx_unlock(&spechash_mtx);
1987	}
1988}
1989
1990/*
1991 * Add vnode to the alias list hung off the dev_t.
1992 *
1993 * The reason for this gunk is that multiple vnodes can reference
1994 * the same physical device, so checking vp->v_usecount to see
1995 * how many users there are is inadequate; the v_usecount for
1996 * the vnodes need to be accumulated.  vcount() does that.
1997 */
1998struct vnode *
1999addaliasu(nvp, nvp_rdev)
2000	struct vnode *nvp;
2001	udev_t nvp_rdev;
2002{
2003	struct vnode *ovp;
2004	vop_t **ops;
2005	dev_t dev;
2006
2007	if (nvp->v_type == VBLK)
2008		return (nvp);
2009	if (nvp->v_type != VCHR)
2010		panic("addaliasu on non-special vnode");
2011	dev = udev2dev(nvp_rdev, 0);
2012	/*
2013	 * Check to see if we have a bdevvp vnode with no associated
2014	 * filesystem. If so, we want to associate the filesystem of
2015	 * the new newly instigated vnode with the bdevvp vnode and
2016	 * discard the newly created vnode rather than leaving the
2017	 * bdevvp vnode lying around with no associated filesystem.
2018	 */
2019	if (vfinddev(dev, nvp->v_type, &ovp) == 0 || ovp->v_data != NULL) {
2020		addalias(nvp, dev);
2021		return (nvp);
2022	}
2023	/*
2024	 * Discard unneeded vnode, but save its node specific data.
2025	 * Note that if there is a lock, it is carried over in the
2026	 * node specific data to the replacement vnode.
2027	 */
2028	vref(ovp);
2029	ovp->v_data = nvp->v_data;
2030	ovp->v_tag = nvp->v_tag;
2031	nvp->v_data = NULL;
2032	lockdestroy(ovp->v_vnlock);
2033	lockinit(ovp->v_vnlock, PVFS, nvp->v_vnlock->lk_wmesg,
2034	    nvp->v_vnlock->lk_timo, nvp->v_vnlock->lk_flags & LK_EXTFLG_MASK);
2035	ops = ovp->v_op;
2036	ovp->v_op = nvp->v_op;
2037	if (VOP_ISLOCKED(nvp, curthread)) {
2038		VOP_UNLOCK(nvp, 0, curthread);
2039		vn_lock(ovp, LK_EXCLUSIVE | LK_RETRY, curthread);
2040	}
2041	nvp->v_op = ops;
2042	insmntque(ovp, nvp->v_mount);
2043	vrele(nvp);
2044	vgone(nvp);
2045	return (ovp);
2046}
2047
2048/* This is a local helper function that do the same as addaliasu, but for a
2049 * dev_t instead of an udev_t. */
2050static void
2051addalias(nvp, dev)
2052	struct vnode *nvp;
2053	dev_t dev;
2054{
2055
2056	KASSERT(nvp->v_type == VCHR, ("addalias on non-special vnode"));
2057	nvp->v_rdev = dev;
2058	VI_LOCK(nvp);
2059	mtx_lock(&spechash_mtx);
2060	SLIST_INSERT_HEAD(&dev->si_hlist, nvp, v_specnext);
2061	dev->si_usecount += nvp->v_usecount;
2062	mtx_unlock(&spechash_mtx);
2063	VI_UNLOCK(nvp);
2064}
2065
2066/*
2067 * Grab a particular vnode from the free list, increment its
2068 * reference count and lock it. The vnode lock bit is set if the
2069 * vnode is being eliminated in vgone. The process is awakened
2070 * when the transition is completed, and an error returned to
2071 * indicate that the vnode is no longer usable (possibly having
2072 * been changed to a new filesystem type).
2073 */
2074int
2075vget(vp, flags, td)
2076	register struct vnode *vp;
2077	int flags;
2078	struct thread *td;
2079{
2080	int error;
2081
2082	/*
2083	 * If the vnode is in the process of being cleaned out for
2084	 * another use, we wait for the cleaning to finish and then
2085	 * return failure. Cleaning is determined by checking that
2086	 * the VI_XLOCK flag is set.
2087	 */
2088	if ((flags & LK_INTERLOCK) == 0)
2089		VI_LOCK(vp);
2090	if (vp->v_iflag & VI_XLOCK && vp->v_vxproc != curthread) {
2091		vp->v_iflag |= VI_XWANT;
2092		msleep(vp, VI_MTX(vp), PINOD | PDROP, "vget", 0);
2093		return (ENOENT);
2094	}
2095
2096	v_incr_usecount(vp, 1);
2097
2098	if (VSHOULDBUSY(vp))
2099		vbusy(vp);
2100	if (flags & LK_TYPE_MASK) {
2101		if ((error = vn_lock(vp, flags | LK_INTERLOCK, td)) != 0) {
2102			/*
2103			 * must expand vrele here because we do not want
2104			 * to call VOP_INACTIVE if the reference count
2105			 * drops back to zero since it was never really
2106			 * active. We must remove it from the free list
2107			 * before sleeping so that multiple processes do
2108			 * not try to recycle it.
2109			 */
2110			VI_LOCK(vp);
2111			v_incr_usecount(vp, -1);
2112			if (VSHOULDFREE(vp))
2113				vfree(vp);
2114			else
2115				vlruvp(vp);
2116			VI_UNLOCK(vp);
2117		}
2118		return (error);
2119	}
2120	VI_UNLOCK(vp);
2121	return (0);
2122}
2123
2124/*
2125 * Increase the reference count of a vnode.
2126 */
2127void
2128vref(struct vnode *vp)
2129{
2130	VI_LOCK(vp);
2131	v_incr_usecount(vp, 1);
2132	VI_UNLOCK(vp);
2133}
2134
2135/*
2136 * Return reference count of a vnode.
2137 *
2138 * The results of this call are only guaranteed when some mechanism other
2139 * than the VI lock is used to stop other processes from gaining references
2140 * to the vnode.  This may be the case if the caller holds the only reference.
2141 * This is also useful when stale data is acceptable as race conditions may
2142 * be accounted for by some other means.
2143 */
2144int
2145vrefcnt(struct vnode *vp)
2146{
2147	int usecnt;
2148
2149	VI_LOCK(vp);
2150	usecnt = vp->v_usecount;
2151	VI_UNLOCK(vp);
2152
2153	return (usecnt);
2154}
2155
2156
2157/*
2158 * Vnode put/release.
2159 * If count drops to zero, call inactive routine and return to freelist.
2160 */
2161void
2162vrele(vp)
2163	struct vnode *vp;
2164{
2165	struct thread *td = curthread;	/* XXX */
2166
2167	KASSERT(vp != NULL, ("vrele: null vp"));
2168
2169	VI_LOCK(vp);
2170
2171	/* Skip this v_writecount check if we're going to panic below. */
2172	KASSERT(vp->v_writecount < vp->v_usecount || vp->v_usecount < 1,
2173	    ("vrele: missed vn_close"));
2174
2175	if (vp->v_usecount > 1 || ((vp->v_iflag & VI_DOINGINACT) &&
2176	    vp->v_usecount == 1)) {
2177		v_incr_usecount(vp, -1);
2178		VI_UNLOCK(vp);
2179
2180		return;
2181	}
2182
2183	if (vp->v_usecount == 1) {
2184		v_incr_usecount(vp, -1);
2185		/*
2186		 * We must call VOP_INACTIVE with the node locked. Mark
2187		 * as VI_DOINGINACT to avoid recursion.
2188		 */
2189		if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK, td) == 0) {
2190			VI_LOCK(vp);
2191			vp->v_iflag |= VI_DOINGINACT;
2192			VI_UNLOCK(vp);
2193			VOP_INACTIVE(vp, td);
2194			VI_LOCK(vp);
2195			KASSERT(vp->v_iflag & VI_DOINGINACT,
2196			    ("vrele: lost VI_DOINGINACT"));
2197			vp->v_iflag &= ~VI_DOINGINACT;
2198			VI_UNLOCK(vp);
2199		}
2200		VI_LOCK(vp);
2201		if (VSHOULDFREE(vp))
2202			vfree(vp);
2203		else
2204			vlruvp(vp);
2205		VI_UNLOCK(vp);
2206
2207	} else {
2208#ifdef DIAGNOSTIC
2209		vprint("vrele: negative ref count", vp);
2210#endif
2211		VI_UNLOCK(vp);
2212		panic("vrele: negative ref cnt");
2213	}
2214}
2215
2216/*
2217 * Release an already locked vnode.  This give the same effects as
2218 * unlock+vrele(), but takes less time and avoids releasing and
2219 * re-aquiring the lock (as vrele() aquires the lock internally.)
2220 */
2221void
2222vput(vp)
2223	struct vnode *vp;
2224{
2225	struct thread *td = curthread;	/* XXX */
2226
2227	GIANT_REQUIRED;
2228
2229	KASSERT(vp != NULL, ("vput: null vp"));
2230	VI_LOCK(vp);
2231	/* Skip this v_writecount check if we're going to panic below. */
2232	KASSERT(vp->v_writecount < vp->v_usecount || vp->v_usecount < 1,
2233	    ("vput: missed vn_close"));
2234
2235	if (vp->v_usecount > 1 || ((vp->v_iflag & VI_DOINGINACT) &&
2236	    vp->v_usecount == 1)) {
2237		v_incr_usecount(vp, -1);
2238		VOP_UNLOCK(vp, LK_INTERLOCK, td);
2239		return;
2240	}
2241
2242	if (vp->v_usecount == 1) {
2243		v_incr_usecount(vp, -1);
2244		/*
2245		 * We must call VOP_INACTIVE with the node locked, so
2246		 * we just need to release the vnode mutex. Mark as
2247		 * as VI_DOINGINACT to avoid recursion.
2248		 */
2249		vp->v_iflag |= VI_DOINGINACT;
2250		VI_UNLOCK(vp);
2251		VOP_INACTIVE(vp, td);
2252		VI_LOCK(vp);
2253		KASSERT(vp->v_iflag & VI_DOINGINACT,
2254		    ("vput: lost VI_DOINGINACT"));
2255		vp->v_iflag &= ~VI_DOINGINACT;
2256		if (VSHOULDFREE(vp))
2257			vfree(vp);
2258		else
2259			vlruvp(vp);
2260		VI_UNLOCK(vp);
2261
2262	} else {
2263#ifdef DIAGNOSTIC
2264		vprint("vput: negative ref count", vp);
2265#endif
2266		panic("vput: negative ref cnt");
2267	}
2268}
2269
2270/*
2271 * Somebody doesn't want the vnode recycled.
2272 */
2273void
2274vhold(struct vnode *vp)
2275{
2276	VI_LOCK(vp);
2277	vholdl(vp);
2278	VI_UNLOCK(vp);
2279}
2280
2281void
2282vholdl(vp)
2283	register struct vnode *vp;
2284{
2285	int s;
2286
2287	s = splbio();
2288	vp->v_holdcnt++;
2289	if (VSHOULDBUSY(vp))
2290		vbusy(vp);
2291	splx(s);
2292}
2293
2294/*
2295 * Note that there is one less who cares about this vnode.  vdrop() is the
2296 * opposite of vhold().
2297 */
2298void
2299vdrop(struct vnode *vp)
2300{
2301	VI_LOCK(vp);
2302	vdropl(vp);
2303	VI_UNLOCK(vp);
2304}
2305
2306void
2307vdropl(vp)
2308	register struct vnode *vp;
2309{
2310	int s;
2311
2312	s = splbio();
2313	if (vp->v_holdcnt <= 0)
2314		panic("vdrop: holdcnt");
2315	vp->v_holdcnt--;
2316	if (VSHOULDFREE(vp))
2317		vfree(vp);
2318	else
2319		vlruvp(vp);
2320	splx(s);
2321}
2322
2323/*
2324 * Remove any vnodes in the vnode table belonging to mount point mp.
2325 *
2326 * If FORCECLOSE is not specified, there should not be any active ones,
2327 * return error if any are found (nb: this is a user error, not a
2328 * system error). If FORCECLOSE is specified, detach any active vnodes
2329 * that are found.
2330 *
2331 * If WRITECLOSE is set, only flush out regular file vnodes open for
2332 * writing.
2333 *
2334 * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped.
2335 *
2336 * `rootrefs' specifies the base reference count for the root vnode
2337 * of this filesystem. The root vnode is considered busy if its
2338 * v_usecount exceeds this value. On a successful return, vflush()
2339 * will call vrele() on the root vnode exactly rootrefs times.
2340 * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must
2341 * be zero.
2342 */
2343#ifdef DIAGNOSTIC
2344static int busyprt = 0;		/* print out busy vnodes */
2345SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "");
2346#endif
2347
2348int
2349vflush(mp, rootrefs, flags)
2350	struct mount *mp;
2351	int rootrefs;
2352	int flags;
2353{
2354	struct thread *td = curthread;	/* XXX */
2355	struct vnode *vp, *nvp, *rootvp = NULL;
2356	struct vattr vattr;
2357	int busy = 0, error;
2358
2359	if (rootrefs > 0) {
2360		KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0,
2361		    ("vflush: bad args"));
2362		/*
2363		 * Get the filesystem root vnode. We can vput() it
2364		 * immediately, since with rootrefs > 0, it won't go away.
2365		 */
2366		if ((error = VFS_ROOT(mp, &rootvp)) != 0)
2367			return (error);
2368		vput(rootvp);
2369
2370	}
2371	mtx_lock(&mntvnode_mtx);
2372loop:
2373	for (vp = TAILQ_FIRST(&mp->mnt_nvnodelist); vp; vp = nvp) {
2374		/*
2375		 * Make sure this vnode wasn't reclaimed in getnewvnode().
2376		 * Start over if it has (it won't be on the list anymore).
2377		 */
2378		if (vp->v_mount != mp)
2379			goto loop;
2380		nvp = TAILQ_NEXT(vp, v_nmntvnodes);
2381
2382		VI_LOCK(vp);
2383		mtx_unlock(&mntvnode_mtx);
2384		vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY, td);
2385		/*
2386		 * Skip over a vnodes marked VV_SYSTEM.
2387		 */
2388		if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) {
2389			VOP_UNLOCK(vp, 0, td);
2390			mtx_lock(&mntvnode_mtx);
2391			continue;
2392		}
2393		/*
2394		 * If WRITECLOSE is set, flush out unlinked but still open
2395		 * files (even if open only for reading) and regular file
2396		 * vnodes open for writing.
2397		 */
2398		if (flags & WRITECLOSE) {
2399			error = VOP_GETATTR(vp, &vattr, td->td_ucred, td);
2400			VI_LOCK(vp);
2401
2402			if ((vp->v_type == VNON ||
2403			    (error == 0 && vattr.va_nlink > 0)) &&
2404			    (vp->v_writecount == 0 || vp->v_type != VREG)) {
2405				VOP_UNLOCK(vp, LK_INTERLOCK, td);
2406				mtx_lock(&mntvnode_mtx);
2407				continue;
2408			}
2409		} else
2410			VI_LOCK(vp);
2411
2412		VOP_UNLOCK(vp, 0, td);
2413
2414		/*
2415		 * With v_usecount == 0, all we need to do is clear out the
2416		 * vnode data structures and we are done.
2417		 */
2418		if (vp->v_usecount == 0) {
2419			vgonel(vp, td);
2420			mtx_lock(&mntvnode_mtx);
2421			continue;
2422		}
2423
2424		/*
2425		 * If FORCECLOSE is set, forcibly close the vnode. For block
2426		 * or character devices, revert to an anonymous device. For
2427		 * all other files, just kill them.
2428		 */
2429		if (flags & FORCECLOSE) {
2430			if (vp->v_type != VCHR) {
2431				vgonel(vp, td);
2432			} else {
2433				vclean(vp, 0, td);
2434				VI_UNLOCK(vp);
2435				vp->v_op = spec_vnodeop_p;
2436				insmntque(vp, (struct mount *) 0);
2437			}
2438			mtx_lock(&mntvnode_mtx);
2439			continue;
2440		}
2441#ifdef DIAGNOSTIC
2442		if (busyprt)
2443			vprint("vflush: busy vnode", vp);
2444#endif
2445		VI_UNLOCK(vp);
2446		mtx_lock(&mntvnode_mtx);
2447		busy++;
2448	}
2449	mtx_unlock(&mntvnode_mtx);
2450	if (rootrefs > 0 && (flags & FORCECLOSE) == 0) {
2451		/*
2452		 * If just the root vnode is busy, and if its refcount
2453		 * is equal to `rootrefs', then go ahead and kill it.
2454		 */
2455		VI_LOCK(rootvp);
2456		KASSERT(busy > 0, ("vflush: not busy"));
2457		KASSERT(rootvp->v_usecount >= rootrefs, ("vflush: rootrefs"));
2458		if (busy == 1 && rootvp->v_usecount == rootrefs) {
2459			vgonel(rootvp, td);
2460			busy = 0;
2461		} else
2462			VI_UNLOCK(rootvp);
2463	}
2464	if (busy)
2465		return (EBUSY);
2466	for (; rootrefs > 0; rootrefs--)
2467		vrele(rootvp);
2468	return (0);
2469}
2470
2471/*
2472 * This moves a now (likely recyclable) vnode to the end of the
2473 * mountlist.  XXX However, it is temporarily disabled until we
2474 * can clean up ffs_sync() and friends, which have loop restart
2475 * conditions which this code causes to operate O(N^2).
2476 */
2477static void
2478vlruvp(struct vnode *vp)
2479{
2480#if 0
2481	struct mount *mp;
2482
2483	if ((mp = vp->v_mount) != NULL) {
2484		mtx_lock(&mntvnode_mtx);
2485		TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
2486		TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
2487		mtx_unlock(&mntvnode_mtx);
2488	}
2489#endif
2490}
2491
2492/*
2493 * Disassociate the underlying filesystem from a vnode.
2494 */
2495static void
2496vclean(vp, flags, td)
2497	struct vnode *vp;
2498	int flags;
2499	struct thread *td;
2500{
2501	int active;
2502
2503	ASSERT_VI_LOCKED(vp, "vclean");
2504	/*
2505	 * Check to see if the vnode is in use. If so we have to reference it
2506	 * before we clean it out so that its count cannot fall to zero and
2507	 * generate a race against ourselves to recycle it.
2508	 */
2509	if ((active = vp->v_usecount))
2510		v_incr_usecount(vp, 1);
2511
2512	/*
2513	 * Prevent the vnode from being recycled or brought into use while we
2514	 * clean it out.
2515	 */
2516	if (vp->v_iflag & VI_XLOCK)
2517		panic("vclean: deadlock");
2518	vp->v_iflag |= VI_XLOCK;
2519	vp->v_vxproc = curthread;
2520	/*
2521	 * Even if the count is zero, the VOP_INACTIVE routine may still
2522	 * have the object locked while it cleans it out. The VOP_LOCK
2523	 * ensures that the VOP_INACTIVE routine is done with its work.
2524	 * For active vnodes, it ensures that no other activity can
2525	 * occur while the underlying object is being cleaned out.
2526	 */
2527	VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK, td);
2528
2529	/*
2530	 * Clean out any buffers associated with the vnode.
2531	 * If the flush fails, just toss the buffers.
2532	 */
2533	if (flags & DOCLOSE) {
2534		struct buf *bp;
2535		VI_LOCK(vp);
2536		bp = TAILQ_FIRST(&vp->v_dirtyblkhd);
2537		VI_UNLOCK(vp);
2538		if (bp != NULL)
2539			(void) vn_write_suspend_wait(vp, NULL, V_WAIT);
2540		if (vinvalbuf(vp, V_SAVE, NOCRED, td, 0, 0) != 0)
2541			vinvalbuf(vp, 0, NOCRED, td, 0, 0);
2542	}
2543
2544	VOP_DESTROYVOBJECT(vp);
2545
2546	/*
2547	 * Any other processes trying to obtain this lock must first
2548	 * wait for VXLOCK to clear, then call the new lock operation.
2549	 */
2550	VOP_UNLOCK(vp, 0, td);
2551
2552	/*
2553	 * If purging an active vnode, it must be closed and
2554	 * deactivated before being reclaimed. Note that the
2555	 * VOP_INACTIVE will unlock the vnode.
2556	 */
2557	if (active) {
2558		if (flags & DOCLOSE)
2559			VOP_CLOSE(vp, FNONBLOCK, NOCRED, td);
2560		VI_LOCK(vp);
2561		if ((vp->v_iflag & VI_DOINGINACT) == 0) {
2562			vp->v_iflag |= VI_DOINGINACT;
2563			VI_UNLOCK(vp);
2564			if (vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT, td) != 0)
2565				panic("vclean: cannot relock.");
2566			VOP_INACTIVE(vp, td);
2567			VI_LOCK(vp);
2568			KASSERT(vp->v_iflag & VI_DOINGINACT,
2569			    ("vclean: lost VI_DOINGINACT"));
2570			vp->v_iflag &= ~VI_DOINGINACT;
2571		}
2572		VI_UNLOCK(vp);
2573	}
2574
2575	/*
2576	 * Reclaim the vnode.
2577	 */
2578	if (VOP_RECLAIM(vp, td))
2579		panic("vclean: cannot reclaim");
2580
2581	if (active) {
2582		/*
2583		 * Inline copy of vrele() since VOP_INACTIVE
2584		 * has already been called.
2585		 */
2586		VI_LOCK(vp);
2587		v_incr_usecount(vp, -1);
2588		if (vp->v_usecount <= 0) {
2589#ifdef DIAGNOSTIC
2590			if (vp->v_usecount < 0 || vp->v_writecount != 0) {
2591				vprint("vclean: bad ref count", vp);
2592				panic("vclean: ref cnt");
2593			}
2594#endif
2595			vfree(vp);
2596		}
2597		VI_UNLOCK(vp);
2598	}
2599
2600	cache_purge(vp);
2601	VI_LOCK(vp);
2602	if (VSHOULDFREE(vp))
2603		vfree(vp);
2604
2605	/*
2606	 * Done with purge, reset to the standard lock and
2607	 * notify sleepers of the grim news.
2608	 */
2609	vp->v_vnlock = &vp->v_lock;
2610	vp->v_op = dead_vnodeop_p;
2611	if (vp->v_pollinfo != NULL)
2612		vn_pollgone(vp);
2613	vp->v_tag = "none";
2614	vp->v_iflag &= ~VI_XLOCK;
2615	vp->v_vxproc = NULL;
2616	if (vp->v_iflag & VI_XWANT) {
2617		vp->v_iflag &= ~VI_XWANT;
2618		wakeup(vp);
2619	}
2620}
2621
2622/*
2623 * Eliminate all activity associated with the requested vnode
2624 * and with all vnodes aliased to the requested vnode.
2625 */
2626int
2627vop_revoke(ap)
2628	struct vop_revoke_args /* {
2629		struct vnode *a_vp;
2630		int a_flags;
2631	} */ *ap;
2632{
2633	struct vnode *vp, *vq;
2634	dev_t dev;
2635
2636	KASSERT((ap->a_flags & REVOKEALL) != 0, ("vop_revoke"));
2637	vp = ap->a_vp;
2638	KASSERT((vp->v_type == VCHR), ("vop_revoke: not VCHR"));
2639
2640	VI_LOCK(vp);
2641	/*
2642	 * If a vgone (or vclean) is already in progress,
2643	 * wait until it is done and return.
2644	 */
2645	if (vp->v_iflag & VI_XLOCK) {
2646		vp->v_iflag |= VI_XWANT;
2647		msleep(vp, VI_MTX(vp), PINOD | PDROP,
2648		    "vop_revokeall", 0);
2649		return (0);
2650	}
2651	VI_UNLOCK(vp);
2652	dev = vp->v_rdev;
2653	for (;;) {
2654		mtx_lock(&spechash_mtx);
2655		vq = SLIST_FIRST(&dev->si_hlist);
2656		mtx_unlock(&spechash_mtx);
2657		if (!vq)
2658			break;
2659		vgone(vq);
2660	}
2661	return (0);
2662}
2663
2664/*
2665 * Recycle an unused vnode to the front of the free list.
2666 * Release the passed interlock if the vnode will be recycled.
2667 */
2668int
2669vrecycle(vp, inter_lkp, td)
2670	struct vnode *vp;
2671	struct mtx *inter_lkp;
2672	struct thread *td;
2673{
2674
2675	VI_LOCK(vp);
2676	if (vp->v_usecount == 0) {
2677		if (inter_lkp) {
2678			mtx_unlock(inter_lkp);
2679		}
2680		vgonel(vp, td);
2681		return (1);
2682	}
2683	VI_UNLOCK(vp);
2684	return (0);
2685}
2686
2687/*
2688 * Eliminate all activity associated with a vnode
2689 * in preparation for reuse.
2690 */
2691void
2692vgone(vp)
2693	register struct vnode *vp;
2694{
2695	struct thread *td = curthread;	/* XXX */
2696
2697	VI_LOCK(vp);
2698	vgonel(vp, td);
2699}
2700
2701/*
2702 * vgone, with the vp interlock held.
2703 */
2704void
2705vgonel(vp, td)
2706	struct vnode *vp;
2707	struct thread *td;
2708{
2709	int s;
2710
2711	/*
2712	 * If a vgone (or vclean) is already in progress,
2713	 * wait until it is done and return.
2714	 */
2715	ASSERT_VI_LOCKED(vp, "vgonel");
2716	if (vp->v_iflag & VI_XLOCK) {
2717		vp->v_iflag |= VI_XWANT;
2718		msleep(vp, VI_MTX(vp), PINOD | PDROP, "vgone", 0);
2719		return;
2720	}
2721
2722	/*
2723	 * Clean out the filesystem specific data.
2724	 */
2725	vclean(vp, DOCLOSE, td);
2726	VI_UNLOCK(vp);
2727
2728	/*
2729	 * Delete from old mount point vnode list, if on one.
2730	 */
2731	if (vp->v_mount != NULL)
2732		insmntque(vp, (struct mount *)0);
2733	/*
2734	 * If special device, remove it from special device alias list
2735	 * if it is on one.
2736	 */
2737	if (vp->v_type == VCHR && vp->v_rdev != NULL && vp->v_rdev != NODEV) {
2738		VI_LOCK(vp);
2739		mtx_lock(&spechash_mtx);
2740		SLIST_REMOVE(&vp->v_rdev->si_hlist, vp, vnode, v_specnext);
2741		vp->v_rdev->si_usecount -= vp->v_usecount;
2742		mtx_unlock(&spechash_mtx);
2743		VI_UNLOCK(vp);
2744		vp->v_rdev = NULL;
2745	}
2746
2747	/*
2748	 * If it is on the freelist and not already at the head,
2749	 * move it to the head of the list. The test of the
2750	 * VDOOMED flag and the reference count of zero is because
2751	 * it will be removed from the free list by getnewvnode,
2752	 * but will not have its reference count incremented until
2753	 * after calling vgone. If the reference count were
2754	 * incremented first, vgone would (incorrectly) try to
2755	 * close the previous instance of the underlying object.
2756	 */
2757	VI_LOCK(vp);
2758	if (vp->v_usecount == 0 && !(vp->v_iflag & VI_DOOMED)) {
2759		s = splbio();
2760		mtx_lock(&vnode_free_list_mtx);
2761		if (vp->v_iflag & VI_FREE) {
2762			TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
2763		} else {
2764			vp->v_iflag |= VI_FREE;
2765			freevnodes++;
2766		}
2767		TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
2768		mtx_unlock(&vnode_free_list_mtx);
2769		splx(s);
2770	}
2771
2772	vp->v_type = VBAD;
2773	VI_UNLOCK(vp);
2774}
2775
2776/*
2777 * Lookup a vnode by device number.
2778 */
2779int
2780vfinddev(dev, type, vpp)
2781	dev_t dev;
2782	enum vtype type;
2783	struct vnode **vpp;
2784{
2785	struct vnode *vp;
2786
2787	mtx_lock(&spechash_mtx);
2788	SLIST_FOREACH(vp, &dev->si_hlist, v_specnext) {
2789		if (type == vp->v_type) {
2790			*vpp = vp;
2791			mtx_unlock(&spechash_mtx);
2792			return (1);
2793		}
2794	}
2795	mtx_unlock(&spechash_mtx);
2796	return (0);
2797}
2798
2799/*
2800 * Calculate the total number of references to a special device.
2801 */
2802int
2803vcount(vp)
2804	struct vnode *vp;
2805{
2806	int count;
2807
2808	mtx_lock(&spechash_mtx);
2809	count = vp->v_rdev->si_usecount;
2810	mtx_unlock(&spechash_mtx);
2811	return (count);
2812}
2813
2814/*
2815 * Same as above, but using the dev_t as argument
2816 */
2817int
2818count_dev(dev)
2819	dev_t dev;
2820{
2821	struct vnode *vp;
2822
2823	vp = SLIST_FIRST(&dev->si_hlist);
2824	if (vp == NULL)
2825		return (0);
2826	return(vcount(vp));
2827}
2828
2829/*
2830 * Print out a description of a vnode.
2831 */
2832static char *typename[] =
2833{"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"};
2834
2835void
2836vprint(label, vp)
2837	char *label;
2838	struct vnode *vp;
2839{
2840	char buf[96];
2841
2842	if (label != NULL)
2843		printf("%s: %p: ", label, (void *)vp);
2844	else
2845		printf("%p: ", (void *)vp);
2846	printf("tag %s, type %s, usecount %d, writecount %d, refcount %d,",
2847	    vp->v_tag, typename[vp->v_type], vp->v_usecount,
2848	    vp->v_writecount, vp->v_holdcnt);
2849	buf[0] = '\0';
2850	if (vp->v_vflag & VV_ROOT)
2851		strcat(buf, "|VV_ROOT");
2852	if (vp->v_vflag & VV_TEXT)
2853		strcat(buf, "|VV_TEXT");
2854	if (vp->v_vflag & VV_SYSTEM)
2855		strcat(buf, "|VV_SYSTEM");
2856	if (vp->v_iflag & VI_XLOCK)
2857		strcat(buf, "|VI_XLOCK");
2858	if (vp->v_iflag & VI_XWANT)
2859		strcat(buf, "|VI_XWANT");
2860	if (vp->v_iflag & VI_BWAIT)
2861		strcat(buf, "|VI_BWAIT");
2862	if (vp->v_iflag & VI_DOOMED)
2863		strcat(buf, "|VI_DOOMED");
2864	if (vp->v_iflag & VI_FREE)
2865		strcat(buf, "|VI_FREE");
2866	if (vp->v_vflag & VV_OBJBUF)
2867		strcat(buf, "|VV_OBJBUF");
2868	if (buf[0] != '\0')
2869		printf(" flags (%s),", &buf[1]);
2870	lockmgr_printinfo(vp->v_vnlock);
2871	printf("\n");
2872	if (vp->v_data != NULL)
2873		VOP_PRINT(vp);
2874}
2875
2876#ifdef DDB
2877#include <ddb/ddb.h>
2878/*
2879 * List all of the locked vnodes in the system.
2880 * Called when debugging the kernel.
2881 */
2882DB_SHOW_COMMAND(lockedvnods, lockedvnodes)
2883{
2884	struct mount *mp, *nmp;
2885	struct vnode *vp;
2886
2887	/*
2888	 * Note: because this is DDB, we can't obey the locking semantics
2889	 * for these structures, which means we could catch an inconsistent
2890	 * state and dereference a nasty pointer.  Not much to be done
2891	 * about that.
2892	 */
2893	printf("Locked vnodes\n");
2894	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
2895		nmp = TAILQ_NEXT(mp, mnt_list);
2896		TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
2897			if (VOP_ISLOCKED(vp, NULL))
2898				vprint(NULL, vp);
2899		}
2900		nmp = TAILQ_NEXT(mp, mnt_list);
2901	}
2902}
2903#endif
2904
2905/*
2906 * Fill in a struct xvfsconf based on a struct vfsconf.
2907 */
2908static void
2909vfsconf2x(struct vfsconf *vfsp, struct xvfsconf *xvfsp)
2910{
2911
2912	strcpy(xvfsp->vfc_name, vfsp->vfc_name);
2913	xvfsp->vfc_typenum = vfsp->vfc_typenum;
2914	xvfsp->vfc_refcount = vfsp->vfc_refcount;
2915	xvfsp->vfc_flags = vfsp->vfc_flags;
2916	/*
2917	 * These are unused in userland, we keep them
2918	 * to not break binary compatibility.
2919	 */
2920	xvfsp->vfc_vfsops = NULL;
2921	xvfsp->vfc_next = NULL;
2922}
2923
2924static int
2925sysctl_vfs_conflist(SYSCTL_HANDLER_ARGS)
2926{
2927	struct vfsconf *vfsp;
2928	struct xvfsconf *xvfsp;
2929	int cnt, error, i;
2930
2931	cnt = 0;
2932	for (vfsp = vfsconf; vfsp != NULL; vfsp = vfsp->vfc_next)
2933		cnt++;
2934	xvfsp = malloc(sizeof(struct xvfsconf) * cnt, M_TEMP, M_WAITOK);
2935	/*
2936	 * Handle the race that we will have here when struct vfsconf
2937	 * will be locked down by using both cnt and checking vfc_next
2938	 * against NULL to determine the end of the loop.  The race will
2939	 * happen because we will have to unlock before calling malloc().
2940	 * We are protected by Giant for now.
2941	 */
2942	i = 0;
2943	for (vfsp = vfsconf; vfsp != NULL && i < cnt; vfsp = vfsp->vfc_next) {
2944		vfsconf2x(vfsp, xvfsp + i);
2945		i++;
2946	}
2947	error = SYSCTL_OUT(req, xvfsp, sizeof(struct xvfsconf) * i);
2948	free(xvfsp, M_TEMP);
2949	return (error);
2950}
2951
2952SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLFLAG_RD, NULL, 0, sysctl_vfs_conflist,
2953    "S,xvfsconf", "List of all configured filesystems");
2954
2955/*
2956 * Top level filesystem related information gathering.
2957 */
2958static int	sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS);
2959
2960static int
2961vfs_sysctl(SYSCTL_HANDLER_ARGS)
2962{
2963	int *name = (int *)arg1 - 1;	/* XXX */
2964	u_int namelen = arg2 + 1;	/* XXX */
2965	struct vfsconf *vfsp;
2966	struct xvfsconf xvfsp;
2967
2968	printf("WARNING: userland calling deprecated sysctl, "
2969	    "please rebuild world\n");
2970
2971#if 1 || defined(COMPAT_PRELITE2)
2972	/* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */
2973	if (namelen == 1)
2974		return (sysctl_ovfs_conf(oidp, arg1, arg2, req));
2975#endif
2976
2977	switch (name[1]) {
2978	case VFS_MAXTYPENUM:
2979		if (namelen != 2)
2980			return (ENOTDIR);
2981		return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int)));
2982	case VFS_CONF:
2983		if (namelen != 3)
2984			return (ENOTDIR);	/* overloaded */
2985		for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
2986			if (vfsp->vfc_typenum == name[2])
2987				break;
2988		if (vfsp == NULL)
2989			return (EOPNOTSUPP);
2990		vfsconf2x(vfsp, &xvfsp);
2991		return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp)));
2992	}
2993	return (EOPNOTSUPP);
2994}
2995
2996SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD | CTLFLAG_SKIP, vfs_sysctl,
2997	"Generic filesystem");
2998
2999#if 1 || defined(COMPAT_PRELITE2)
3000
3001static int
3002sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS)
3003{
3004	int error;
3005	struct vfsconf *vfsp;
3006	struct ovfsconf ovfs;
3007
3008	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
3009		ovfs.vfc_vfsops = vfsp->vfc_vfsops;	/* XXX used as flag */
3010		strcpy(ovfs.vfc_name, vfsp->vfc_name);
3011		ovfs.vfc_index = vfsp->vfc_typenum;
3012		ovfs.vfc_refcount = vfsp->vfc_refcount;
3013		ovfs.vfc_flags = vfsp->vfc_flags;
3014		error = SYSCTL_OUT(req, &ovfs, sizeof ovfs);
3015		if (error)
3016			return error;
3017	}
3018	return 0;
3019}
3020
3021#endif /* 1 || COMPAT_PRELITE2 */
3022
3023#define KINFO_VNODESLOP		10
3024#ifdef notyet
3025/*
3026 * Dump vnode list (via sysctl).
3027 */
3028/* ARGSUSED */
3029static int
3030sysctl_vnode(SYSCTL_HANDLER_ARGS)
3031{
3032	struct xvnode *xvn;
3033	struct thread *td = req->td;
3034	struct mount *mp;
3035	struct vnode *vp;
3036	int error, len, n;
3037
3038	/*
3039	 * Stale numvnodes access is not fatal here.
3040	 */
3041	req->lock = 0;
3042	len = (numvnodes + KINFO_VNODESLOP) * sizeof *xvn;
3043	if (!req->oldptr)
3044		/* Make an estimate */
3045		return (SYSCTL_OUT(req, 0, len));
3046
3047	sysctl_wire_old_buffer(req, 0);
3048	xvn = malloc(len, M_TEMP, M_ZERO | M_WAITOK);
3049	n = 0;
3050	mtx_lock(&mountlist_mtx);
3051	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
3052		if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td))
3053			continue;
3054		mtx_lock(&mntvnode_mtx);
3055		TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
3056			if (n == len)
3057				break;
3058			vref(vp);
3059			xvn[n].xv_size = sizeof *xvn;
3060			xvn[n].xv_vnode = vp;
3061#define XV_COPY(field) xvn[n].xv_##field = vp->v_##field
3062			XV_COPY(usecount);
3063			XV_COPY(writecount);
3064			XV_COPY(holdcnt);
3065			XV_COPY(id);
3066			XV_COPY(mount);
3067			XV_COPY(numoutput);
3068			XV_COPY(type);
3069#undef XV_COPY
3070			xvn[n].xv_flag = vp->v_vflag;
3071
3072			switch (vp->v_type) {
3073			case VREG:
3074			case VDIR:
3075			case VLNK:
3076				xvn[n].xv_dev = vp->v_cachedfs;
3077				xvn[n].xv_ino = vp->v_cachedid;
3078				break;
3079			case VBLK:
3080			case VCHR:
3081				if (vp->v_rdev == NULL) {
3082					vrele(vp);
3083					continue;
3084				}
3085				xvn[n].xv_dev = dev2udev(vp->v_rdev);
3086				break;
3087			case VSOCK:
3088				xvn[n].xv_socket = vp->v_socket;
3089				break;
3090			case VFIFO:
3091				xvn[n].xv_fifo = vp->v_fifoinfo;
3092				break;
3093			case VNON:
3094			case VBAD:
3095			default:
3096				/* shouldn't happen? */
3097				vrele(vp);
3098				continue;
3099			}
3100			vrele(vp);
3101			++n;
3102		}
3103		mtx_unlock(&mntvnode_mtx);
3104		mtx_lock(&mountlist_mtx);
3105		vfs_unbusy(mp, td);
3106		if (n == len)
3107			break;
3108	}
3109	mtx_unlock(&mountlist_mtx);
3110
3111	error = SYSCTL_OUT(req, xvn, n * sizeof *xvn);
3112	free(xvn, M_TEMP);
3113	return (error);
3114}
3115
3116SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD,
3117	0, 0, sysctl_vnode, "S,xvnode", "");
3118#endif
3119
3120/*
3121 * Check to see if a filesystem is mounted on a block device.
3122 */
3123int
3124vfs_mountedon(vp)
3125	struct vnode *vp;
3126{
3127
3128	if (vp->v_rdev->si_mountpoint != NULL)
3129		return (EBUSY);
3130	return (0);
3131}
3132
3133/*
3134 * Unmount all filesystems. The list is traversed in reverse order
3135 * of mounting to avoid dependencies.
3136 */
3137void
3138vfs_unmountall()
3139{
3140	struct mount *mp;
3141	struct thread *td;
3142	int error;
3143
3144	if (curthread != NULL)
3145		td = curthread;
3146	else
3147		td = FIRST_THREAD_IN_PROC(initproc); /* XXX XXX proc0? */
3148	/*
3149	 * Since this only runs when rebooting, it is not interlocked.
3150	 */
3151	while(!TAILQ_EMPTY(&mountlist)) {
3152		mp = TAILQ_LAST(&mountlist, mntlist);
3153		error = dounmount(mp, MNT_FORCE, td);
3154		if (error) {
3155			TAILQ_REMOVE(&mountlist, mp, mnt_list);
3156			printf("unmount of %s failed (",
3157			    mp->mnt_stat.f_mntonname);
3158			if (error == EBUSY)
3159				printf("BUSY)\n");
3160			else
3161				printf("%d)\n", error);
3162		} else {
3163			/* The unmount has removed mp from the mountlist */
3164		}
3165	}
3166}
3167
3168/*
3169 * perform msync on all vnodes under a mount point
3170 * the mount point must be locked.
3171 */
3172void
3173vfs_msync(struct mount *mp, int flags)
3174{
3175	struct vnode *vp, *nvp;
3176	struct vm_object *obj;
3177	int tries;
3178
3179	GIANT_REQUIRED;
3180
3181	tries = 5;
3182	mtx_lock(&mntvnode_mtx);
3183loop:
3184	for (vp = TAILQ_FIRST(&mp->mnt_nvnodelist); vp != NULL; vp = nvp) {
3185		if (vp->v_mount != mp) {
3186			if (--tries > 0)
3187				goto loop;
3188			break;
3189		}
3190		nvp = TAILQ_NEXT(vp, v_nmntvnodes);
3191
3192		VI_LOCK(vp);
3193		if (vp->v_iflag & VI_XLOCK) {	/* XXX: what if MNT_WAIT? */
3194			VI_UNLOCK(vp);
3195			continue;
3196		}
3197
3198		if ((vp->v_iflag & VI_OBJDIRTY) &&
3199		    (flags == MNT_WAIT || VOP_ISLOCKED(vp, NULL) == 0)) {
3200			mtx_unlock(&mntvnode_mtx);
3201			if (!vget(vp,
3202			    LK_EXCLUSIVE | LK_RETRY | LK_INTERLOCK,
3203			    curthread)) {
3204				if (vp->v_vflag & VV_NOSYNC) {	/* unlinked */
3205					vput(vp);
3206					mtx_lock(&mntvnode_mtx);
3207					continue;
3208				}
3209
3210				if (VOP_GETVOBJECT(vp, &obj) == 0) {
3211					VM_OBJECT_LOCK(obj);
3212					vm_object_page_clean(obj, 0, 0,
3213					    flags == MNT_WAIT ?
3214					    OBJPC_SYNC : OBJPC_NOSYNC);
3215					VM_OBJECT_UNLOCK(obj);
3216				}
3217				vput(vp);
3218			}
3219			mtx_lock(&mntvnode_mtx);
3220			if (TAILQ_NEXT(vp, v_nmntvnodes) != nvp) {
3221				if (--tries > 0)
3222					goto loop;
3223				break;
3224			}
3225		} else
3226			VI_UNLOCK(vp);
3227	}
3228	mtx_unlock(&mntvnode_mtx);
3229}
3230
3231/*
3232 * Create the VM object needed for VMIO and mmap support.  This
3233 * is done for all VREG files in the system.  Some filesystems might
3234 * afford the additional metadata buffering capability of the
3235 * VMIO code by making the device node be VMIO mode also.
3236 *
3237 * vp must be locked when vfs_object_create is called.
3238 */
3239int
3240vfs_object_create(vp, td, cred)
3241	struct vnode *vp;
3242	struct thread *td;
3243	struct ucred *cred;
3244{
3245	GIANT_REQUIRED;
3246	return (VOP_CREATEVOBJECT(vp, cred, td));
3247}
3248
3249/*
3250 * Mark a vnode as free, putting it up for recycling.
3251 */
3252void
3253vfree(vp)
3254	struct vnode *vp;
3255{
3256	int s;
3257
3258	ASSERT_VI_LOCKED(vp, "vfree");
3259	s = splbio();
3260	mtx_lock(&vnode_free_list_mtx);
3261	KASSERT((vp->v_iflag & VI_FREE) == 0, ("vnode already free"));
3262	if (vp->v_iflag & VI_AGE) {
3263		TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
3264	} else {
3265		TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
3266	}
3267	freevnodes++;
3268	mtx_unlock(&vnode_free_list_mtx);
3269	vp->v_iflag &= ~VI_AGE;
3270	vp->v_iflag |= VI_FREE;
3271	splx(s);
3272}
3273
3274/*
3275 * Opposite of vfree() - mark a vnode as in use.
3276 */
3277void
3278vbusy(vp)
3279	struct vnode *vp;
3280{
3281	int s;
3282
3283	s = splbio();
3284	ASSERT_VI_LOCKED(vp, "vbusy");
3285	KASSERT((vp->v_iflag & VI_FREE) != 0, ("vnode not free"));
3286
3287	mtx_lock(&vnode_free_list_mtx);
3288	TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
3289	freevnodes--;
3290	mtx_unlock(&vnode_free_list_mtx);
3291
3292	vp->v_iflag &= ~(VI_FREE|VI_AGE);
3293	splx(s);
3294}
3295
3296/*
3297 * Record a process's interest in events which might happen to
3298 * a vnode.  Because poll uses the historic select-style interface
3299 * internally, this routine serves as both the ``check for any
3300 * pending events'' and the ``record my interest in future events''
3301 * functions.  (These are done together, while the lock is held,
3302 * to avoid race conditions.)
3303 */
3304int
3305vn_pollrecord(vp, td, events)
3306	struct vnode *vp;
3307	struct thread *td;
3308	short events;
3309{
3310
3311	if (vp->v_pollinfo == NULL)
3312		v_addpollinfo(vp);
3313	mtx_lock(&vp->v_pollinfo->vpi_lock);
3314	if (vp->v_pollinfo->vpi_revents & events) {
3315		/*
3316		 * This leaves events we are not interested
3317		 * in available for the other process which
3318		 * which presumably had requested them
3319		 * (otherwise they would never have been
3320		 * recorded).
3321		 */
3322		events &= vp->v_pollinfo->vpi_revents;
3323		vp->v_pollinfo->vpi_revents &= ~events;
3324
3325		mtx_unlock(&vp->v_pollinfo->vpi_lock);
3326		return events;
3327	}
3328	vp->v_pollinfo->vpi_events |= events;
3329	selrecord(td, &vp->v_pollinfo->vpi_selinfo);
3330	mtx_unlock(&vp->v_pollinfo->vpi_lock);
3331	return 0;
3332}
3333
3334/*
3335 * Note the occurrence of an event.  If the VN_POLLEVENT macro is used,
3336 * it is possible for us to miss an event due to race conditions, but
3337 * that condition is expected to be rare, so for the moment it is the
3338 * preferred interface.
3339 */
3340void
3341vn_pollevent(vp, events)
3342	struct vnode *vp;
3343	short events;
3344{
3345
3346	if (vp->v_pollinfo == NULL)
3347		v_addpollinfo(vp);
3348	mtx_lock(&vp->v_pollinfo->vpi_lock);
3349	if (vp->v_pollinfo->vpi_events & events) {
3350		/*
3351		 * We clear vpi_events so that we don't
3352		 * call selwakeup() twice if two events are
3353		 * posted before the polling process(es) is
3354		 * awakened.  This also ensures that we take at
3355		 * most one selwakeup() if the polling process
3356		 * is no longer interested.  However, it does
3357		 * mean that only one event can be noticed at
3358		 * a time.  (Perhaps we should only clear those
3359		 * event bits which we note?) XXX
3360		 */
3361		vp->v_pollinfo->vpi_events = 0;	/* &= ~events ??? */
3362		vp->v_pollinfo->vpi_revents |= events;
3363		selwakeup(&vp->v_pollinfo->vpi_selinfo);
3364	}
3365	mtx_unlock(&vp->v_pollinfo->vpi_lock);
3366}
3367
3368/*
3369 * Wake up anyone polling on vp because it is being revoked.
3370 * This depends on dead_poll() returning POLLHUP for correct
3371 * behavior.
3372 */
3373void
3374vn_pollgone(vp)
3375	struct vnode *vp;
3376{
3377
3378	mtx_lock(&vp->v_pollinfo->vpi_lock);
3379	VN_KNOTE(vp, NOTE_REVOKE);
3380	if (vp->v_pollinfo->vpi_events) {
3381		vp->v_pollinfo->vpi_events = 0;
3382		selwakeup(&vp->v_pollinfo->vpi_selinfo);
3383	}
3384	mtx_unlock(&vp->v_pollinfo->vpi_lock);
3385}
3386
3387
3388
3389/*
3390 * Routine to create and manage a filesystem syncer vnode.
3391 */
3392#define sync_close ((int (*)(struct  vop_close_args *))nullop)
3393static int	sync_fsync(struct  vop_fsync_args *);
3394static int	sync_inactive(struct  vop_inactive_args *);
3395static int	sync_reclaim(struct  vop_reclaim_args *);
3396
3397static vop_t **sync_vnodeop_p;
3398static struct vnodeopv_entry_desc sync_vnodeop_entries[] = {
3399	{ &vop_default_desc,	(vop_t *) vop_eopnotsupp },
3400	{ &vop_close_desc,	(vop_t *) sync_close },		/* close */
3401	{ &vop_fsync_desc,	(vop_t *) sync_fsync },		/* fsync */
3402	{ &vop_inactive_desc,	(vop_t *) sync_inactive },	/* inactive */
3403	{ &vop_reclaim_desc,	(vop_t *) sync_reclaim },	/* reclaim */
3404	{ &vop_lock_desc,	(vop_t *) vop_stdlock },	/* lock */
3405	{ &vop_unlock_desc,	(vop_t *) vop_stdunlock },	/* unlock */
3406	{ &vop_islocked_desc,	(vop_t *) vop_stdislocked },	/* islocked */
3407	{ NULL, NULL }
3408};
3409static struct vnodeopv_desc sync_vnodeop_opv_desc =
3410	{ &sync_vnodeop_p, sync_vnodeop_entries };
3411
3412VNODEOP_SET(sync_vnodeop_opv_desc);
3413
3414/*
3415 * Create a new filesystem syncer vnode for the specified mount point.
3416 */
3417int
3418vfs_allocate_syncvnode(mp)
3419	struct mount *mp;
3420{
3421	struct vnode *vp;
3422	static long start, incr, next;
3423	int error;
3424
3425	/* Allocate a new vnode */
3426	if ((error = getnewvnode("syncer", mp, sync_vnodeop_p, &vp)) != 0) {
3427		mp->mnt_syncer = NULL;
3428		return (error);
3429	}
3430	vp->v_type = VNON;
3431	/*
3432	 * Place the vnode onto the syncer worklist. We attempt to
3433	 * scatter them about on the list so that they will go off
3434	 * at evenly distributed times even if all the filesystems
3435	 * are mounted at once.
3436	 */
3437	next += incr;
3438	if (next == 0 || next > syncer_maxdelay) {
3439		start /= 2;
3440		incr /= 2;
3441		if (start == 0) {
3442			start = syncer_maxdelay / 2;
3443			incr = syncer_maxdelay;
3444		}
3445		next = start;
3446	}
3447	VI_LOCK(vp);
3448	vn_syncer_add_to_worklist(vp, syncdelay > 0 ? next % syncdelay : 0);
3449	VI_UNLOCK(vp);
3450	mp->mnt_syncer = vp;
3451	return (0);
3452}
3453
3454/*
3455 * Do a lazy sync of the filesystem.
3456 */
3457static int
3458sync_fsync(ap)
3459	struct vop_fsync_args /* {
3460		struct vnode *a_vp;
3461		struct ucred *a_cred;
3462		int a_waitfor;
3463		struct thread *a_td;
3464	} */ *ap;
3465{
3466	struct vnode *syncvp = ap->a_vp;
3467	struct mount *mp = syncvp->v_mount;
3468	struct thread *td = ap->a_td;
3469	int error, asyncflag;
3470
3471	/*
3472	 * We only need to do something if this is a lazy evaluation.
3473	 */
3474	if (ap->a_waitfor != MNT_LAZY)
3475		return (0);
3476
3477	/*
3478	 * Move ourselves to the back of the sync list.
3479	 */
3480	VI_LOCK(syncvp);
3481	vn_syncer_add_to_worklist(syncvp, syncdelay);
3482	VI_UNLOCK(syncvp);
3483
3484	/*
3485	 * Walk the list of vnodes pushing all that are dirty and
3486	 * not already on the sync list.
3487	 */
3488	mtx_lock(&mountlist_mtx);
3489	if (vfs_busy(mp, LK_EXCLUSIVE | LK_NOWAIT, &mountlist_mtx, td) != 0) {
3490		mtx_unlock(&mountlist_mtx);
3491		return (0);
3492	}
3493	if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) {
3494		vfs_unbusy(mp, td);
3495		return (0);
3496	}
3497	asyncflag = mp->mnt_flag & MNT_ASYNC;
3498	mp->mnt_flag &= ~MNT_ASYNC;
3499	vfs_msync(mp, MNT_NOWAIT);
3500	error = VFS_SYNC(mp, MNT_LAZY, ap->a_cred, td);
3501	if (asyncflag)
3502		mp->mnt_flag |= MNT_ASYNC;
3503	vn_finished_write(mp);
3504	vfs_unbusy(mp, td);
3505	return (error);
3506}
3507
3508/*
3509 * The syncer vnode is no referenced.
3510 */
3511static int
3512sync_inactive(ap)
3513	struct vop_inactive_args /* {
3514		struct vnode *a_vp;
3515		struct thread *a_td;
3516	} */ *ap;
3517{
3518
3519	VOP_UNLOCK(ap->a_vp, 0, ap->a_td);
3520	vgone(ap->a_vp);
3521	return (0);
3522}
3523
3524/*
3525 * The syncer vnode is no longer needed and is being decommissioned.
3526 *
3527 * Modifications to the worklist must be protected at splbio().
3528 */
3529static int
3530sync_reclaim(ap)
3531	struct vop_reclaim_args /* {
3532		struct vnode *a_vp;
3533	} */ *ap;
3534{
3535	struct vnode *vp = ap->a_vp;
3536	int s;
3537
3538	s = splbio();
3539	vp->v_mount->mnt_syncer = NULL;
3540	VI_LOCK(vp);
3541	if (vp->v_iflag & VI_ONWORKLST) {
3542		mtx_lock(&sync_mtx);
3543		LIST_REMOVE(vp, v_synclist);
3544		mtx_unlock(&sync_mtx);
3545		vp->v_iflag &= ~VI_ONWORKLST;
3546	}
3547	VI_UNLOCK(vp);
3548	splx(s);
3549
3550	return (0);
3551}
3552
3553/*
3554 * extract the dev_t from a VCHR
3555 */
3556dev_t
3557vn_todev(vp)
3558	struct vnode *vp;
3559{
3560	if (vp->v_type != VCHR)
3561		return (NODEV);
3562	return (vp->v_rdev);
3563}
3564
3565/*
3566 * Check if vnode represents a disk device
3567 */
3568int
3569vn_isdisk(vp, errp)
3570	struct vnode *vp;
3571	int *errp;
3572{
3573	struct cdevsw *cdevsw;
3574
3575	if (vp->v_type != VCHR) {
3576		if (errp != NULL)
3577			*errp = ENOTBLK;
3578		return (0);
3579	}
3580	if (vp->v_rdev == NULL) {
3581		if (errp != NULL)
3582			*errp = ENXIO;
3583		return (0);
3584	}
3585	cdevsw = devsw(vp->v_rdev);
3586	if (cdevsw == NULL) {
3587		if (errp != NULL)
3588			*errp = ENXIO;
3589		return (0);
3590	}
3591	if (!(cdevsw->d_flags & D_DISK)) {
3592		if (errp != NULL)
3593			*errp = ENOTBLK;
3594		return (0);
3595	}
3596	if (errp != NULL)
3597		*errp = 0;
3598	return (1);
3599}
3600
3601/*
3602 * Free data allocated by namei(); see namei(9) for details.
3603 */
3604void
3605NDFREE(ndp, flags)
3606     struct nameidata *ndp;
3607     const uint flags;
3608{
3609	if (!(flags & NDF_NO_FREE_PNBUF) &&
3610	    (ndp->ni_cnd.cn_flags & HASBUF)) {
3611		uma_zfree(namei_zone, ndp->ni_cnd.cn_pnbuf);
3612		ndp->ni_cnd.cn_flags &= ~HASBUF;
3613	}
3614	if (!(flags & NDF_NO_DVP_UNLOCK) &&
3615	    (ndp->ni_cnd.cn_flags & LOCKPARENT) &&
3616	    ndp->ni_dvp != ndp->ni_vp)
3617		VOP_UNLOCK(ndp->ni_dvp, 0, ndp->ni_cnd.cn_thread);
3618	if (!(flags & NDF_NO_DVP_RELE) &&
3619	    (ndp->ni_cnd.cn_flags & (LOCKPARENT|WANTPARENT))) {
3620		vrele(ndp->ni_dvp);
3621		ndp->ni_dvp = NULL;
3622	}
3623	if (!(flags & NDF_NO_VP_UNLOCK) &&
3624	    (ndp->ni_cnd.cn_flags & LOCKLEAF) && ndp->ni_vp)
3625		VOP_UNLOCK(ndp->ni_vp, 0, ndp->ni_cnd.cn_thread);
3626	if (!(flags & NDF_NO_VP_RELE) &&
3627	    ndp->ni_vp) {
3628		vrele(ndp->ni_vp);
3629		ndp->ni_vp = NULL;
3630	}
3631	if (!(flags & NDF_NO_STARTDIR_RELE) &&
3632	    (ndp->ni_cnd.cn_flags & SAVESTART)) {
3633		vrele(ndp->ni_startdir);
3634		ndp->ni_startdir = NULL;
3635	}
3636}
3637
3638/*
3639 * Common filesystem object access control check routine.  Accepts a
3640 * vnode's type, "mode", uid and gid, requested access mode, credentials,
3641 * and optional call-by-reference privused argument allowing vaccess()
3642 * to indicate to the caller whether privilege was used to satisfy the
3643 * request (obsoleted).  Returns 0 on success, or an errno on failure.
3644 */
3645int
3646vaccess(type, file_mode, file_uid, file_gid, acc_mode, cred, privused)
3647	enum vtype type;
3648	mode_t file_mode;
3649	uid_t file_uid;
3650	gid_t file_gid;
3651	mode_t acc_mode;
3652	struct ucred *cred;
3653	int *privused;
3654{
3655	mode_t dac_granted;
3656#ifdef CAPABILITIES
3657	mode_t cap_granted;
3658#endif
3659
3660	/*
3661	 * Look for a normal, non-privileged way to access the file/directory
3662	 * as requested.  If it exists, go with that.
3663	 */
3664
3665	if (privused != NULL)
3666		*privused = 0;
3667
3668	dac_granted = 0;
3669
3670	/* Check the owner. */
3671	if (cred->cr_uid == file_uid) {
3672		dac_granted |= VADMIN;
3673		if (file_mode & S_IXUSR)
3674			dac_granted |= VEXEC;
3675		if (file_mode & S_IRUSR)
3676			dac_granted |= VREAD;
3677		if (file_mode & S_IWUSR)
3678			dac_granted |= (VWRITE | VAPPEND);
3679
3680		if ((acc_mode & dac_granted) == acc_mode)
3681			return (0);
3682
3683		goto privcheck;
3684	}
3685
3686	/* Otherwise, check the groups (first match) */
3687	if (groupmember(file_gid, cred)) {
3688		if (file_mode & S_IXGRP)
3689			dac_granted |= VEXEC;
3690		if (file_mode & S_IRGRP)
3691			dac_granted |= VREAD;
3692		if (file_mode & S_IWGRP)
3693			dac_granted |= (VWRITE | VAPPEND);
3694
3695		if ((acc_mode & dac_granted) == acc_mode)
3696			return (0);
3697
3698		goto privcheck;
3699	}
3700
3701	/* Otherwise, check everyone else. */
3702	if (file_mode & S_IXOTH)
3703		dac_granted |= VEXEC;
3704	if (file_mode & S_IROTH)
3705		dac_granted |= VREAD;
3706	if (file_mode & S_IWOTH)
3707		dac_granted |= (VWRITE | VAPPEND);
3708	if ((acc_mode & dac_granted) == acc_mode)
3709		return (0);
3710
3711privcheck:
3712	if (!suser_cred(cred, PRISON_ROOT)) {
3713		/* XXX audit: privilege used */
3714		if (privused != NULL)
3715			*privused = 1;
3716		return (0);
3717	}
3718
3719#ifdef CAPABILITIES
3720	/*
3721	 * Build a capability mask to determine if the set of capabilities
3722	 * satisfies the requirements when combined with the granted mask
3723	 * from above.
3724	 * For each capability, if the capability is required, bitwise
3725	 * or the request type onto the cap_granted mask.
3726	 */
3727	cap_granted = 0;
3728
3729	if (type == VDIR) {
3730		/*
3731		 * For directories, use CAP_DAC_READ_SEARCH to satisfy
3732		 * VEXEC requests, instead of CAP_DAC_EXECUTE.
3733		 */
3734		if ((acc_mode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
3735		    !cap_check(cred, NULL, CAP_DAC_READ_SEARCH, PRISON_ROOT))
3736			cap_granted |= VEXEC;
3737	} else {
3738		if ((acc_mode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
3739		    !cap_check(cred, NULL, CAP_DAC_EXECUTE, PRISON_ROOT))
3740			cap_granted |= VEXEC;
3741	}
3742
3743	if ((acc_mode & VREAD) && ((dac_granted & VREAD) == 0) &&
3744	    !cap_check(cred, NULL, CAP_DAC_READ_SEARCH, PRISON_ROOT))
3745		cap_granted |= VREAD;
3746
3747	if ((acc_mode & VWRITE) && ((dac_granted & VWRITE) == 0) &&
3748	    !cap_check(cred, NULL, CAP_DAC_WRITE, PRISON_ROOT))
3749		cap_granted |= (VWRITE | VAPPEND);
3750
3751	if ((acc_mode & VADMIN) && ((dac_granted & VADMIN) == 0) &&
3752	    !cap_check(cred, NULL, CAP_FOWNER, PRISON_ROOT))
3753		cap_granted |= VADMIN;
3754
3755	if ((acc_mode & (cap_granted | dac_granted)) == acc_mode) {
3756		/* XXX audit: privilege used */
3757		if (privused != NULL)
3758			*privused = 1;
3759		return (0);
3760	}
3761#endif
3762
3763	return ((acc_mode & VADMIN) ? EPERM : EACCES);
3764}
3765
3766/*
3767 * Credential check based on process requesting service, and per-attribute
3768 * permissions.
3769 */
3770int
3771extattr_check_cred(struct vnode *vp, int attrnamespace,
3772    struct ucred *cred, struct thread *td, int access)
3773{
3774
3775	/*
3776	 * Kernel-invoked always succeeds.
3777	 */
3778	if (cred == NOCRED)
3779		return (0);
3780
3781	/*
3782	 * Do not allow privileged processes in jail to directly
3783	 * manipulate system attributes.
3784	 *
3785	 * XXX What capability should apply here?
3786	 * Probably CAP_SYS_SETFFLAG.
3787	 */
3788	switch (attrnamespace) {
3789	case EXTATTR_NAMESPACE_SYSTEM:
3790		/* Potentially should be: return (EPERM); */
3791		return (suser_cred(cred, 0));
3792	case EXTATTR_NAMESPACE_USER:
3793		return (VOP_ACCESS(vp, access, cred, td));
3794	default:
3795		return (EPERM);
3796	}
3797}
3798