vfs_subr.c revision 103986
1/*
2 * Copyright (c) 1989, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 *    must display the following acknowledgement:
20 *	This product includes software developed by the University of
21 *	California, Berkeley and its contributors.
22 * 4. Neither the name of the University nor the names of its contributors
23 *    may be used to endorse or promote products derived from this software
24 *    without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 *
38 *	@(#)vfs_subr.c	8.31 (Berkeley) 5/26/95
39 * $FreeBSD: head/sys/kern/vfs_subr.c 103986 2002-09-26 04:48:44Z jeff $
40 */
41
42/*
43 * External virtual filesystem routines
44 */
45#include "opt_ddb.h"
46#include "opt_mac.h"
47
48#include <sys/param.h>
49#include <sys/systm.h>
50#include <sys/bio.h>
51#include <sys/buf.h>
52#include <sys/conf.h>
53#include <sys/eventhandler.h>
54#include <sys/extattr.h>
55#include <sys/fcntl.h>
56#include <sys/kernel.h>
57#include <sys/kthread.h>
58#include <sys/mac.h>
59#include <sys/malloc.h>
60#include <sys/mount.h>
61#include <sys/namei.h>
62#include <sys/stat.h>
63#include <sys/sysctl.h>
64#include <sys/syslog.h>
65#include <sys/vmmeter.h>
66#include <sys/vnode.h>
67
68#include <vm/vm.h>
69#include <vm/vm_object.h>
70#include <vm/vm_extern.h>
71#include <vm/pmap.h>
72#include <vm/vm_map.h>
73#include <vm/vm_page.h>
74#include <vm/uma.h>
75
76static MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure");
77
78static void	addalias(struct vnode *vp, dev_t nvp_rdev);
79static void	insmntque(struct vnode *vp, struct mount *mp);
80static void	vclean(struct vnode *vp, int flags, struct thread *td);
81static void	vlruvp(struct vnode *vp);
82static int	flushbuflist(struct buf *blist, int flags, struct vnode *vp,
83		    int slpflag, int slptimeo, int *errorp);
84static int	vcanrecycle(struct vnode *vp);
85
86
87/*
88 * Number of vnodes in existence.  Increased whenever getnewvnode()
89 * allocates a new vnode, never decreased.
90 */
91static unsigned long	numvnodes;
92
93SYSCTL_LONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, "");
94
95/*
96 * Conversion tables for conversion from vnode types to inode formats
97 * and back.
98 */
99enum vtype iftovt_tab[16] = {
100	VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
101	VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
102};
103int vttoif_tab[9] = {
104	0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
105	S_IFSOCK, S_IFIFO, S_IFMT,
106};
107
108/*
109 * List of vnodes that are ready for recycling.
110 */
111static TAILQ_HEAD(freelst, vnode) vnode_free_list;
112
113/*
114 * Minimum number of free vnodes.  If there are fewer than this free vnodes,
115 * getnewvnode() will return a newly allocated vnode.
116 */
117static u_long wantfreevnodes = 25;
118SYSCTL_LONG(_vfs, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, "");
119/* Number of vnodes in the free list. */
120static u_long freevnodes;
121SYSCTL_LONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, "");
122
123/*
124 * Various variables used for debugging the new implementation of
125 * reassignbuf().
126 * XXX these are probably of (very) limited utility now.
127 */
128static int reassignbufcalls;
129SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0, "");
130static int nameileafonly;
131SYSCTL_INT(_vfs, OID_AUTO, nameileafonly, CTLFLAG_RW, &nameileafonly, 0, "");
132
133#ifdef ENABLE_VFS_IOOPT
134/* See NOTES for a description of this setting. */
135int vfs_ioopt;
136SYSCTL_INT(_vfs, OID_AUTO, ioopt, CTLFLAG_RW, &vfs_ioopt, 0, "");
137#endif
138
139/*
140 * Cache for the mount type id assigned to NFS.  This is used for
141 * special checks in nfs/nfs_nqlease.c and vm/vnode_pager.c.
142 */
143int	nfs_mount_type = -1;
144
145/* To keep more than one thread at a time from running vfs_getnewfsid */
146static struct mtx mntid_mtx;
147
148/*
149 * Lock for any access to the following:
150 *	vnode_free_list
151 *	numvnodes
152 *	freevnodes
153 */
154static struct mtx vnode_free_list_mtx;
155
156/*
157 * For any iteration/modification of dev->si_hlist (linked through
158 * v_specnext)
159 */
160static struct mtx spechash_mtx;
161
162/* Publicly exported FS */
163struct nfs_public nfs_pub;
164
165/* Zone for allocation of new vnodes - used exclusively by getnewvnode() */
166static uma_zone_t vnode_zone;
167static uma_zone_t vnodepoll_zone;
168
169/* Set to 1 to print out reclaim of active vnodes */
170int	prtactive;
171
172/*
173 * The workitem queue.
174 *
175 * It is useful to delay writes of file data and filesystem metadata
176 * for tens of seconds so that quickly created and deleted files need
177 * not waste disk bandwidth being created and removed. To realize this,
178 * we append vnodes to a "workitem" queue. When running with a soft
179 * updates implementation, most pending metadata dependencies should
180 * not wait for more than a few seconds. Thus, mounted on block devices
181 * are delayed only about a half the time that file data is delayed.
182 * Similarly, directory updates are more critical, so are only delayed
183 * about a third the time that file data is delayed. Thus, there are
184 * SYNCER_MAXDELAY queues that are processed round-robin at a rate of
185 * one each second (driven off the filesystem syncer process). The
186 * syncer_delayno variable indicates the next queue that is to be processed.
187 * Items that need to be processed soon are placed in this queue:
188 *
189 *	syncer_workitem_pending[syncer_delayno]
190 *
191 * A delay of fifteen seconds is done by placing the request fifteen
192 * entries later in the queue:
193 *
194 *	syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
195 *
196 */
197static int syncer_delayno;
198static long syncer_mask;
199LIST_HEAD(synclist, vnode);
200static struct synclist *syncer_workitem_pending;
201/*
202 * The sync_mtx protects:
203 *	vp->v_synclist
204 *	syncer_delayno
205 *	syncer_workitem_pending
206 *	rushjob
207 */
208static struct mtx sync_mtx;
209
210#define SYNCER_MAXDELAY		32
211static int syncer_maxdelay = SYNCER_MAXDELAY;	/* maximum delay time */
212static int syncdelay = 30;		/* max time to delay syncing data */
213static int filedelay = 30;		/* time to delay syncing files */
214SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, "");
215static int dirdelay = 29;		/* time to delay syncing directories */
216SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, "");
217static int metadelay = 28;		/* time to delay syncing metadata */
218SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, "");
219static int rushjob;		/* number of slots to run ASAP */
220static int stat_rush_requests;	/* number of times I/O speeded up */
221SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, "");
222
223/*
224 * Number of vnodes we want to exist at any one time.  This is mostly used
225 * to size hash tables in vnode-related code.  It is normally not used in
226 * getnewvnode(), as wantfreevnodes is normally nonzero.)
227 *
228 * XXX desiredvnodes is historical cruft and should not exist.
229 */
230int desiredvnodes;
231SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW,
232    &desiredvnodes, 0, "Maximum number of vnodes");
233static int minvnodes;
234SYSCTL_INT(_kern, OID_AUTO, minvnodes, CTLFLAG_RW,
235    &minvnodes, 0, "Minimum number of vnodes");
236static int vnlru_nowhere;
237SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW, &vnlru_nowhere, 0,
238    "Number of times the vnlru process ran without success");
239
240/* Hook for calling soft updates */
241int (*softdep_process_worklist_hook)(struct mount *);
242
243/*
244 * This only exists to supress warnings from unlocked specfs accesses.  It is
245 * no longer ok to have an unlocked VFS.
246 */
247#define IGNORE_LOCK(vp) ((vp)->v_type == VCHR || (vp)->v_type == VBAD)
248
249/* Print lock violations */
250int vfs_badlock_print = 1;
251
252/* Panic on violation */
253int vfs_badlock_panic = 1;
254
255/* Check for interlock across VOPs */
256int vfs_badlock_mutex = 1;
257
258static void
259vfs_badlock(char *msg, char *str, struct vnode *vp)
260{
261	if (vfs_badlock_print)
262		printf("%s: %p %s\n", str, vp, msg);
263	if (vfs_badlock_panic)
264		Debugger("Lock violation.\n");
265}
266
267void
268assert_vi_unlocked(struct vnode *vp, char *str)
269{
270	if (vfs_badlock_mutex && mtx_owned(VI_MTX(vp)))
271		vfs_badlock("interlock is locked but should not be", str, vp);
272}
273
274void
275assert_vi_locked(struct vnode *vp, char *str)
276{
277	if (vfs_badlock_mutex && !mtx_owned(VI_MTX(vp)))
278		vfs_badlock("interlock is not locked but should be", str, vp);
279}
280
281void
282assert_vop_locked(struct vnode *vp, char *str)
283{
284	if (vp && !IGNORE_LOCK(vp) && !VOP_ISLOCKED(vp, NULL))
285		vfs_badlock("is not locked but should be", str, vp);
286}
287
288void
289assert_vop_unlocked(struct vnode *vp, char *str)
290{
291	if (vp && !IGNORE_LOCK(vp) &&
292	    VOP_ISLOCKED(vp, curthread) == LK_EXCLUSIVE)
293		vfs_badlock("is locked but should not be", str, vp);
294}
295
296void
297assert_vop_elocked(struct vnode *vp, char *str)
298{
299	if (vp && !IGNORE_LOCK(vp) &&
300	    VOP_ISLOCKED(vp, curthread) != LK_EXCLUSIVE)
301		vfs_badlock("is not exclusive locked but should be", str, vp);
302}
303
304void
305assert_vop_elocked_other(struct vnode *vp, char *str)
306{
307	if (vp && !IGNORE_LOCK(vp) &&
308	    VOP_ISLOCKED(vp, curthread) != LK_EXCLOTHER)
309		vfs_badlock("is not exclusive locked by another thread",
310		    str, vp);
311}
312
313void
314assert_vop_slocked(struct vnode *vp, char *str)
315{
316	if (vp && !IGNORE_LOCK(vp) &&
317	    VOP_ISLOCKED(vp, curthread) != LK_SHARED)
318		vfs_badlock("is not locked shared but should be", str, vp);
319}
320
321void
322vop_rename_pre(void *ap)
323{
324	struct vop_rename_args *a = ap;
325
326	if (a->a_tvp)
327		ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME");
328	ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME");
329	ASSERT_VI_UNLOCKED(a->a_fvp, "VOP_RENAME");
330	ASSERT_VI_UNLOCKED(a->a_fdvp, "VOP_RENAME");
331
332	/* Check the source (from) */
333	if (a->a_tdvp != a->a_fdvp)
334		ASSERT_VOP_UNLOCKED(a->a_fdvp, "vop_rename: fdvp locked.\n");
335	if (a->a_tvp != a->a_fvp)
336		ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: tvp locked.\n");
337
338	/* Check the target */
339	if (a->a_tvp)
340		ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked.\n");
341
342	ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked.\n");
343}
344
345void
346vop_strategy_pre(void *ap)
347{
348	struct vop_strategy_args *a = ap;
349	struct buf *bp;
350
351	bp = a->a_bp;
352
353	/*
354	 * Cluster ops lock their component buffers but not the IO container.
355	 */
356	if ((bp->b_flags & B_CLUSTER) != 0)
357		return;
358
359	if (BUF_REFCNT(bp) < 1) {
360		if (vfs_badlock_print)
361			printf("VOP_STRATEGY: bp is not locked but should be.\n");
362		if (vfs_badlock_panic)
363			Debugger("Lock violation.\n");
364	}
365}
366
367void
368vop_lookup_pre(void *ap)
369{
370	struct vop_lookup_args *a = ap;
371	struct vnode *dvp;
372
373	dvp = a->a_dvp;
374
375	ASSERT_VI_UNLOCKED(dvp, "VOP_LOOKUP");
376	ASSERT_VOP_LOCKED(dvp, "VOP_LOOKUP");
377}
378
379void
380vop_lookup_post(void *ap, int rc)
381{
382	struct vop_lookup_args *a = ap;
383	struct componentname *cnp;
384	struct vnode *dvp;
385	struct vnode *vp;
386	int flags;
387
388	dvp = a->a_dvp;
389	cnp = a->a_cnp;
390	vp = *(a->a_vpp);
391	flags = cnp->cn_flags;
392
393
394	ASSERT_VI_UNLOCKED(dvp, "VOP_LOOKUP");
395	/*
396	 * If this is the last path component for this lookup and LOCPARENT
397	 * is set, OR if there is an error the directory has to be locked.
398	 */
399	if ((flags & LOCKPARENT) && (flags & ISLASTCN))
400		ASSERT_VOP_LOCKED(dvp, "VOP_LOOKUP (LOCKPARENT)");
401	else if (rc != 0)
402		ASSERT_VOP_LOCKED(dvp, "VOP_LOOKUP (error)");
403	else if (dvp != vp)
404		ASSERT_VOP_UNLOCKED(dvp, "VOP_LOOKUP (dvp)");
405
406	if (flags & PDIRUNLOCK)
407		ASSERT_VOP_UNLOCKED(dvp, "VOP_LOOKUP (PDIRUNLOCK)");
408}
409
410void
411vop_unlock_pre(void *ap)
412{
413	struct vop_unlock_args *a = ap;
414
415	if (a->a_flags & LK_INTERLOCK)
416		ASSERT_VI_LOCKED(a->a_vp, "VOP_UNLOCK");
417
418	ASSERT_VOP_LOCKED(a->a_vp, "VOP_UNLOCK");
419}
420
421void
422vop_unlock_post(void *ap, int rc)
423{
424	struct vop_unlock_args *a = ap;
425
426	if (a->a_flags & LK_INTERLOCK)
427		ASSERT_VI_UNLOCKED(a->a_vp, "VOP_UNLOCK");
428}
429
430void
431vop_lock_pre(void *ap)
432{
433	struct vop_lock_args *a = ap;
434
435	if ((a->a_flags & LK_INTERLOCK) == 0)
436		ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
437	else
438		ASSERT_VI_LOCKED(a->a_vp, "VOP_LOCK");
439}
440
441void
442vop_lock_post(void *ap, int rc)
443{
444	struct vop_lock_args *a;
445
446	a = ap;
447
448	ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
449	ASSERT_VOP_LOCKED(a->a_vp, "VOP_LOCK");
450}
451
452void
453v_addpollinfo(struct vnode *vp)
454{
455	vp->v_pollinfo = uma_zalloc(vnodepoll_zone, M_WAITOK);
456	mtx_init(&vp->v_pollinfo->vpi_lock, "vnode pollinfo", NULL, MTX_DEF);
457}
458
459/*
460 * Initialize the vnode management data structures.
461 */
462static void
463vntblinit(void *dummy __unused)
464{
465
466	desiredvnodes = maxproc + cnt.v_page_count / 4;
467	minvnodes = desiredvnodes / 4;
468	mtx_init(&mountlist_mtx, "mountlist", NULL, MTX_DEF);
469	mtx_init(&mntvnode_mtx, "mntvnode", NULL, MTX_DEF);
470	mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF);
471	mtx_init(&spechash_mtx, "spechash", NULL, MTX_DEF);
472	TAILQ_INIT(&vnode_free_list);
473	mtx_init(&vnode_free_list_mtx, "vnode_free_list", NULL, MTX_DEF);
474	vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL,
475	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
476	vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo),
477	      NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
478	/*
479	 * Initialize the filesystem syncer.
480	 */
481	syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE,
482		&syncer_mask);
483	syncer_maxdelay = syncer_mask + 1;
484	mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF);
485}
486SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL)
487
488
489/*
490 * Mark a mount point as busy. Used to synchronize access and to delay
491 * unmounting. Interlock is not released on failure.
492 */
493int
494vfs_busy(mp, flags, interlkp, td)
495	struct mount *mp;
496	int flags;
497	struct mtx *interlkp;
498	struct thread *td;
499{
500	int lkflags;
501
502	if (mp->mnt_kern_flag & MNTK_UNMOUNT) {
503		if (flags & LK_NOWAIT)
504			return (ENOENT);
505		mp->mnt_kern_flag |= MNTK_MWAIT;
506		/*
507		 * Since all busy locks are shared except the exclusive
508		 * lock granted when unmounting, the only place that a
509		 * wakeup needs to be done is at the release of the
510		 * exclusive lock at the end of dounmount.
511		 */
512		msleep(mp, interlkp, PVFS, "vfs_busy", 0);
513		return (ENOENT);
514	}
515	lkflags = LK_SHARED | LK_NOPAUSE;
516	if (interlkp)
517		lkflags |= LK_INTERLOCK;
518	if (lockmgr(&mp->mnt_lock, lkflags, interlkp, td))
519		panic("vfs_busy: unexpected lock failure");
520	return (0);
521}
522
523/*
524 * Free a busy filesystem.
525 */
526void
527vfs_unbusy(mp, td)
528	struct mount *mp;
529	struct thread *td;
530{
531
532	lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, td);
533}
534
535/*
536 * Lookup a mount point by filesystem identifier.
537 */
538struct mount *
539vfs_getvfs(fsid)
540	fsid_t *fsid;
541{
542	register struct mount *mp;
543
544	mtx_lock(&mountlist_mtx);
545	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
546		if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
547		    mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
548			mtx_unlock(&mountlist_mtx);
549			return (mp);
550	    }
551	}
552	mtx_unlock(&mountlist_mtx);
553	return ((struct mount *) 0);
554}
555
556/*
557 * Get a new unique fsid.  Try to make its val[0] unique, since this value
558 * will be used to create fake device numbers for stat().  Also try (but
559 * not so hard) make its val[0] unique mod 2^16, since some emulators only
560 * support 16-bit device numbers.  We end up with unique val[0]'s for the
561 * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls.
562 *
563 * Keep in mind that several mounts may be running in parallel.  Starting
564 * the search one past where the previous search terminated is both a
565 * micro-optimization and a defense against returning the same fsid to
566 * different mounts.
567 */
568void
569vfs_getnewfsid(mp)
570	struct mount *mp;
571{
572	static u_int16_t mntid_base;
573	fsid_t tfsid;
574	int mtype;
575
576	mtx_lock(&mntid_mtx);
577	mtype = mp->mnt_vfc->vfc_typenum;
578	tfsid.val[1] = mtype;
579	mtype = (mtype & 0xFF) << 24;
580	for (;;) {
581		tfsid.val[0] = makeudev(255,
582		    mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF));
583		mntid_base++;
584		if (vfs_getvfs(&tfsid) == NULL)
585			break;
586	}
587	mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
588	mp->mnt_stat.f_fsid.val[1] = tfsid.val[1];
589	mtx_unlock(&mntid_mtx);
590}
591
592/*
593 * Knob to control the precision of file timestamps:
594 *
595 *   0 = seconds only; nanoseconds zeroed.
596 *   1 = seconds and nanoseconds, accurate within 1/HZ.
597 *   2 = seconds and nanoseconds, truncated to microseconds.
598 * >=3 = seconds and nanoseconds, maximum precision.
599 */
600enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC };
601
602static int timestamp_precision = TSP_SEC;
603SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW,
604    &timestamp_precision, 0, "");
605
606/*
607 * Get a current timestamp.
608 */
609void
610vfs_timestamp(tsp)
611	struct timespec *tsp;
612{
613	struct timeval tv;
614
615	switch (timestamp_precision) {
616	case TSP_SEC:
617		tsp->tv_sec = time_second;
618		tsp->tv_nsec = 0;
619		break;
620	case TSP_HZ:
621		getnanotime(tsp);
622		break;
623	case TSP_USEC:
624		microtime(&tv);
625		TIMEVAL_TO_TIMESPEC(&tv, tsp);
626		break;
627	case TSP_NSEC:
628	default:
629		nanotime(tsp);
630		break;
631	}
632}
633
634/*
635 * Set vnode attributes to VNOVAL
636 */
637void
638vattr_null(vap)
639	register struct vattr *vap;
640{
641
642	vap->va_type = VNON;
643	vap->va_size = VNOVAL;
644	vap->va_bytes = VNOVAL;
645	vap->va_mode = VNOVAL;
646	vap->va_nlink = VNOVAL;
647	vap->va_uid = VNOVAL;
648	vap->va_gid = VNOVAL;
649	vap->va_fsid = VNOVAL;
650	vap->va_fileid = VNOVAL;
651	vap->va_blocksize = VNOVAL;
652	vap->va_rdev = VNOVAL;
653	vap->va_atime.tv_sec = VNOVAL;
654	vap->va_atime.tv_nsec = VNOVAL;
655	vap->va_mtime.tv_sec = VNOVAL;
656	vap->va_mtime.tv_nsec = VNOVAL;
657	vap->va_ctime.tv_sec = VNOVAL;
658	vap->va_ctime.tv_nsec = VNOVAL;
659	vap->va_birthtime.tv_sec = VNOVAL;
660	vap->va_birthtime.tv_nsec = VNOVAL;
661	vap->va_flags = VNOVAL;
662	vap->va_gen = VNOVAL;
663	vap->va_vaflags = 0;
664}
665
666/*
667 * This routine is called when we have too many vnodes.  It attempts
668 * to free <count> vnodes and will potentially free vnodes that still
669 * have VM backing store (VM backing store is typically the cause
670 * of a vnode blowout so we want to do this).  Therefore, this operation
671 * is not considered cheap.
672 *
673 * A number of conditions may prevent a vnode from being reclaimed.
674 * the buffer cache may have references on the vnode, a directory
675 * vnode may still have references due to the namei cache representing
676 * underlying files, or the vnode may be in active use.   It is not
677 * desireable to reuse such vnodes.  These conditions may cause the
678 * number of vnodes to reach some minimum value regardless of what
679 * you set kern.maxvnodes to.  Do not set kern.maxvnodes too low.
680 */
681static int
682vlrureclaim(struct mount *mp, int count)
683{
684	struct vnode *vp;
685	int done;
686	int trigger;
687	int usevnodes;
688
689	/*
690	 * Calculate the trigger point, don't allow user
691	 * screwups to blow us up.   This prevents us from
692	 * recycling vnodes with lots of resident pages.  We
693	 * aren't trying to free memory, we are trying to
694	 * free vnodes.
695	 */
696	usevnodes = desiredvnodes;
697	if (usevnodes <= 0)
698		usevnodes = 1;
699	trigger = cnt.v_page_count * 2 / usevnodes;
700
701	done = 0;
702	mtx_lock(&mntvnode_mtx);
703	while (count && (vp = TAILQ_FIRST(&mp->mnt_nvnodelist)) != NULL) {
704		TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
705		TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
706
707		if (vp->v_type != VNON &&
708		    vp->v_type != VBAD &&
709		    VI_TRYLOCK(vp)) {
710			if (VMIGHTFREE(vp) &&           /* critical path opt */
711			    (vp->v_object == NULL ||
712			    vp->v_object->resident_page_count < trigger)) {
713				mtx_unlock(&mntvnode_mtx);
714				vgonel(vp, curthread);
715				done++;
716				mtx_lock(&mntvnode_mtx);
717			} else
718				VI_UNLOCK(vp);
719		}
720		--count;
721	}
722	mtx_unlock(&mntvnode_mtx);
723	return done;
724}
725
726/*
727 * Attempt to recycle vnodes in a context that is always safe to block.
728 * Calling vlrurecycle() from the bowels of filesystem code has some
729 * interesting deadlock problems.
730 */
731static struct proc *vnlruproc;
732static int vnlruproc_sig;
733
734static void
735vnlru_proc(void)
736{
737	struct mount *mp, *nmp;
738	int s;
739	int done;
740	struct proc *p = vnlruproc;
741	struct thread *td = FIRST_THREAD_IN_PROC(p);	/* XXXKSE */
742
743	mtx_lock(&Giant);
744
745	EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, p,
746	    SHUTDOWN_PRI_FIRST);
747
748	s = splbio();
749	for (;;) {
750		kthread_suspend_check(p);
751		mtx_lock(&vnode_free_list_mtx);
752		if (numvnodes - freevnodes <= desiredvnodes * 9 / 10) {
753			mtx_unlock(&vnode_free_list_mtx);
754			vnlruproc_sig = 0;
755			tsleep(vnlruproc, PVFS, "vlruwt", 0);
756			continue;
757		}
758		mtx_unlock(&vnode_free_list_mtx);
759		done = 0;
760		mtx_lock(&mountlist_mtx);
761		for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
762			if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td)) {
763				nmp = TAILQ_NEXT(mp, mnt_list);
764				continue;
765			}
766			done += vlrureclaim(mp, 10);
767			mtx_lock(&mountlist_mtx);
768			nmp = TAILQ_NEXT(mp, mnt_list);
769			vfs_unbusy(mp, td);
770		}
771		mtx_unlock(&mountlist_mtx);
772		if (done == 0) {
773#if 0
774			/* These messages are temporary debugging aids */
775			if (vnlru_nowhere < 5)
776				printf("vnlru process getting nowhere..\n");
777			else if (vnlru_nowhere == 5)
778				printf("vnlru process messages stopped.\n");
779#endif
780			vnlru_nowhere++;
781			tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3);
782		}
783	}
784	splx(s);
785}
786
787static struct kproc_desc vnlru_kp = {
788	"vnlru",
789	vnlru_proc,
790	&vnlruproc
791};
792SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &vnlru_kp)
793
794
795/*
796 * Routines having to do with the management of the vnode table.
797 */
798
799/*
800 * Check to see if a free vnode can be recycled.  If it can, return it locked
801 * with the vn lock, but not interlock.  Otherwise indicate the error.
802 */
803static int
804vcanrecycle(struct vnode *vp)
805{
806	struct thread *td = curthread;
807	vm_object_t object;
808	int error;
809
810	/* Don't recycle if we can't get the interlock */
811	if (!VI_TRYLOCK(vp))
812		return (EWOULDBLOCK);
813
814	/* We should be able to immediately acquire this */
815	/* XXX This looks like it should panic if it fails */
816	if (vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE, td) != 0)
817		return (EWOULDBLOCK);
818
819	/*
820	 * Don't recycle if we still have cached pages.
821	 */
822	if (VOP_GETVOBJECT(vp, &object) == 0 &&
823	     (object->resident_page_count ||
824	      object->ref_count)) {
825		error = EBUSY;
826		goto done;
827	}
828	if (LIST_FIRST(&vp->v_cache_src)) {
829		/*
830		 * note: nameileafonly sysctl is temporary,
831		 * for debugging only, and will eventually be
832		 * removed.
833		 */
834		if (nameileafonly > 0) {
835			/*
836			 * Do not reuse namei-cached directory
837			 * vnodes that have cached
838			 * subdirectories.
839			 */
840			if (cache_leaf_test(vp) < 0) {
841				error = EISDIR;
842				goto done;
843			}
844		} else if (nameileafonly < 0 ||
845			    vmiodirenable == 0) {
846			/*
847			 * Do not reuse namei-cached directory
848			 * vnodes if nameileafonly is -1 or
849			 * if VMIO backing for directories is
850			 * turned off (otherwise we reuse them
851			 * too quickly).
852			 */
853			error = EBUSY;
854			goto done;
855		}
856	}
857	return (0);
858done:
859	VOP_UNLOCK(vp, 0, td);
860	return (error);
861}
862
863/*
864 * Return the next vnode from the free list.
865 */
866int
867getnewvnode(tag, mp, vops, vpp)
868	const char *tag;
869	struct mount *mp;
870	vop_t **vops;
871	struct vnode **vpp;
872{
873	int s;
874	struct thread *td = curthread;	/* XXX */
875	struct vnode *vp = NULL;
876	struct mount *vnmp;
877
878	s = splbio();
879	mtx_lock(&vnode_free_list_mtx);
880
881	/*
882	 * Try to reuse vnodes if we hit the max.  This situation only
883	 * occurs in certain large-memory (2G+) situations.  We cannot
884	 * attempt to directly reclaim vnodes due to nasty recursion
885	 * problems.
886	 */
887	if (vnlruproc_sig == 0 && numvnodes - freevnodes > desiredvnodes) {
888		vnlruproc_sig = 1;      /* avoid unnecessary wakeups */
889		wakeup(vnlruproc);
890	}
891
892	/*
893	 * Attempt to reuse a vnode already on the free list, allocating
894	 * a new vnode if we can't find one or if we have not reached a
895	 * good minimum for good LRU performance.
896	 */
897
898	if (freevnodes >= wantfreevnodes && numvnodes >= minvnodes) {
899		int error;
900		int count;
901
902		for (count = 0; count < freevnodes; count++) {
903			vp = TAILQ_FIRST(&vnode_free_list);
904
905			KASSERT(vp->v_usecount == 0,
906			    ("getnewvnode: free vnode isn't"));
907
908			TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
909			/*
910			 * We have to drop the free list mtx to avoid lock
911			 * order reversals with interlock.
912			 */
913			mtx_unlock(&vnode_free_list_mtx);
914			error = vcanrecycle(vp);
915			/*
916			 * Skip over it if its filesystem is being suspended.
917			 */
918			if (error == 0 &&
919			    vn_start_write(vp, &vnmp, V_NOWAIT) != 0)
920				error = EBUSY;
921
922			mtx_lock(&vnode_free_list_mtx);
923			if (error != 0)
924				TAILQ_INSERT_TAIL(&vnode_free_list, vp,
925				    v_freelist);
926			else
927				break;
928		}
929	}
930	if (vp) {
931		freevnodes--;
932		mtx_unlock(&vnode_free_list_mtx);
933
934		cache_purge(vp);
935		VI_LOCK(vp);
936		vp->v_iflag |= VI_DOOMED;
937		vp->v_iflag &= ~VI_FREE;
938		if (vp->v_type != VBAD) {
939			VOP_UNLOCK(vp, 0, td);
940			vgonel(vp, td);
941			VI_LOCK(vp);
942		} else {
943			VOP_UNLOCK(vp, 0, td);
944		}
945		vn_finished_write(vnmp);
946
947#ifdef INVARIANTS
948		{
949			if (vp->v_data)
950				panic("cleaned vnode isn't");
951			if (vp->v_numoutput)
952				panic("Clean vnode has pending I/O's");
953			if (vp->v_writecount != 0)
954				panic("Non-zero write count");
955		}
956#endif
957		if (vp->v_pollinfo) {
958			mtx_destroy(&vp->v_pollinfo->vpi_lock);
959			uma_zfree(vnodepoll_zone, vp->v_pollinfo);
960		}
961		vp->v_pollinfo = NULL;
962#ifdef MAC
963		mac_destroy_vnode(vp);
964#endif
965		vp->v_iflag = 0;
966		vp->v_vflag = 0;
967		vp->v_lastw = 0;
968		vp->v_lasta = 0;
969		vp->v_cstart = 0;
970		vp->v_clen = 0;
971		vp->v_socket = 0;
972		KASSERT(vp->v_cleanblkroot == NULL, ("cleanblkroot not NULL"));
973		KASSERT(vp->v_dirtyblkroot == NULL, ("dirtyblkroot not NULL"));
974	} else {
975		numvnodes++;
976		mtx_unlock(&vnode_free_list_mtx);
977
978		vp = (struct vnode *) uma_zalloc(vnode_zone, M_WAITOK|M_ZERO);
979		mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF);
980		VI_LOCK(vp);
981		vp->v_dd = vp;
982		cache_purge(vp);
983		LIST_INIT(&vp->v_cache_src);
984		TAILQ_INIT(&vp->v_cache_dst);
985	}
986
987	TAILQ_INIT(&vp->v_cleanblkhd);
988	TAILQ_INIT(&vp->v_dirtyblkhd);
989	vp->v_type = VNON;
990	vp->v_tag = tag;
991	vp->v_op = vops;
992	lockinit(&vp->v_lock, PVFS, "vnlock", VLKTIMEOUT, LK_NOPAUSE);
993#ifdef MAC
994	mac_init_vnode(vp);
995#endif
996	*vpp = vp;
997	vp->v_usecount = 1;
998	vp->v_data = 0;
999	vp->v_cachedid = -1;
1000	VI_UNLOCK(vp);
1001	insmntque(vp, mp);
1002
1003	return (0);
1004}
1005
1006/*
1007 * Move a vnode from one mount queue to another.
1008 */
1009static void
1010insmntque(vp, mp)
1011	register struct vnode *vp;
1012	register struct mount *mp;
1013{
1014
1015	mtx_lock(&mntvnode_mtx);
1016	/*
1017	 * Delete from old mount point vnode list, if on one.
1018	 */
1019	if (vp->v_mount != NULL)
1020		TAILQ_REMOVE(&vp->v_mount->mnt_nvnodelist, vp, v_nmntvnodes);
1021	/*
1022	 * Insert into list of vnodes for the new mount point, if available.
1023	 */
1024	if ((vp->v_mount = mp) == NULL) {
1025		mtx_unlock(&mntvnode_mtx);
1026		return;
1027	}
1028	TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
1029	mtx_unlock(&mntvnode_mtx);
1030}
1031
1032/*
1033 * Update outstanding I/O count and do wakeup if requested.
1034 */
1035void
1036vwakeup(bp)
1037	register struct buf *bp;
1038{
1039	register struct vnode *vp;
1040
1041	bp->b_flags &= ~B_WRITEINPROG;
1042	if ((vp = bp->b_vp)) {
1043		VI_LOCK(vp);
1044		vp->v_numoutput--;
1045		if (vp->v_numoutput < 0)
1046			panic("vwakeup: neg numoutput");
1047		if ((vp->v_numoutput == 0) && (vp->v_iflag & VI_BWAIT)) {
1048			vp->v_iflag &= ~VI_BWAIT;
1049			wakeup(&vp->v_numoutput);
1050		}
1051		VI_UNLOCK(vp);
1052	}
1053}
1054
1055/*
1056 * Flush out and invalidate all buffers associated with a vnode.
1057 * Called with the underlying object locked.
1058 */
1059int
1060vinvalbuf(vp, flags, cred, td, slpflag, slptimeo)
1061	struct vnode *vp;
1062	int flags;
1063	struct ucred *cred;
1064	struct thread *td;
1065	int slpflag, slptimeo;
1066{
1067	struct buf *blist;
1068	int s, error;
1069	vm_object_t object;
1070
1071	GIANT_REQUIRED;
1072
1073	ASSERT_VOP_LOCKED(vp, "vinvalbuf");
1074
1075	VI_LOCK(vp);
1076	if (flags & V_SAVE) {
1077		s = splbio();
1078		while (vp->v_numoutput) {
1079			vp->v_iflag |= VI_BWAIT;
1080			error = msleep(&vp->v_numoutput, VI_MTX(vp),
1081			    slpflag | (PRIBIO + 1), "vinvlbuf", slptimeo);
1082			if (error) {
1083				VI_UNLOCK(vp);
1084				splx(s);
1085				return (error);
1086			}
1087		}
1088		if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
1089			splx(s);
1090			VI_UNLOCK(vp);
1091			if ((error = VOP_FSYNC(vp, cred, MNT_WAIT, td)) != 0)
1092				return (error);
1093			/*
1094			 * XXX We could save a lock/unlock if this was only
1095			 * enabled under INVARIANTS
1096			 */
1097			VI_LOCK(vp);
1098			s = splbio();
1099			if (vp->v_numoutput > 0 ||
1100			    !TAILQ_EMPTY(&vp->v_dirtyblkhd))
1101				panic("vinvalbuf: dirty bufs");
1102		}
1103		splx(s);
1104	}
1105	s = splbio();
1106	/*
1107	 * If you alter this loop please notice that interlock is dropped and
1108	 * reacquired in flushbuflist.  Special care is needed to ensure that
1109	 * no race conditions occur from this.
1110	 */
1111	for (error = 0;;) {
1112		if ((blist = TAILQ_FIRST(&vp->v_cleanblkhd)) != 0 &&
1113		    flushbuflist(blist, flags, vp, slpflag, slptimeo, &error)) {
1114			if (error)
1115				break;
1116			continue;
1117		}
1118		if ((blist = TAILQ_FIRST(&vp->v_dirtyblkhd)) != 0 &&
1119		    flushbuflist(blist, flags, vp, slpflag, slptimeo, &error)) {
1120			if (error)
1121				break;
1122			continue;
1123		}
1124		break;
1125	}
1126	if (error) {
1127		splx(s);
1128		VI_UNLOCK(vp);
1129		return (error);
1130	}
1131
1132	/*
1133	 * Wait for I/O to complete.  XXX needs cleaning up.  The vnode can
1134	 * have write I/O in-progress but if there is a VM object then the
1135	 * VM object can also have read-I/O in-progress.
1136	 */
1137	do {
1138		while (vp->v_numoutput > 0) {
1139			vp->v_iflag |= VI_BWAIT;
1140			msleep(&vp->v_numoutput, VI_MTX(vp), PVM, "vnvlbv", 0);
1141		}
1142		VI_UNLOCK(vp);
1143		if (VOP_GETVOBJECT(vp, &object) == 0) {
1144			while (object->paging_in_progress)
1145			vm_object_pip_sleep(object, "vnvlbx");
1146		}
1147		VI_LOCK(vp);
1148	} while (vp->v_numoutput > 0);
1149	VI_UNLOCK(vp);
1150
1151	splx(s);
1152
1153	/*
1154	 * Destroy the copy in the VM cache, too.
1155	 */
1156	if (VOP_GETVOBJECT(vp, &object) == 0) {
1157		vm_object_page_remove(object, 0, 0,
1158			(flags & V_SAVE) ? TRUE : FALSE);
1159	}
1160
1161#ifdef INVARIANTS
1162	VI_LOCK(vp);
1163	if ((flags & (V_ALT | V_NORMAL)) == 0 &&
1164	    (!TAILQ_EMPTY(&vp->v_dirtyblkhd) ||
1165	     !TAILQ_EMPTY(&vp->v_cleanblkhd)))
1166		panic("vinvalbuf: flush failed");
1167	VI_UNLOCK(vp);
1168#endif
1169	return (0);
1170}
1171
1172/*
1173 * Flush out buffers on the specified list.
1174 *
1175 */
1176static int
1177flushbuflist(blist, flags, vp, slpflag, slptimeo, errorp)
1178	struct buf *blist;
1179	int flags;
1180	struct vnode *vp;
1181	int slpflag, slptimeo;
1182	int *errorp;
1183{
1184	struct buf *bp, *nbp;
1185	int found, error;
1186
1187	ASSERT_VI_LOCKED(vp, "flushbuflist");
1188
1189	for (found = 0, bp = blist; bp; bp = nbp) {
1190		nbp = TAILQ_NEXT(bp, b_vnbufs);
1191		VI_UNLOCK(vp);
1192		if (((flags & V_NORMAL) && (bp->b_xflags & BX_ALTDATA)) ||
1193		    ((flags & V_ALT) && (bp->b_xflags & BX_ALTDATA) == 0)) {
1194			VI_LOCK(vp);
1195			continue;
1196		}
1197		found += 1;
1198		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
1199			error = BUF_TIMELOCK(bp,
1200			    LK_EXCLUSIVE | LK_SLEEPFAIL,
1201			    "flushbuf", slpflag, slptimeo);
1202			if (error != ENOLCK)
1203				*errorp = error;
1204			goto done;
1205		}
1206		/*
1207		 * XXX Since there are no node locks for NFS, I
1208		 * believe there is a slight chance that a delayed
1209		 * write will occur while sleeping just above, so
1210		 * check for it.  Note that vfs_bio_awrite expects
1211		 * buffers to reside on a queue, while BUF_WRITE and
1212		 * brelse do not.
1213		 */
1214		if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) &&
1215			(flags & V_SAVE)) {
1216
1217			if (bp->b_vp == vp) {
1218				if (bp->b_flags & B_CLUSTEROK) {
1219					BUF_UNLOCK(bp);
1220					vfs_bio_awrite(bp);
1221				} else {
1222					bremfree(bp);
1223					bp->b_flags |= B_ASYNC;
1224					BUF_WRITE(bp);
1225				}
1226			} else {
1227				bremfree(bp);
1228				(void) BUF_WRITE(bp);
1229			}
1230			goto done;
1231		}
1232		bremfree(bp);
1233		bp->b_flags |= (B_INVAL | B_NOCACHE | B_RELBUF);
1234		bp->b_flags &= ~B_ASYNC;
1235		brelse(bp);
1236		VI_LOCK(vp);
1237	}
1238	return (found);
1239done:
1240	VI_LOCK(vp);
1241	return (found);
1242}
1243
1244/*
1245 * Truncate a file's buffer and pages to a specified length.  This
1246 * is in lieu of the old vinvalbuf mechanism, which performed unneeded
1247 * sync activity.
1248 */
1249int
1250vtruncbuf(vp, cred, td, length, blksize)
1251	register struct vnode *vp;
1252	struct ucred *cred;
1253	struct thread *td;
1254	off_t length;
1255	int blksize;
1256{
1257	register struct buf *bp;
1258	struct buf *nbp;
1259	int s, anyfreed;
1260	int trunclbn;
1261
1262	/*
1263	 * Round up to the *next* lbn.
1264	 */
1265	trunclbn = (length + blksize - 1) / blksize;
1266
1267	s = splbio();
1268	ASSERT_VOP_LOCKED(vp, "vtruncbuf");
1269restart:
1270	VI_LOCK(vp);
1271	anyfreed = 1;
1272	for (;anyfreed;) {
1273		anyfreed = 0;
1274		for (bp = TAILQ_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) {
1275			nbp = TAILQ_NEXT(bp, b_vnbufs);
1276			VI_UNLOCK(vp);
1277			if (bp->b_lblkno >= trunclbn) {
1278				if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
1279					BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL);
1280					goto restart;
1281				} else {
1282					bremfree(bp);
1283					bp->b_flags |= (B_INVAL | B_RELBUF);
1284					bp->b_flags &= ~B_ASYNC;
1285					brelse(bp);
1286					anyfreed = 1;
1287				}
1288				if (nbp &&
1289				    (((nbp->b_xflags & BX_VNCLEAN) == 0) ||
1290				    (nbp->b_vp != vp) ||
1291				    (nbp->b_flags & B_DELWRI))) {
1292					goto restart;
1293				}
1294			}
1295			VI_LOCK(vp);
1296		}
1297
1298		for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
1299			nbp = TAILQ_NEXT(bp, b_vnbufs);
1300			VI_UNLOCK(vp);
1301			if (bp->b_lblkno >= trunclbn) {
1302				if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
1303					BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL);
1304					goto restart;
1305				} else {
1306					bremfree(bp);
1307					bp->b_flags |= (B_INVAL | B_RELBUF);
1308					bp->b_flags &= ~B_ASYNC;
1309					brelse(bp);
1310					anyfreed = 1;
1311				}
1312				if (nbp &&
1313				    (((nbp->b_xflags & BX_VNDIRTY) == 0) ||
1314				    (nbp->b_vp != vp) ||
1315				    (nbp->b_flags & B_DELWRI) == 0)) {
1316					goto restart;
1317				}
1318			}
1319			VI_LOCK(vp);
1320		}
1321	}
1322
1323	if (length > 0) {
1324restartsync:
1325		for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
1326			nbp = TAILQ_NEXT(bp, b_vnbufs);
1327			VI_UNLOCK(vp);
1328			if ((bp->b_flags & B_DELWRI) && (bp->b_lblkno < 0)) {
1329				if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
1330					BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL);
1331					goto restart;
1332				} else {
1333					bremfree(bp);
1334					if (bp->b_vp == vp) {
1335						bp->b_flags |= B_ASYNC;
1336					} else {
1337						bp->b_flags &= ~B_ASYNC;
1338					}
1339					BUF_WRITE(bp);
1340				}
1341				VI_LOCK(vp);
1342				goto restartsync;
1343			}
1344			VI_LOCK(vp);
1345		}
1346	}
1347
1348	while (vp->v_numoutput > 0) {
1349		vp->v_iflag |= VI_BWAIT;
1350		msleep(&vp->v_numoutput, VI_MTX(vp), PVM, "vbtrunc", 0);
1351	}
1352	VI_UNLOCK(vp);
1353	splx(s);
1354
1355	vnode_pager_setsize(vp, length);
1356
1357	return (0);
1358}
1359
1360/*
1361 * buf_splay() - splay tree core for the clean/dirty list of buffers in
1362 * 		 a vnode.
1363 *
1364 *	NOTE: We have to deal with the special case of a background bitmap
1365 *	buffer, a situation where two buffers will have the same logical
1366 *	block offset.  We want (1) only the foreground buffer to be accessed
1367 *	in a lookup and (2) must differentiate between the foreground and
1368 *	background buffer in the splay tree algorithm because the splay
1369 *	tree cannot normally handle multiple entities with the same 'index'.
1370 *	We accomplish this by adding differentiating flags to the splay tree's
1371 *	numerical domain.
1372 */
1373static
1374struct buf *
1375buf_splay(daddr_t lblkno, b_xflags_t xflags, struct buf *root)
1376{
1377	struct buf dummy;
1378	struct buf *lefttreemax, *righttreemin, *y;
1379
1380	if (root == NULL)
1381		return (NULL);
1382	lefttreemax = righttreemin = &dummy;
1383	for (;;) {
1384		if (lblkno < root->b_lblkno ||
1385		    (lblkno == root->b_lblkno &&
1386		    (xflags & BX_BKGRDMARKER) < (root->b_xflags & BX_BKGRDMARKER))) {
1387			if ((y = root->b_left) == NULL)
1388				break;
1389			if (lblkno < y->b_lblkno) {
1390				/* Rotate right. */
1391				root->b_left = y->b_right;
1392				y->b_right = root;
1393				root = y;
1394				if ((y = root->b_left) == NULL)
1395					break;
1396			}
1397			/* Link into the new root's right tree. */
1398			righttreemin->b_left = root;
1399			righttreemin = root;
1400		} else if (lblkno > root->b_lblkno ||
1401		    (lblkno == root->b_lblkno &&
1402		    (xflags & BX_BKGRDMARKER) > (root->b_xflags & BX_BKGRDMARKER))) {
1403			if ((y = root->b_right) == NULL)
1404				break;
1405			if (lblkno > y->b_lblkno) {
1406				/* Rotate left. */
1407				root->b_right = y->b_left;
1408				y->b_left = root;
1409				root = y;
1410				if ((y = root->b_right) == NULL)
1411					break;
1412			}
1413			/* Link into the new root's left tree. */
1414			lefttreemax->b_right = root;
1415			lefttreemax = root;
1416		} else {
1417			break;
1418		}
1419		root = y;
1420	}
1421	/* Assemble the new root. */
1422	lefttreemax->b_right = root->b_left;
1423	righttreemin->b_left = root->b_right;
1424	root->b_left = dummy.b_right;
1425	root->b_right = dummy.b_left;
1426	return (root);
1427}
1428
1429static
1430void
1431buf_vlist_remove(struct buf *bp)
1432{
1433	struct vnode *vp = bp->b_vp;
1434	struct buf *root;
1435
1436	ASSERT_VI_LOCKED(vp, "buf_vlist_remove");
1437	if (bp->b_xflags & BX_VNDIRTY) {
1438		if (bp != vp->v_dirtyblkroot) {
1439			root = buf_splay(bp->b_lblkno, bp->b_xflags, vp->v_dirtyblkroot);
1440			KASSERT(root == bp, ("splay lookup failed during dirty remove"));
1441		}
1442		if (bp->b_left == NULL) {
1443			root = bp->b_right;
1444		} else {
1445			root = buf_splay(bp->b_lblkno, bp->b_xflags, bp->b_left);
1446			root->b_right = bp->b_right;
1447		}
1448		vp->v_dirtyblkroot = root;
1449		TAILQ_REMOVE(&vp->v_dirtyblkhd, bp, b_vnbufs);
1450	} else {
1451		/* KASSERT(bp->b_xflags & BX_VNCLEAN, ("bp wasn't clean")); */
1452		if (bp != vp->v_cleanblkroot) {
1453			root = buf_splay(bp->b_lblkno, bp->b_xflags, vp->v_cleanblkroot);
1454			KASSERT(root == bp, ("splay lookup failed during clean remove"));
1455		}
1456		if (bp->b_left == NULL) {
1457			root = bp->b_right;
1458		} else {
1459			root = buf_splay(bp->b_lblkno, bp->b_xflags, bp->b_left);
1460			root->b_right = bp->b_right;
1461		}
1462		vp->v_cleanblkroot = root;
1463		TAILQ_REMOVE(&vp->v_cleanblkhd, bp, b_vnbufs);
1464	}
1465	bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN);
1466}
1467
1468/*
1469 * Add the buffer to the sorted clean or dirty block list using a
1470 * splay tree algorithm.
1471 *
1472 * NOTE: xflags is passed as a constant, optimizing this inline function!
1473 */
1474static
1475void
1476buf_vlist_add(struct buf *bp, struct vnode *vp, b_xflags_t xflags)
1477{
1478	struct buf *root;
1479
1480	ASSERT_VI_LOCKED(vp, "buf_vlist_add");
1481	bp->b_xflags |= xflags;
1482	if (xflags & BX_VNDIRTY) {
1483		root = buf_splay(bp->b_lblkno, bp->b_xflags, vp->v_dirtyblkroot);
1484		if (root == NULL) {
1485			bp->b_left = NULL;
1486			bp->b_right = NULL;
1487			TAILQ_INSERT_TAIL(&vp->v_dirtyblkhd, bp, b_vnbufs);
1488		} else if (bp->b_lblkno < root->b_lblkno ||
1489		    (bp->b_lblkno == root->b_lblkno &&
1490		    (bp->b_xflags & BX_BKGRDMARKER) < (root->b_xflags & BX_BKGRDMARKER))) {
1491			bp->b_left = root->b_left;
1492			bp->b_right = root;
1493			root->b_left = NULL;
1494			TAILQ_INSERT_BEFORE(root, bp, b_vnbufs);
1495		} else {
1496			bp->b_right = root->b_right;
1497			bp->b_left = root;
1498			root->b_right = NULL;
1499			TAILQ_INSERT_AFTER(&vp->v_dirtyblkhd,
1500			    root, bp, b_vnbufs);
1501		}
1502		vp->v_dirtyblkroot = bp;
1503	} else {
1504		/* KASSERT(xflags & BX_VNCLEAN, ("xflags not clean")); */
1505		root = buf_splay(bp->b_lblkno, bp->b_xflags, vp->v_cleanblkroot);
1506		if (root == NULL) {
1507			bp->b_left = NULL;
1508			bp->b_right = NULL;
1509			TAILQ_INSERT_TAIL(&vp->v_cleanblkhd, bp, b_vnbufs);
1510		} else if (bp->b_lblkno < root->b_lblkno ||
1511		    (bp->b_lblkno == root->b_lblkno &&
1512		    (bp->b_xflags & BX_BKGRDMARKER) < (root->b_xflags & BX_BKGRDMARKER))) {
1513			bp->b_left = root->b_left;
1514			bp->b_right = root;
1515			root->b_left = NULL;
1516			TAILQ_INSERT_BEFORE(root, bp, b_vnbufs);
1517		} else {
1518			bp->b_right = root->b_right;
1519			bp->b_left = root;
1520			root->b_right = NULL;
1521			TAILQ_INSERT_AFTER(&vp->v_cleanblkhd,
1522			    root, bp, b_vnbufs);
1523		}
1524		vp->v_cleanblkroot = bp;
1525	}
1526}
1527
1528#ifndef USE_BUFHASH
1529
1530/*
1531 * Lookup a buffer using the splay tree.  Note that we specifically avoid
1532 * shadow buffers used in background bitmap writes.
1533 *
1534 * This code isn't quite efficient as it could be because we are maintaining
1535 * two sorted lists and do not know which list the block resides in.
1536 */
1537struct buf *
1538gbincore(struct vnode *vp, daddr_t lblkno)
1539{
1540	struct buf *bp;
1541
1542	GIANT_REQUIRED;
1543
1544	ASSERT_VI_LOCKED(vp, "gbincore");
1545	bp = vp->v_cleanblkroot = buf_splay(lblkno, 0, vp->v_cleanblkroot);
1546	if (bp && bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
1547		return(bp);
1548	bp = vp->v_dirtyblkroot = buf_splay(lblkno, 0, vp->v_dirtyblkroot);
1549	if (bp && bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
1550		return(bp);
1551	return(NULL);
1552}
1553
1554#endif
1555
1556/*
1557 * Associate a buffer with a vnode.
1558 */
1559void
1560bgetvp(vp, bp)
1561	register struct vnode *vp;
1562	register struct buf *bp;
1563{
1564	int s;
1565
1566	KASSERT(bp->b_vp == NULL, ("bgetvp: not free"));
1567
1568	KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0,
1569	    ("bgetvp: bp already attached! %p", bp));
1570
1571	VI_LOCK(vp);
1572	vholdl(vp);
1573	bp->b_vp = vp;
1574	bp->b_dev = vn_todev(vp);
1575	/*
1576	 * Insert onto list for new vnode.
1577	 */
1578	s = splbio();
1579	buf_vlist_add(bp, vp, BX_VNCLEAN);
1580	splx(s);
1581	VI_UNLOCK(vp);
1582}
1583
1584/*
1585 * Disassociate a buffer from a vnode.
1586 */
1587void
1588brelvp(bp)
1589	register struct buf *bp;
1590{
1591	struct vnode *vp;
1592	int s;
1593
1594	KASSERT(bp->b_vp != NULL, ("brelvp: NULL"));
1595
1596	/*
1597	 * Delete from old vnode list, if on one.
1598	 */
1599	vp = bp->b_vp;
1600	s = splbio();
1601	VI_LOCK(vp);
1602	if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN))
1603		buf_vlist_remove(bp);
1604	if ((vp->v_iflag & VI_ONWORKLST) && TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
1605		vp->v_iflag &= ~VI_ONWORKLST;
1606		mtx_lock(&sync_mtx);
1607		LIST_REMOVE(vp, v_synclist);
1608		mtx_unlock(&sync_mtx);
1609	}
1610	vdropl(vp);
1611	VI_UNLOCK(vp);
1612	bp->b_vp = (struct vnode *) 0;
1613	if (bp->b_object)
1614		bp->b_object = NULL;
1615	splx(s);
1616}
1617
1618/*
1619 * Add an item to the syncer work queue.
1620 */
1621static void
1622vn_syncer_add_to_worklist(struct vnode *vp, int delay)
1623{
1624	int s, slot;
1625
1626	s = splbio();
1627	ASSERT_VI_LOCKED(vp, "vn_syncer_add_to_worklist");
1628
1629	mtx_lock(&sync_mtx);
1630	if (vp->v_iflag & VI_ONWORKLST)
1631		LIST_REMOVE(vp, v_synclist);
1632	else
1633		vp->v_iflag |= VI_ONWORKLST;
1634
1635	if (delay > syncer_maxdelay - 2)
1636		delay = syncer_maxdelay - 2;
1637	slot = (syncer_delayno + delay) & syncer_mask;
1638
1639	LIST_INSERT_HEAD(&syncer_workitem_pending[slot], vp, v_synclist);
1640	mtx_unlock(&sync_mtx);
1641
1642	splx(s);
1643}
1644
1645struct  proc *updateproc;
1646static void sched_sync(void);
1647static struct kproc_desc up_kp = {
1648	"syncer",
1649	sched_sync,
1650	&updateproc
1651};
1652SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp)
1653
1654/*
1655 * System filesystem synchronizer daemon.
1656 */
1657void
1658sched_sync(void)
1659{
1660	struct synclist *slp;
1661	struct vnode *vp;
1662	struct mount *mp;
1663	long starttime;
1664	int s;
1665	struct thread *td = FIRST_THREAD_IN_PROC(updateproc);  /* XXXKSE */
1666
1667	mtx_lock(&Giant);
1668
1669	EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, td->td_proc,
1670	    SHUTDOWN_PRI_LAST);
1671
1672	for (;;) {
1673		kthread_suspend_check(td->td_proc);
1674
1675		starttime = time_second;
1676
1677		/*
1678		 * Push files whose dirty time has expired.  Be careful
1679		 * of interrupt race on slp queue.
1680		 */
1681		s = splbio();
1682		mtx_lock(&sync_mtx);
1683		slp = &syncer_workitem_pending[syncer_delayno];
1684		syncer_delayno += 1;
1685		if (syncer_delayno == syncer_maxdelay)
1686			syncer_delayno = 0;
1687		splx(s);
1688
1689		while ((vp = LIST_FIRST(slp)) != NULL) {
1690			mtx_unlock(&sync_mtx);
1691			if (VOP_ISLOCKED(vp, NULL) == 0 &&
1692			    vn_start_write(vp, &mp, V_NOWAIT) == 0) {
1693				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
1694				(void) VOP_FSYNC(vp, td->td_ucred, MNT_LAZY, td);
1695				VOP_UNLOCK(vp, 0, td);
1696				vn_finished_write(mp);
1697			}
1698			s = splbio();
1699			mtx_lock(&sync_mtx);
1700			if (LIST_FIRST(slp) == vp) {
1701				mtx_unlock(&sync_mtx);
1702				/*
1703				 * Note: VFS vnodes can remain on the
1704				 * worklist too with no dirty blocks, but
1705				 * since sync_fsync() moves it to a different
1706				 * slot we are safe.
1707				 */
1708				VI_LOCK(vp);
1709				if (TAILQ_EMPTY(&vp->v_dirtyblkhd) &&
1710				    !vn_isdisk(vp, NULL)) {
1711					panic("sched_sync: fsync failed "
1712					      "vp %p tag %s", vp, vp->v_tag);
1713				}
1714				/*
1715				 * Put us back on the worklist.  The worklist
1716				 * routine will remove us from our current
1717				 * position and then add us back in at a later
1718				 * position.
1719				 */
1720				vn_syncer_add_to_worklist(vp, syncdelay);
1721				VI_UNLOCK(vp);
1722				mtx_lock(&sync_mtx);
1723			}
1724			splx(s);
1725		}
1726		mtx_unlock(&sync_mtx);
1727
1728		/*
1729		 * Do soft update processing.
1730		 */
1731		if (softdep_process_worklist_hook != NULL)
1732			(*softdep_process_worklist_hook)(NULL);
1733
1734		/*
1735		 * The variable rushjob allows the kernel to speed up the
1736		 * processing of the filesystem syncer process. A rushjob
1737		 * value of N tells the filesystem syncer to process the next
1738		 * N seconds worth of work on its queue ASAP. Currently rushjob
1739		 * is used by the soft update code to speed up the filesystem
1740		 * syncer process when the incore state is getting so far
1741		 * ahead of the disk that the kernel memory pool is being
1742		 * threatened with exhaustion.
1743		 */
1744		mtx_lock(&sync_mtx);
1745		if (rushjob > 0) {
1746			rushjob -= 1;
1747			mtx_unlock(&sync_mtx);
1748			continue;
1749		}
1750		mtx_unlock(&sync_mtx);
1751		/*
1752		 * If it has taken us less than a second to process the
1753		 * current work, then wait. Otherwise start right over
1754		 * again. We can still lose time if any single round
1755		 * takes more than two seconds, but it does not really
1756		 * matter as we are just trying to generally pace the
1757		 * filesystem activity.
1758		 */
1759		if (time_second == starttime)
1760			tsleep(&lbolt, PPAUSE, "syncer", 0);
1761	}
1762}
1763
1764/*
1765 * Request the syncer daemon to speed up its work.
1766 * We never push it to speed up more than half of its
1767 * normal turn time, otherwise it could take over the cpu.
1768 * XXXKSE  only one update?
1769 */
1770int
1771speedup_syncer()
1772{
1773	struct thread *td;
1774	int ret = 0;
1775
1776	td = FIRST_THREAD_IN_PROC(updateproc);
1777	mtx_lock_spin(&sched_lock);
1778	if (td->td_wchan == &lbolt) {
1779		unsleep(td);
1780		TD_CLR_SLEEPING(td);
1781		setrunnable(td);
1782	}
1783	mtx_unlock_spin(&sched_lock);
1784	mtx_lock(&sync_mtx);
1785	if (rushjob < syncdelay / 2) {
1786		rushjob += 1;
1787		stat_rush_requests += 1;
1788		ret = 1;
1789	}
1790	mtx_unlock(&sync_mtx);
1791	return (ret);
1792}
1793
1794/*
1795 * Associate a p-buffer with a vnode.
1796 *
1797 * Also sets B_PAGING flag to indicate that vnode is not fully associated
1798 * with the buffer.  i.e. the bp has not been linked into the vnode or
1799 * ref-counted.
1800 */
1801void
1802pbgetvp(vp, bp)
1803	register struct vnode *vp;
1804	register struct buf *bp;
1805{
1806
1807	KASSERT(bp->b_vp == NULL, ("pbgetvp: not free"));
1808
1809	bp->b_vp = vp;
1810	bp->b_flags |= B_PAGING;
1811	bp->b_dev = vn_todev(vp);
1812}
1813
1814/*
1815 * Disassociate a p-buffer from a vnode.
1816 */
1817void
1818pbrelvp(bp)
1819	register struct buf *bp;
1820{
1821
1822	KASSERT(bp->b_vp != NULL, ("pbrelvp: NULL"));
1823
1824	/* XXX REMOVE ME */
1825	VI_LOCK(bp->b_vp);
1826	if (TAILQ_NEXT(bp, b_vnbufs) != NULL) {
1827		panic(
1828		    "relpbuf(): b_vp was probably reassignbuf()d %p %x",
1829		    bp,
1830		    (int)bp->b_flags
1831		);
1832	}
1833	VI_UNLOCK(bp->b_vp);
1834	bp->b_vp = (struct vnode *) 0;
1835	bp->b_flags &= ~B_PAGING;
1836}
1837
1838/*
1839 * Reassign a buffer from one vnode to another.
1840 * Used to assign file specific control information
1841 * (indirect blocks) to the vnode to which they belong.
1842 */
1843void
1844reassignbuf(bp, newvp)
1845	register struct buf *bp;
1846	register struct vnode *newvp;
1847{
1848	int delay;
1849	int s;
1850
1851	if (newvp == NULL) {
1852		printf("reassignbuf: NULL");
1853		return;
1854	}
1855	++reassignbufcalls;
1856
1857	/*
1858	 * B_PAGING flagged buffers cannot be reassigned because their vp
1859	 * is not fully linked in.
1860	 */
1861	if (bp->b_flags & B_PAGING)
1862		panic("cannot reassign paging buffer");
1863
1864	s = splbio();
1865	/*
1866	 * Delete from old vnode list, if on one.
1867	 */
1868	VI_LOCK(bp->b_vp);
1869	if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) {
1870		buf_vlist_remove(bp);
1871		if (bp->b_vp != newvp) {
1872			vdropl(bp->b_vp);
1873			bp->b_vp = NULL;	/* for clarification */
1874		}
1875	}
1876	VI_UNLOCK(bp->b_vp);
1877	/*
1878	 * If dirty, put on list of dirty buffers; otherwise insert onto list
1879	 * of clean buffers.
1880	 */
1881	VI_LOCK(newvp);
1882	if (bp->b_flags & B_DELWRI) {
1883		if ((newvp->v_iflag & VI_ONWORKLST) == 0) {
1884			switch (newvp->v_type) {
1885			case VDIR:
1886				delay = dirdelay;
1887				break;
1888			case VCHR:
1889				if (newvp->v_rdev->si_mountpoint != NULL) {
1890					delay = metadelay;
1891					break;
1892				}
1893				/* FALLTHROUGH */
1894			default:
1895				delay = filedelay;
1896			}
1897			vn_syncer_add_to_worklist(newvp, delay);
1898		}
1899		buf_vlist_add(bp, newvp, BX_VNDIRTY);
1900	} else {
1901		buf_vlist_add(bp, newvp, BX_VNCLEAN);
1902
1903		if ((newvp->v_iflag & VI_ONWORKLST) &&
1904		    TAILQ_EMPTY(&newvp->v_dirtyblkhd)) {
1905			mtx_lock(&sync_mtx);
1906			LIST_REMOVE(newvp, v_synclist);
1907			mtx_unlock(&sync_mtx);
1908			newvp->v_iflag &= ~VI_ONWORKLST;
1909		}
1910	}
1911	if (bp->b_vp != newvp) {
1912		bp->b_vp = newvp;
1913		vholdl(bp->b_vp);
1914	}
1915	VI_UNLOCK(newvp);
1916	splx(s);
1917}
1918
1919/*
1920 * Create a vnode for a device.
1921 * Used for mounting the root filesystem.
1922 */
1923int
1924bdevvp(dev, vpp)
1925	dev_t dev;
1926	struct vnode **vpp;
1927{
1928	register struct vnode *vp;
1929	struct vnode *nvp;
1930	int error;
1931
1932	if (dev == NODEV) {
1933		*vpp = NULLVP;
1934		return (ENXIO);
1935	}
1936	if (vfinddev(dev, VCHR, vpp))
1937		return (0);
1938	error = getnewvnode("none", (struct mount *)0, spec_vnodeop_p, &nvp);
1939	if (error) {
1940		*vpp = NULLVP;
1941		return (error);
1942	}
1943	vp = nvp;
1944	vp->v_type = VCHR;
1945	addalias(vp, dev);
1946	*vpp = vp;
1947	return (0);
1948}
1949
1950/*
1951 * Add vnode to the alias list hung off the dev_t.
1952 *
1953 * The reason for this gunk is that multiple vnodes can reference
1954 * the same physical device, so checking vp->v_usecount to see
1955 * how many users there are is inadequate; the v_usecount for
1956 * the vnodes need to be accumulated.  vcount() does that.
1957 */
1958struct vnode *
1959addaliasu(nvp, nvp_rdev)
1960	struct vnode *nvp;
1961	udev_t nvp_rdev;
1962{
1963	struct vnode *ovp;
1964	vop_t **ops;
1965	dev_t dev;
1966
1967	if (nvp->v_type == VBLK)
1968		return (nvp);
1969	if (nvp->v_type != VCHR)
1970		panic("addaliasu on non-special vnode");
1971	dev = udev2dev(nvp_rdev, 0);
1972	/*
1973	 * Check to see if we have a bdevvp vnode with no associated
1974	 * filesystem. If so, we want to associate the filesystem of
1975	 * the new newly instigated vnode with the bdevvp vnode and
1976	 * discard the newly created vnode rather than leaving the
1977	 * bdevvp vnode lying around with no associated filesystem.
1978	 */
1979	if (vfinddev(dev, nvp->v_type, &ovp) == 0 || ovp->v_data != NULL) {
1980		addalias(nvp, dev);
1981		return (nvp);
1982	}
1983	/*
1984	 * Discard unneeded vnode, but save its node specific data.
1985	 * Note that if there is a lock, it is carried over in the
1986	 * node specific data to the replacement vnode.
1987	 */
1988	vref(ovp);
1989	ovp->v_data = nvp->v_data;
1990	ovp->v_tag = nvp->v_tag;
1991	nvp->v_data = NULL;
1992	lockinit(&ovp->v_lock, PVFS, nvp->v_lock.lk_wmesg,
1993	    nvp->v_lock.lk_timo, nvp->v_lock.lk_flags & LK_EXTFLG_MASK);
1994	if (nvp->v_vnlock)
1995		ovp->v_vnlock = &ovp->v_lock;
1996	ops = ovp->v_op;
1997	ovp->v_op = nvp->v_op;
1998	if (VOP_ISLOCKED(nvp, curthread)) {
1999		VOP_UNLOCK(nvp, 0, curthread);
2000		vn_lock(ovp, LK_EXCLUSIVE | LK_RETRY, curthread);
2001	}
2002	nvp->v_op = ops;
2003	insmntque(ovp, nvp->v_mount);
2004	vrele(nvp);
2005	vgone(nvp);
2006	return (ovp);
2007}
2008
2009/* This is a local helper function that do the same as addaliasu, but for a
2010 * dev_t instead of an udev_t. */
2011static void
2012addalias(nvp, dev)
2013	struct vnode *nvp;
2014	dev_t dev;
2015{
2016
2017	KASSERT(nvp->v_type == VCHR, ("addalias on non-special vnode"));
2018	nvp->v_rdev = dev;
2019	mtx_lock(&spechash_mtx);
2020	SLIST_INSERT_HEAD(&dev->si_hlist, nvp, v_specnext);
2021	mtx_unlock(&spechash_mtx);
2022}
2023
2024/*
2025 * Grab a particular vnode from the free list, increment its
2026 * reference count and lock it. The vnode lock bit is set if the
2027 * vnode is being eliminated in vgone. The process is awakened
2028 * when the transition is completed, and an error returned to
2029 * indicate that the vnode is no longer usable (possibly having
2030 * been changed to a new filesystem type).
2031 */
2032int
2033vget(vp, flags, td)
2034	register struct vnode *vp;
2035	int flags;
2036	struct thread *td;
2037{
2038	int error;
2039
2040	/*
2041	 * If the vnode is in the process of being cleaned out for
2042	 * another use, we wait for the cleaning to finish and then
2043	 * return failure. Cleaning is determined by checking that
2044	 * the VI_XLOCK flag is set.
2045	 */
2046	if ((flags & LK_INTERLOCK) == 0)
2047		VI_LOCK(vp);
2048	if (vp->v_iflag & VI_XLOCK && vp->v_vxproc != curthread) {
2049		vp->v_iflag |= VI_XWANT;
2050		msleep(vp, VI_MTX(vp), PINOD | PDROP, "vget", 0);
2051		return (ENOENT);
2052	}
2053
2054	vp->v_usecount++;
2055
2056	if (VSHOULDBUSY(vp))
2057		vbusy(vp);
2058	if (flags & LK_TYPE_MASK) {
2059		if ((error = vn_lock(vp, flags | LK_INTERLOCK, td)) != 0) {
2060			/*
2061			 * must expand vrele here because we do not want
2062			 * to call VOP_INACTIVE if the reference count
2063			 * drops back to zero since it was never really
2064			 * active. We must remove it from the free list
2065			 * before sleeping so that multiple processes do
2066			 * not try to recycle it.
2067			 */
2068			VI_LOCK(vp);
2069			vp->v_usecount--;
2070			if (VSHOULDFREE(vp))
2071				vfree(vp);
2072			else
2073				vlruvp(vp);
2074			VI_UNLOCK(vp);
2075		}
2076		return (error);
2077	}
2078	VI_UNLOCK(vp);
2079	return (0);
2080}
2081
2082/*
2083 * Increase the reference count of a vnode.
2084 */
2085void
2086vref(struct vnode *vp)
2087{
2088	VI_LOCK(vp);
2089	vp->v_usecount++;
2090	VI_UNLOCK(vp);
2091}
2092
2093/*
2094 * Return reference count of a vnode.
2095 *
2096 * The results of this call are only guaranteed when some mechanism other
2097 * than the VI lock is used to stop other processes from gaining references
2098 * to the vnode.  This may be the case if the caller holds the only reference.
2099 * This is also useful when stale data is acceptable as race conditions may
2100 * be accounted for by some other means.
2101 */
2102int
2103vrefcnt(struct vnode *vp)
2104{
2105	int usecnt;
2106
2107	VI_LOCK(vp);
2108	usecnt = vp->v_usecount;
2109	VI_UNLOCK(vp);
2110
2111	return (usecnt);
2112}
2113
2114
2115/*
2116 * Vnode put/release.
2117 * If count drops to zero, call inactive routine and return to freelist.
2118 */
2119void
2120vrele(vp)
2121	struct vnode *vp;
2122{
2123	struct thread *td = curthread;	/* XXX */
2124
2125	KASSERT(vp != NULL, ("vrele: null vp"));
2126
2127	VI_LOCK(vp);
2128
2129	/* Skip this v_writecount check if we're going to panic below. */
2130	KASSERT(vp->v_writecount < vp->v_usecount || vp->v_usecount < 1,
2131	    ("vrele: missed vn_close"));
2132
2133	if (vp->v_usecount > 1) {
2134
2135		vp->v_usecount--;
2136		VI_UNLOCK(vp);
2137
2138		return;
2139	}
2140
2141	if (vp->v_usecount == 1) {
2142		vp->v_usecount--;
2143		/*
2144		 * We must call VOP_INACTIVE with the node locked.
2145		 * If we are doing a vput, the node is already locked,
2146		 * but, in the case of vrele, we must explicitly lock
2147		 * the vnode before calling VOP_INACTIVE.
2148		 */
2149		if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK, td) == 0)
2150			VOP_INACTIVE(vp, td);
2151		VI_LOCK(vp);
2152		if (VSHOULDFREE(vp))
2153			vfree(vp);
2154		else
2155			vlruvp(vp);
2156		VI_UNLOCK(vp);
2157
2158	} else {
2159#ifdef DIAGNOSTIC
2160		vprint("vrele: negative ref count", vp);
2161#endif
2162		VI_UNLOCK(vp);
2163		panic("vrele: negative ref cnt");
2164	}
2165}
2166
2167/*
2168 * Release an already locked vnode.  This give the same effects as
2169 * unlock+vrele(), but takes less time and avoids releasing and
2170 * re-aquiring the lock (as vrele() aquires the lock internally.)
2171 */
2172void
2173vput(vp)
2174	struct vnode *vp;
2175{
2176	struct thread *td = curthread;	/* XXX */
2177
2178	GIANT_REQUIRED;
2179
2180	KASSERT(vp != NULL, ("vput: null vp"));
2181	VI_LOCK(vp);
2182	/* Skip this v_writecount check if we're going to panic below. */
2183	KASSERT(vp->v_writecount < vp->v_usecount || vp->v_usecount < 1,
2184	    ("vput: missed vn_close"));
2185
2186	if (vp->v_usecount > 1) {
2187		vp->v_usecount--;
2188		VOP_UNLOCK(vp, LK_INTERLOCK, td);
2189		return;
2190	}
2191
2192	if (vp->v_usecount == 1) {
2193		vp->v_usecount--;
2194		/*
2195		 * We must call VOP_INACTIVE with the node locked.
2196		 * If we are doing a vput, the node is already locked,
2197		 * so we just need to release the vnode mutex.
2198		 */
2199		VI_UNLOCK(vp);
2200		VOP_INACTIVE(vp, td);
2201		VI_LOCK(vp);
2202		if (VSHOULDFREE(vp))
2203			vfree(vp);
2204		else
2205			vlruvp(vp);
2206		VI_UNLOCK(vp);
2207
2208	} else {
2209#ifdef DIAGNOSTIC
2210		vprint("vput: negative ref count", vp);
2211#endif
2212		panic("vput: negative ref cnt");
2213	}
2214}
2215
2216/*
2217 * Somebody doesn't want the vnode recycled.
2218 */
2219void
2220vhold(struct vnode *vp)
2221{
2222	VI_LOCK(vp);
2223	vholdl(vp);
2224	VI_UNLOCK(vp);
2225}
2226
2227void
2228vholdl(vp)
2229	register struct vnode *vp;
2230{
2231	int s;
2232
2233	s = splbio();
2234	vp->v_holdcnt++;
2235	if (VSHOULDBUSY(vp))
2236		vbusy(vp);
2237	splx(s);
2238}
2239
2240/*
2241 * Note that there is one less who cares about this vnode.  vdrop() is the
2242 * opposite of vhold().
2243 */
2244void
2245vdrop(struct vnode *vp)
2246{
2247	VI_LOCK(vp);
2248	vdropl(vp);
2249	VI_UNLOCK(vp);
2250}
2251
2252void
2253vdropl(vp)
2254	register struct vnode *vp;
2255{
2256	int s;
2257
2258	s = splbio();
2259	if (vp->v_holdcnt <= 0)
2260		panic("vdrop: holdcnt");
2261	vp->v_holdcnt--;
2262	if (VSHOULDFREE(vp))
2263		vfree(vp);
2264	else
2265		vlruvp(vp);
2266	splx(s);
2267}
2268
2269/*
2270 * Remove any vnodes in the vnode table belonging to mount point mp.
2271 *
2272 * If FORCECLOSE is not specified, there should not be any active ones,
2273 * return error if any are found (nb: this is a user error, not a
2274 * system error). If FORCECLOSE is specified, detach any active vnodes
2275 * that are found.
2276 *
2277 * If WRITECLOSE is set, only flush out regular file vnodes open for
2278 * writing.
2279 *
2280 * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped.
2281 *
2282 * `rootrefs' specifies the base reference count for the root vnode
2283 * of this filesystem. The root vnode is considered busy if its
2284 * v_usecount exceeds this value. On a successful return, vflush()
2285 * will call vrele() on the root vnode exactly rootrefs times.
2286 * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must
2287 * be zero.
2288 */
2289#ifdef DIAGNOSTIC
2290static int busyprt = 0;		/* print out busy vnodes */
2291SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "");
2292#endif
2293
2294int
2295vflush(mp, rootrefs, flags)
2296	struct mount *mp;
2297	int rootrefs;
2298	int flags;
2299{
2300	struct thread *td = curthread;	/* XXX */
2301	struct vnode *vp, *nvp, *rootvp = NULL;
2302	struct vattr vattr;
2303	int busy = 0, error;
2304
2305	if (rootrefs > 0) {
2306		KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0,
2307		    ("vflush: bad args"));
2308		/*
2309		 * Get the filesystem root vnode. We can vput() it
2310		 * immediately, since with rootrefs > 0, it won't go away.
2311		 */
2312		if ((error = VFS_ROOT(mp, &rootvp)) != 0)
2313			return (error);
2314		vput(rootvp);
2315
2316	}
2317	mtx_lock(&mntvnode_mtx);
2318loop:
2319	for (vp = TAILQ_FIRST(&mp->mnt_nvnodelist); vp; vp = nvp) {
2320		/*
2321		 * Make sure this vnode wasn't reclaimed in getnewvnode().
2322		 * Start over if it has (it won't be on the list anymore).
2323		 */
2324		if (vp->v_mount != mp)
2325			goto loop;
2326		nvp = TAILQ_NEXT(vp, v_nmntvnodes);
2327
2328		VI_LOCK(vp);
2329		mtx_unlock(&mntvnode_mtx);
2330		vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY, td);
2331		/*
2332		 * Skip over a vnodes marked VV_SYSTEM.
2333		 */
2334		if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) {
2335			VOP_UNLOCK(vp, 0, td);
2336			mtx_lock(&mntvnode_mtx);
2337			continue;
2338		}
2339		/*
2340		 * If WRITECLOSE is set, flush out unlinked but still open
2341		 * files (even if open only for reading) and regular file
2342		 * vnodes open for writing.
2343		 */
2344		if (flags & WRITECLOSE) {
2345			error = VOP_GETATTR(vp, &vattr, td->td_ucred, td);
2346			VI_LOCK(vp);
2347
2348			if ((vp->v_type == VNON ||
2349			    (error == 0 && vattr.va_nlink > 0)) &&
2350			    (vp->v_writecount == 0 || vp->v_type != VREG)) {
2351				VOP_UNLOCK(vp, LK_INTERLOCK, td);
2352				mtx_lock(&mntvnode_mtx);
2353				continue;
2354			}
2355		} else
2356			VI_LOCK(vp);
2357
2358		VOP_UNLOCK(vp, 0, td);
2359
2360		/*
2361		 * With v_usecount == 0, all we need to do is clear out the
2362		 * vnode data structures and we are done.
2363		 */
2364		if (vp->v_usecount == 0) {
2365			vgonel(vp, td);
2366			mtx_lock(&mntvnode_mtx);
2367			continue;
2368		}
2369
2370		/*
2371		 * If FORCECLOSE is set, forcibly close the vnode. For block
2372		 * or character devices, revert to an anonymous device. For
2373		 * all other files, just kill them.
2374		 */
2375		if (flags & FORCECLOSE) {
2376			if (vp->v_type != VCHR) {
2377				vgonel(vp, td);
2378			} else {
2379				vclean(vp, 0, td);
2380				VI_UNLOCK(vp);
2381				vp->v_op = spec_vnodeop_p;
2382				insmntque(vp, (struct mount *) 0);
2383			}
2384			mtx_lock(&mntvnode_mtx);
2385			continue;
2386		}
2387#ifdef DIAGNOSTIC
2388		if (busyprt)
2389			vprint("vflush: busy vnode", vp);
2390#endif
2391		VI_UNLOCK(vp);
2392		mtx_lock(&mntvnode_mtx);
2393		busy++;
2394	}
2395	mtx_unlock(&mntvnode_mtx);
2396	if (rootrefs > 0 && (flags & FORCECLOSE) == 0) {
2397		/*
2398		 * If just the root vnode is busy, and if its refcount
2399		 * is equal to `rootrefs', then go ahead and kill it.
2400		 */
2401		VI_LOCK(rootvp);
2402		KASSERT(busy > 0, ("vflush: not busy"));
2403		KASSERT(rootvp->v_usecount >= rootrefs, ("vflush: rootrefs"));
2404		if (busy == 1 && rootvp->v_usecount == rootrefs) {
2405			vgonel(rootvp, td);
2406			busy = 0;
2407		} else
2408			VI_UNLOCK(rootvp);
2409	}
2410	if (busy)
2411		return (EBUSY);
2412	for (; rootrefs > 0; rootrefs--)
2413		vrele(rootvp);
2414	return (0);
2415}
2416
2417/*
2418 * This moves a now (likely recyclable) vnode to the end of the
2419 * mountlist.  XXX However, it is temporarily disabled until we
2420 * can clean up ffs_sync() and friends, which have loop restart
2421 * conditions which this code causes to operate O(N^2).
2422 */
2423static void
2424vlruvp(struct vnode *vp)
2425{
2426#if 0
2427	struct mount *mp;
2428
2429	if ((mp = vp->v_mount) != NULL) {
2430		mtx_lock(&mntvnode_mtx);
2431		TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
2432		TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
2433		mtx_unlock(&mntvnode_mtx);
2434	}
2435#endif
2436}
2437
2438/*
2439 * Disassociate the underlying filesystem from a vnode.
2440 */
2441static void
2442vclean(vp, flags, td)
2443	struct vnode *vp;
2444	int flags;
2445	struct thread *td;
2446{
2447	int active;
2448
2449	ASSERT_VI_LOCKED(vp, "vclean");
2450	/*
2451	 * Check to see if the vnode is in use. If so we have to reference it
2452	 * before we clean it out so that its count cannot fall to zero and
2453	 * generate a race against ourselves to recycle it.
2454	 */
2455	if ((active = vp->v_usecount))
2456		vp->v_usecount++;
2457
2458	/*
2459	 * Prevent the vnode from being recycled or brought into use while we
2460	 * clean it out.
2461	 */
2462	if (vp->v_iflag & VI_XLOCK)
2463		panic("vclean: deadlock");
2464	vp->v_iflag |= VI_XLOCK;
2465	vp->v_vxproc = curthread;
2466	/*
2467	 * Even if the count is zero, the VOP_INACTIVE routine may still
2468	 * have the object locked while it cleans it out. The VOP_LOCK
2469	 * ensures that the VOP_INACTIVE routine is done with its work.
2470	 * For active vnodes, it ensures that no other activity can
2471	 * occur while the underlying object is being cleaned out.
2472	 */
2473	VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK, td);
2474
2475	/*
2476	 * Clean out any buffers associated with the vnode.
2477	 * If the flush fails, just toss the buffers.
2478	 */
2479	if (flags & DOCLOSE) {
2480		struct buf *bp;
2481		VI_LOCK(vp);
2482		bp = TAILQ_FIRST(&vp->v_dirtyblkhd);
2483		VI_UNLOCK(vp);
2484		if (bp != NULL)
2485			(void) vn_write_suspend_wait(vp, NULL, V_WAIT);
2486		if (vinvalbuf(vp, V_SAVE, NOCRED, td, 0, 0) != 0)
2487			vinvalbuf(vp, 0, NOCRED, td, 0, 0);
2488	}
2489
2490	VOP_DESTROYVOBJECT(vp);
2491
2492	/*
2493	 * Any other processes trying to obtain this lock must first
2494	 * wait for VXLOCK to clear, then call the new lock operation.
2495	 */
2496	VOP_UNLOCK(vp, 0, td);
2497
2498	/*
2499	 * If purging an active vnode, it must be closed and
2500	 * deactivated before being reclaimed. Note that the
2501	 * VOP_INACTIVE will unlock the vnode.
2502	 */
2503	if (active) {
2504		if (flags & DOCLOSE)
2505			VOP_CLOSE(vp, FNONBLOCK, NOCRED, td);
2506		if (vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT, td) != 0)
2507			panic("vclean: cannot relock.");
2508		VOP_INACTIVE(vp, td);
2509	}
2510
2511	/*
2512	 * Reclaim the vnode.
2513	 */
2514	if (VOP_RECLAIM(vp, td))
2515		panic("vclean: cannot reclaim");
2516
2517	if (active) {
2518		/*
2519		 * Inline copy of vrele() since VOP_INACTIVE
2520		 * has already been called.
2521		 */
2522		VI_LOCK(vp);
2523		if (--vp->v_usecount <= 0) {
2524#ifdef DIAGNOSTIC
2525			if (vp->v_usecount < 0 || vp->v_writecount != 0) {
2526				vprint("vclean: bad ref count", vp);
2527				panic("vclean: ref cnt");
2528			}
2529#endif
2530			vfree(vp);
2531		}
2532		VI_UNLOCK(vp);
2533	}
2534
2535	cache_purge(vp);
2536	vp->v_vnlock = NULL;
2537	lockdestroy(&vp->v_lock);
2538
2539	VI_LOCK(vp);
2540	if (VSHOULDFREE(vp))
2541		vfree(vp);
2542
2543	/*
2544	 * Done with purge, notify sleepers of the grim news.
2545	 */
2546	vp->v_op = dead_vnodeop_p;
2547	if (vp->v_pollinfo != NULL)
2548		vn_pollgone(vp);
2549	vp->v_tag = "none";
2550	vp->v_iflag &= ~VI_XLOCK;
2551	vp->v_vxproc = NULL;
2552	if (vp->v_iflag & VI_XWANT) {
2553		vp->v_iflag &= ~VI_XWANT;
2554		wakeup(vp);
2555	}
2556}
2557
2558/*
2559 * Eliminate all activity associated with the requested vnode
2560 * and with all vnodes aliased to the requested vnode.
2561 */
2562int
2563vop_revoke(ap)
2564	struct vop_revoke_args /* {
2565		struct vnode *a_vp;
2566		int a_flags;
2567	} */ *ap;
2568{
2569	struct vnode *vp, *vq;
2570	dev_t dev;
2571
2572	KASSERT((ap->a_flags & REVOKEALL) != 0, ("vop_revoke"));
2573
2574	vp = ap->a_vp;
2575	VI_LOCK(vp);
2576	/*
2577	 * If a vgone (or vclean) is already in progress,
2578	 * wait until it is done and return.
2579	 */
2580	if (vp->v_iflag & VI_XLOCK) {
2581		vp->v_iflag |= VI_XWANT;
2582		msleep(vp, VI_MTX(vp), PINOD | PDROP,
2583		    "vop_revokeall", 0);
2584		return (0);
2585	}
2586	VI_UNLOCK(vp);
2587	dev = vp->v_rdev;
2588	for (;;) {
2589		mtx_lock(&spechash_mtx);
2590		vq = SLIST_FIRST(&dev->si_hlist);
2591		mtx_unlock(&spechash_mtx);
2592		if (!vq)
2593			break;
2594		vgone(vq);
2595	}
2596	return (0);
2597}
2598
2599/*
2600 * Recycle an unused vnode to the front of the free list.
2601 * Release the passed interlock if the vnode will be recycled.
2602 */
2603int
2604vrecycle(vp, inter_lkp, td)
2605	struct vnode *vp;
2606	struct mtx *inter_lkp;
2607	struct thread *td;
2608{
2609
2610	VI_LOCK(vp);
2611	if (vp->v_usecount == 0) {
2612		if (inter_lkp) {
2613			mtx_unlock(inter_lkp);
2614		}
2615		vgonel(vp, td);
2616		return (1);
2617	}
2618	VI_UNLOCK(vp);
2619	return (0);
2620}
2621
2622/*
2623 * Eliminate all activity associated with a vnode
2624 * in preparation for reuse.
2625 */
2626void
2627vgone(vp)
2628	register struct vnode *vp;
2629{
2630	struct thread *td = curthread;	/* XXX */
2631
2632	VI_LOCK(vp);
2633	vgonel(vp, td);
2634}
2635
2636/*
2637 * vgone, with the vp interlock held.
2638 */
2639void
2640vgonel(vp, td)
2641	struct vnode *vp;
2642	struct thread *td;
2643{
2644	int s;
2645
2646	/*
2647	 * If a vgone (or vclean) is already in progress,
2648	 * wait until it is done and return.
2649	 */
2650	ASSERT_VI_LOCKED(vp, "vgonel");
2651	if (vp->v_iflag & VI_XLOCK) {
2652		vp->v_iflag |= VI_XWANT;
2653		msleep(vp, VI_MTX(vp), PINOD | PDROP, "vgone", 0);
2654		return;
2655	}
2656
2657	/*
2658	 * Clean out the filesystem specific data.
2659	 */
2660	vclean(vp, DOCLOSE, td);
2661	VI_UNLOCK(vp);
2662
2663	/*
2664	 * Delete from old mount point vnode list, if on one.
2665	 */
2666	if (vp->v_mount != NULL)
2667		insmntque(vp, (struct mount *)0);
2668	/*
2669	 * If special device, remove it from special device alias list
2670	 * if it is on one.
2671	 */
2672	if (vp->v_type == VCHR && vp->v_rdev != NULL && vp->v_rdev != NODEV) {
2673		mtx_lock(&spechash_mtx);
2674		SLIST_REMOVE(&vp->v_rdev->si_hlist, vp, vnode, v_specnext);
2675		freedev(vp->v_rdev);
2676		mtx_unlock(&spechash_mtx);
2677		vp->v_rdev = NULL;
2678	}
2679
2680	/*
2681	 * If it is on the freelist and not already at the head,
2682	 * move it to the head of the list. The test of the
2683	 * VDOOMED flag and the reference count of zero is because
2684	 * it will be removed from the free list by getnewvnode,
2685	 * but will not have its reference count incremented until
2686	 * after calling vgone. If the reference count were
2687	 * incremented first, vgone would (incorrectly) try to
2688	 * close the previous instance of the underlying object.
2689	 */
2690	VI_LOCK(vp);
2691	if (vp->v_usecount == 0 && !(vp->v_iflag & VI_DOOMED)) {
2692		s = splbio();
2693		mtx_lock(&vnode_free_list_mtx);
2694		if (vp->v_iflag & VI_FREE) {
2695			TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
2696		} else {
2697			vp->v_iflag |= VI_FREE;
2698			freevnodes++;
2699		}
2700		TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
2701		mtx_unlock(&vnode_free_list_mtx);
2702		splx(s);
2703	}
2704
2705	vp->v_type = VBAD;
2706	VI_UNLOCK(vp);
2707}
2708
2709/*
2710 * Lookup a vnode by device number.
2711 */
2712int
2713vfinddev(dev, type, vpp)
2714	dev_t dev;
2715	enum vtype type;
2716	struct vnode **vpp;
2717{
2718	struct vnode *vp;
2719
2720	mtx_lock(&spechash_mtx);
2721	SLIST_FOREACH(vp, &dev->si_hlist, v_specnext) {
2722		if (type == vp->v_type) {
2723			*vpp = vp;
2724			mtx_unlock(&spechash_mtx);
2725			return (1);
2726		}
2727	}
2728	mtx_unlock(&spechash_mtx);
2729	return (0);
2730}
2731
2732/*
2733 * Calculate the total number of references to a special device.
2734 */
2735int
2736vcount(vp)
2737	struct vnode *vp;
2738{
2739	struct vnode *vq;
2740	int count;
2741
2742	count = 0;
2743	mtx_lock(&spechash_mtx);
2744	SLIST_FOREACH(vq, &vp->v_rdev->si_hlist, v_specnext) {
2745		if (vq != vp)
2746			VI_LOCK(vq);
2747		count += vq->v_usecount;
2748		if (vq != vp)
2749			VI_UNLOCK(vq);
2750	}
2751	mtx_unlock(&spechash_mtx);
2752	return (count);
2753}
2754
2755/*
2756 * Same as above, but using the dev_t as argument
2757 */
2758int
2759count_dev(dev)
2760	dev_t dev;
2761{
2762	struct vnode *vp;
2763
2764	vp = SLIST_FIRST(&dev->si_hlist);
2765	if (vp == NULL)
2766		return (0);
2767	return(vcount(vp));
2768}
2769
2770/*
2771 * Print out a description of a vnode.
2772 */
2773static char *typename[] =
2774{"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"};
2775
2776void
2777vprint(label, vp)
2778	char *label;
2779	struct vnode *vp;
2780{
2781	char buf[96];
2782
2783	if (label != NULL)
2784		printf("%s: %p: ", label, (void *)vp);
2785	else
2786		printf("%p: ", (void *)vp);
2787	printf("tag %s, type %s, usecount %d, writecount %d, refcount %d,",
2788	    vp->v_tag, typename[vp->v_type], vp->v_usecount,
2789	    vp->v_writecount, vp->v_holdcnt);
2790	buf[0] = '\0';
2791	if (vp->v_vflag & VV_ROOT)
2792		strcat(buf, "|VV_ROOT");
2793	if (vp->v_vflag & VV_TEXT)
2794		strcat(buf, "|VV_TEXT");
2795	if (vp->v_vflag & VV_SYSTEM)
2796		strcat(buf, "|VV_SYSTEM");
2797	if (vp->v_iflag & VI_XLOCK)
2798		strcat(buf, "|VI_XLOCK");
2799	if (vp->v_iflag & VI_XWANT)
2800		strcat(buf, "|VI_XWANT");
2801	if (vp->v_iflag & VI_BWAIT)
2802		strcat(buf, "|VI_BWAIT");
2803	if (vp->v_iflag & VI_DOOMED)
2804		strcat(buf, "|VI_DOOMED");
2805	if (vp->v_iflag & VI_FREE)
2806		strcat(buf, "|VI_FREE");
2807	if (vp->v_vflag & VV_OBJBUF)
2808		strcat(buf, "|VV_OBJBUF");
2809	if (buf[0] != '\0')
2810		printf(" flags (%s),", &buf[1]);
2811	lockmgr_printinfo(&vp->v_lock);
2812	printf("\n");
2813	if (vp->v_data != NULL) {
2814		printf("\t");
2815		VOP_PRINT(vp);
2816	}
2817}
2818
2819#ifdef DDB
2820#include <ddb/ddb.h>
2821/*
2822 * List all of the locked vnodes in the system.
2823 * Called when debugging the kernel.
2824 */
2825DB_SHOW_COMMAND(lockedvnods, lockedvnodes)
2826{
2827	struct thread *td = curthread;	/* XXX */
2828	struct mount *mp, *nmp;
2829	struct vnode *vp;
2830
2831	printf("Locked vnodes\n");
2832	mtx_lock(&mountlist_mtx);
2833	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
2834		if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td)) {
2835			nmp = TAILQ_NEXT(mp, mnt_list);
2836			continue;
2837		}
2838		mtx_lock(&mntvnode_mtx);
2839		TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
2840			if (VOP_ISLOCKED(vp, NULL))
2841				vprint((char *)0, vp);
2842		}
2843		mtx_unlock(&mntvnode_mtx);
2844		mtx_lock(&mountlist_mtx);
2845		nmp = TAILQ_NEXT(mp, mnt_list);
2846		vfs_unbusy(mp, td);
2847	}
2848	mtx_unlock(&mountlist_mtx);
2849}
2850#endif
2851
2852/*
2853 * Fill in a struct xvfsconf based on a struct vfsconf.
2854 */
2855static void
2856vfsconf2x(struct vfsconf *vfsp, struct xvfsconf *xvfsp)
2857{
2858
2859	strcpy(xvfsp->vfc_name, vfsp->vfc_name);
2860	xvfsp->vfc_typenum = vfsp->vfc_typenum;
2861	xvfsp->vfc_refcount = vfsp->vfc_refcount;
2862	xvfsp->vfc_flags = vfsp->vfc_flags;
2863	/*
2864	 * These are unused in userland, we keep them
2865	 * to not break binary compatibility.
2866	 */
2867	xvfsp->vfc_vfsops = NULL;
2868	xvfsp->vfc_next = NULL;
2869}
2870
2871static int
2872sysctl_vfs_conflist(SYSCTL_HANDLER_ARGS)
2873{
2874	struct vfsconf *vfsp;
2875	struct xvfsconf *xvfsp;
2876	int cnt, error, i;
2877
2878	cnt = 0;
2879	for (vfsp = vfsconf; vfsp != NULL; vfsp = vfsp->vfc_next)
2880		cnt++;
2881	xvfsp = malloc(sizeof(struct xvfsconf) * cnt, M_TEMP, M_WAITOK);
2882	/*
2883	 * Handle the race that we will have here when struct vfsconf
2884	 * will be locked down by using both cnt and checking vfc_next
2885	 * against NULL to determine the end of the loop.  The race will
2886	 * happen because we will have to unlock before calling malloc().
2887	 * We are protected by Giant for now.
2888	 */
2889	i = 0;
2890	for (vfsp = vfsconf; vfsp != NULL && i < cnt; vfsp = vfsp->vfc_next) {
2891		vfsconf2x(vfsp, xvfsp + i);
2892		i++;
2893	}
2894	error = SYSCTL_OUT(req, xvfsp, sizeof(struct xvfsconf) * i);
2895	free(xvfsp, M_TEMP);
2896	return (error);
2897}
2898
2899SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLFLAG_RD, NULL, 0, sysctl_vfs_conflist,
2900    "S,xvfsconf", "List of all configured filesystems");
2901
2902/*
2903 * Top level filesystem related information gathering.
2904 */
2905static int	sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS);
2906
2907static int
2908vfs_sysctl(SYSCTL_HANDLER_ARGS)
2909{
2910	int *name = (int *)arg1 - 1;	/* XXX */
2911	u_int namelen = arg2 + 1;	/* XXX */
2912	struct vfsconf *vfsp;
2913	struct xvfsconf xvfsp;
2914
2915	printf("WARNING: userland calling deprecated sysctl, "
2916	    "please rebuild world\n");
2917
2918#if 1 || defined(COMPAT_PRELITE2)
2919	/* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */
2920	if (namelen == 1)
2921		return (sysctl_ovfs_conf(oidp, arg1, arg2, req));
2922#endif
2923
2924	switch (name[1]) {
2925	case VFS_MAXTYPENUM:
2926		if (namelen != 2)
2927			return (ENOTDIR);
2928		return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int)));
2929	case VFS_CONF:
2930		if (namelen != 3)
2931			return (ENOTDIR);	/* overloaded */
2932		for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
2933			if (vfsp->vfc_typenum == name[2])
2934				break;
2935		if (vfsp == NULL)
2936			return (EOPNOTSUPP);
2937		vfsconf2x(vfsp, &xvfsp);
2938		return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp)));
2939	}
2940	return (EOPNOTSUPP);
2941}
2942
2943SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD | CTLFLAG_SKIP, vfs_sysctl,
2944	"Generic filesystem");
2945
2946#if 1 || defined(COMPAT_PRELITE2)
2947
2948static int
2949sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS)
2950{
2951	int error;
2952	struct vfsconf *vfsp;
2953	struct ovfsconf ovfs;
2954
2955	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
2956		ovfs.vfc_vfsops = vfsp->vfc_vfsops;	/* XXX used as flag */
2957		strcpy(ovfs.vfc_name, vfsp->vfc_name);
2958		ovfs.vfc_index = vfsp->vfc_typenum;
2959		ovfs.vfc_refcount = vfsp->vfc_refcount;
2960		ovfs.vfc_flags = vfsp->vfc_flags;
2961		error = SYSCTL_OUT(req, &ovfs, sizeof ovfs);
2962		if (error)
2963			return error;
2964	}
2965	return 0;
2966}
2967
2968#endif /* 1 || COMPAT_PRELITE2 */
2969
2970#define KINFO_VNODESLOP		10
2971/*
2972 * Dump vnode list (via sysctl).
2973 */
2974/* ARGSUSED */
2975static int
2976sysctl_vnode(SYSCTL_HANDLER_ARGS)
2977{
2978	struct xvnode *xvn;
2979	struct thread *td = req->td;
2980	struct mount *mp;
2981	struct vnode *vp;
2982	int error, len, n;
2983
2984	/*
2985	 * Stale numvnodes access is not fatal here.
2986	 */
2987	req->lock = 0;
2988	len = (numvnodes + KINFO_VNODESLOP) * sizeof *xvn;
2989	if (!req->oldptr)
2990		/* Make an estimate */
2991		return (SYSCTL_OUT(req, 0, len));
2992
2993	sysctl_wire_old_buffer(req, 0);
2994	xvn = malloc(len, M_TEMP, M_ZERO | M_WAITOK);
2995	n = 0;
2996	mtx_lock(&mountlist_mtx);
2997	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
2998		if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td))
2999			continue;
3000		mtx_lock(&mntvnode_mtx);
3001		TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
3002			if (n == len)
3003				break;
3004			vref(vp);
3005			xvn[n].xv_size = sizeof *xvn;
3006			xvn[n].xv_vnode = vp;
3007#define XV_COPY(field) xvn[n].xv_##field = vp->v_##field
3008			XV_COPY(usecount);
3009			XV_COPY(writecount);
3010			XV_COPY(holdcnt);
3011			XV_COPY(id);
3012			XV_COPY(mount);
3013			XV_COPY(numoutput);
3014			XV_COPY(type);
3015#undef XV_COPY
3016			xvn[n].xv_flag = vp->v_vflag;
3017
3018			switch (vp->v_type) {
3019			case VREG:
3020			case VDIR:
3021			case VLNK:
3022				xvn[n].xv_dev = vp->v_cachedfs;
3023				xvn[n].xv_ino = vp->v_cachedid;
3024				break;
3025			case VBLK:
3026			case VCHR:
3027				if (vp->v_rdev == NULL) {
3028					vrele(vp);
3029					continue;
3030				}
3031				xvn[n].xv_dev = dev2udev(vp->v_rdev);
3032				break;
3033			case VSOCK:
3034				xvn[n].xv_socket = vp->v_socket;
3035				break;
3036			case VFIFO:
3037				xvn[n].xv_fifo = vp->v_fifoinfo;
3038				break;
3039			case VNON:
3040			case VBAD:
3041			default:
3042				/* shouldn't happen? */
3043				vrele(vp);
3044				continue;
3045			}
3046			vrele(vp);
3047			++n;
3048		}
3049		mtx_unlock(&mntvnode_mtx);
3050		mtx_lock(&mountlist_mtx);
3051		vfs_unbusy(mp, td);
3052		if (n == len)
3053			break;
3054	}
3055	mtx_unlock(&mountlist_mtx);
3056
3057	error = SYSCTL_OUT(req, xvn, n * sizeof *xvn);
3058	free(xvn, M_TEMP);
3059	return (error);
3060}
3061
3062SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD,
3063	0, 0, sysctl_vnode, "S,xvnode", "");
3064
3065/*
3066 * Check to see if a filesystem is mounted on a block device.
3067 */
3068int
3069vfs_mountedon(vp)
3070	struct vnode *vp;
3071{
3072
3073	if (vp->v_rdev->si_mountpoint != NULL)
3074		return (EBUSY);
3075	return (0);
3076}
3077
3078/*
3079 * Unmount all filesystems. The list is traversed in reverse order
3080 * of mounting to avoid dependencies.
3081 */
3082void
3083vfs_unmountall()
3084{
3085	struct mount *mp;
3086	struct thread *td;
3087	int error;
3088
3089	if (curthread != NULL)
3090		td = curthread;
3091	else
3092		td = FIRST_THREAD_IN_PROC(initproc); /* XXX XXX proc0? */
3093	/*
3094	 * Since this only runs when rebooting, it is not interlocked.
3095	 */
3096	while(!TAILQ_EMPTY(&mountlist)) {
3097		mp = TAILQ_LAST(&mountlist, mntlist);
3098		error = dounmount(mp, MNT_FORCE, td);
3099		if (error) {
3100			TAILQ_REMOVE(&mountlist, mp, mnt_list);
3101			printf("unmount of %s failed (",
3102			    mp->mnt_stat.f_mntonname);
3103			if (error == EBUSY)
3104				printf("BUSY)\n");
3105			else
3106				printf("%d)\n", error);
3107		} else {
3108			/* The unmount has removed mp from the mountlist */
3109		}
3110	}
3111}
3112
3113/*
3114 * perform msync on all vnodes under a mount point
3115 * the mount point must be locked.
3116 */
3117void
3118vfs_msync(struct mount *mp, int flags)
3119{
3120	struct vnode *vp, *nvp;
3121	struct vm_object *obj;
3122	int tries;
3123
3124	GIANT_REQUIRED;
3125
3126	tries = 5;
3127	mtx_lock(&mntvnode_mtx);
3128loop:
3129	for (vp = TAILQ_FIRST(&mp->mnt_nvnodelist); vp != NULL; vp = nvp) {
3130		if (vp->v_mount != mp) {
3131			if (--tries > 0)
3132				goto loop;
3133			break;
3134		}
3135		nvp = TAILQ_NEXT(vp, v_nmntvnodes);
3136
3137		VI_LOCK(vp);
3138		if (vp->v_iflag & VI_XLOCK) {	/* XXX: what if MNT_WAIT? */
3139			VI_UNLOCK(vp);
3140			continue;
3141		}
3142
3143		if ((vp->v_iflag & VI_OBJDIRTY) &&
3144		    (flags == MNT_WAIT || VOP_ISLOCKED(vp, NULL) == 0)) {
3145			mtx_unlock(&mntvnode_mtx);
3146			if (!vget(vp,
3147			    LK_EXCLUSIVE | LK_RETRY | LK_INTERLOCK,
3148			    curthread)) {
3149				if (vp->v_vflag & VV_NOSYNC) {	/* unlinked */
3150					vput(vp);
3151					continue;
3152				}
3153
3154				if (VOP_GETVOBJECT(vp, &obj) == 0) {
3155					vm_object_page_clean(obj, 0, 0,
3156					    flags == MNT_WAIT ?
3157					    OBJPC_SYNC : OBJPC_NOSYNC);
3158				}
3159				vput(vp);
3160			}
3161			mtx_lock(&mntvnode_mtx);
3162			if (TAILQ_NEXT(vp, v_nmntvnodes) != nvp) {
3163				if (--tries > 0)
3164					goto loop;
3165				break;
3166			}
3167		} else
3168			VI_UNLOCK(vp);
3169	}
3170	mtx_unlock(&mntvnode_mtx);
3171}
3172
3173/*
3174 * Create the VM object needed for VMIO and mmap support.  This
3175 * is done for all VREG files in the system.  Some filesystems might
3176 * afford the additional metadata buffering capability of the
3177 * VMIO code by making the device node be VMIO mode also.
3178 *
3179 * vp must be locked when vfs_object_create is called.
3180 */
3181int
3182vfs_object_create(vp, td, cred)
3183	struct vnode *vp;
3184	struct thread *td;
3185	struct ucred *cred;
3186{
3187	GIANT_REQUIRED;
3188	return (VOP_CREATEVOBJECT(vp, cred, td));
3189}
3190
3191/*
3192 * Mark a vnode as free, putting it up for recycling.
3193 */
3194void
3195vfree(vp)
3196	struct vnode *vp;
3197{
3198	int s;
3199
3200	ASSERT_VI_LOCKED(vp, "vfree");
3201	s = splbio();
3202	mtx_lock(&vnode_free_list_mtx);
3203	KASSERT((vp->v_iflag & VI_FREE) == 0, ("vnode already free"));
3204	if (vp->v_iflag & VI_AGE) {
3205		TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
3206	} else {
3207		TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
3208	}
3209	freevnodes++;
3210	mtx_unlock(&vnode_free_list_mtx);
3211	vp->v_iflag &= ~VI_AGE;
3212	vp->v_iflag |= VI_FREE;
3213	splx(s);
3214}
3215
3216/*
3217 * Opposite of vfree() - mark a vnode as in use.
3218 */
3219void
3220vbusy(vp)
3221	struct vnode *vp;
3222{
3223	int s;
3224
3225	s = splbio();
3226	ASSERT_VI_LOCKED(vp, "vbusy");
3227	KASSERT((vp->v_iflag & VI_FREE) != 0, ("vnode not free"));
3228
3229	mtx_lock(&vnode_free_list_mtx);
3230	TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
3231	freevnodes--;
3232	mtx_unlock(&vnode_free_list_mtx);
3233
3234	vp->v_iflag &= ~(VI_FREE|VI_AGE);
3235	splx(s);
3236}
3237
3238/*
3239 * Record a process's interest in events which might happen to
3240 * a vnode.  Because poll uses the historic select-style interface
3241 * internally, this routine serves as both the ``check for any
3242 * pending events'' and the ``record my interest in future events''
3243 * functions.  (These are done together, while the lock is held,
3244 * to avoid race conditions.)
3245 */
3246int
3247vn_pollrecord(vp, td, events)
3248	struct vnode *vp;
3249	struct thread *td;
3250	short events;
3251{
3252
3253	if (vp->v_pollinfo == NULL)
3254		v_addpollinfo(vp);
3255	mtx_lock(&vp->v_pollinfo->vpi_lock);
3256	if (vp->v_pollinfo->vpi_revents & events) {
3257		/*
3258		 * This leaves events we are not interested
3259		 * in available for the other process which
3260		 * which presumably had requested them
3261		 * (otherwise they would never have been
3262		 * recorded).
3263		 */
3264		events &= vp->v_pollinfo->vpi_revents;
3265		vp->v_pollinfo->vpi_revents &= ~events;
3266
3267		mtx_unlock(&vp->v_pollinfo->vpi_lock);
3268		return events;
3269	}
3270	vp->v_pollinfo->vpi_events |= events;
3271	selrecord(td, &vp->v_pollinfo->vpi_selinfo);
3272	mtx_unlock(&vp->v_pollinfo->vpi_lock);
3273	return 0;
3274}
3275
3276/*
3277 * Note the occurrence of an event.  If the VN_POLLEVENT macro is used,
3278 * it is possible for us to miss an event due to race conditions, but
3279 * that condition is expected to be rare, so for the moment it is the
3280 * preferred interface.
3281 */
3282void
3283vn_pollevent(vp, events)
3284	struct vnode *vp;
3285	short events;
3286{
3287
3288	if (vp->v_pollinfo == NULL)
3289		v_addpollinfo(vp);
3290	mtx_lock(&vp->v_pollinfo->vpi_lock);
3291	if (vp->v_pollinfo->vpi_events & events) {
3292		/*
3293		 * We clear vpi_events so that we don't
3294		 * call selwakeup() twice if two events are
3295		 * posted before the polling process(es) is
3296		 * awakened.  This also ensures that we take at
3297		 * most one selwakeup() if the polling process
3298		 * is no longer interested.  However, it does
3299		 * mean that only one event can be noticed at
3300		 * a time.  (Perhaps we should only clear those
3301		 * event bits which we note?) XXX
3302		 */
3303		vp->v_pollinfo->vpi_events = 0;	/* &= ~events ??? */
3304		vp->v_pollinfo->vpi_revents |= events;
3305		selwakeup(&vp->v_pollinfo->vpi_selinfo);
3306	}
3307	mtx_unlock(&vp->v_pollinfo->vpi_lock);
3308}
3309
3310/*
3311 * Wake up anyone polling on vp because it is being revoked.
3312 * This depends on dead_poll() returning POLLHUP for correct
3313 * behavior.
3314 */
3315void
3316vn_pollgone(vp)
3317	struct vnode *vp;
3318{
3319
3320	mtx_lock(&vp->v_pollinfo->vpi_lock);
3321	VN_KNOTE(vp, NOTE_REVOKE);
3322	if (vp->v_pollinfo->vpi_events) {
3323		vp->v_pollinfo->vpi_events = 0;
3324		selwakeup(&vp->v_pollinfo->vpi_selinfo);
3325	}
3326	mtx_unlock(&vp->v_pollinfo->vpi_lock);
3327}
3328
3329
3330
3331/*
3332 * Routine to create and manage a filesystem syncer vnode.
3333 */
3334#define sync_close ((int (*)(struct  vop_close_args *))nullop)
3335static int	sync_fsync(struct  vop_fsync_args *);
3336static int	sync_inactive(struct  vop_inactive_args *);
3337static int	sync_reclaim(struct  vop_reclaim_args *);
3338static int	sync_print(struct vop_print_args *);
3339
3340static vop_t **sync_vnodeop_p;
3341static struct vnodeopv_entry_desc sync_vnodeop_entries[] = {
3342	{ &vop_default_desc,	(vop_t *) vop_eopnotsupp },
3343	{ &vop_close_desc,	(vop_t *) sync_close },		/* close */
3344	{ &vop_fsync_desc,	(vop_t *) sync_fsync },		/* fsync */
3345	{ &vop_inactive_desc,	(vop_t *) sync_inactive },	/* inactive */
3346	{ &vop_reclaim_desc,	(vop_t *) sync_reclaim },	/* reclaim */
3347	{ &vop_lock_desc,	(vop_t *) vop_stdlock },	/* lock */
3348	{ &vop_unlock_desc,	(vop_t *) vop_stdunlock },	/* unlock */
3349	{ &vop_print_desc,	(vop_t *) sync_print },		/* print */
3350	{ &vop_islocked_desc,	(vop_t *) vop_stdislocked },	/* islocked */
3351	{ NULL, NULL }
3352};
3353static struct vnodeopv_desc sync_vnodeop_opv_desc =
3354	{ &sync_vnodeop_p, sync_vnodeop_entries };
3355
3356VNODEOP_SET(sync_vnodeop_opv_desc);
3357
3358/*
3359 * Create a new filesystem syncer vnode for the specified mount point.
3360 */
3361int
3362vfs_allocate_syncvnode(mp)
3363	struct mount *mp;
3364{
3365	struct vnode *vp;
3366	static long start, incr, next;
3367	int error;
3368
3369	/* Allocate a new vnode */
3370	if ((error = getnewvnode("vfs", mp, sync_vnodeop_p, &vp)) != 0) {
3371		mp->mnt_syncer = NULL;
3372		return (error);
3373	}
3374	vp->v_type = VNON;
3375	/*
3376	 * Place the vnode onto the syncer worklist. We attempt to
3377	 * scatter them about on the list so that they will go off
3378	 * at evenly distributed times even if all the filesystems
3379	 * are mounted at once.
3380	 */
3381	next += incr;
3382	if (next == 0 || next > syncer_maxdelay) {
3383		start /= 2;
3384		incr /= 2;
3385		if (start == 0) {
3386			start = syncer_maxdelay / 2;
3387			incr = syncer_maxdelay;
3388		}
3389		next = start;
3390	}
3391	VI_LOCK(vp);
3392	vn_syncer_add_to_worklist(vp, syncdelay > 0 ? next % syncdelay : 0);
3393	VI_UNLOCK(vp);
3394	mp->mnt_syncer = vp;
3395	return (0);
3396}
3397
3398/*
3399 * Do a lazy sync of the filesystem.
3400 */
3401static int
3402sync_fsync(ap)
3403	struct vop_fsync_args /* {
3404		struct vnode *a_vp;
3405		struct ucred *a_cred;
3406		int a_waitfor;
3407		struct thread *a_td;
3408	} */ *ap;
3409{
3410	struct vnode *syncvp = ap->a_vp;
3411	struct mount *mp = syncvp->v_mount;
3412	struct thread *td = ap->a_td;
3413	int asyncflag;
3414
3415	/*
3416	 * We only need to do something if this is a lazy evaluation.
3417	 */
3418	if (ap->a_waitfor != MNT_LAZY)
3419		return (0);
3420
3421	/*
3422	 * Move ourselves to the back of the sync list.
3423	 */
3424	VI_LOCK(syncvp);
3425	vn_syncer_add_to_worklist(syncvp, syncdelay);
3426	VI_UNLOCK(syncvp);
3427
3428	/*
3429	 * Walk the list of vnodes pushing all that are dirty and
3430	 * not already on the sync list.
3431	 */
3432	mtx_lock(&mountlist_mtx);
3433	if (vfs_busy(mp, LK_EXCLUSIVE | LK_NOWAIT, &mountlist_mtx, td) != 0) {
3434		mtx_unlock(&mountlist_mtx);
3435		return (0);
3436	}
3437	if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) {
3438		vfs_unbusy(mp, td);
3439		return (0);
3440	}
3441	asyncflag = mp->mnt_flag & MNT_ASYNC;
3442	mp->mnt_flag &= ~MNT_ASYNC;
3443	vfs_msync(mp, MNT_NOWAIT);
3444	VFS_SYNC(mp, MNT_LAZY, ap->a_cred, td);
3445	if (asyncflag)
3446		mp->mnt_flag |= MNT_ASYNC;
3447	vn_finished_write(mp);
3448	vfs_unbusy(mp, td);
3449	return (0);
3450}
3451
3452/*
3453 * The syncer vnode is no referenced.
3454 */
3455static int
3456sync_inactive(ap)
3457	struct vop_inactive_args /* {
3458		struct vnode *a_vp;
3459		struct thread *a_td;
3460	} */ *ap;
3461{
3462
3463	VOP_UNLOCK(ap->a_vp, 0, ap->a_td);
3464	vgone(ap->a_vp);
3465	return (0);
3466}
3467
3468/*
3469 * The syncer vnode is no longer needed and is being decommissioned.
3470 *
3471 * Modifications to the worklist must be protected at splbio().
3472 */
3473static int
3474sync_reclaim(ap)
3475	struct vop_reclaim_args /* {
3476		struct vnode *a_vp;
3477	} */ *ap;
3478{
3479	struct vnode *vp = ap->a_vp;
3480	int s;
3481
3482	s = splbio();
3483	vp->v_mount->mnt_syncer = NULL;
3484	VI_LOCK(vp);
3485	if (vp->v_iflag & VI_ONWORKLST) {
3486		mtx_lock(&sync_mtx);
3487		LIST_REMOVE(vp, v_synclist);
3488		mtx_unlock(&sync_mtx);
3489		vp->v_iflag &= ~VI_ONWORKLST;
3490	}
3491	VI_UNLOCK(vp);
3492	splx(s);
3493
3494	return (0);
3495}
3496
3497/*
3498 * Print out a syncer vnode.
3499 */
3500static int
3501sync_print(ap)
3502	struct vop_print_args /* {
3503		struct vnode *a_vp;
3504	} */ *ap;
3505{
3506	struct vnode *vp = ap->a_vp;
3507
3508	printf("syncer vnode");
3509	if (vp->v_vnlock != NULL)
3510		lockmgr_printinfo(vp->v_vnlock);
3511	printf("\n");
3512	return (0);
3513}
3514
3515/*
3516 * extract the dev_t from a VCHR
3517 */
3518dev_t
3519vn_todev(vp)
3520	struct vnode *vp;
3521{
3522	if (vp->v_type != VCHR)
3523		return (NODEV);
3524	return (vp->v_rdev);
3525}
3526
3527/*
3528 * Check if vnode represents a disk device
3529 */
3530int
3531vn_isdisk(vp, errp)
3532	struct vnode *vp;
3533	int *errp;
3534{
3535	struct cdevsw *cdevsw;
3536
3537	if (vp->v_type != VCHR) {
3538		if (errp != NULL)
3539			*errp = ENOTBLK;
3540		return (0);
3541	}
3542	if (vp->v_rdev == NULL) {
3543		if (errp != NULL)
3544			*errp = ENXIO;
3545		return (0);
3546	}
3547	cdevsw = devsw(vp->v_rdev);
3548	if (cdevsw == NULL) {
3549		if (errp != NULL)
3550			*errp = ENXIO;
3551		return (0);
3552	}
3553	if (!(cdevsw->d_flags & D_DISK)) {
3554		if (errp != NULL)
3555			*errp = ENOTBLK;
3556		return (0);
3557	}
3558	if (errp != NULL)
3559		*errp = 0;
3560	return (1);
3561}
3562
3563/*
3564 * Free data allocated by namei(); see namei(9) for details.
3565 */
3566void
3567NDFREE(ndp, flags)
3568     struct nameidata *ndp;
3569     const uint flags;
3570{
3571	if (!(flags & NDF_NO_FREE_PNBUF) &&
3572	    (ndp->ni_cnd.cn_flags & HASBUF)) {
3573		uma_zfree(namei_zone, ndp->ni_cnd.cn_pnbuf);
3574		ndp->ni_cnd.cn_flags &= ~HASBUF;
3575	}
3576	if (!(flags & NDF_NO_DVP_UNLOCK) &&
3577	    (ndp->ni_cnd.cn_flags & LOCKPARENT) &&
3578	    ndp->ni_dvp != ndp->ni_vp)
3579		VOP_UNLOCK(ndp->ni_dvp, 0, ndp->ni_cnd.cn_thread);
3580	if (!(flags & NDF_NO_DVP_RELE) &&
3581	    (ndp->ni_cnd.cn_flags & (LOCKPARENT|WANTPARENT))) {
3582		vrele(ndp->ni_dvp);
3583		ndp->ni_dvp = NULL;
3584	}
3585	if (!(flags & NDF_NO_VP_UNLOCK) &&
3586	    (ndp->ni_cnd.cn_flags & LOCKLEAF) && ndp->ni_vp)
3587		VOP_UNLOCK(ndp->ni_vp, 0, ndp->ni_cnd.cn_thread);
3588	if (!(flags & NDF_NO_VP_RELE) &&
3589	    ndp->ni_vp) {
3590		vrele(ndp->ni_vp);
3591		ndp->ni_vp = NULL;
3592	}
3593	if (!(flags & NDF_NO_STARTDIR_RELE) &&
3594	    (ndp->ni_cnd.cn_flags & SAVESTART)) {
3595		vrele(ndp->ni_startdir);
3596		ndp->ni_startdir = NULL;
3597	}
3598}
3599
3600/*
3601 * Common filesystem object access control check routine.  Accepts a
3602 * vnode's type, "mode", uid and gid, requested access mode, credentials,
3603 * and optional call-by-reference privused argument allowing vaccess()
3604 * to indicate to the caller whether privilege was used to satisfy the
3605 * request (obsoleted).  Returns 0 on success, or an errno on failure.
3606 */
3607int
3608vaccess(type, file_mode, file_uid, file_gid, acc_mode, cred, privused)
3609	enum vtype type;
3610	mode_t file_mode;
3611	uid_t file_uid;
3612	gid_t file_gid;
3613	mode_t acc_mode;
3614	struct ucred *cred;
3615	int *privused;
3616{
3617	mode_t dac_granted;
3618#ifdef CAPABILITIES
3619	mode_t cap_granted;
3620#endif
3621
3622	/*
3623	 * Look for a normal, non-privileged way to access the file/directory
3624	 * as requested.  If it exists, go with that.
3625	 */
3626
3627	if (privused != NULL)
3628		*privused = 0;
3629
3630	dac_granted = 0;
3631
3632	/* Check the owner. */
3633	if (cred->cr_uid == file_uid) {
3634		dac_granted |= VADMIN;
3635		if (file_mode & S_IXUSR)
3636			dac_granted |= VEXEC;
3637		if (file_mode & S_IRUSR)
3638			dac_granted |= VREAD;
3639		if (file_mode & S_IWUSR)
3640			dac_granted |= (VWRITE | VAPPEND);
3641
3642		if ((acc_mode & dac_granted) == acc_mode)
3643			return (0);
3644
3645		goto privcheck;
3646	}
3647
3648	/* Otherwise, check the groups (first match) */
3649	if (groupmember(file_gid, cred)) {
3650		if (file_mode & S_IXGRP)
3651			dac_granted |= VEXEC;
3652		if (file_mode & S_IRGRP)
3653			dac_granted |= VREAD;
3654		if (file_mode & S_IWGRP)
3655			dac_granted |= (VWRITE | VAPPEND);
3656
3657		if ((acc_mode & dac_granted) == acc_mode)
3658			return (0);
3659
3660		goto privcheck;
3661	}
3662
3663	/* Otherwise, check everyone else. */
3664	if (file_mode & S_IXOTH)
3665		dac_granted |= VEXEC;
3666	if (file_mode & S_IROTH)
3667		dac_granted |= VREAD;
3668	if (file_mode & S_IWOTH)
3669		dac_granted |= (VWRITE | VAPPEND);
3670	if ((acc_mode & dac_granted) == acc_mode)
3671		return (0);
3672
3673privcheck:
3674	if (!suser_cred(cred, PRISON_ROOT)) {
3675		/* XXX audit: privilege used */
3676		if (privused != NULL)
3677			*privused = 1;
3678		return (0);
3679	}
3680
3681#ifdef CAPABILITIES
3682	/*
3683	 * Build a capability mask to determine if the set of capabilities
3684	 * satisfies the requirements when combined with the granted mask
3685	 * from above.
3686	 * For each capability, if the capability is required, bitwise
3687	 * or the request type onto the cap_granted mask.
3688	 */
3689	cap_granted = 0;
3690
3691	if (type == VDIR) {
3692		/*
3693		 * For directories, use CAP_DAC_READ_SEARCH to satisfy
3694		 * VEXEC requests, instead of CAP_DAC_EXECUTE.
3695		 */
3696		if ((acc_mode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
3697		    !cap_check(cred, NULL, CAP_DAC_READ_SEARCH, PRISON_ROOT))
3698			cap_granted |= VEXEC;
3699	} else {
3700		if ((acc_mode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
3701		    !cap_check(cred, NULL, CAP_DAC_EXECUTE, PRISON_ROOT))
3702			cap_granted |= VEXEC;
3703	}
3704
3705	if ((acc_mode & VREAD) && ((dac_granted & VREAD) == 0) &&
3706	    !cap_check(cred, NULL, CAP_DAC_READ_SEARCH, PRISON_ROOT))
3707		cap_granted |= VREAD;
3708
3709	if ((acc_mode & VWRITE) && ((dac_granted & VWRITE) == 0) &&
3710	    !cap_check(cred, NULL, CAP_DAC_WRITE, PRISON_ROOT))
3711		cap_granted |= (VWRITE | VAPPEND);
3712
3713	if ((acc_mode & VADMIN) && ((dac_granted & VADMIN) == 0) &&
3714	    !cap_check(cred, NULL, CAP_FOWNER, PRISON_ROOT))
3715		cap_granted |= VADMIN;
3716
3717	if ((acc_mode & (cap_granted | dac_granted)) == acc_mode) {
3718		/* XXX audit: privilege used */
3719		if (privused != NULL)
3720			*privused = 1;
3721		return (0);
3722	}
3723#endif
3724
3725	return ((acc_mode & VADMIN) ? EPERM : EACCES);
3726}
3727
3728/*
3729 * Credential check based on process requesting service, and per-attribute
3730 * permissions.
3731 */
3732int
3733extattr_check_cred(struct vnode *vp, int attrnamespace,
3734    struct ucred *cred, struct thread *td, int access)
3735{
3736
3737	/*
3738	 * Kernel-invoked always succeeds.
3739	 */
3740	if (cred == NOCRED)
3741		return (0);
3742
3743	/*
3744	 * Do not allow privileged processes in jail to directly
3745	 * manipulate system attributes.
3746	 *
3747	 * XXX What capability should apply here?
3748	 * Probably CAP_SYS_SETFFLAG.
3749	 */
3750	switch (attrnamespace) {
3751	case EXTATTR_NAMESPACE_SYSTEM:
3752		/* Potentially should be: return (EPERM); */
3753		return (suser_cred(cred, 0));
3754	case EXTATTR_NAMESPACE_USER:
3755		return (VOP_ACCESS(vp, access, cred, td));
3756	default:
3757		return (EPERM);
3758	}
3759}
3760