vfs_subr.c revision 103216
1/*
2 * Copyright (c) 1989, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 *    must display the following acknowledgement:
20 *	This product includes software developed by the University of
21 *	California, Berkeley and its contributors.
22 * 4. Neither the name of the University nor the names of its contributors
23 *    may be used to endorse or promote products derived from this software
24 *    without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 *
38 *	@(#)vfs_subr.c	8.31 (Berkeley) 5/26/95
39 * $FreeBSD: head/sys/kern/vfs_subr.c 103216 2002-09-11 08:13:56Z julian $
40 */
41
42/*
43 * External virtual filesystem routines
44 */
45#include "opt_ddb.h"
46#include "opt_mac.h"
47
48#include <sys/param.h>
49#include <sys/systm.h>
50#include <sys/bio.h>
51#include <sys/buf.h>
52#include <sys/conf.h>
53#include <sys/eventhandler.h>
54#include <sys/extattr.h>
55#include <sys/fcntl.h>
56#include <sys/kernel.h>
57#include <sys/kthread.h>
58#include <sys/mac.h>
59#include <sys/malloc.h>
60#include <sys/mount.h>
61#include <sys/namei.h>
62#include <sys/stat.h>
63#include <sys/sysctl.h>
64#include <sys/syslog.h>
65#include <sys/vmmeter.h>
66#include <sys/vnode.h>
67
68#include <vm/vm.h>
69#include <vm/vm_object.h>
70#include <vm/vm_extern.h>
71#include <vm/pmap.h>
72#include <vm/vm_map.h>
73#include <vm/vm_page.h>
74#include <vm/uma.h>
75
76static MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure");
77
78static void	addalias(struct vnode *vp, dev_t nvp_rdev);
79static void	insmntque(struct vnode *vp, struct mount *mp);
80static void	vclean(struct vnode *vp, int flags, struct thread *td);
81static void	vlruvp(struct vnode *vp);
82static int	flushbuflist(struct buf *blist, int flags, struct vnode *vp,
83		    int slpflag, int slptimeo, int *errorp);
84static int	vcanrecycle(struct vnode *vp);
85
86
87/*
88 * Number of vnodes in existence.  Increased whenever getnewvnode()
89 * allocates a new vnode, never decreased.
90 */
91static unsigned long	numvnodes;
92
93SYSCTL_LONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, "");
94
95/*
96 * Conversion tables for conversion from vnode types to inode formats
97 * and back.
98 */
99enum vtype iftovt_tab[16] = {
100	VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
101	VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
102};
103int vttoif_tab[9] = {
104	0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
105	S_IFSOCK, S_IFIFO, S_IFMT,
106};
107
108/*
109 * List of vnodes that are ready for recycling.
110 */
111static TAILQ_HEAD(freelst, vnode) vnode_free_list;
112
113/*
114 * Minimum number of free vnodes.  If there are fewer than this free vnodes,
115 * getnewvnode() will return a newly allocated vnode.
116 */
117static u_long wantfreevnodes = 25;
118SYSCTL_LONG(_vfs, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, "");
119/* Number of vnodes in the free list. */
120static u_long freevnodes;
121SYSCTL_LONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, "");
122
123/*
124 * Various variables used for debugging the new implementation of
125 * reassignbuf().
126 * XXX these are probably of (very) limited utility now.
127 */
128static int reassignbufcalls;
129SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0, "");
130static int nameileafonly;
131SYSCTL_INT(_vfs, OID_AUTO, nameileafonly, CTLFLAG_RW, &nameileafonly, 0, "");
132
133#ifdef ENABLE_VFS_IOOPT
134/* See NOTES for a description of this setting. */
135int vfs_ioopt;
136SYSCTL_INT(_vfs, OID_AUTO, ioopt, CTLFLAG_RW, &vfs_ioopt, 0, "");
137#endif
138
139/*
140 * Cache for the mount type id assigned to NFS.  This is used for
141 * special checks in nfs/nfs_nqlease.c and vm/vnode_pager.c.
142 */
143int	nfs_mount_type = -1;
144
145/* To keep more than one thread at a time from running vfs_getnewfsid */
146static struct mtx mntid_mtx;
147
148/*
149 * Lock for any access to the following:
150 *	vnode_free_list
151 *	numvnodes
152 *	freevnodes
153 */
154static struct mtx vnode_free_list_mtx;
155
156/*
157 * For any iteration/modification of dev->si_hlist (linked through
158 * v_specnext)
159 */
160static struct mtx spechash_mtx;
161
162/* Publicly exported FS */
163struct nfs_public nfs_pub;
164
165/* Zone for allocation of new vnodes - used exclusively by getnewvnode() */
166static uma_zone_t vnode_zone;
167static uma_zone_t vnodepoll_zone;
168
169/* Set to 1 to print out reclaim of active vnodes */
170int	prtactive;
171
172/*
173 * The workitem queue.
174 *
175 * It is useful to delay writes of file data and filesystem metadata
176 * for tens of seconds so that quickly created and deleted files need
177 * not waste disk bandwidth being created and removed. To realize this,
178 * we append vnodes to a "workitem" queue. When running with a soft
179 * updates implementation, most pending metadata dependencies should
180 * not wait for more than a few seconds. Thus, mounted on block devices
181 * are delayed only about a half the time that file data is delayed.
182 * Similarly, directory updates are more critical, so are only delayed
183 * about a third the time that file data is delayed. Thus, there are
184 * SYNCER_MAXDELAY queues that are processed round-robin at a rate of
185 * one each second (driven off the filesystem syncer process). The
186 * syncer_delayno variable indicates the next queue that is to be processed.
187 * Items that need to be processed soon are placed in this queue:
188 *
189 *	syncer_workitem_pending[syncer_delayno]
190 *
191 * A delay of fifteen seconds is done by placing the request fifteen
192 * entries later in the queue:
193 *
194 *	syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
195 *
196 */
197static int syncer_delayno;
198static long syncer_mask;
199LIST_HEAD(synclist, vnode);
200static struct synclist *syncer_workitem_pending;
201
202#define SYNCER_MAXDELAY		32
203static int syncer_maxdelay = SYNCER_MAXDELAY;	/* maximum delay time */
204static int syncdelay = 30;		/* max time to delay syncing data */
205static int filedelay = 30;		/* time to delay syncing files */
206SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, "");
207static int dirdelay = 29;		/* time to delay syncing directories */
208SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, "");
209static int metadelay = 28;		/* time to delay syncing metadata */
210SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, "");
211static int rushjob;		/* number of slots to run ASAP */
212static int stat_rush_requests;	/* number of times I/O speeded up */
213SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, "");
214
215/*
216 * Number of vnodes we want to exist at any one time.  This is mostly used
217 * to size hash tables in vnode-related code.  It is normally not used in
218 * getnewvnode(), as wantfreevnodes is normally nonzero.)
219 *
220 * XXX desiredvnodes is historical cruft and should not exist.
221 */
222int desiredvnodes;
223SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW,
224    &desiredvnodes, 0, "Maximum number of vnodes");
225static int minvnodes;
226SYSCTL_INT(_kern, OID_AUTO, minvnodes, CTLFLAG_RW,
227    &minvnodes, 0, "Minimum number of vnodes");
228static int vnlru_nowhere;
229SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW, &vnlru_nowhere, 0,
230    "Number of times the vnlru process ran without success");
231
232/* Hook for calling soft updates */
233int (*softdep_process_worklist_hook)(struct mount *);
234
235#ifdef DEBUG_VFS_LOCKS
236/* Print lock violations */
237int vfs_badlock_print = 1;
238
239/* Panic on violation */
240int vfs_badlock_panic = 1;
241
242/* Check for interlock across VOPs */
243int vfs_badlock_mutex = 0;
244
245void
246vop_rename_pre(void *ap)
247{
248	struct vop_rename_args *a = ap;
249
250	if (a->a_tvp)
251		ASSERT_VI_UNLOCKED(a->a_tvp);
252	ASSERT_VI_UNLOCKED(a->a_tdvp);
253	ASSERT_VI_UNLOCKED(a->a_fvp);
254	ASSERT_VI_UNLOCKED(a->a_fdvp);
255
256	/* Check the source (from) */
257	if (a->a_tdvp != a->a_fdvp)
258		ASSERT_VOP_UNLOCKED(a->a_fdvp, "vop_rename: fdvp locked.\n");
259	if (a->a_tvp != a->a_fvp)
260		ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: tvp locked.\n");
261
262	/* Check the target */
263	if (a->a_tvp)
264		ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked.\n");
265
266	ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked.\n");
267}
268
269void
270vop_strategy_pre(void *ap)
271{
272	struct vop_strategy_args *a = ap;
273	struct buf *bp;
274
275	bp = a->a_bp;
276
277	/*
278	 * Cluster ops lock their component buffers but not the IO container.
279	 */
280	if ((bp->b_flags & B_CLUSTER) != 0)
281		return;
282
283	if (BUF_REFCNT(bp) < 1) {
284		if (vfs_badlock_print)
285			printf("VOP_STRATEGY: bp is not locked but should be.\n");
286		if (vfs_badlock_panic)
287			Debugger("Lock violation.\n");
288	}
289}
290
291void
292vop_lookup_pre(void *ap)
293{
294	struct vop_lookup_args *a = ap;
295	struct vnode *dvp;
296
297	dvp = a->a_dvp;
298
299	ASSERT_VI_UNLOCKED(dvp);
300	ASSERT_VOP_LOCKED(dvp, "VOP_LOOKUP");
301}
302
303void
304vop_lookup_post(void *ap, int rc)
305{
306	struct vop_lookup_args *a = ap;
307	struct componentname *cnp;
308	struct vnode *dvp;
309	struct vnode *vp;
310	int flags;
311
312	dvp = a->a_dvp;
313	cnp = a->a_cnp;
314	vp = *(a->a_vpp);
315	flags = cnp->cn_flags;
316
317
318	ASSERT_VI_UNLOCKED(dvp);
319	/*
320	 * If this is the last path component for this lookup and LOCPARENT
321	 * is set, OR if there is an error the directory has to be locked.
322	 */
323	if ((flags & LOCKPARENT) && (flags & ISLASTCN))
324		ASSERT_VOP_LOCKED(dvp, "VOP_LOOKUP (LOCKPARENT)");
325	else if (rc != 0)
326		ASSERT_VOP_LOCKED(dvp, "VOP_LOOKUP (error)");
327	else if (dvp != vp)
328		ASSERT_VOP_UNLOCKED(dvp, "VOP_LOOKUP (dvp)");
329
330	if (flags & PDIRUNLOCK)
331		ASSERT_VOP_UNLOCKED(dvp, "VOP_LOOKUP (PDIRUNLOCK)");
332
333	if (rc == 0) {
334		ASSERT_VI_UNLOCKED(vp);
335		ASSERT_VOP_LOCKED(vp, "VOP_LOOKUP (vpp)");
336	}
337}
338
339void
340vop_unlock_pre(void *ap)
341{
342	struct vop_unlock_args *a = ap;
343
344	if ((a->a_flags & LK_INTERLOCK) == 0)
345		ASSERT_VI_UNLOCKED(a->a_vp);
346	else
347		ASSERT_VI_LOCKED(a->a_vp);
348}
349
350void
351vop_unlock_post(void *ap, int rc)
352{
353	struct vop_unlock_args *a = ap;
354
355	ASSERT_VI_UNLOCKED(a->a_vp);
356}
357
358void
359vop_lock_pre(void *ap)
360{
361	struct vop_lock_args *a = ap;
362
363	if ((a->a_flags & LK_INTERLOCK) == 0)
364		ASSERT_VI_UNLOCKED(a->a_vp);
365	else
366		ASSERT_VI_LOCKED(a->a_vp);
367}
368
369void
370vop_lock_post(void *ap, int rc)
371{
372	struct vop_lock_args *a = ap;
373
374	ASSERT_VI_UNLOCKED(a->a_vp);
375	ASSERT_VOP_LOCKED(a->a_vp, "VOP_LOCK");
376}
377
378#endif	/* DEBUG_VFS_LOCKS */
379
380void
381v_addpollinfo(struct vnode *vp)
382{
383	vp->v_pollinfo = uma_zalloc(vnodepoll_zone, M_WAITOK);
384	mtx_init(&vp->v_pollinfo->vpi_lock, "vnode pollinfo", NULL, MTX_DEF);
385}
386
387/*
388 * Initialize the vnode management data structures.
389 */
390static void
391vntblinit(void *dummy __unused)
392{
393
394	desiredvnodes = maxproc + cnt.v_page_count / 4;
395	minvnodes = desiredvnodes / 4;
396	mtx_init(&mountlist_mtx, "mountlist", NULL, MTX_DEF);
397	mtx_init(&mntvnode_mtx, "mntvnode", NULL, MTX_DEF);
398	mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF);
399	mtx_init(&spechash_mtx, "spechash", NULL, MTX_DEF);
400	TAILQ_INIT(&vnode_free_list);
401	mtx_init(&vnode_free_list_mtx, "vnode_free_list", NULL, MTX_DEF);
402	vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL,
403	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
404	vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo),
405	      NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
406	/*
407	 * Initialize the filesystem syncer.
408	 */
409	syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE,
410		&syncer_mask);
411	syncer_maxdelay = syncer_mask + 1;
412}
413SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL)
414
415
416/*
417 * Mark a mount point as busy. Used to synchronize access and to delay
418 * unmounting. Interlock is not released on failure.
419 */
420int
421vfs_busy(mp, flags, interlkp, td)
422	struct mount *mp;
423	int flags;
424	struct mtx *interlkp;
425	struct thread *td;
426{
427	int lkflags;
428
429	if (mp->mnt_kern_flag & MNTK_UNMOUNT) {
430		if (flags & LK_NOWAIT)
431			return (ENOENT);
432		mp->mnt_kern_flag |= MNTK_MWAIT;
433		/*
434		 * Since all busy locks are shared except the exclusive
435		 * lock granted when unmounting, the only place that a
436		 * wakeup needs to be done is at the release of the
437		 * exclusive lock at the end of dounmount.
438		 */
439		msleep(mp, interlkp, PVFS, "vfs_busy", 0);
440		return (ENOENT);
441	}
442	lkflags = LK_SHARED | LK_NOPAUSE;
443	if (interlkp)
444		lkflags |= LK_INTERLOCK;
445	if (lockmgr(&mp->mnt_lock, lkflags, interlkp, td))
446		panic("vfs_busy: unexpected lock failure");
447	return (0);
448}
449
450/*
451 * Free a busy filesystem.
452 */
453void
454vfs_unbusy(mp, td)
455	struct mount *mp;
456	struct thread *td;
457{
458
459	lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, td);
460}
461
462/*
463 * Lookup a mount point by filesystem identifier.
464 */
465struct mount *
466vfs_getvfs(fsid)
467	fsid_t *fsid;
468{
469	register struct mount *mp;
470
471	mtx_lock(&mountlist_mtx);
472	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
473		if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
474		    mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
475			mtx_unlock(&mountlist_mtx);
476			return (mp);
477	    }
478	}
479	mtx_unlock(&mountlist_mtx);
480	return ((struct mount *) 0);
481}
482
483/*
484 * Get a new unique fsid.  Try to make its val[0] unique, since this value
485 * will be used to create fake device numbers for stat().  Also try (but
486 * not so hard) make its val[0] unique mod 2^16, since some emulators only
487 * support 16-bit device numbers.  We end up with unique val[0]'s for the
488 * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls.
489 *
490 * Keep in mind that several mounts may be running in parallel.  Starting
491 * the search one past where the previous search terminated is both a
492 * micro-optimization and a defense against returning the same fsid to
493 * different mounts.
494 */
495void
496vfs_getnewfsid(mp)
497	struct mount *mp;
498{
499	static u_int16_t mntid_base;
500	fsid_t tfsid;
501	int mtype;
502
503	mtx_lock(&mntid_mtx);
504	mtype = mp->mnt_vfc->vfc_typenum;
505	tfsid.val[1] = mtype;
506	mtype = (mtype & 0xFF) << 24;
507	for (;;) {
508		tfsid.val[0] = makeudev(255,
509		    mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF));
510		mntid_base++;
511		if (vfs_getvfs(&tfsid) == NULL)
512			break;
513	}
514	mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
515	mp->mnt_stat.f_fsid.val[1] = tfsid.val[1];
516	mtx_unlock(&mntid_mtx);
517}
518
519/*
520 * Knob to control the precision of file timestamps:
521 *
522 *   0 = seconds only; nanoseconds zeroed.
523 *   1 = seconds and nanoseconds, accurate within 1/HZ.
524 *   2 = seconds and nanoseconds, truncated to microseconds.
525 * >=3 = seconds and nanoseconds, maximum precision.
526 */
527enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC };
528
529static int timestamp_precision = TSP_SEC;
530SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW,
531    &timestamp_precision, 0, "");
532
533/*
534 * Get a current timestamp.
535 */
536void
537vfs_timestamp(tsp)
538	struct timespec *tsp;
539{
540	struct timeval tv;
541
542	switch (timestamp_precision) {
543	case TSP_SEC:
544		tsp->tv_sec = time_second;
545		tsp->tv_nsec = 0;
546		break;
547	case TSP_HZ:
548		getnanotime(tsp);
549		break;
550	case TSP_USEC:
551		microtime(&tv);
552		TIMEVAL_TO_TIMESPEC(&tv, tsp);
553		break;
554	case TSP_NSEC:
555	default:
556		nanotime(tsp);
557		break;
558	}
559}
560
561/*
562 * Set vnode attributes to VNOVAL
563 */
564void
565vattr_null(vap)
566	register struct vattr *vap;
567{
568
569	vap->va_type = VNON;
570	vap->va_size = VNOVAL;
571	vap->va_bytes = VNOVAL;
572	vap->va_mode = VNOVAL;
573	vap->va_nlink = VNOVAL;
574	vap->va_uid = VNOVAL;
575	vap->va_gid = VNOVAL;
576	vap->va_fsid = VNOVAL;
577	vap->va_fileid = VNOVAL;
578	vap->va_blocksize = VNOVAL;
579	vap->va_rdev = VNOVAL;
580	vap->va_atime.tv_sec = VNOVAL;
581	vap->va_atime.tv_nsec = VNOVAL;
582	vap->va_mtime.tv_sec = VNOVAL;
583	vap->va_mtime.tv_nsec = VNOVAL;
584	vap->va_ctime.tv_sec = VNOVAL;
585	vap->va_ctime.tv_nsec = VNOVAL;
586	vap->va_birthtime.tv_sec = VNOVAL;
587	vap->va_birthtime.tv_nsec = VNOVAL;
588	vap->va_flags = VNOVAL;
589	vap->va_gen = VNOVAL;
590	vap->va_vaflags = 0;
591}
592
593/*
594 * This routine is called when we have too many vnodes.  It attempts
595 * to free <count> vnodes and will potentially free vnodes that still
596 * have VM backing store (VM backing store is typically the cause
597 * of a vnode blowout so we want to do this).  Therefore, this operation
598 * is not considered cheap.
599 *
600 * A number of conditions may prevent a vnode from being reclaimed.
601 * the buffer cache may have references on the vnode, a directory
602 * vnode may still have references due to the namei cache representing
603 * underlying files, or the vnode may be in active use.   It is not
604 * desireable to reuse such vnodes.  These conditions may cause the
605 * number of vnodes to reach some minimum value regardless of what
606 * you set kern.maxvnodes to.  Do not set kern.maxvnodes too low.
607 */
608static int
609vlrureclaim(struct mount *mp, int count)
610{
611	struct vnode *vp;
612	int done;
613	int trigger;
614	int usevnodes;
615
616	/*
617	 * Calculate the trigger point, don't allow user
618	 * screwups to blow us up.   This prevents us from
619	 * recycling vnodes with lots of resident pages.  We
620	 * aren't trying to free memory, we are trying to
621	 * free vnodes.
622	 */
623	usevnodes = desiredvnodes;
624	if (usevnodes <= 0)
625		usevnodes = 1;
626	trigger = cnt.v_page_count * 2 / usevnodes;
627
628	done = 0;
629	mtx_lock(&mntvnode_mtx);
630	while (count && (vp = TAILQ_FIRST(&mp->mnt_nvnodelist)) != NULL) {
631		TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
632		TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
633
634		if (vp->v_type != VNON &&
635		    vp->v_type != VBAD &&
636		    VI_TRYLOCK(vp)) {
637			if (VMIGHTFREE(vp) &&           /* critical path opt */
638			    (vp->v_object == NULL ||
639			    vp->v_object->resident_page_count < trigger)) {
640				mtx_unlock(&mntvnode_mtx);
641				vgonel(vp, curthread);
642				done++;
643				mtx_lock(&mntvnode_mtx);
644			} else
645				VI_UNLOCK(vp);
646		}
647		--count;
648	}
649	mtx_unlock(&mntvnode_mtx);
650	return done;
651}
652
653/*
654 * Attempt to recycle vnodes in a context that is always safe to block.
655 * Calling vlrurecycle() from the bowels of filesystem code has some
656 * interesting deadlock problems.
657 */
658static struct proc *vnlruproc;
659static int vnlruproc_sig;
660
661static void
662vnlru_proc(void)
663{
664	struct mount *mp, *nmp;
665	int s;
666	int done;
667	struct proc *p = vnlruproc;
668	struct thread *td = FIRST_THREAD_IN_PROC(p);	/* XXXKSE */
669
670	mtx_lock(&Giant);
671
672	EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, p,
673	    SHUTDOWN_PRI_FIRST);
674
675	s = splbio();
676	for (;;) {
677		kthread_suspend_check(p);
678		mtx_lock(&vnode_free_list_mtx);
679		if (numvnodes - freevnodes <= desiredvnodes * 9 / 10) {
680			mtx_unlock(&vnode_free_list_mtx);
681			vnlruproc_sig = 0;
682			tsleep(vnlruproc, PVFS, "vlruwt", 0);
683			continue;
684		}
685		mtx_unlock(&vnode_free_list_mtx);
686		done = 0;
687		mtx_lock(&mountlist_mtx);
688		for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
689			if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td)) {
690				nmp = TAILQ_NEXT(mp, mnt_list);
691				continue;
692			}
693			done += vlrureclaim(mp, 10);
694			mtx_lock(&mountlist_mtx);
695			nmp = TAILQ_NEXT(mp, mnt_list);
696			vfs_unbusy(mp, td);
697		}
698		mtx_unlock(&mountlist_mtx);
699		if (done == 0) {
700#if 0
701			/* These messages are temporary debugging aids */
702			if (vnlru_nowhere < 5)
703				printf("vnlru process getting nowhere..\n");
704			else if (vnlru_nowhere == 5)
705				printf("vnlru process messages stopped.\n");
706#endif
707			vnlru_nowhere++;
708			tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3);
709		}
710	}
711	splx(s);
712}
713
714static struct kproc_desc vnlru_kp = {
715	"vnlru",
716	vnlru_proc,
717	&vnlruproc
718};
719SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &vnlru_kp)
720
721
722/*
723 * Routines having to do with the management of the vnode table.
724 */
725
726/*
727 * Check to see if a free vnode can be recycled.  If it can, return it locked
728 * with the vn lock, but not interlock.  Otherwise indicate the error.
729 */
730static int
731vcanrecycle(struct vnode *vp)
732{
733	struct thread *td = curthread;
734	vm_object_t object;
735	int error;
736
737	/* Don't recycle if we can't get the interlock */
738	if (!mtx_trylock(&vp->v_interlock))
739		return (EWOULDBLOCK);
740
741	/* We should be able to immediately acquire this */
742	/* XXX This looks like it should panic if it fails */
743	if (vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE, td) != 0)
744		return (EWOULDBLOCK);
745
746	/*
747	 * Don't recycle if we still have cached pages.
748	 */
749	if (VOP_GETVOBJECT(vp, &object) == 0 &&
750	     (object->resident_page_count ||
751	      object->ref_count)) {
752		error = EBUSY;
753		goto done;
754	}
755	if (LIST_FIRST(&vp->v_cache_src)) {
756		/*
757		 * note: nameileafonly sysctl is temporary,
758		 * for debugging only, and will eventually be
759		 * removed.
760		 */
761		if (nameileafonly > 0) {
762			/*
763			 * Do not reuse namei-cached directory
764			 * vnodes that have cached
765			 * subdirectories.
766			 */
767			if (cache_leaf_test(vp) < 0) {
768				error = EISDIR;
769				goto done;
770			}
771		} else if (nameileafonly < 0 ||
772			    vmiodirenable == 0) {
773			/*
774			 * Do not reuse namei-cached directory
775			 * vnodes if nameileafonly is -1 or
776			 * if VMIO backing for directories is
777			 * turned off (otherwise we reuse them
778			 * too quickly).
779			 */
780			error = EBUSY;
781			goto done;
782		}
783	}
784	return (0);
785done:
786	VOP_UNLOCK(vp, 0, td);
787	return (error);
788}
789
790/*
791 * Return the next vnode from the free list.
792 */
793int
794getnewvnode(tag, mp, vops, vpp)
795	enum vtagtype tag;
796	struct mount *mp;
797	vop_t **vops;
798	struct vnode **vpp;
799{
800	int s;
801	struct thread *td = curthread;	/* XXX */
802	struct vnode *vp = NULL;
803	struct mount *vnmp;
804
805	s = splbio();
806	mtx_lock(&vnode_free_list_mtx);
807
808	/*
809	 * Try to reuse vnodes if we hit the max.  This situation only
810	 * occurs in certain large-memory (2G+) situations.  We cannot
811	 * attempt to directly reclaim vnodes due to nasty recursion
812	 * problems.
813	 */
814	if (vnlruproc_sig == 0 && numvnodes - freevnodes > desiredvnodes) {
815		vnlruproc_sig = 1;      /* avoid unnecessary wakeups */
816		wakeup(vnlruproc);
817	}
818
819	/*
820	 * Attempt to reuse a vnode already on the free list, allocating
821	 * a new vnode if we can't find one or if we have not reached a
822	 * good minimum for good LRU performance.
823	 */
824
825	if (freevnodes >= wantfreevnodes && numvnodes >= minvnodes) {
826		int error;
827		int count;
828
829		for (count = 0; count < freevnodes; count++) {
830			vp = TAILQ_FIRST(&vnode_free_list);
831
832			KASSERT(vp->v_usecount == 0,
833			    ("getnewvnode: free vnode isn't"));
834
835			TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
836			/*
837			 * We have to drop the free list mtx to avoid lock
838			 * order reversals with interlock.
839			 */
840			mtx_unlock(&vnode_free_list_mtx);
841			error = vcanrecycle(vp);
842			/*
843			 * Skip over it if its filesystem is being suspended.
844			 */
845			if (error == 0 &&
846			    vn_start_write(vp, &vnmp, V_NOWAIT) != 0)
847				error = EBUSY;
848
849			mtx_lock(&vnode_free_list_mtx);
850			if (error != 0)
851				TAILQ_INSERT_TAIL(&vnode_free_list, vp,
852				    v_freelist);
853			else
854				break;
855		}
856	}
857	/*
858	 * Unlocked access to this vp is ok because we are assured that there
859	 * are no other references to it.
860	 */
861	if (vp) {
862		freevnodes--;
863		mtx_unlock(&vnode_free_list_mtx);
864
865		vp->v_iflag |= VI_DOOMED;
866		vp->v_iflag &= ~VI_FREE;
867		cache_purge(vp);
868		if (vp->v_type != VBAD) {
869			VOP_UNLOCK(vp, 0, td);
870			vgone(vp);
871		} else {
872			VOP_UNLOCK(vp, 0, td);
873		}
874		vn_finished_write(vnmp);
875
876#ifdef INVARIANTS
877		{
878			if (vp->v_data)
879				panic("cleaned vnode isn't");
880			VI_LOCK(vp);
881			if (vp->v_numoutput)
882				panic("Clean vnode has pending I/O's");
883			if (vp->v_writecount != 0)
884				panic("Non-zero write count");
885			VI_UNLOCK(vp);
886		}
887#endif
888		if (vp->v_pollinfo) {
889			mtx_destroy(&vp->v_pollinfo->vpi_lock);
890			uma_zfree(vnodepoll_zone, vp->v_pollinfo);
891		}
892		vp->v_pollinfo = NULL;
893#ifdef MAC
894		mac_destroy_vnode(vp);
895#endif
896		vp->v_iflag = 0;
897		vp->v_vflag = 0;
898		vp->v_lastw = 0;
899		vp->v_lasta = 0;
900		vp->v_cstart = 0;
901		vp->v_clen = 0;
902		vp->v_socket = 0;
903		KASSERT(vp->v_cleanblkroot == NULL, ("cleanblkroot not NULL"));
904		KASSERT(vp->v_dirtyblkroot == NULL, ("dirtyblkroot not NULL"));
905	} else {
906		numvnodes++;
907		mtx_unlock(&vnode_free_list_mtx);
908
909		vp = (struct vnode *) uma_zalloc(vnode_zone, M_WAITOK|M_ZERO);
910		mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF);
911		vp->v_dd = vp;
912		cache_purge(vp);
913		LIST_INIT(&vp->v_cache_src);
914		TAILQ_INIT(&vp->v_cache_dst);
915	}
916
917	TAILQ_INIT(&vp->v_cleanblkhd);
918	TAILQ_INIT(&vp->v_dirtyblkhd);
919	vp->v_type = VNON;
920	vp->v_tag = tag;
921	vp->v_op = vops;
922	lockinit(&vp->v_lock, PVFS, "vnlock", VLKTIMEOUT, LK_NOPAUSE);
923#ifdef MAC
924	mac_init_vnode(vp);
925#endif
926	insmntque(vp, mp);
927	*vpp = vp;
928	vp->v_usecount = 1;
929	vp->v_data = 0;
930	vp->v_cachedid = -1;
931
932	splx(s);
933
934	return (0);
935}
936
937/*
938 * Move a vnode from one mount queue to another.
939 */
940static void
941insmntque(vp, mp)
942	register struct vnode *vp;
943	register struct mount *mp;
944{
945
946	mtx_lock(&mntvnode_mtx);
947	/*
948	 * Delete from old mount point vnode list, if on one.
949	 */
950	if (vp->v_mount != NULL)
951		TAILQ_REMOVE(&vp->v_mount->mnt_nvnodelist, vp, v_nmntvnodes);
952	/*
953	 * Insert into list of vnodes for the new mount point, if available.
954	 */
955	if ((vp->v_mount = mp) == NULL) {
956		mtx_unlock(&mntvnode_mtx);
957		return;
958	}
959	TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
960	mtx_unlock(&mntvnode_mtx);
961}
962
963/*
964 * Update outstanding I/O count and do wakeup if requested.
965 */
966void
967vwakeup(bp)
968	register struct buf *bp;
969{
970	register struct vnode *vp;
971
972	bp->b_flags &= ~B_WRITEINPROG;
973	if ((vp = bp->b_vp)) {
974		VI_LOCK(vp);
975		vp->v_numoutput--;
976		if (vp->v_numoutput < 0)
977			panic("vwakeup: neg numoutput");
978		if ((vp->v_numoutput == 0) && (vp->v_iflag & VI_BWAIT)) {
979			vp->v_iflag &= ~VI_BWAIT;
980			wakeup(&vp->v_numoutput);
981		}
982		VI_UNLOCK(vp);
983	}
984}
985
986/*
987 * Flush out and invalidate all buffers associated with a vnode.
988 * Called with the underlying object locked.
989 */
990int
991vinvalbuf(vp, flags, cred, td, slpflag, slptimeo)
992	struct vnode *vp;
993	int flags;
994	struct ucred *cred;
995	struct thread *td;
996	int slpflag, slptimeo;
997{
998	struct buf *blist;
999	int s, error;
1000	vm_object_t object;
1001
1002	GIANT_REQUIRED;
1003
1004	if (flags & V_SAVE) {
1005		s = splbio();
1006		VI_LOCK(vp);
1007		while (vp->v_numoutput) {
1008			vp->v_iflag |= VI_BWAIT;
1009			error = msleep(&vp->v_numoutput, VI_MTX(vp),
1010			    slpflag | (PRIBIO + 1), "vinvlbuf", slptimeo);
1011			if (error) {
1012				VI_UNLOCK(vp);
1013				splx(s);
1014				return (error);
1015			}
1016		}
1017		if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
1018			splx(s);
1019			VI_UNLOCK(vp);
1020			if ((error = VOP_FSYNC(vp, cred, MNT_WAIT, td)) != 0)
1021				return (error);
1022			/*
1023			 * XXX We could save a lock/unlock if this was only
1024			 * enabled under INVARIANTS
1025			 */
1026			VI_LOCK(vp);
1027			s = splbio();
1028			if (vp->v_numoutput > 0 ||
1029			    !TAILQ_EMPTY(&vp->v_dirtyblkhd))
1030				panic("vinvalbuf: dirty bufs");
1031		}
1032		VI_UNLOCK(vp);
1033		splx(s);
1034	}
1035	s = splbio();
1036	for (error = 0;;) {
1037		if ((blist = TAILQ_FIRST(&vp->v_cleanblkhd)) != 0 &&
1038		    flushbuflist(blist, flags, vp, slpflag, slptimeo, &error)) {
1039			if (error)
1040				break;
1041			continue;
1042		}
1043		if ((blist = TAILQ_FIRST(&vp->v_dirtyblkhd)) != 0 &&
1044		    flushbuflist(blist, flags, vp, slpflag, slptimeo, &error)) {
1045			if (error)
1046				break;
1047			continue;
1048		}
1049		break;
1050	}
1051	if (error) {
1052		splx(s);
1053		return (error);
1054	}
1055
1056	/*
1057	 * Wait for I/O to complete.  XXX needs cleaning up.  The vnode can
1058	 * have write I/O in-progress but if there is a VM object then the
1059	 * VM object can also have read-I/O in-progress.
1060	 */
1061	VI_LOCK(vp);
1062	do {
1063		while (vp->v_numoutput > 0) {
1064			vp->v_iflag |= VI_BWAIT;
1065			msleep(&vp->v_numoutput, VI_MTX(vp), PVM, "vnvlbv", 0);
1066		}
1067		VI_UNLOCK(vp);
1068		if (VOP_GETVOBJECT(vp, &object) == 0) {
1069			while (object->paging_in_progress)
1070			vm_object_pip_sleep(object, "vnvlbx");
1071		}
1072		VI_LOCK(vp);
1073	} while (vp->v_numoutput > 0);
1074	VI_UNLOCK(vp);
1075
1076	splx(s);
1077
1078	/*
1079	 * Destroy the copy in the VM cache, too.
1080	 */
1081	if (VOP_GETVOBJECT(vp, &object) == 0) {
1082		vm_object_page_remove(object, 0, 0,
1083			(flags & V_SAVE) ? TRUE : FALSE);
1084	}
1085
1086	if ((flags & (V_ALT | V_NORMAL)) == 0 &&
1087	    (!TAILQ_EMPTY(&vp->v_dirtyblkhd) ||
1088	     !TAILQ_EMPTY(&vp->v_cleanblkhd)))
1089		panic("vinvalbuf: flush failed");
1090	return (0);
1091}
1092
1093/*
1094 * Flush out buffers on the specified list.
1095 */
1096static int
1097flushbuflist(blist, flags, vp, slpflag, slptimeo, errorp)
1098	struct buf *blist;
1099	int flags;
1100	struct vnode *vp;
1101	int slpflag, slptimeo;
1102	int *errorp;
1103{
1104	struct buf *bp, *nbp;
1105	int found, error;
1106
1107	for (found = 0, bp = blist; bp; bp = nbp) {
1108		nbp = TAILQ_NEXT(bp, b_vnbufs);
1109		if (((flags & V_NORMAL) && (bp->b_xflags & BX_ALTDATA)) ||
1110		    ((flags & V_ALT) && (bp->b_xflags & BX_ALTDATA) == 0))
1111			continue;
1112		found += 1;
1113		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
1114			error = BUF_TIMELOCK(bp,
1115			    LK_EXCLUSIVE | LK_SLEEPFAIL,
1116			    "flushbuf", slpflag, slptimeo);
1117			if (error != ENOLCK)
1118				*errorp = error;
1119			return (found);
1120		}
1121		/*
1122		 * XXX Since there are no node locks for NFS, I
1123		 * believe there is a slight chance that a delayed
1124		 * write will occur while sleeping just above, so
1125		 * check for it.  Note that vfs_bio_awrite expects
1126		 * buffers to reside on a queue, while BUF_WRITE and
1127		 * brelse do not.
1128		 */
1129		if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) &&
1130			(flags & V_SAVE)) {
1131
1132			if (bp->b_vp == vp) {
1133				if (bp->b_flags & B_CLUSTEROK) {
1134					BUF_UNLOCK(bp);
1135					vfs_bio_awrite(bp);
1136				} else {
1137					bremfree(bp);
1138					bp->b_flags |= B_ASYNC;
1139					BUF_WRITE(bp);
1140				}
1141			} else {
1142				bremfree(bp);
1143				(void) BUF_WRITE(bp);
1144			}
1145			return (found);
1146		}
1147		bremfree(bp);
1148		bp->b_flags |= (B_INVAL | B_NOCACHE | B_RELBUF);
1149		bp->b_flags &= ~B_ASYNC;
1150		brelse(bp);
1151	}
1152	return (found);
1153}
1154
1155/*
1156 * Truncate a file's buffer and pages to a specified length.  This
1157 * is in lieu of the old vinvalbuf mechanism, which performed unneeded
1158 * sync activity.
1159 */
1160int
1161vtruncbuf(vp, cred, td, length, blksize)
1162	register struct vnode *vp;
1163	struct ucred *cred;
1164	struct thread *td;
1165	off_t length;
1166	int blksize;
1167{
1168	register struct buf *bp;
1169	struct buf *nbp;
1170	int s, anyfreed;
1171	int trunclbn;
1172
1173	/*
1174	 * Round up to the *next* lbn.
1175	 */
1176	trunclbn = (length + blksize - 1) / blksize;
1177
1178	s = splbio();
1179restart:
1180	anyfreed = 1;
1181	for (;anyfreed;) {
1182		anyfreed = 0;
1183		for (bp = TAILQ_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) {
1184			nbp = TAILQ_NEXT(bp, b_vnbufs);
1185			if (bp->b_lblkno >= trunclbn) {
1186				if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
1187					BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL);
1188					goto restart;
1189				} else {
1190					bremfree(bp);
1191					bp->b_flags |= (B_INVAL | B_RELBUF);
1192					bp->b_flags &= ~B_ASYNC;
1193					brelse(bp);
1194					anyfreed = 1;
1195				}
1196				if (nbp &&
1197				    (((nbp->b_xflags & BX_VNCLEAN) == 0) ||
1198				    (nbp->b_vp != vp) ||
1199				    (nbp->b_flags & B_DELWRI))) {
1200					goto restart;
1201				}
1202			}
1203		}
1204
1205		for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
1206			nbp = TAILQ_NEXT(bp, b_vnbufs);
1207			if (bp->b_lblkno >= trunclbn) {
1208				if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
1209					BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL);
1210					goto restart;
1211				} else {
1212					bremfree(bp);
1213					bp->b_flags |= (B_INVAL | B_RELBUF);
1214					bp->b_flags &= ~B_ASYNC;
1215					brelse(bp);
1216					anyfreed = 1;
1217				}
1218				if (nbp &&
1219				    (((nbp->b_xflags & BX_VNDIRTY) == 0) ||
1220				    (nbp->b_vp != vp) ||
1221				    (nbp->b_flags & B_DELWRI) == 0)) {
1222					goto restart;
1223				}
1224			}
1225		}
1226	}
1227
1228	if (length > 0) {
1229restartsync:
1230		for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
1231			nbp = TAILQ_NEXT(bp, b_vnbufs);
1232			if ((bp->b_flags & B_DELWRI) && (bp->b_lblkno < 0)) {
1233				if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
1234					BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL);
1235					goto restart;
1236				} else {
1237					bremfree(bp);
1238					if (bp->b_vp == vp) {
1239						bp->b_flags |= B_ASYNC;
1240					} else {
1241						bp->b_flags &= ~B_ASYNC;
1242					}
1243					BUF_WRITE(bp);
1244				}
1245				goto restartsync;
1246			}
1247
1248		}
1249	}
1250
1251	VI_LOCK(vp);
1252	while (vp->v_numoutput > 0) {
1253		vp->v_iflag |= VI_BWAIT;
1254		msleep(&vp->v_numoutput, VI_MTX(vp), PVM, "vbtrunc", 0);
1255	}
1256	VI_UNLOCK(vp);
1257	splx(s);
1258
1259	vnode_pager_setsize(vp, length);
1260
1261	return (0);
1262}
1263
1264/*
1265 * buf_splay() - splay tree core for the clean/dirty list of buffers in
1266 * 		 a vnode.
1267 *
1268 *	NOTE: We have to deal with the special case of a background bitmap
1269 *	buffer, a situation where two buffers will have the same logical
1270 *	block offset.  We want (1) only the foreground buffer to be accessed
1271 *	in a lookup and (2) must differentiate between the foreground and
1272 *	background buffer in the splay tree algorithm because the splay
1273 *	tree cannot normally handle multiple entities with the same 'index'.
1274 *	We accomplish this by adding differentiating flags to the splay tree's
1275 *	numerical domain.
1276 */
1277static
1278struct buf *
1279buf_splay(daddr_t lblkno, b_xflags_t xflags, struct buf *root)
1280{
1281	struct buf dummy;
1282	struct buf *lefttreemax, *righttreemin, *y;
1283
1284	if (root == NULL)
1285		return (NULL);
1286	lefttreemax = righttreemin = &dummy;
1287	for (;;) {
1288		if (lblkno < root->b_lblkno ||
1289		    (lblkno == root->b_lblkno &&
1290		    (xflags & BX_BKGRDMARKER) < (root->b_xflags & BX_BKGRDMARKER))) {
1291			if ((y = root->b_left) == NULL)
1292				break;
1293			if (lblkno < y->b_lblkno) {
1294				/* Rotate right. */
1295				root->b_left = y->b_right;
1296				y->b_right = root;
1297				root = y;
1298				if ((y = root->b_left) == NULL)
1299					break;
1300			}
1301			/* Link into the new root's right tree. */
1302			righttreemin->b_left = root;
1303			righttreemin = root;
1304		} else if (lblkno > root->b_lblkno ||
1305		    (lblkno == root->b_lblkno &&
1306		    (xflags & BX_BKGRDMARKER) > (root->b_xflags & BX_BKGRDMARKER))) {
1307			if ((y = root->b_right) == NULL)
1308				break;
1309			if (lblkno > y->b_lblkno) {
1310				/* Rotate left. */
1311				root->b_right = y->b_left;
1312				y->b_left = root;
1313				root = y;
1314				if ((y = root->b_right) == NULL)
1315					break;
1316			}
1317			/* Link into the new root's left tree. */
1318			lefttreemax->b_right = root;
1319			lefttreemax = root;
1320		} else {
1321			break;
1322		}
1323		root = y;
1324	}
1325	/* Assemble the new root. */
1326	lefttreemax->b_right = root->b_left;
1327	righttreemin->b_left = root->b_right;
1328	root->b_left = dummy.b_right;
1329	root->b_right = dummy.b_left;
1330	return (root);
1331}
1332
1333static
1334void
1335buf_vlist_remove(struct buf *bp)
1336{
1337	struct vnode *vp = bp->b_vp;
1338	struct buf *root;
1339
1340	if (bp->b_xflags & BX_VNDIRTY) {
1341		if (bp != vp->v_dirtyblkroot) {
1342			root = buf_splay(bp->b_lblkno, bp->b_xflags, vp->v_dirtyblkroot);
1343			KASSERT(root == bp, ("splay lookup failed during dirty remove"));
1344		}
1345		if (bp->b_left == NULL) {
1346			root = bp->b_right;
1347		} else {
1348			root = buf_splay(bp->b_lblkno, bp->b_xflags, bp->b_left);
1349			root->b_right = bp->b_right;
1350		}
1351		vp->v_dirtyblkroot = root;
1352		TAILQ_REMOVE(&vp->v_dirtyblkhd, bp, b_vnbufs);
1353	} else {
1354		/* KASSERT(bp->b_xflags & BX_VNCLEAN, ("bp wasn't clean")); */
1355		if (bp != vp->v_cleanblkroot) {
1356			root = buf_splay(bp->b_lblkno, bp->b_xflags, vp->v_cleanblkroot);
1357			KASSERT(root == bp, ("splay lookup failed during clean remove"));
1358		}
1359		if (bp->b_left == NULL) {
1360			root = bp->b_right;
1361		} else {
1362			root = buf_splay(bp->b_lblkno, bp->b_xflags, bp->b_left);
1363			root->b_right = bp->b_right;
1364		}
1365		vp->v_cleanblkroot = root;
1366		TAILQ_REMOVE(&vp->v_cleanblkhd, bp, b_vnbufs);
1367	}
1368	bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN);
1369}
1370
1371/*
1372 * Add the buffer to the sorted clean or dirty block list using a
1373 * splay tree algorithm.
1374 *
1375 * NOTE: xflags is passed as a constant, optimizing this inline function!
1376 */
1377static
1378void
1379buf_vlist_add(struct buf *bp, struct vnode *vp, b_xflags_t xflags)
1380{
1381	struct buf *root;
1382
1383	bp->b_xflags |= xflags;
1384	if (xflags & BX_VNDIRTY) {
1385		root = buf_splay(bp->b_lblkno, bp->b_xflags, vp->v_dirtyblkroot);
1386		if (root == NULL) {
1387			bp->b_left = NULL;
1388			bp->b_right = NULL;
1389			TAILQ_INSERT_TAIL(&vp->v_dirtyblkhd, bp, b_vnbufs);
1390		} else if (bp->b_lblkno < root->b_lblkno ||
1391		    (bp->b_lblkno == root->b_lblkno &&
1392		    (bp->b_xflags & BX_BKGRDMARKER) < (root->b_xflags & BX_BKGRDMARKER))) {
1393			bp->b_left = root->b_left;
1394			bp->b_right = root;
1395			root->b_left = NULL;
1396			TAILQ_INSERT_BEFORE(root, bp, b_vnbufs);
1397		} else {
1398			bp->b_right = root->b_right;
1399			bp->b_left = root;
1400			root->b_right = NULL;
1401			TAILQ_INSERT_AFTER(&vp->v_dirtyblkhd,
1402			    root, bp, b_vnbufs);
1403		}
1404		vp->v_dirtyblkroot = bp;
1405	} else {
1406		/* KASSERT(xflags & BX_VNCLEAN, ("xflags not clean")); */
1407		root = buf_splay(bp->b_lblkno, bp->b_xflags, vp->v_cleanblkroot);
1408		if (root == NULL) {
1409			bp->b_left = NULL;
1410			bp->b_right = NULL;
1411			TAILQ_INSERT_TAIL(&vp->v_cleanblkhd, bp, b_vnbufs);
1412		} else if (bp->b_lblkno < root->b_lblkno ||
1413		    (bp->b_lblkno == root->b_lblkno &&
1414		    (bp->b_xflags & BX_BKGRDMARKER) < (root->b_xflags & BX_BKGRDMARKER))) {
1415			bp->b_left = root->b_left;
1416			bp->b_right = root;
1417			root->b_left = NULL;
1418			TAILQ_INSERT_BEFORE(root, bp, b_vnbufs);
1419		} else {
1420			bp->b_right = root->b_right;
1421			bp->b_left = root;
1422			root->b_right = NULL;
1423			TAILQ_INSERT_AFTER(&vp->v_cleanblkhd,
1424			    root, bp, b_vnbufs);
1425		}
1426		vp->v_cleanblkroot = bp;
1427	}
1428}
1429
1430#ifndef USE_BUFHASH
1431
1432/*
1433 * Lookup a buffer using the splay tree.  Note that we specifically avoid
1434 * shadow buffers used in background bitmap writes.
1435 *
1436 * This code isn't quite efficient as it could be because we are maintaining
1437 * two sorted lists and do not know which list the block resides in.
1438 */
1439struct buf *
1440gbincore(struct vnode *vp, daddr_t lblkno)
1441{
1442	struct buf *bp;
1443
1444	GIANT_REQUIRED;
1445
1446	bp = vp->v_cleanblkroot = buf_splay(lblkno, 0, vp->v_cleanblkroot);
1447	if (bp && bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
1448		return(bp);
1449	bp = vp->v_dirtyblkroot = buf_splay(lblkno, 0, vp->v_dirtyblkroot);
1450	if (bp && bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
1451		return(bp);
1452	return(NULL);
1453}
1454
1455#endif
1456
1457/*
1458 * Associate a buffer with a vnode.
1459 */
1460void
1461bgetvp(vp, bp)
1462	register struct vnode *vp;
1463	register struct buf *bp;
1464{
1465	int s;
1466
1467	KASSERT(bp->b_vp == NULL, ("bgetvp: not free"));
1468
1469	KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0,
1470	    ("bgetvp: bp already attached! %p", bp));
1471
1472	vhold(vp);
1473	bp->b_vp = vp;
1474	bp->b_dev = vn_todev(vp);
1475	/*
1476	 * Insert onto list for new vnode.
1477	 */
1478	s = splbio();
1479	buf_vlist_add(bp, vp, BX_VNCLEAN);
1480	splx(s);
1481}
1482
1483/*
1484 * Disassociate a buffer from a vnode.
1485 */
1486void
1487brelvp(bp)
1488	register struct buf *bp;
1489{
1490	struct vnode *vp;
1491	int s;
1492
1493	KASSERT(bp->b_vp != NULL, ("brelvp: NULL"));
1494
1495	/*
1496	 * Delete from old vnode list, if on one.
1497	 */
1498	vp = bp->b_vp;
1499	s = splbio();
1500	if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN))
1501		buf_vlist_remove(bp);
1502	VI_LOCK(vp);
1503	if ((vp->v_iflag & VI_ONWORKLST) && TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
1504		vp->v_iflag &= ~VI_ONWORKLST;
1505		LIST_REMOVE(vp, v_synclist);
1506	}
1507	VI_UNLOCK(vp);
1508	splx(s);
1509	bp->b_vp = (struct vnode *) 0;
1510	vdrop(vp);
1511	if (bp->b_object)
1512		bp->b_object = NULL;
1513}
1514
1515/*
1516 * Add an item to the syncer work queue.
1517 */
1518static void
1519vn_syncer_add_to_worklist(struct vnode *vp, int delay)
1520{
1521	int s, slot;
1522
1523	s = splbio();
1524	ASSERT_VI_LOCKED(vp);
1525
1526	if (vp->v_iflag & VI_ONWORKLST)
1527		LIST_REMOVE(vp, v_synclist);
1528	else
1529		vp->v_iflag |= VI_ONWORKLST;
1530
1531	if (delay > syncer_maxdelay - 2)
1532		delay = syncer_maxdelay - 2;
1533	slot = (syncer_delayno + delay) & syncer_mask;
1534
1535	LIST_INSERT_HEAD(&syncer_workitem_pending[slot], vp, v_synclist);
1536
1537	splx(s);
1538}
1539
1540struct  proc *updateproc;
1541static void sched_sync(void);
1542static struct kproc_desc up_kp = {
1543	"syncer",
1544	sched_sync,
1545	&updateproc
1546};
1547SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp)
1548
1549/*
1550 * System filesystem synchronizer daemon.
1551 */
1552void
1553sched_sync(void)
1554{
1555	struct synclist *slp;
1556	struct vnode *vp;
1557	struct mount *mp;
1558	long starttime;
1559	int s;
1560	struct thread *td = FIRST_THREAD_IN_PROC(updateproc);  /* XXXKSE */
1561
1562	mtx_lock(&Giant);
1563
1564	EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, td->td_proc,
1565	    SHUTDOWN_PRI_LAST);
1566
1567	for (;;) {
1568		kthread_suspend_check(td->td_proc);
1569
1570		starttime = time_second;
1571
1572		/*
1573		 * Push files whose dirty time has expired.  Be careful
1574		 * of interrupt race on slp queue.
1575		 */
1576		s = splbio();
1577		slp = &syncer_workitem_pending[syncer_delayno];
1578		syncer_delayno += 1;
1579		if (syncer_delayno == syncer_maxdelay)
1580			syncer_delayno = 0;
1581		splx(s);
1582
1583		while ((vp = LIST_FIRST(slp)) != NULL) {
1584			if (VOP_ISLOCKED(vp, NULL) == 0 &&
1585			    vn_start_write(vp, &mp, V_NOWAIT) == 0) {
1586				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
1587				(void) VOP_FSYNC(vp, td->td_ucred, MNT_LAZY, td);
1588				VOP_UNLOCK(vp, 0, td);
1589				vn_finished_write(mp);
1590			}
1591			s = splbio();
1592			if (LIST_FIRST(slp) == vp) {
1593				/*
1594				 * Note: v_tag VT_VFS vps can remain on the
1595				 * worklist too with no dirty blocks, but
1596				 * since sync_fsync() moves it to a different
1597				 * slot we are safe.
1598				 */
1599				if (TAILQ_EMPTY(&vp->v_dirtyblkhd) &&
1600				    !vn_isdisk(vp, NULL))
1601					panic("sched_sync: fsync failed vp %p tag %d", vp, vp->v_tag);
1602				/*
1603				 * Put us back on the worklist.  The worklist
1604				 * routine will remove us from our current
1605				 * position and then add us back in at a later
1606				 * position.
1607				 */
1608				VI_LOCK(vp);
1609				vn_syncer_add_to_worklist(vp, syncdelay);
1610				VI_UNLOCK(vp);
1611			}
1612			splx(s);
1613		}
1614
1615		/*
1616		 * Do soft update processing.
1617		 */
1618		if (softdep_process_worklist_hook != NULL)
1619			(*softdep_process_worklist_hook)(NULL);
1620
1621		/*
1622		 * The variable rushjob allows the kernel to speed up the
1623		 * processing of the filesystem syncer process. A rushjob
1624		 * value of N tells the filesystem syncer to process the next
1625		 * N seconds worth of work on its queue ASAP. Currently rushjob
1626		 * is used by the soft update code to speed up the filesystem
1627		 * syncer process when the incore state is getting so far
1628		 * ahead of the disk that the kernel memory pool is being
1629		 * threatened with exhaustion.
1630		 */
1631		if (rushjob > 0) {
1632			rushjob -= 1;
1633			continue;
1634		}
1635		/*
1636		 * If it has taken us less than a second to process the
1637		 * current work, then wait. Otherwise start right over
1638		 * again. We can still lose time if any single round
1639		 * takes more than two seconds, but it does not really
1640		 * matter as we are just trying to generally pace the
1641		 * filesystem activity.
1642		 */
1643		if (time_second == starttime)
1644			tsleep(&lbolt, PPAUSE, "syncer", 0);
1645	}
1646}
1647
1648/*
1649 * Request the syncer daemon to speed up its work.
1650 * We never push it to speed up more than half of its
1651 * normal turn time, otherwise it could take over the cpu.
1652 * XXXKSE  only one update?
1653 */
1654int
1655speedup_syncer()
1656{
1657	struct thread *td;
1658
1659	td = FIRST_THREAD_IN_PROC(updateproc);
1660	mtx_lock_spin(&sched_lock);
1661	if (td->td_wchan == &lbolt) /* XXXKSE */
1662		unsleep(td);
1663		TD_CLR_SLEEPING(td);
1664		setrunnable(td);
1665	mtx_unlock_spin(&sched_lock);
1666	if (rushjob < syncdelay / 2) {
1667		rushjob += 1;
1668		stat_rush_requests += 1;
1669		return (1);
1670	}
1671	return(0);
1672}
1673
1674/*
1675 * Associate a p-buffer with a vnode.
1676 *
1677 * Also sets B_PAGING flag to indicate that vnode is not fully associated
1678 * with the buffer.  i.e. the bp has not been linked into the vnode or
1679 * ref-counted.
1680 */
1681void
1682pbgetvp(vp, bp)
1683	register struct vnode *vp;
1684	register struct buf *bp;
1685{
1686
1687	KASSERT(bp->b_vp == NULL, ("pbgetvp: not free"));
1688
1689	bp->b_vp = vp;
1690	bp->b_flags |= B_PAGING;
1691	bp->b_dev = vn_todev(vp);
1692}
1693
1694/*
1695 * Disassociate a p-buffer from a vnode.
1696 */
1697void
1698pbrelvp(bp)
1699	register struct buf *bp;
1700{
1701
1702	KASSERT(bp->b_vp != NULL, ("pbrelvp: NULL"));
1703
1704	/* XXX REMOVE ME */
1705	if (TAILQ_NEXT(bp, b_vnbufs) != NULL) {
1706		panic(
1707		    "relpbuf(): b_vp was probably reassignbuf()d %p %x",
1708		    bp,
1709		    (int)bp->b_flags
1710		);
1711	}
1712	bp->b_vp = (struct vnode *) 0;
1713	bp->b_flags &= ~B_PAGING;
1714}
1715
1716/*
1717 * Reassign a buffer from one vnode to another.
1718 * Used to assign file specific control information
1719 * (indirect blocks) to the vnode to which they belong.
1720 */
1721void
1722reassignbuf(bp, newvp)
1723	register struct buf *bp;
1724	register struct vnode *newvp;
1725{
1726	int delay;
1727	int s;
1728
1729	if (newvp == NULL) {
1730		printf("reassignbuf: NULL");
1731		return;
1732	}
1733	++reassignbufcalls;
1734
1735	/*
1736	 * B_PAGING flagged buffers cannot be reassigned because their vp
1737	 * is not fully linked in.
1738	 */
1739	if (bp->b_flags & B_PAGING)
1740		panic("cannot reassign paging buffer");
1741
1742	s = splbio();
1743	/*
1744	 * Delete from old vnode list, if on one.
1745	 */
1746	if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) {
1747		buf_vlist_remove(bp);
1748		if (bp->b_vp != newvp) {
1749			vdrop(bp->b_vp);
1750			bp->b_vp = NULL;	/* for clarification */
1751		}
1752	}
1753	/*
1754	 * If dirty, put on list of dirty buffers; otherwise insert onto list
1755	 * of clean buffers.
1756	 */
1757	if (bp->b_flags & B_DELWRI) {
1758		VI_LOCK(newvp);
1759		if ((newvp->v_iflag & VI_ONWORKLST) == 0) {
1760			switch (newvp->v_type) {
1761			case VDIR:
1762				delay = dirdelay;
1763				break;
1764			case VCHR:
1765				if (newvp->v_rdev->si_mountpoint != NULL) {
1766					delay = metadelay;
1767					break;
1768				}
1769				/* FALLTHROUGH */
1770			default:
1771				delay = filedelay;
1772			}
1773			vn_syncer_add_to_worklist(newvp, delay);
1774		}
1775		VI_UNLOCK(newvp);
1776		buf_vlist_add(bp, newvp, BX_VNDIRTY);
1777	} else {
1778		buf_vlist_add(bp, newvp, BX_VNCLEAN);
1779
1780		VI_LOCK(newvp);
1781		if ((newvp->v_iflag & VI_ONWORKLST) &&
1782		    TAILQ_EMPTY(&newvp->v_dirtyblkhd)) {
1783			newvp->v_iflag &= ~VI_ONWORKLST;
1784			LIST_REMOVE(newvp, v_synclist);
1785		}
1786		VI_UNLOCK(newvp);
1787	}
1788	if (bp->b_vp != newvp) {
1789		bp->b_vp = newvp;
1790		vhold(bp->b_vp);
1791	}
1792	splx(s);
1793}
1794
1795/*
1796 * Create a vnode for a device.
1797 * Used for mounting the root filesystem.
1798 */
1799int
1800bdevvp(dev, vpp)
1801	dev_t dev;
1802	struct vnode **vpp;
1803{
1804	register struct vnode *vp;
1805	struct vnode *nvp;
1806	int error;
1807
1808	if (dev == NODEV) {
1809		*vpp = NULLVP;
1810		return (ENXIO);
1811	}
1812	if (vfinddev(dev, VCHR, vpp))
1813		return (0);
1814	error = getnewvnode(VT_NON, (struct mount *)0, spec_vnodeop_p, &nvp);
1815	if (error) {
1816		*vpp = NULLVP;
1817		return (error);
1818	}
1819	vp = nvp;
1820	vp->v_type = VCHR;
1821	addalias(vp, dev);
1822	*vpp = vp;
1823	return (0);
1824}
1825
1826/*
1827 * Add vnode to the alias list hung off the dev_t.
1828 *
1829 * The reason for this gunk is that multiple vnodes can reference
1830 * the same physical device, so checking vp->v_usecount to see
1831 * how many users there are is inadequate; the v_usecount for
1832 * the vnodes need to be accumulated.  vcount() does that.
1833 */
1834struct vnode *
1835addaliasu(nvp, nvp_rdev)
1836	struct vnode *nvp;
1837	udev_t nvp_rdev;
1838{
1839	struct vnode *ovp;
1840	vop_t **ops;
1841	dev_t dev;
1842
1843	if (nvp->v_type == VBLK)
1844		return (nvp);
1845	if (nvp->v_type != VCHR)
1846		panic("addaliasu on non-special vnode");
1847	dev = udev2dev(nvp_rdev, 0);
1848	/*
1849	 * Check to see if we have a bdevvp vnode with no associated
1850	 * filesystem. If so, we want to associate the filesystem of
1851	 * the new newly instigated vnode with the bdevvp vnode and
1852	 * discard the newly created vnode rather than leaving the
1853	 * bdevvp vnode lying around with no associated filesystem.
1854	 */
1855	if (vfinddev(dev, nvp->v_type, &ovp) == 0 || ovp->v_data != NULL) {
1856		addalias(nvp, dev);
1857		return (nvp);
1858	}
1859	/*
1860	 * Discard unneeded vnode, but save its node specific data.
1861	 * Note that if there is a lock, it is carried over in the
1862	 * node specific data to the replacement vnode.
1863	 */
1864	vref(ovp);
1865	ovp->v_data = nvp->v_data;
1866	ovp->v_tag = nvp->v_tag;
1867	nvp->v_data = NULL;
1868	lockinit(&ovp->v_lock, PVFS, nvp->v_lock.lk_wmesg,
1869	    nvp->v_lock.lk_timo, nvp->v_lock.lk_flags & LK_EXTFLG_MASK);
1870	if (nvp->v_vnlock)
1871		ovp->v_vnlock = &ovp->v_lock;
1872	ops = ovp->v_op;
1873	ovp->v_op = nvp->v_op;
1874	if (VOP_ISLOCKED(nvp, curthread)) {
1875		VOP_UNLOCK(nvp, 0, curthread);
1876		vn_lock(ovp, LK_EXCLUSIVE | LK_RETRY, curthread);
1877	}
1878	nvp->v_op = ops;
1879	insmntque(ovp, nvp->v_mount);
1880	vrele(nvp);
1881	vgone(nvp);
1882	return (ovp);
1883}
1884
1885/* This is a local helper function that do the same as addaliasu, but for a
1886 * dev_t instead of an udev_t. */
1887static void
1888addalias(nvp, dev)
1889	struct vnode *nvp;
1890	dev_t dev;
1891{
1892
1893	KASSERT(nvp->v_type == VCHR, ("addalias on non-special vnode"));
1894	nvp->v_rdev = dev;
1895	mtx_lock(&spechash_mtx);
1896	SLIST_INSERT_HEAD(&dev->si_hlist, nvp, v_specnext);
1897	mtx_unlock(&spechash_mtx);
1898}
1899
1900/*
1901 * Grab a particular vnode from the free list, increment its
1902 * reference count and lock it. The vnode lock bit is set if the
1903 * vnode is being eliminated in vgone. The process is awakened
1904 * when the transition is completed, and an error returned to
1905 * indicate that the vnode is no longer usable (possibly having
1906 * been changed to a new filesystem type).
1907 */
1908int
1909vget(vp, flags, td)
1910	register struct vnode *vp;
1911	int flags;
1912	struct thread *td;
1913{
1914	int error;
1915
1916	/*
1917	 * If the vnode is in the process of being cleaned out for
1918	 * another use, we wait for the cleaning to finish and then
1919	 * return failure. Cleaning is determined by checking that
1920	 * the VI_XLOCK flag is set.
1921	 */
1922	if ((flags & LK_INTERLOCK) == 0)
1923		VI_LOCK(vp);
1924	if (vp->v_iflag & VI_XLOCK && vp->v_vxproc != curthread) {
1925		vp->v_iflag |= VI_XWANT;
1926		msleep(vp, VI_MTX(vp), PINOD | PDROP, "vget", 0);
1927		return (ENOENT);
1928	}
1929
1930	vp->v_usecount++;
1931
1932	if (VSHOULDBUSY(vp))
1933		vbusy(vp);
1934	if (flags & LK_TYPE_MASK) {
1935		if ((error = vn_lock(vp, flags | LK_INTERLOCK, td)) != 0) {
1936			/*
1937			 * must expand vrele here because we do not want
1938			 * to call VOP_INACTIVE if the reference count
1939			 * drops back to zero since it was never really
1940			 * active. We must remove it from the free list
1941			 * before sleeping so that multiple processes do
1942			 * not try to recycle it.
1943			 */
1944			VI_LOCK(vp);
1945			vp->v_usecount--;
1946			if (VSHOULDFREE(vp))
1947				vfree(vp);
1948			else
1949				vlruvp(vp);
1950			VI_UNLOCK(vp);
1951		}
1952		return (error);
1953	}
1954	VI_UNLOCK(vp);
1955	return (0);
1956}
1957
1958/*
1959 * Increase the reference count of a vnode.
1960 */
1961void
1962vref(struct vnode *vp)
1963{
1964	mtx_lock(&vp->v_interlock);
1965	vp->v_usecount++;
1966	mtx_unlock(&vp->v_interlock);
1967}
1968
1969/*
1970 * Vnode put/release.
1971 * If count drops to zero, call inactive routine and return to freelist.
1972 */
1973void
1974vrele(vp)
1975	struct vnode *vp;
1976{
1977	struct thread *td = curthread;	/* XXX */
1978
1979	KASSERT(vp != NULL, ("vrele: null vp"));
1980
1981	VI_LOCK(vp);
1982
1983	/* Skip this v_writecount check if we're going to panic below. */
1984	KASSERT(vp->v_writecount < vp->v_usecount || vp->v_usecount < 1,
1985	    ("vrele: missed vn_close"));
1986
1987	if (vp->v_usecount > 1) {
1988
1989		vp->v_usecount--;
1990		VI_UNLOCK(vp);
1991
1992		return;
1993	}
1994
1995	if (vp->v_usecount == 1) {
1996		vp->v_usecount--;
1997		/*
1998		 * We must call VOP_INACTIVE with the node locked.
1999		 * If we are doing a vput, the node is already locked,
2000		 * but, in the case of vrele, we must explicitly lock
2001		 * the vnode before calling VOP_INACTIVE.
2002		 */
2003		if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK, td) == 0)
2004			VOP_INACTIVE(vp, td);
2005		VI_LOCK(vp);
2006		if (VSHOULDFREE(vp))
2007			vfree(vp);
2008		else
2009			vlruvp(vp);
2010		VI_UNLOCK(vp);
2011
2012	} else {
2013#ifdef DIAGNOSTIC
2014		vprint("vrele: negative ref count", vp);
2015		VI_UNLOCK(vp);
2016#endif
2017		panic("vrele: negative ref cnt");
2018	}
2019}
2020
2021/*
2022 * Release an already locked vnode.  This give the same effects as
2023 * unlock+vrele(), but takes less time and avoids releasing and
2024 * re-aquiring the lock (as vrele() aquires the lock internally.)
2025 */
2026void
2027vput(vp)
2028	struct vnode *vp;
2029{
2030	struct thread *td = curthread;	/* XXX */
2031
2032	GIANT_REQUIRED;
2033
2034	KASSERT(vp != NULL, ("vput: null vp"));
2035	mtx_lock(&vp->v_interlock);
2036	/* Skip this v_writecount check if we're going to panic below. */
2037	KASSERT(vp->v_writecount < vp->v_usecount || vp->v_usecount < 1,
2038	    ("vput: missed vn_close"));
2039
2040	if (vp->v_usecount > 1) {
2041		vp->v_usecount--;
2042		VOP_UNLOCK(vp, LK_INTERLOCK, td);
2043		return;
2044	}
2045
2046	if (vp->v_usecount == 1) {
2047		vp->v_usecount--;
2048		/*
2049		 * We must call VOP_INACTIVE with the node locked.
2050		 * If we are doing a vput, the node is already locked,
2051		 * so we just need to release the vnode mutex.
2052		 */
2053		VI_UNLOCK(vp);
2054		VOP_INACTIVE(vp, td);
2055		VI_LOCK(vp);
2056		if (VSHOULDFREE(vp))
2057			vfree(vp);
2058		else
2059			vlruvp(vp);
2060		VI_UNLOCK(vp);
2061
2062	} else {
2063#ifdef DIAGNOSTIC
2064		vprint("vput: negative ref count", vp);
2065#endif
2066		panic("vput: negative ref cnt");
2067	}
2068}
2069
2070/*
2071 * Somebody doesn't want the vnode recycled.
2072 */
2073void
2074vhold(vp)
2075	register struct vnode *vp;
2076{
2077	int s;
2078
2079	s = splbio();
2080	vp->v_holdcnt++;
2081	VI_LOCK(vp);
2082	if (VSHOULDBUSY(vp))
2083		vbusy(vp);
2084	VI_UNLOCK(vp);
2085	splx(s);
2086}
2087
2088/*
2089 * Note that there is one less who cares about this vnode.  vdrop() is the
2090 * opposite of vhold().
2091 */
2092void
2093vdrop(vp)
2094	register struct vnode *vp;
2095{
2096	int s;
2097
2098	s = splbio();
2099	if (vp->v_holdcnt <= 0)
2100		panic("vdrop: holdcnt");
2101	vp->v_holdcnt--;
2102	VI_LOCK(vp);
2103	if (VSHOULDFREE(vp))
2104		vfree(vp);
2105	else
2106		vlruvp(vp);
2107	VI_UNLOCK(vp);
2108	splx(s);
2109}
2110
2111/*
2112 * Remove any vnodes in the vnode table belonging to mount point mp.
2113 *
2114 * If FORCECLOSE is not specified, there should not be any active ones,
2115 * return error if any are found (nb: this is a user error, not a
2116 * system error). If FORCECLOSE is specified, detach any active vnodes
2117 * that are found.
2118 *
2119 * If WRITECLOSE is set, only flush out regular file vnodes open for
2120 * writing.
2121 *
2122 * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped.
2123 *
2124 * `rootrefs' specifies the base reference count for the root vnode
2125 * of this filesystem. The root vnode is considered busy if its
2126 * v_usecount exceeds this value. On a successful return, vflush()
2127 * will call vrele() on the root vnode exactly rootrefs times.
2128 * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must
2129 * be zero.
2130 */
2131#ifdef DIAGNOSTIC
2132static int busyprt = 0;		/* print out busy vnodes */
2133SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "");
2134#endif
2135
2136int
2137vflush(mp, rootrefs, flags)
2138	struct mount *mp;
2139	int rootrefs;
2140	int flags;
2141{
2142	struct thread *td = curthread;	/* XXX */
2143	struct vnode *vp, *nvp, *rootvp = NULL;
2144	struct vattr vattr;
2145	int busy = 0, error;
2146
2147	if (rootrefs > 0) {
2148		KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0,
2149		    ("vflush: bad args"));
2150		/*
2151		 * Get the filesystem root vnode. We can vput() it
2152		 * immediately, since with rootrefs > 0, it won't go away.
2153		 */
2154		if ((error = VFS_ROOT(mp, &rootvp)) != 0)
2155			return (error);
2156		vput(rootvp);
2157
2158	}
2159	mtx_lock(&mntvnode_mtx);
2160loop:
2161	for (vp = TAILQ_FIRST(&mp->mnt_nvnodelist); vp; vp = nvp) {
2162		/*
2163		 * Make sure this vnode wasn't reclaimed in getnewvnode().
2164		 * Start over if it has (it won't be on the list anymore).
2165		 */
2166		if (vp->v_mount != mp)
2167			goto loop;
2168		nvp = TAILQ_NEXT(vp, v_nmntvnodes);
2169
2170		mtx_unlock(&mntvnode_mtx);
2171		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
2172		/*
2173		 * Skip over a vnodes marked VV_SYSTEM.
2174		 */
2175		if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) {
2176			VOP_UNLOCK(vp, 0, td);
2177			mtx_lock(&mntvnode_mtx);
2178			continue;
2179		}
2180		/*
2181		 * If WRITECLOSE is set, flush out unlinked but still open
2182		 * files (even if open only for reading) and regular file
2183		 * vnodes open for writing.
2184		 */
2185		error = VOP_GETATTR(vp, &vattr, td->td_ucred, td);
2186		VI_LOCK(vp);
2187
2188		if ((flags & WRITECLOSE) &&
2189		    (vp->v_type == VNON ||
2190		    (error == 0 && vattr.va_nlink > 0)) &&
2191		    (vp->v_writecount == 0 || vp->v_type != VREG)) {
2192			VOP_UNLOCK(vp, LK_INTERLOCK, td);
2193			mtx_lock(&mntvnode_mtx);
2194			continue;
2195		}
2196
2197		VOP_UNLOCK(vp, 0, td);
2198
2199		/*
2200		 * With v_usecount == 0, all we need to do is clear out the
2201		 * vnode data structures and we are done.
2202		 */
2203		if (vp->v_usecount == 0) {
2204			vgonel(vp, td);
2205			mtx_lock(&mntvnode_mtx);
2206			continue;
2207		}
2208
2209		/*
2210		 * If FORCECLOSE is set, forcibly close the vnode. For block
2211		 * or character devices, revert to an anonymous device. For
2212		 * all other files, just kill them.
2213		 */
2214		if (flags & FORCECLOSE) {
2215			if (vp->v_type != VCHR) {
2216				vgonel(vp, td);
2217			} else {
2218				vclean(vp, 0, td);
2219				VI_UNLOCK(vp);
2220				vp->v_op = spec_vnodeop_p;
2221				insmntque(vp, (struct mount *) 0);
2222			}
2223			mtx_lock(&mntvnode_mtx);
2224			continue;
2225		}
2226#ifdef DIAGNOSTIC
2227		if (busyprt)
2228			vprint("vflush: busy vnode", vp);
2229#endif
2230		VI_UNLOCK(vp);
2231		mtx_lock(&mntvnode_mtx);
2232		busy++;
2233	}
2234	mtx_unlock(&mntvnode_mtx);
2235	if (rootrefs > 0 && (flags & FORCECLOSE) == 0) {
2236		/*
2237		 * If just the root vnode is busy, and if its refcount
2238		 * is equal to `rootrefs', then go ahead and kill it.
2239		 */
2240		mtx_lock(&rootvp->v_interlock);
2241		KASSERT(busy > 0, ("vflush: not busy"));
2242		KASSERT(rootvp->v_usecount >= rootrefs, ("vflush: rootrefs"));
2243		if (busy == 1 && rootvp->v_usecount == rootrefs) {
2244			vgonel(rootvp, td);
2245			busy = 0;
2246		} else
2247			mtx_unlock(&rootvp->v_interlock);
2248	}
2249	if (busy)
2250		return (EBUSY);
2251	for (; rootrefs > 0; rootrefs--)
2252		vrele(rootvp);
2253	return (0);
2254}
2255
2256/*
2257 * This moves a now (likely recyclable) vnode to the end of the
2258 * mountlist.  XXX However, it is temporarily disabled until we
2259 * can clean up ffs_sync() and friends, which have loop restart
2260 * conditions which this code causes to operate O(N^2).
2261 */
2262static void
2263vlruvp(struct vnode *vp)
2264{
2265#if 0
2266	struct mount *mp;
2267
2268	if ((mp = vp->v_mount) != NULL) {
2269		mtx_lock(&mntvnode_mtx);
2270		TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
2271		TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
2272		mtx_unlock(&mntvnode_mtx);
2273	}
2274#endif
2275}
2276
2277/*
2278 * Disassociate the underlying filesystem from a vnode.
2279 */
2280static void
2281vclean(vp, flags, td)
2282	struct vnode *vp;
2283	int flags;
2284	struct thread *td;
2285{
2286	int active;
2287
2288	ASSERT_VI_LOCKED(vp);
2289	/*
2290	 * Check to see if the vnode is in use. If so we have to reference it
2291	 * before we clean it out so that its count cannot fall to zero and
2292	 * generate a race against ourselves to recycle it.
2293	 */
2294	if ((active = vp->v_usecount))
2295		vp->v_usecount++;
2296
2297	/*
2298	 * Prevent the vnode from being recycled or brought into use while we
2299	 * clean it out.
2300	 */
2301	if (vp->v_iflag & VI_XLOCK)
2302		panic("vclean: deadlock");
2303	vp->v_iflag |= VI_XLOCK;
2304	vp->v_vxproc = curthread;
2305	/*
2306	 * Even if the count is zero, the VOP_INACTIVE routine may still
2307	 * have the object locked while it cleans it out. The VOP_LOCK
2308	 * ensures that the VOP_INACTIVE routine is done with its work.
2309	 * For active vnodes, it ensures that no other activity can
2310	 * occur while the underlying object is being cleaned out.
2311	 */
2312	VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK, td);
2313
2314	/*
2315	 * Clean out any buffers associated with the vnode.
2316	 * If the flush fails, just toss the buffers.
2317	 */
2318	if (flags & DOCLOSE) {
2319		if (TAILQ_FIRST(&vp->v_dirtyblkhd) != NULL)
2320			(void) vn_write_suspend_wait(vp, NULL, V_WAIT);
2321		if (vinvalbuf(vp, V_SAVE, NOCRED, td, 0, 0) != 0)
2322			vinvalbuf(vp, 0, NOCRED, td, 0, 0);
2323	}
2324
2325	VOP_DESTROYVOBJECT(vp);
2326
2327	/*
2328	 * Any other processes trying to obtain this lock must first
2329	 * wait for VXLOCK to clear, then call the new lock operation.
2330	 */
2331	VOP_UNLOCK(vp, 0, td);
2332
2333	/*
2334	 * If purging an active vnode, it must be closed and
2335	 * deactivated before being reclaimed. Note that the
2336	 * VOP_INACTIVE will unlock the vnode.
2337	 */
2338	if (active) {
2339		if (flags & DOCLOSE)
2340			VOP_CLOSE(vp, FNONBLOCK, NOCRED, td);
2341		if (vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT, td) != 0)
2342			panic("vclean: cannot relock.");
2343		VOP_INACTIVE(vp, td);
2344	}
2345
2346	/*
2347	 * Reclaim the vnode.
2348	 */
2349	if (VOP_RECLAIM(vp, td))
2350		panic("vclean: cannot reclaim");
2351
2352	if (active) {
2353		/*
2354		 * Inline copy of vrele() since VOP_INACTIVE
2355		 * has already been called.
2356		 */
2357		VI_LOCK(vp);
2358		if (--vp->v_usecount <= 0) {
2359#ifdef DIAGNOSTIC
2360			if (vp->v_usecount < 0 || vp->v_writecount != 0) {
2361				vprint("vclean: bad ref count", vp);
2362				panic("vclean: ref cnt");
2363			}
2364#endif
2365			vfree(vp);
2366		}
2367		VI_UNLOCK(vp);
2368	}
2369
2370	cache_purge(vp);
2371	vp->v_vnlock = NULL;
2372	lockdestroy(&vp->v_lock);
2373
2374	VI_LOCK(vp);
2375	if (VSHOULDFREE(vp))
2376		vfree(vp);
2377
2378	/*
2379	 * Done with purge, notify sleepers of the grim news.
2380	 */
2381	vp->v_op = dead_vnodeop_p;
2382	if (vp->v_pollinfo != NULL)
2383		vn_pollgone(vp);
2384	vp->v_tag = VT_NON;
2385	vp->v_iflag &= ~VI_XLOCK;
2386	vp->v_vxproc = NULL;
2387	if (vp->v_iflag & VI_XWANT) {
2388		vp->v_iflag &= ~VI_XWANT;
2389		wakeup(vp);
2390	}
2391}
2392
2393/*
2394 * Eliminate all activity associated with the requested vnode
2395 * and with all vnodes aliased to the requested vnode.
2396 */
2397int
2398vop_revoke(ap)
2399	struct vop_revoke_args /* {
2400		struct vnode *a_vp;
2401		int a_flags;
2402	} */ *ap;
2403{
2404	struct vnode *vp, *vq;
2405	dev_t dev;
2406
2407	KASSERT((ap->a_flags & REVOKEALL) != 0, ("vop_revoke"));
2408
2409	vp = ap->a_vp;
2410	VI_LOCK(vp);
2411	/*
2412	 * If a vgone (or vclean) is already in progress,
2413	 * wait until it is done and return.
2414	 */
2415	if (vp->v_iflag & VI_XLOCK) {
2416		vp->v_iflag |= VI_XWANT;
2417		msleep(vp, VI_MTX(vp), PINOD | PDROP,
2418		    "vop_revokeall", 0);
2419		return (0);
2420	}
2421	VI_UNLOCK(vp);
2422	dev = vp->v_rdev;
2423	for (;;) {
2424		mtx_lock(&spechash_mtx);
2425		vq = SLIST_FIRST(&dev->si_hlist);
2426		mtx_unlock(&spechash_mtx);
2427		if (!vq)
2428			break;
2429		vgone(vq);
2430	}
2431	return (0);
2432}
2433
2434/*
2435 * Recycle an unused vnode to the front of the free list.
2436 * Release the passed interlock if the vnode will be recycled.
2437 */
2438int
2439vrecycle(vp, inter_lkp, td)
2440	struct vnode *vp;
2441	struct mtx *inter_lkp;
2442	struct thread *td;
2443{
2444
2445	mtx_lock(&vp->v_interlock);
2446	if (vp->v_usecount == 0) {
2447		if (inter_lkp) {
2448			mtx_unlock(inter_lkp);
2449		}
2450		vgonel(vp, td);
2451		return (1);
2452	}
2453	mtx_unlock(&vp->v_interlock);
2454	return (0);
2455}
2456
2457/*
2458 * Eliminate all activity associated with a vnode
2459 * in preparation for reuse.
2460 */
2461void
2462vgone(vp)
2463	register struct vnode *vp;
2464{
2465	struct thread *td = curthread;	/* XXX */
2466
2467	VI_LOCK(vp);
2468	vgonel(vp, td);
2469}
2470
2471/*
2472 * vgone, with the vp interlock held.
2473 */
2474void
2475vgonel(vp, td)
2476	struct vnode *vp;
2477	struct thread *td;
2478{
2479	int s;
2480
2481	/*
2482	 * If a vgone (or vclean) is already in progress,
2483	 * wait until it is done and return.
2484	 */
2485	ASSERT_VI_LOCKED(vp);
2486	if (vp->v_iflag & VI_XLOCK) {
2487		vp->v_iflag |= VI_XWANT;
2488		msleep(vp, VI_MTX(vp), PINOD | PDROP, "vgone", 0);
2489		return;
2490	}
2491
2492	/*
2493	 * Clean out the filesystem specific data.
2494	 */
2495	vclean(vp, DOCLOSE, td);
2496	VI_UNLOCK(vp);
2497
2498	/*
2499	 * Delete from old mount point vnode list, if on one.
2500	 */
2501	if (vp->v_mount != NULL)
2502		insmntque(vp, (struct mount *)0);
2503	/*
2504	 * If special device, remove it from special device alias list
2505	 * if it is on one.
2506	 */
2507	if (vp->v_type == VCHR && vp->v_rdev != NULL && vp->v_rdev != NODEV) {
2508		mtx_lock(&spechash_mtx);
2509		SLIST_REMOVE(&vp->v_rdev->si_hlist, vp, vnode, v_specnext);
2510		freedev(vp->v_rdev);
2511		mtx_unlock(&spechash_mtx);
2512		vp->v_rdev = NULL;
2513	}
2514
2515	/*
2516	 * If it is on the freelist and not already at the head,
2517	 * move it to the head of the list. The test of the
2518	 * VDOOMED flag and the reference count of zero is because
2519	 * it will be removed from the free list by getnewvnode,
2520	 * but will not have its reference count incremented until
2521	 * after calling vgone. If the reference count were
2522	 * incremented first, vgone would (incorrectly) try to
2523	 * close the previous instance of the underlying object.
2524	 */
2525	VI_LOCK(vp);
2526	if (vp->v_usecount == 0 && !(vp->v_iflag & VI_DOOMED)) {
2527		s = splbio();
2528		mtx_lock(&vnode_free_list_mtx);
2529		if (vp->v_iflag & VI_FREE) {
2530			TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
2531		} else {
2532			vp->v_iflag |= VI_FREE;
2533			freevnodes++;
2534		}
2535		TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
2536		mtx_unlock(&vnode_free_list_mtx);
2537		splx(s);
2538	}
2539
2540	vp->v_type = VBAD;
2541	VI_UNLOCK(vp);
2542}
2543
2544/*
2545 * Lookup a vnode by device number.
2546 */
2547int
2548vfinddev(dev, type, vpp)
2549	dev_t dev;
2550	enum vtype type;
2551	struct vnode **vpp;
2552{
2553	struct vnode *vp;
2554
2555	mtx_lock(&spechash_mtx);
2556	SLIST_FOREACH(vp, &dev->si_hlist, v_specnext) {
2557		if (type == vp->v_type) {
2558			*vpp = vp;
2559			mtx_unlock(&spechash_mtx);
2560			return (1);
2561		}
2562	}
2563	mtx_unlock(&spechash_mtx);
2564	return (0);
2565}
2566
2567/*
2568 * Calculate the total number of references to a special device.
2569 */
2570int
2571vcount(vp)
2572	struct vnode *vp;
2573{
2574	struct vnode *vq;
2575	int count;
2576
2577	count = 0;
2578	mtx_lock(&spechash_mtx);
2579	SLIST_FOREACH(vq, &vp->v_rdev->si_hlist, v_specnext)
2580		count += vq->v_usecount;
2581	mtx_unlock(&spechash_mtx);
2582	return (count);
2583}
2584
2585/*
2586 * Same as above, but using the dev_t as argument
2587 */
2588int
2589count_dev(dev)
2590	dev_t dev;
2591{
2592	struct vnode *vp;
2593
2594	vp = SLIST_FIRST(&dev->si_hlist);
2595	if (vp == NULL)
2596		return (0);
2597	return(vcount(vp));
2598}
2599
2600/*
2601 * Print out a description of a vnode.
2602 */
2603static char *typename[] =
2604{"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"};
2605
2606void
2607vprint(label, vp)
2608	char *label;
2609	struct vnode *vp;
2610{
2611	char buf[96];
2612
2613	if (label != NULL)
2614		printf("%s: %p: ", label, (void *)vp);
2615	else
2616		printf("%p: ", (void *)vp);
2617	printf("type %s, usecount %d, writecount %d, refcount %d,",
2618	    typename[vp->v_type], vp->v_usecount, vp->v_writecount,
2619	    vp->v_holdcnt);
2620	buf[0] = '\0';
2621	if (vp->v_vflag & VV_ROOT)
2622		strcat(buf, "|VV_ROOT");
2623	if (vp->v_vflag & VV_TEXT)
2624		strcat(buf, "|VV_TEXT");
2625	if (vp->v_vflag & VV_SYSTEM)
2626		strcat(buf, "|VV_SYSTEM");
2627	if (vp->v_iflag & VI_XLOCK)
2628		strcat(buf, "|VI_XLOCK");
2629	if (vp->v_iflag & VI_XWANT)
2630		strcat(buf, "|VI_XWANT");
2631	if (vp->v_iflag & VI_BWAIT)
2632		strcat(buf, "|VI_BWAIT");
2633	if (vp->v_iflag & VI_DOOMED)
2634		strcat(buf, "|VI_DOOMED");
2635	if (vp->v_iflag & VI_FREE)
2636		strcat(buf, "|VI_FREE");
2637	if (vp->v_vflag & VV_OBJBUF)
2638		strcat(buf, "|VV_OBJBUF");
2639	if (buf[0] != '\0')
2640		printf(" flags (%s)", &buf[1]);
2641	if (vp->v_data == NULL) {
2642		printf("\n");
2643	} else {
2644		printf("\n\t");
2645		VOP_PRINT(vp);
2646	}
2647}
2648
2649#ifdef DDB
2650#include <ddb/ddb.h>
2651/*
2652 * List all of the locked vnodes in the system.
2653 * Called when debugging the kernel.
2654 */
2655DB_SHOW_COMMAND(lockedvnods, lockedvnodes)
2656{
2657	struct thread *td = curthread;	/* XXX */
2658	struct mount *mp, *nmp;
2659	struct vnode *vp;
2660
2661	printf("Locked vnodes\n");
2662	mtx_lock(&mountlist_mtx);
2663	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
2664		if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td)) {
2665			nmp = TAILQ_NEXT(mp, mnt_list);
2666			continue;
2667		}
2668		mtx_lock(&mntvnode_mtx);
2669		TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
2670			if (VOP_ISLOCKED(vp, NULL))
2671				vprint((char *)0, vp);
2672		}
2673		mtx_unlock(&mntvnode_mtx);
2674		mtx_lock(&mountlist_mtx);
2675		nmp = TAILQ_NEXT(mp, mnt_list);
2676		vfs_unbusy(mp, td);
2677	}
2678	mtx_unlock(&mountlist_mtx);
2679}
2680#endif
2681
2682/*
2683 * Fill in a struct xvfsconf based on a struct vfsconf.
2684 */
2685static void
2686vfsconf2x(struct vfsconf *vfsp, struct xvfsconf *xvfsp)
2687{
2688
2689	strcpy(xvfsp->vfc_name, vfsp->vfc_name);
2690	xvfsp->vfc_typenum = vfsp->vfc_typenum;
2691	xvfsp->vfc_refcount = vfsp->vfc_refcount;
2692	xvfsp->vfc_flags = vfsp->vfc_flags;
2693	/*
2694	 * These are unused in userland, we keep them
2695	 * to not break binary compatibility.
2696	 */
2697	xvfsp->vfc_vfsops = NULL;
2698	xvfsp->vfc_next = NULL;
2699}
2700
2701static int
2702sysctl_vfs_conflist(SYSCTL_HANDLER_ARGS)
2703{
2704	struct vfsconf *vfsp;
2705	struct xvfsconf *xvfsp;
2706	int cnt, error, i;
2707
2708	cnt = 0;
2709	for (vfsp = vfsconf; vfsp != NULL; vfsp = vfsp->vfc_next)
2710		cnt++;
2711	xvfsp = malloc(sizeof(struct xvfsconf) * cnt, M_TEMP, M_WAITOK);
2712	/*
2713	 * Handle the race that we will have here when struct vfsconf
2714	 * will be locked down by using both cnt and checking vfc_next
2715	 * against NULL to determine the end of the loop.  The race will
2716	 * happen because we will have to unlock before calling malloc().
2717	 * We are protected by Giant for now.
2718	 */
2719	i = 0;
2720	for (vfsp = vfsconf; vfsp != NULL && i < cnt; vfsp = vfsp->vfc_next) {
2721		vfsconf2x(vfsp, xvfsp + i);
2722		i++;
2723	}
2724	error = SYSCTL_OUT(req, xvfsp, sizeof(struct xvfsconf) * i);
2725	free(xvfsp, M_TEMP);
2726	return (error);
2727}
2728
2729SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLFLAG_RD, NULL, 0, sysctl_vfs_conflist,
2730    "S,xvfsconf", "List of all configured filesystems");
2731
2732/*
2733 * Top level filesystem related information gathering.
2734 */
2735static int	sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS);
2736
2737static int
2738vfs_sysctl(SYSCTL_HANDLER_ARGS)
2739{
2740	int *name = (int *)arg1 - 1;	/* XXX */
2741	u_int namelen = arg2 + 1;	/* XXX */
2742	struct vfsconf *vfsp;
2743	struct xvfsconf xvfsp;
2744
2745	printf("WARNING: userland calling deprecated sysctl, "
2746	    "please rebuild world\n");
2747
2748#if 1 || defined(COMPAT_PRELITE2)
2749	/* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */
2750	if (namelen == 1)
2751		return (sysctl_ovfs_conf(oidp, arg1, arg2, req));
2752#endif
2753
2754	switch (name[1]) {
2755	case VFS_MAXTYPENUM:
2756		if (namelen != 2)
2757			return (ENOTDIR);
2758		return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int)));
2759	case VFS_CONF:
2760		if (namelen != 3)
2761			return (ENOTDIR);	/* overloaded */
2762		for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
2763			if (vfsp->vfc_typenum == name[2])
2764				break;
2765		if (vfsp == NULL)
2766			return (EOPNOTSUPP);
2767		vfsconf2x(vfsp, &xvfsp);
2768		return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp)));
2769	}
2770	return (EOPNOTSUPP);
2771}
2772
2773SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD | CTLFLAG_SKIP, vfs_sysctl,
2774	"Generic filesystem");
2775
2776#if 1 || defined(COMPAT_PRELITE2)
2777
2778static int
2779sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS)
2780{
2781	int error;
2782	struct vfsconf *vfsp;
2783	struct ovfsconf ovfs;
2784
2785	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
2786		ovfs.vfc_vfsops = vfsp->vfc_vfsops;	/* XXX used as flag */
2787		strcpy(ovfs.vfc_name, vfsp->vfc_name);
2788		ovfs.vfc_index = vfsp->vfc_typenum;
2789		ovfs.vfc_refcount = vfsp->vfc_refcount;
2790		ovfs.vfc_flags = vfsp->vfc_flags;
2791		error = SYSCTL_OUT(req, &ovfs, sizeof ovfs);
2792		if (error)
2793			return error;
2794	}
2795	return 0;
2796}
2797
2798#endif /* 1 || COMPAT_PRELITE2 */
2799
2800#define KINFO_VNODESLOP		10
2801/*
2802 * Dump vnode list (via sysctl).
2803 */
2804/* ARGSUSED */
2805static int
2806sysctl_vnode(SYSCTL_HANDLER_ARGS)
2807{
2808	struct xvnode *xvn;
2809	struct thread *td = req->td;
2810	struct mount *mp;
2811	struct vnode *vp;
2812	int error, len, n;
2813
2814	/*
2815	 * Stale numvnodes access is not fatal here.
2816	 */
2817	req->lock = 0;
2818	len = (numvnodes + KINFO_VNODESLOP) * sizeof *xvn;
2819	if (!req->oldptr)
2820		/* Make an estimate */
2821		return (SYSCTL_OUT(req, 0, len));
2822
2823	sysctl_wire_old_buffer(req, 0);
2824	xvn = malloc(len, M_TEMP, M_ZERO | M_WAITOK);
2825	n = 0;
2826	mtx_lock(&mountlist_mtx);
2827	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
2828		if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td))
2829			continue;
2830		mtx_lock(&mntvnode_mtx);
2831		TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
2832			if (n == len)
2833				break;
2834			vref(vp);
2835			xvn[n].xv_size = sizeof *xvn;
2836			xvn[n].xv_vnode = vp;
2837#define XV_COPY(field) xvn[n].xv_##field = vp->v_##field
2838			XV_COPY(usecount);
2839			XV_COPY(writecount);
2840			XV_COPY(holdcnt);
2841			XV_COPY(id);
2842			XV_COPY(mount);
2843			XV_COPY(numoutput);
2844			XV_COPY(type);
2845#undef XV_COPY
2846			xvn[n].xv_flag = vp->v_vflag;
2847
2848			switch (vp->v_type) {
2849			case VREG:
2850			case VDIR:
2851			case VLNK:
2852				xvn[n].xv_dev = vp->v_cachedfs;
2853				xvn[n].xv_ino = vp->v_cachedid;
2854				break;
2855			case VBLK:
2856			case VCHR:
2857				if (vp->v_rdev == NULL) {
2858					vrele(vp);
2859					continue;
2860				}
2861				xvn[n].xv_dev = dev2udev(vp->v_rdev);
2862				break;
2863			case VSOCK:
2864				xvn[n].xv_socket = vp->v_socket;
2865				break;
2866			case VFIFO:
2867				xvn[n].xv_fifo = vp->v_fifoinfo;
2868				break;
2869			case VNON:
2870			case VBAD:
2871			default:
2872				/* shouldn't happen? */
2873				vrele(vp);
2874				continue;
2875			}
2876			vrele(vp);
2877			++n;
2878		}
2879		mtx_unlock(&mntvnode_mtx);
2880		mtx_lock(&mountlist_mtx);
2881		vfs_unbusy(mp, td);
2882		if (n == len)
2883			break;
2884	}
2885	mtx_unlock(&mountlist_mtx);
2886
2887	error = SYSCTL_OUT(req, xvn, n * sizeof *xvn);
2888	free(xvn, M_TEMP);
2889	return (error);
2890}
2891
2892SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD,
2893	0, 0, sysctl_vnode, "S,xvnode", "");
2894
2895/*
2896 * Check to see if a filesystem is mounted on a block device.
2897 */
2898int
2899vfs_mountedon(vp)
2900	struct vnode *vp;
2901{
2902
2903	if (vp->v_rdev->si_mountpoint != NULL)
2904		return (EBUSY);
2905	return (0);
2906}
2907
2908/*
2909 * Unmount all filesystems. The list is traversed in reverse order
2910 * of mounting to avoid dependencies.
2911 */
2912void
2913vfs_unmountall()
2914{
2915	struct mount *mp;
2916	struct thread *td;
2917	int error;
2918
2919	if (curthread != NULL)
2920		td = curthread;
2921	else
2922		td = FIRST_THREAD_IN_PROC(initproc); /* XXX XXX proc0? */
2923	/*
2924	 * Since this only runs when rebooting, it is not interlocked.
2925	 */
2926	while(!TAILQ_EMPTY(&mountlist)) {
2927		mp = TAILQ_LAST(&mountlist, mntlist);
2928		error = dounmount(mp, MNT_FORCE, td);
2929		if (error) {
2930			TAILQ_REMOVE(&mountlist, mp, mnt_list);
2931			printf("unmount of %s failed (",
2932			    mp->mnt_stat.f_mntonname);
2933			if (error == EBUSY)
2934				printf("BUSY)\n");
2935			else
2936				printf("%d)\n", error);
2937		} else {
2938			/* The unmount has removed mp from the mountlist */
2939		}
2940	}
2941}
2942
2943/*
2944 * perform msync on all vnodes under a mount point
2945 * the mount point must be locked.
2946 */
2947void
2948vfs_msync(struct mount *mp, int flags)
2949{
2950	struct vnode *vp, *nvp;
2951	struct vm_object *obj;
2952	int tries;
2953
2954	GIANT_REQUIRED;
2955
2956	tries = 5;
2957	mtx_lock(&mntvnode_mtx);
2958loop:
2959	for (vp = TAILQ_FIRST(&mp->mnt_nvnodelist); vp != NULL; vp = nvp) {
2960		if (vp->v_mount != mp) {
2961			if (--tries > 0)
2962				goto loop;
2963			break;
2964		}
2965		nvp = TAILQ_NEXT(vp, v_nmntvnodes);
2966
2967		mp_fixme("What locks do we need here?");
2968		if (vp->v_iflag & VI_XLOCK)	/* XXX: what if MNT_WAIT? */
2969			continue;
2970
2971		if (vp->v_vflag & VV_NOSYNC)	/* unlinked, skip it */
2972			continue;
2973
2974		if ((vp->v_iflag & VI_OBJDIRTY) &&
2975		    (flags == MNT_WAIT || VOP_ISLOCKED(vp, NULL) == 0)) {
2976			mtx_unlock(&mntvnode_mtx);
2977			if (!vget(vp,
2978			    LK_EXCLUSIVE | LK_RETRY | LK_NOOBJ, curthread)) {
2979				if (VOP_GETVOBJECT(vp, &obj) == 0) {
2980					vm_object_page_clean(obj, 0, 0,
2981					    flags == MNT_WAIT ?
2982					    OBJPC_SYNC : OBJPC_NOSYNC);
2983				}
2984				vput(vp);
2985			}
2986			mtx_lock(&mntvnode_mtx);
2987			if (TAILQ_NEXT(vp, v_nmntvnodes) != nvp) {
2988				if (--tries > 0)
2989					goto loop;
2990				break;
2991			}
2992		}
2993	}
2994	mtx_unlock(&mntvnode_mtx);
2995}
2996
2997/*
2998 * Create the VM object needed for VMIO and mmap support.  This
2999 * is done for all VREG files in the system.  Some filesystems might
3000 * afford the additional metadata buffering capability of the
3001 * VMIO code by making the device node be VMIO mode also.
3002 *
3003 * vp must be locked when vfs_object_create is called.
3004 */
3005int
3006vfs_object_create(vp, td, cred)
3007	struct vnode *vp;
3008	struct thread *td;
3009	struct ucred *cred;
3010{
3011	GIANT_REQUIRED;
3012	return (VOP_CREATEVOBJECT(vp, cred, td));
3013}
3014
3015/*
3016 * Mark a vnode as free, putting it up for recycling.
3017 */
3018void
3019vfree(vp)
3020	struct vnode *vp;
3021{
3022	int s;
3023
3024	ASSERT_VI_LOCKED(vp);
3025	s = splbio();
3026	mtx_lock(&vnode_free_list_mtx);
3027	KASSERT((vp->v_iflag & VI_FREE) == 0, ("vnode already free"));
3028	if (vp->v_iflag & VI_AGE) {
3029		TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
3030	} else {
3031		TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
3032	}
3033	freevnodes++;
3034	mtx_unlock(&vnode_free_list_mtx);
3035	vp->v_iflag &= ~VI_AGE;
3036	vp->v_iflag |= VI_FREE;
3037	splx(s);
3038}
3039
3040/*
3041 * Opposite of vfree() - mark a vnode as in use.
3042 */
3043void
3044vbusy(vp)
3045	struct vnode *vp;
3046{
3047	int s;
3048
3049	s = splbio();
3050	ASSERT_VI_LOCKED(vp);
3051	mtx_lock(&vnode_free_list_mtx);
3052	KASSERT((vp->v_iflag & VI_FREE) != 0, ("vnode not free"));
3053	TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
3054	freevnodes--;
3055	mtx_unlock(&vnode_free_list_mtx);
3056	vp->v_iflag &= ~(VI_FREE|VI_AGE);
3057	splx(s);
3058}
3059
3060/*
3061 * Record a process's interest in events which might happen to
3062 * a vnode.  Because poll uses the historic select-style interface
3063 * internally, this routine serves as both the ``check for any
3064 * pending events'' and the ``record my interest in future events''
3065 * functions.  (These are done together, while the lock is held,
3066 * to avoid race conditions.)
3067 */
3068int
3069vn_pollrecord(vp, td, events)
3070	struct vnode *vp;
3071	struct thread *td;
3072	short events;
3073{
3074
3075	if (vp->v_pollinfo == NULL)
3076		v_addpollinfo(vp);
3077	mtx_lock(&vp->v_pollinfo->vpi_lock);
3078	if (vp->v_pollinfo->vpi_revents & events) {
3079		/*
3080		 * This leaves events we are not interested
3081		 * in available for the other process which
3082		 * which presumably had requested them
3083		 * (otherwise they would never have been
3084		 * recorded).
3085		 */
3086		events &= vp->v_pollinfo->vpi_revents;
3087		vp->v_pollinfo->vpi_revents &= ~events;
3088
3089		mtx_unlock(&vp->v_pollinfo->vpi_lock);
3090		return events;
3091	}
3092	vp->v_pollinfo->vpi_events |= events;
3093	selrecord(td, &vp->v_pollinfo->vpi_selinfo);
3094	mtx_unlock(&vp->v_pollinfo->vpi_lock);
3095	return 0;
3096}
3097
3098/*
3099 * Note the occurrence of an event.  If the VN_POLLEVENT macro is used,
3100 * it is possible for us to miss an event due to race conditions, but
3101 * that condition is expected to be rare, so for the moment it is the
3102 * preferred interface.
3103 */
3104void
3105vn_pollevent(vp, events)
3106	struct vnode *vp;
3107	short events;
3108{
3109
3110	if (vp->v_pollinfo == NULL)
3111		v_addpollinfo(vp);
3112	mtx_lock(&vp->v_pollinfo->vpi_lock);
3113	if (vp->v_pollinfo->vpi_events & events) {
3114		/*
3115		 * We clear vpi_events so that we don't
3116		 * call selwakeup() twice if two events are
3117		 * posted before the polling process(es) is
3118		 * awakened.  This also ensures that we take at
3119		 * most one selwakeup() if the polling process
3120		 * is no longer interested.  However, it does
3121		 * mean that only one event can be noticed at
3122		 * a time.  (Perhaps we should only clear those
3123		 * event bits which we note?) XXX
3124		 */
3125		vp->v_pollinfo->vpi_events = 0;	/* &= ~events ??? */
3126		vp->v_pollinfo->vpi_revents |= events;
3127		selwakeup(&vp->v_pollinfo->vpi_selinfo);
3128	}
3129	mtx_unlock(&vp->v_pollinfo->vpi_lock);
3130}
3131
3132/*
3133 * Wake up anyone polling on vp because it is being revoked.
3134 * This depends on dead_poll() returning POLLHUP for correct
3135 * behavior.
3136 */
3137void
3138vn_pollgone(vp)
3139	struct vnode *vp;
3140{
3141
3142	mtx_lock(&vp->v_pollinfo->vpi_lock);
3143	VN_KNOTE(vp, NOTE_REVOKE);
3144	if (vp->v_pollinfo->vpi_events) {
3145		vp->v_pollinfo->vpi_events = 0;
3146		selwakeup(&vp->v_pollinfo->vpi_selinfo);
3147	}
3148	mtx_unlock(&vp->v_pollinfo->vpi_lock);
3149}
3150
3151
3152
3153/*
3154 * Routine to create and manage a filesystem syncer vnode.
3155 */
3156#define sync_close ((int (*)(struct  vop_close_args *))nullop)
3157static int	sync_fsync(struct  vop_fsync_args *);
3158static int	sync_inactive(struct  vop_inactive_args *);
3159static int	sync_reclaim(struct  vop_reclaim_args *);
3160static int	sync_print(struct vop_print_args *);
3161
3162static vop_t **sync_vnodeop_p;
3163static struct vnodeopv_entry_desc sync_vnodeop_entries[] = {
3164	{ &vop_default_desc,	(vop_t *) vop_eopnotsupp },
3165	{ &vop_close_desc,	(vop_t *) sync_close },		/* close */
3166	{ &vop_fsync_desc,	(vop_t *) sync_fsync },		/* fsync */
3167	{ &vop_inactive_desc,	(vop_t *) sync_inactive },	/* inactive */
3168	{ &vop_reclaim_desc,	(vop_t *) sync_reclaim },	/* reclaim */
3169	{ &vop_lock_desc,	(vop_t *) vop_stdlock },	/* lock */
3170	{ &vop_unlock_desc,	(vop_t *) vop_stdunlock },	/* unlock */
3171	{ &vop_print_desc,	(vop_t *) sync_print },		/* print */
3172	{ &vop_islocked_desc,	(vop_t *) vop_stdislocked },	/* islocked */
3173	{ NULL, NULL }
3174};
3175static struct vnodeopv_desc sync_vnodeop_opv_desc =
3176	{ &sync_vnodeop_p, sync_vnodeop_entries };
3177
3178VNODEOP_SET(sync_vnodeop_opv_desc);
3179
3180/*
3181 * Create a new filesystem syncer vnode for the specified mount point.
3182 */
3183int
3184vfs_allocate_syncvnode(mp)
3185	struct mount *mp;
3186{
3187	struct vnode *vp;
3188	static long start, incr, next;
3189	int error;
3190
3191	/* Allocate a new vnode */
3192	if ((error = getnewvnode(VT_VFS, mp, sync_vnodeop_p, &vp)) != 0) {
3193		mp->mnt_syncer = NULL;
3194		return (error);
3195	}
3196	vp->v_type = VNON;
3197	/*
3198	 * Place the vnode onto the syncer worklist. We attempt to
3199	 * scatter them about on the list so that they will go off
3200	 * at evenly distributed times even if all the filesystems
3201	 * are mounted at once.
3202	 */
3203	next += incr;
3204	if (next == 0 || next > syncer_maxdelay) {
3205		start /= 2;
3206		incr /= 2;
3207		if (start == 0) {
3208			start = syncer_maxdelay / 2;
3209			incr = syncer_maxdelay;
3210		}
3211		next = start;
3212	}
3213	VI_LOCK(vp);
3214	vn_syncer_add_to_worklist(vp, syncdelay > 0 ? next % syncdelay : 0);
3215	VI_UNLOCK(vp);
3216	mp->mnt_syncer = vp;
3217	return (0);
3218}
3219
3220/*
3221 * Do a lazy sync of the filesystem.
3222 */
3223static int
3224sync_fsync(ap)
3225	struct vop_fsync_args /* {
3226		struct vnode *a_vp;
3227		struct ucred *a_cred;
3228		int a_waitfor;
3229		struct thread *a_td;
3230	} */ *ap;
3231{
3232	struct vnode *syncvp = ap->a_vp;
3233	struct mount *mp = syncvp->v_mount;
3234	struct thread *td = ap->a_td;
3235	int asyncflag;
3236
3237	/*
3238	 * We only need to do something if this is a lazy evaluation.
3239	 */
3240	if (ap->a_waitfor != MNT_LAZY)
3241		return (0);
3242
3243	/*
3244	 * Move ourselves to the back of the sync list.
3245	 */
3246	VI_LOCK(syncvp);
3247	vn_syncer_add_to_worklist(syncvp, syncdelay);
3248	VI_UNLOCK(syncvp);
3249
3250	/*
3251	 * Walk the list of vnodes pushing all that are dirty and
3252	 * not already on the sync list.
3253	 */
3254	mtx_lock(&mountlist_mtx);
3255	if (vfs_busy(mp, LK_EXCLUSIVE | LK_NOWAIT, &mountlist_mtx, td) != 0) {
3256		mtx_unlock(&mountlist_mtx);
3257		return (0);
3258	}
3259	if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) {
3260		vfs_unbusy(mp, td);
3261		return (0);
3262	}
3263	asyncflag = mp->mnt_flag & MNT_ASYNC;
3264	mp->mnt_flag &= ~MNT_ASYNC;
3265	vfs_msync(mp, MNT_NOWAIT);
3266	VFS_SYNC(mp, MNT_LAZY, ap->a_cred, td);
3267	if (asyncflag)
3268		mp->mnt_flag |= MNT_ASYNC;
3269	vn_finished_write(mp);
3270	vfs_unbusy(mp, td);
3271	return (0);
3272}
3273
3274/*
3275 * The syncer vnode is no referenced.
3276 */
3277static int
3278sync_inactive(ap)
3279	struct vop_inactive_args /* {
3280		struct vnode *a_vp;
3281		struct thread *a_td;
3282	} */ *ap;
3283{
3284
3285	VOP_UNLOCK(ap->a_vp, 0, ap->a_td);
3286	vgone(ap->a_vp);
3287	return (0);
3288}
3289
3290/*
3291 * The syncer vnode is no longer needed and is being decommissioned.
3292 *
3293 * Modifications to the worklist must be protected at splbio().
3294 */
3295static int
3296sync_reclaim(ap)
3297	struct vop_reclaim_args /* {
3298		struct vnode *a_vp;
3299	} */ *ap;
3300{
3301	struct vnode *vp = ap->a_vp;
3302	int s;
3303
3304	s = splbio();
3305	vp->v_mount->mnt_syncer = NULL;
3306	VI_LOCK(vp);
3307	if (vp->v_iflag & VI_ONWORKLST) {
3308		LIST_REMOVE(vp, v_synclist);
3309		vp->v_iflag &= ~VI_ONWORKLST;
3310	}
3311	VI_UNLOCK(vp);
3312	splx(s);
3313
3314	return (0);
3315}
3316
3317/*
3318 * Print out a syncer vnode.
3319 */
3320static int
3321sync_print(ap)
3322	struct vop_print_args /* {
3323		struct vnode *a_vp;
3324	} */ *ap;
3325{
3326	struct vnode *vp = ap->a_vp;
3327
3328	printf("syncer vnode");
3329	if (vp->v_vnlock != NULL)
3330		lockmgr_printinfo(vp->v_vnlock);
3331	printf("\n");
3332	return (0);
3333}
3334
3335/*
3336 * extract the dev_t from a VCHR
3337 */
3338dev_t
3339vn_todev(vp)
3340	struct vnode *vp;
3341{
3342	if (vp->v_type != VCHR)
3343		return (NODEV);
3344	return (vp->v_rdev);
3345}
3346
3347/*
3348 * Check if vnode represents a disk device
3349 */
3350int
3351vn_isdisk(vp, errp)
3352	struct vnode *vp;
3353	int *errp;
3354{
3355	struct cdevsw *cdevsw;
3356
3357	if (vp->v_type != VCHR) {
3358		if (errp != NULL)
3359			*errp = ENOTBLK;
3360		return (0);
3361	}
3362	if (vp->v_rdev == NULL) {
3363		if (errp != NULL)
3364			*errp = ENXIO;
3365		return (0);
3366	}
3367	cdevsw = devsw(vp->v_rdev);
3368	if (cdevsw == NULL) {
3369		if (errp != NULL)
3370			*errp = ENXIO;
3371		return (0);
3372	}
3373	if (!(cdevsw->d_flags & D_DISK)) {
3374		if (errp != NULL)
3375			*errp = ENOTBLK;
3376		return (0);
3377	}
3378	if (errp != NULL)
3379		*errp = 0;
3380	return (1);
3381}
3382
3383/*
3384 * Free data allocated by namei(); see namei(9) for details.
3385 */
3386void
3387NDFREE(ndp, flags)
3388     struct nameidata *ndp;
3389     const uint flags;
3390{
3391	if (!(flags & NDF_NO_FREE_PNBUF) &&
3392	    (ndp->ni_cnd.cn_flags & HASBUF)) {
3393		uma_zfree(namei_zone, ndp->ni_cnd.cn_pnbuf);
3394		ndp->ni_cnd.cn_flags &= ~HASBUF;
3395	}
3396	if (!(flags & NDF_NO_DVP_UNLOCK) &&
3397	    (ndp->ni_cnd.cn_flags & LOCKPARENT) &&
3398	    ndp->ni_dvp != ndp->ni_vp)
3399		VOP_UNLOCK(ndp->ni_dvp, 0, ndp->ni_cnd.cn_thread);
3400	if (!(flags & NDF_NO_DVP_RELE) &&
3401	    (ndp->ni_cnd.cn_flags & (LOCKPARENT|WANTPARENT))) {
3402		vrele(ndp->ni_dvp);
3403		ndp->ni_dvp = NULL;
3404	}
3405	if (!(flags & NDF_NO_VP_UNLOCK) &&
3406	    (ndp->ni_cnd.cn_flags & LOCKLEAF) && ndp->ni_vp)
3407		VOP_UNLOCK(ndp->ni_vp, 0, ndp->ni_cnd.cn_thread);
3408	if (!(flags & NDF_NO_VP_RELE) &&
3409	    ndp->ni_vp) {
3410		vrele(ndp->ni_vp);
3411		ndp->ni_vp = NULL;
3412	}
3413	if (!(flags & NDF_NO_STARTDIR_RELE) &&
3414	    (ndp->ni_cnd.cn_flags & SAVESTART)) {
3415		vrele(ndp->ni_startdir);
3416		ndp->ni_startdir = NULL;
3417	}
3418}
3419
3420/*
3421 * Common filesystem object access control check routine.  Accepts a
3422 * vnode's type, "mode", uid and gid, requested access mode, credentials,
3423 * and optional call-by-reference privused argument allowing vaccess()
3424 * to indicate to the caller whether privilege was used to satisfy the
3425 * request (obsoleted).  Returns 0 on success, or an errno on failure.
3426 */
3427int
3428vaccess(type, file_mode, file_uid, file_gid, acc_mode, cred, privused)
3429	enum vtype type;
3430	mode_t file_mode;
3431	uid_t file_uid;
3432	gid_t file_gid;
3433	mode_t acc_mode;
3434	struct ucred *cred;
3435	int *privused;
3436{
3437	mode_t dac_granted;
3438#ifdef CAPABILITIES
3439	mode_t cap_granted;
3440#endif
3441
3442	/*
3443	 * Look for a normal, non-privileged way to access the file/directory
3444	 * as requested.  If it exists, go with that.
3445	 */
3446
3447	if (privused != NULL)
3448		*privused = 0;
3449
3450	dac_granted = 0;
3451
3452	/* Check the owner. */
3453	if (cred->cr_uid == file_uid) {
3454		dac_granted |= VADMIN;
3455		if (file_mode & S_IXUSR)
3456			dac_granted |= VEXEC;
3457		if (file_mode & S_IRUSR)
3458			dac_granted |= VREAD;
3459		if (file_mode & S_IWUSR)
3460			dac_granted |= (VWRITE | VAPPEND);
3461
3462		if ((acc_mode & dac_granted) == acc_mode)
3463			return (0);
3464
3465		goto privcheck;
3466	}
3467
3468	/* Otherwise, check the groups (first match) */
3469	if (groupmember(file_gid, cred)) {
3470		if (file_mode & S_IXGRP)
3471			dac_granted |= VEXEC;
3472		if (file_mode & S_IRGRP)
3473			dac_granted |= VREAD;
3474		if (file_mode & S_IWGRP)
3475			dac_granted |= (VWRITE | VAPPEND);
3476
3477		if ((acc_mode & dac_granted) == acc_mode)
3478			return (0);
3479
3480		goto privcheck;
3481	}
3482
3483	/* Otherwise, check everyone else. */
3484	if (file_mode & S_IXOTH)
3485		dac_granted |= VEXEC;
3486	if (file_mode & S_IROTH)
3487		dac_granted |= VREAD;
3488	if (file_mode & S_IWOTH)
3489		dac_granted |= (VWRITE | VAPPEND);
3490	if ((acc_mode & dac_granted) == acc_mode)
3491		return (0);
3492
3493privcheck:
3494	if (!suser_cred(cred, PRISON_ROOT)) {
3495		/* XXX audit: privilege used */
3496		if (privused != NULL)
3497			*privused = 1;
3498		return (0);
3499	}
3500
3501#ifdef CAPABILITIES
3502	/*
3503	 * Build a capability mask to determine if the set of capabilities
3504	 * satisfies the requirements when combined with the granted mask
3505	 * from above.
3506	 * For each capability, if the capability is required, bitwise
3507	 * or the request type onto the cap_granted mask.
3508	 */
3509	cap_granted = 0;
3510
3511	if (type == VDIR) {
3512		/*
3513		 * For directories, use CAP_DAC_READ_SEARCH to satisfy
3514		 * VEXEC requests, instead of CAP_DAC_EXECUTE.
3515		 */
3516		if ((acc_mode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
3517		    !cap_check(cred, NULL, CAP_DAC_READ_SEARCH, PRISON_ROOT))
3518			cap_granted |= VEXEC;
3519	} else {
3520		if ((acc_mode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
3521		    !cap_check(cred, NULL, CAP_DAC_EXECUTE, PRISON_ROOT))
3522			cap_granted |= VEXEC;
3523	}
3524
3525	if ((acc_mode & VREAD) && ((dac_granted & VREAD) == 0) &&
3526	    !cap_check(cred, NULL, CAP_DAC_READ_SEARCH, PRISON_ROOT))
3527		cap_granted |= VREAD;
3528
3529	if ((acc_mode & VWRITE) && ((dac_granted & VWRITE) == 0) &&
3530	    !cap_check(cred, NULL, CAP_DAC_WRITE, PRISON_ROOT))
3531		cap_granted |= (VWRITE | VAPPEND);
3532
3533	if ((acc_mode & VADMIN) && ((dac_granted & VADMIN) == 0) &&
3534	    !cap_check(cred, NULL, CAP_FOWNER, PRISON_ROOT))
3535		cap_granted |= VADMIN;
3536
3537	if ((acc_mode & (cap_granted | dac_granted)) == acc_mode) {
3538		/* XXX audit: privilege used */
3539		if (privused != NULL)
3540			*privused = 1;
3541		return (0);
3542	}
3543#endif
3544
3545	return ((acc_mode & VADMIN) ? EPERM : EACCES);
3546}
3547
3548/*
3549 * Credential check based on process requesting service, and per-attribute
3550 * permissions.
3551 */
3552int
3553extattr_check_cred(struct vnode *vp, int attrnamespace,
3554    struct ucred *cred, struct thread *td, int access)
3555{
3556
3557	/*
3558	 * Kernel-invoked always succeeds.
3559	 */
3560	if (cred == NOCRED)
3561		return (0);
3562
3563	/*
3564	 * Do not allow privileged processes in jail to directly
3565	 * manipulate system attributes.
3566	 *
3567	 * XXX What capability should apply here?
3568	 * Probably CAP_SYS_SETFFLAG.
3569	 */
3570	switch (attrnamespace) {
3571	case EXTATTR_NAMESPACE_SYSTEM:
3572		/* Potentially should be: return (EPERM); */
3573		return (suser_cred(cred, 0));
3574	case EXTATTR_NAMESPACE_USER:
3575		return (VOP_ACCESS(vp, access, cred, td));
3576	default:
3577		return (EPERM);
3578	}
3579}
3580