vfs_export.c revision 75858
1/*
2 * Copyright (c) 1989, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 *    must display the following acknowledgement:
20 *	This product includes software developed by the University of
21 *	California, Berkeley and its contributors.
22 * 4. Neither the name of the University nor the names of its contributors
23 *    may be used to endorse or promote products derived from this software
24 *    without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 *
38 *	@(#)vfs_subr.c	8.31 (Berkeley) 5/26/95
39 * $FreeBSD: head/sys/kern/vfs_export.c 75858 2001-04-23 09:05:15Z grog $
40 */
41
42/*
43 * External virtual filesystem routines
44 */
45#include "opt_ddb.h"
46#include "opt_ffs.h"
47
48#include <sys/param.h>
49#include <sys/systm.h>
50#include <sys/bio.h>
51#include <sys/buf.h>
52#include <sys/conf.h>
53#include <sys/dirent.h>
54#include <sys/domain.h>
55#include <sys/eventhandler.h>
56#include <sys/event.h>
57#include <sys/fcntl.h>
58#include <sys/kernel.h>
59#include <sys/kthread.h>
60#include <sys/ktr.h>
61#include <sys/malloc.h>
62#include <net/radix.h>
63#include <sys/socket.h>
64#include <sys/mount.h>
65#include <sys/mutex.h>
66#include <sys/namei.h>
67#include <sys/proc.h>
68#include <sys/reboot.h>
69#include <sys/socket.h>
70#include <sys/stat.h>
71#include <sys/sysctl.h>
72#include <sys/vmmeter.h>
73#include <sys/vnode.h>
74
75#include <machine/limits.h>
76
77#include <vm/vm.h>
78#include <vm/vm_object.h>
79#include <vm/vm_extern.h>
80#include <vm/pmap.h>
81#include <vm/vm_map.h>
82#include <vm/vm_page.h>
83#include <vm/vm_pager.h>
84#include <vm/vnode_pager.h>
85#include <vm/vm_zone.h>
86
87static MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure");
88
89static void	addalias __P((struct vnode *vp, dev_t nvp_rdev));
90static void	insmntque __P((struct vnode *vp, struct mount *mp));
91static void	vclean __P((struct vnode *vp, int flags, struct proc *p));
92
93/*
94 * Number of vnodes in existence.  Increased whenever getnewvnode()
95 * allocates a new vnode, never decreased.
96 */
97static unsigned long	numvnodes;
98SYSCTL_LONG(_debug, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, "");
99
100/*
101 * Conversion tables for conversion from vnode types to inode formats
102 * and back.
103 */
104enum vtype iftovt_tab[16] = {
105	VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
106	VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
107};
108int vttoif_tab[9] = {
109	0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
110	S_IFSOCK, S_IFIFO, S_IFMT,
111};
112
113/*
114 * List of vnodes that are ready for recycling.
115 */
116static TAILQ_HEAD(freelst, vnode) vnode_free_list;
117
118/*
119 * Minimum number of free vnodes.  If there are fewer than this free vnodes,
120 * getnewvnode() will return a newly allocated vnode.
121 */
122static u_long wantfreevnodes = 25;
123SYSCTL_LONG(_debug, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, "");
124/* Number of vnodes in the free list. */
125static u_long freevnodes = 0;
126SYSCTL_LONG(_debug, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, "");
127/* Number of vnode allocation. */
128static u_long vnodeallocs = 0;
129SYSCTL_LONG(_debug, OID_AUTO, vnodeallocs, CTLFLAG_RD, &vnodeallocs, 0, "");
130/* Period of vnode recycle from namecache in vnode allocation times. */
131static u_long vnoderecycleperiod = 1000;
132SYSCTL_LONG(_debug, OID_AUTO, vnoderecycleperiod, CTLFLAG_RW, &vnoderecycleperiod, 0, "");
133/* Minimum number of total vnodes required to invoke vnode recycle from namecache. */
134static u_long vnoderecyclemintotalvn = 2000;
135SYSCTL_LONG(_debug, OID_AUTO, vnoderecyclemintotalvn, CTLFLAG_RW, &vnoderecyclemintotalvn, 0, "");
136/* Minimum number of free vnodes required to invoke vnode recycle from namecache. */
137static u_long vnoderecycleminfreevn = 2000;
138SYSCTL_LONG(_debug, OID_AUTO, vnoderecycleminfreevn, CTLFLAG_RW, &vnoderecycleminfreevn, 0, "");
139/* Number of vnodes attempted to recycle at a time. */
140static u_long vnoderecyclenumber = 3000;
141SYSCTL_LONG(_debug, OID_AUTO, vnoderecyclenumber, CTLFLAG_RW, &vnoderecyclenumber, 0, "");
142
143/*
144 * Various variables used for debugging the new implementation of
145 * reassignbuf().
146 * XXX these are probably of (very) limited utility now.
147 */
148static int reassignbufcalls;
149SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0, "");
150static int reassignbufloops;
151SYSCTL_INT(_vfs, OID_AUTO, reassignbufloops, CTLFLAG_RW, &reassignbufloops, 0, "");
152static int reassignbufsortgood;
153SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortgood, CTLFLAG_RW, &reassignbufsortgood, 0, "");
154static int reassignbufsortbad;
155SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortbad, CTLFLAG_RW, &reassignbufsortbad, 0, "");
156/* Set to 0 for old insertion-sort based reassignbuf, 1 for modern method. */
157static int reassignbufmethod = 1;
158SYSCTL_INT(_vfs, OID_AUTO, reassignbufmethod, CTLFLAG_RW, &reassignbufmethod, 0, "");
159
160#ifdef ENABLE_VFS_IOOPT
161/* See NOTES for a description of this setting. */
162int vfs_ioopt = 0;
163SYSCTL_INT(_vfs, OID_AUTO, ioopt, CTLFLAG_RW, &vfs_ioopt, 0, "");
164#endif
165
166/* List of mounted filesystems. */
167struct mntlist mountlist = TAILQ_HEAD_INITIALIZER(mountlist);
168
169/* For any iteration/modification of mountlist */
170struct mtx mountlist_mtx;
171
172/* For any iteration/modification of mnt_vnodelist */
173struct mtx mntvnode_mtx;
174
175/*
176 * Cache for the mount type id assigned to NFS.  This is used for
177 * special checks in nfs/nfs_nqlease.c and vm/vnode_pager.c.
178 */
179int	nfs_mount_type = -1;
180
181/* To keep more than one thread at a time from running vfs_getnewfsid */
182static struct mtx mntid_mtx;
183
184/* For any iteration/modification of vnode_free_list */
185static struct mtx vnode_free_list_mtx;
186
187/*
188 * For any iteration/modification of dev->si_hlist (linked through
189 * v_specnext)
190 */
191static struct mtx spechash_mtx;
192
193/* Publicly exported FS */
194struct nfs_public nfs_pub;
195
196/* Zone for allocation of new vnodes - used exclusively by getnewvnode() */
197static vm_zone_t vnode_zone;
198
199/* Set to 1 to print out reclaim of active vnodes */
200int	prtactive = 0;
201
202/*
203 * The workitem queue.
204 *
205 * It is useful to delay writes of file data and filesystem metadata
206 * for tens of seconds so that quickly created and deleted files need
207 * not waste disk bandwidth being created and removed. To realize this,
208 * we append vnodes to a "workitem" queue. When running with a soft
209 * updates implementation, most pending metadata dependencies should
210 * not wait for more than a few seconds. Thus, mounted on block devices
211 * are delayed only about a half the time that file data is delayed.
212 * Similarly, directory updates are more critical, so are only delayed
213 * about a third the time that file data is delayed. Thus, there are
214 * SYNCER_MAXDELAY queues that are processed round-robin at a rate of
215 * one each second (driven off the filesystem syncer process). The
216 * syncer_delayno variable indicates the next queue that is to be processed.
217 * Items that need to be processed soon are placed in this queue:
218 *
219 *	syncer_workitem_pending[syncer_delayno]
220 *
221 * A delay of fifteen seconds is done by placing the request fifteen
222 * entries later in the queue:
223 *
224 *	syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
225 *
226 */
227static int syncer_delayno = 0;
228static long syncer_mask;
229LIST_HEAD(synclist, vnode);
230static struct synclist *syncer_workitem_pending;
231
232#define SYNCER_MAXDELAY		32
233static int syncer_maxdelay = SYNCER_MAXDELAY;	/* maximum delay time */
234time_t syncdelay = 30;		/* max time to delay syncing data */
235time_t filedelay = 30;		/* time to delay syncing files */
236SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, "");
237time_t dirdelay = 29;		/* time to delay syncing directories */
238SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, "");
239time_t metadelay = 28;		/* time to delay syncing metadata */
240SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, "");
241static int rushjob;		/* number of slots to run ASAP */
242static int stat_rush_requests;	/* number of times I/O speeded up */
243SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, "");
244
245/*
246 * Number of vnodes we want to exist at any one time.  This is mostly used
247 * to size hash tables in vnode-related code.  It is normally not used in
248 * getnewvnode(), as wantfreevnodes is normally nonzero.)
249 *
250 * XXX desiredvnodes is historical cruft and should not exist.
251 */
252int desiredvnodes;
253SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW,
254    &desiredvnodes, 0, "Maximum number of vnodes");
255
256static void	vfs_free_addrlist __P((struct netexport *nep));
257static int	vfs_free_netcred __P((struct radix_node *rn, void *w));
258static int	vfs_hang_addrlist __P((struct mount *mp, struct netexport *nep,
259				       struct export_args *argp));
260
261/*
262 * Initialize the vnode management data structures.
263 */
264static void
265vntblinit(void *dummy __unused)
266{
267
268	desiredvnodes = maxproc + cnt.v_page_count / 4;
269	mtx_init(&mountlist_mtx, "mountlist", MTX_DEF);
270	mtx_init(&mntvnode_mtx, "mntvnode", MTX_DEF);
271	mtx_init(&mntid_mtx, "mntid", MTX_DEF);
272	mtx_init(&spechash_mtx, "spechash", MTX_DEF);
273	TAILQ_INIT(&vnode_free_list);
274	mtx_init(&vnode_free_list_mtx, "vnode_free_list", MTX_DEF);
275	vnode_zone = zinit("VNODE", sizeof (struct vnode), 0, 0, 5);
276	/*
277	 * Initialize the filesystem syncer.
278	 */
279	syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE,
280		&syncer_mask);
281	syncer_maxdelay = syncer_mask + 1;
282}
283SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL)
284
285
286/*
287 * Mark a mount point as busy. Used to synchronize access and to delay
288 * unmounting. Interlock is not released on failure.
289 */
290int
291vfs_busy(mp, flags, interlkp, p)
292	struct mount *mp;
293	int flags;
294	struct mtx *interlkp;
295	struct proc *p;
296{
297	int lkflags;
298
299	if (mp->mnt_kern_flag & MNTK_UNMOUNT) {
300		if (flags & LK_NOWAIT)
301			return (ENOENT);
302		mp->mnt_kern_flag |= MNTK_MWAIT;
303		/*
304		 * Since all busy locks are shared except the exclusive
305		 * lock granted when unmounting, the only place that a
306		 * wakeup needs to be done is at the release of the
307		 * exclusive lock at the end of dounmount.
308		 */
309		msleep((caddr_t)mp, interlkp, PVFS, "vfs_busy", 0);
310		return (ENOENT);
311	}
312	lkflags = LK_SHARED | LK_NOPAUSE;
313	if (interlkp)
314		lkflags |= LK_INTERLOCK;
315	if (lockmgr(&mp->mnt_lock, lkflags, interlkp, p))
316		panic("vfs_busy: unexpected lock failure");
317	return (0);
318}
319
320/*
321 * Free a busy filesystem.
322 */
323void
324vfs_unbusy(mp, p)
325	struct mount *mp;
326	struct proc *p;
327{
328
329	lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, p);
330}
331
332/*
333 * Lookup a filesystem type, and if found allocate and initialize
334 * a mount structure for it.
335 *
336 * Devname is usually updated by mount(8) after booting.
337 */
338int
339vfs_rootmountalloc(fstypename, devname, mpp)
340	char *fstypename;
341	char *devname;
342	struct mount **mpp;
343{
344	struct proc *p = curproc;	/* XXX */
345	struct vfsconf *vfsp;
346	struct mount *mp;
347
348	if (fstypename == NULL)
349		return (ENODEV);
350	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
351		if (!strcmp(vfsp->vfc_name, fstypename))
352			break;
353	if (vfsp == NULL)
354		return (ENODEV);
355	mp = malloc((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK | M_ZERO);
356	lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, LK_NOPAUSE);
357	(void)vfs_busy(mp, LK_NOWAIT, 0, p);
358	LIST_INIT(&mp->mnt_vnodelist);
359	mp->mnt_vfc = vfsp;
360	mp->mnt_op = vfsp->vfc_vfsops;
361	mp->mnt_flag = MNT_RDONLY;
362	mp->mnt_vnodecovered = NULLVP;
363	vfsp->vfc_refcount++;
364	mp->mnt_iosize_max = DFLTPHYS;
365	mp->mnt_stat.f_type = vfsp->vfc_typenum;
366	mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
367	strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN);
368	mp->mnt_stat.f_mntonname[0] = '/';
369	mp->mnt_stat.f_mntonname[1] = 0;
370	(void) copystr(devname, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, 0);
371	*mpp = mp;
372	return (0);
373}
374
375/*
376 * Find an appropriate filesystem to use for the root. If a filesystem
377 * has not been preselected, walk through the list of known filesystems
378 * trying those that have mountroot routines, and try them until one
379 * works or we have tried them all.
380 */
381#ifdef notdef	/* XXX JH */
382int
383lite2_vfs_mountroot()
384{
385	struct vfsconf *vfsp;
386	extern int (*lite2_mountroot) __P((void));
387	int error;
388
389	if (lite2_mountroot != NULL)
390		return ((*lite2_mountroot)());
391	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
392		if (vfsp->vfc_mountroot == NULL)
393			continue;
394		if ((error = (*vfsp->vfc_mountroot)()) == 0)
395			return (0);
396		printf("%s_mountroot failed: %d\n", vfsp->vfc_name, error);
397	}
398	return (ENODEV);
399}
400#endif
401
402/*
403 * Lookup a mount point by filesystem identifier.
404 */
405struct mount *
406vfs_getvfs(fsid)
407	fsid_t *fsid;
408{
409	register struct mount *mp;
410
411	mtx_lock(&mountlist_mtx);
412	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
413		if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
414		    mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
415			mtx_unlock(&mountlist_mtx);
416			return (mp);
417	    }
418	}
419	mtx_unlock(&mountlist_mtx);
420	return ((struct mount *) 0);
421}
422
423/*
424 * Get a new unique fsid.  Try to make its val[0] unique, since this value
425 * will be used to create fake device numbers for stat().  Also try (but
426 * not so hard) make its val[0] unique mod 2^16, since some emulators only
427 * support 16-bit device numbers.  We end up with unique val[0]'s for the
428 * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls.
429 *
430 * Keep in mind that several mounts may be running in parallel.  Starting
431 * the search one past where the previous search terminated is both a
432 * micro-optimization and a defense against returning the same fsid to
433 * different mounts.
434 */
435void
436vfs_getnewfsid(mp)
437	struct mount *mp;
438{
439	static u_int16_t mntid_base;
440	fsid_t tfsid;
441	int mtype;
442
443	mtx_lock(&mntid_mtx);
444	mtype = mp->mnt_vfc->vfc_typenum;
445	tfsid.val[1] = mtype;
446	mtype = (mtype & 0xFF) << 24;
447	for (;;) {
448		tfsid.val[0] = makeudev(255,
449		    mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF));
450		mntid_base++;
451		if (vfs_getvfs(&tfsid) == NULL)
452			break;
453	}
454	mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
455	mp->mnt_stat.f_fsid.val[1] = tfsid.val[1];
456	mtx_unlock(&mntid_mtx);
457}
458
459/*
460 * Knob to control the precision of file timestamps:
461 *
462 *   0 = seconds only; nanoseconds zeroed.
463 *   1 = seconds and nanoseconds, accurate within 1/HZ.
464 *   2 = seconds and nanoseconds, truncated to microseconds.
465 * >=3 = seconds and nanoseconds, maximum precision.
466 */
467enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC };
468
469static int timestamp_precision = TSP_SEC;
470SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW,
471    &timestamp_precision, 0, "");
472
473/*
474 * Get a current timestamp.
475 */
476void
477vfs_timestamp(tsp)
478	struct timespec *tsp;
479{
480	struct timeval tv;
481
482	switch (timestamp_precision) {
483	case TSP_SEC:
484		tsp->tv_sec = time_second;
485		tsp->tv_nsec = 0;
486		break;
487	case TSP_HZ:
488		getnanotime(tsp);
489		break;
490	case TSP_USEC:
491		microtime(&tv);
492		TIMEVAL_TO_TIMESPEC(&tv, tsp);
493		break;
494	case TSP_NSEC:
495	default:
496		nanotime(tsp);
497		break;
498	}
499}
500
501/*
502 * Set vnode attributes to VNOVAL
503 */
504void
505vattr_null(vap)
506	register struct vattr *vap;
507{
508
509	vap->va_type = VNON;
510	vap->va_size = VNOVAL;
511	vap->va_bytes = VNOVAL;
512	vap->va_mode = VNOVAL;
513	vap->va_nlink = VNOVAL;
514	vap->va_uid = VNOVAL;
515	vap->va_gid = VNOVAL;
516	vap->va_fsid = VNOVAL;
517	vap->va_fileid = VNOVAL;
518	vap->va_blocksize = VNOVAL;
519	vap->va_rdev = VNOVAL;
520	vap->va_atime.tv_sec = VNOVAL;
521	vap->va_atime.tv_nsec = VNOVAL;
522	vap->va_mtime.tv_sec = VNOVAL;
523	vap->va_mtime.tv_nsec = VNOVAL;
524	vap->va_ctime.tv_sec = VNOVAL;
525	vap->va_ctime.tv_nsec = VNOVAL;
526	vap->va_flags = VNOVAL;
527	vap->va_gen = VNOVAL;
528	vap->va_vaflags = 0;
529}
530
531/*
532 * Routines having to do with the management of the vnode table.
533 */
534
535/*
536 * Return the next vnode from the free list.
537 */
538int
539getnewvnode(tag, mp, vops, vpp)
540	enum vtagtype tag;
541	struct mount *mp;
542	vop_t **vops;
543	struct vnode **vpp;
544{
545	int s, count;
546	struct proc *p = curproc;	/* XXX */
547	struct vnode *vp = NULL;
548	struct mount *vnmp;
549	vm_object_t object;
550
551	/*
552	 * We take the least recently used vnode from the freelist
553	 * if we can get it and it has no cached pages, and no
554	 * namecache entries are relative to it.
555	 * Otherwise we allocate a new vnode
556	 */
557
558	s = splbio();
559	mtx_lock(&vnode_free_list_mtx);
560
561	if (wantfreevnodes && freevnodes < wantfreevnodes) {
562		vp = NULL;
563	} else if (!wantfreevnodes && freevnodes <= desiredvnodes) {
564		/*
565		 * XXX: this is only here to be backwards compatible
566		 */
567		vp = NULL;
568	} else for (count = 0; count < freevnodes; count++) {
569		vp = TAILQ_FIRST(&vnode_free_list);
570		if (vp == NULL || vp->v_usecount)
571			panic("getnewvnode: free vnode isn't");
572		TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
573
574		/*
575		 * Don't recycle if active in the namecache or
576		 * if it still has cached pages or we cannot get
577		 * its interlock.
578		 */
579		if (LIST_FIRST(&vp->v_cache_src) != NULL ||
580		    (VOP_GETVOBJECT(vp, &object) == 0 &&
581		     (object->resident_page_count || object->ref_count)) ||
582		    !mtx_trylock(&vp->v_interlock)) {
583			TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
584			vp = NULL;
585			continue;
586		}
587		/*
588		 * Skip over it if its filesystem is being suspended.
589		 */
590		if (vn_start_write(vp, &vnmp, V_NOWAIT) == 0)
591			break;
592		mtx_unlock(&vp->v_interlock);
593		TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
594		vp = NULL;
595	}
596	if (vp) {
597		vp->v_flag |= VDOOMED;
598		vp->v_flag &= ~VFREE;
599		freevnodes--;
600		mtx_unlock(&vnode_free_list_mtx);
601		cache_purge(vp);
602		vp->v_lease = NULL;
603		if (vp->v_type != VBAD) {
604			vgonel(vp, p);
605		} else {
606			mtx_unlock(&vp->v_interlock);
607		}
608		vn_finished_write(vnmp);
609
610#ifdef INVARIANTS
611		{
612			int s;
613
614			if (vp->v_data)
615				panic("cleaned vnode isn't");
616			s = splbio();
617			if (vp->v_numoutput)
618				panic("Clean vnode has pending I/O's");
619			splx(s);
620			if (vp->v_writecount != 0)
621				panic("Non-zero write count");
622		}
623#endif
624		vp->v_flag = 0;
625		vp->v_lastw = 0;
626		vp->v_lasta = 0;
627		vp->v_cstart = 0;
628		vp->v_clen = 0;
629		vp->v_socket = 0;
630	} else {
631		mtx_unlock(&vnode_free_list_mtx);
632		vp = (struct vnode *) zalloc(vnode_zone);
633		bzero((char *) vp, sizeof *vp);
634		mtx_init(&vp->v_interlock, "vnode interlock", MTX_DEF);
635		vp->v_dd = vp;
636		mtx_init(&vp->v_pollinfo.vpi_lock, "vnode pollinfo", MTX_DEF);
637		cache_purge(vp);
638		LIST_INIT(&vp->v_cache_src);
639		TAILQ_INIT(&vp->v_cache_dst);
640		numvnodes++;
641	}
642
643	TAILQ_INIT(&vp->v_cleanblkhd);
644	TAILQ_INIT(&vp->v_dirtyblkhd);
645	vp->v_type = VNON;
646	vp->v_tag = tag;
647	vp->v_op = vops;
648	lockinit(&vp->v_lock, PVFS, "vnlock", 0, LK_NOPAUSE);
649	insmntque(vp, mp);
650	*vpp = vp;
651	vp->v_usecount = 1;
652	vp->v_data = 0;
653
654	splx(s);
655
656	vfs_object_create(vp, p, p->p_ucred);
657
658	vnodeallocs++;
659	if (vnodeallocs % vnoderecycleperiod == 0 &&
660	    freevnodes < vnoderecycleminfreevn &&
661	    vnoderecyclemintotalvn < numvnodes) {
662		/* Recycle vnodes. */
663		cache_purgeleafdirs(vnoderecyclenumber);
664	}
665
666	return (0);
667}
668
669/*
670 * Move a vnode from one mount queue to another.
671 */
672static void
673insmntque(vp, mp)
674	register struct vnode *vp;
675	register struct mount *mp;
676{
677
678	mtx_lock(&mntvnode_mtx);
679	/*
680	 * Delete from old mount point vnode list, if on one.
681	 */
682	if (vp->v_mount != NULL)
683		LIST_REMOVE(vp, v_mntvnodes);
684	/*
685	 * Insert into list of vnodes for the new mount point, if available.
686	 */
687	if ((vp->v_mount = mp) == NULL) {
688		mtx_unlock(&mntvnode_mtx);
689		return;
690	}
691	LIST_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes);
692	mtx_unlock(&mntvnode_mtx);
693}
694
695/*
696 * Update outstanding I/O count and do wakeup if requested.
697 */
698void
699vwakeup(bp)
700	register struct buf *bp;
701{
702	register struct vnode *vp;
703
704	bp->b_flags &= ~B_WRITEINPROG;
705	if ((vp = bp->b_vp)) {
706		vp->v_numoutput--;
707		if (vp->v_numoutput < 0)
708			panic("vwakeup: neg numoutput");
709		if ((vp->v_numoutput == 0) && (vp->v_flag & VBWAIT)) {
710			vp->v_flag &= ~VBWAIT;
711			wakeup((caddr_t) &vp->v_numoutput);
712		}
713	}
714}
715
716/*
717 * Flush out and invalidate all buffers associated with a vnode.
718 * Called with the underlying object locked.
719 */
720int
721vinvalbuf(vp, flags, cred, p, slpflag, slptimeo)
722	register struct vnode *vp;
723	int flags;
724	struct ucred *cred;
725	struct proc *p;
726	int slpflag, slptimeo;
727{
728	register struct buf *bp;
729	struct buf *nbp, *blist;
730	int s, error;
731	vm_object_t object;
732
733	if (flags & V_SAVE) {
734		s = splbio();
735		while (vp->v_numoutput) {
736			vp->v_flag |= VBWAIT;
737			error = tsleep((caddr_t)&vp->v_numoutput,
738			    slpflag | (PRIBIO + 1), "vinvlbuf", slptimeo);
739			if (error) {
740				splx(s);
741				return (error);
742			}
743		}
744		if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
745			splx(s);
746			if ((error = VOP_FSYNC(vp, cred, MNT_WAIT, p)) != 0)
747				return (error);
748			s = splbio();
749			if (vp->v_numoutput > 0 ||
750			    !TAILQ_EMPTY(&vp->v_dirtyblkhd))
751				panic("vinvalbuf: dirty bufs");
752		}
753		splx(s);
754  	}
755	s = splbio();
756	for (;;) {
757		blist = TAILQ_FIRST(&vp->v_cleanblkhd);
758		if (!blist)
759			blist = TAILQ_FIRST(&vp->v_dirtyblkhd);
760		if (!blist)
761			break;
762
763		for (bp = blist; bp; bp = nbp) {
764			nbp = TAILQ_NEXT(bp, b_vnbufs);
765			if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
766				error = BUF_TIMELOCK(bp,
767				    LK_EXCLUSIVE | LK_SLEEPFAIL,
768				    "vinvalbuf", slpflag, slptimeo);
769				if (error == ENOLCK)
770					break;
771				splx(s);
772				return (error);
773			}
774			/*
775			 * XXX Since there are no node locks for NFS, I
776			 * believe there is a slight chance that a delayed
777			 * write will occur while sleeping just above, so
778			 * check for it.  Note that vfs_bio_awrite expects
779			 * buffers to reside on a queue, while BUF_WRITE and
780			 * brelse do not.
781			 */
782			if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) &&
783				(flags & V_SAVE)) {
784
785				if (bp->b_vp == vp) {
786					if (bp->b_flags & B_CLUSTEROK) {
787						BUF_UNLOCK(bp);
788						vfs_bio_awrite(bp);
789					} else {
790						bremfree(bp);
791						bp->b_flags |= B_ASYNC;
792						BUF_WRITE(bp);
793					}
794				} else {
795					bremfree(bp);
796					(void) BUF_WRITE(bp);
797				}
798				break;
799			}
800			bremfree(bp);
801			bp->b_flags |= (B_INVAL | B_NOCACHE | B_RELBUF);
802			bp->b_flags &= ~B_ASYNC;
803			brelse(bp);
804		}
805	}
806
807	while (vp->v_numoutput > 0) {
808		vp->v_flag |= VBWAIT;
809		tsleep(&vp->v_numoutput, PVM, "vnvlbv", 0);
810	}
811
812	splx(s);
813
814	/*
815	 * Destroy the copy in the VM cache, too.
816	 */
817	mtx_lock(&vp->v_interlock);
818	if (VOP_GETVOBJECT(vp, &object) == 0) {
819		vm_object_page_remove(object, 0, 0,
820			(flags & V_SAVE) ? TRUE : FALSE);
821	}
822	mtx_unlock(&vp->v_interlock);
823
824	if (!TAILQ_EMPTY(&vp->v_dirtyblkhd) || !TAILQ_EMPTY(&vp->v_cleanblkhd))
825		panic("vinvalbuf: flush failed");
826	return (0);
827}
828
829/*
830 * Truncate a file's buffer and pages to a specified length.  This
831 * is in lieu of the old vinvalbuf mechanism, which performed unneeded
832 * sync activity.
833 */
834int
835vtruncbuf(vp, cred, p, length, blksize)
836	register struct vnode *vp;
837	struct ucred *cred;
838	struct proc *p;
839	off_t length;
840	int blksize;
841{
842	register struct buf *bp;
843	struct buf *nbp;
844	int s, anyfreed;
845	int trunclbn;
846
847	/*
848	 * Round up to the *next* lbn.
849	 */
850	trunclbn = (length + blksize - 1) / blksize;
851
852	s = splbio();
853restart:
854	anyfreed = 1;
855	for (;anyfreed;) {
856		anyfreed = 0;
857		for (bp = TAILQ_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) {
858			nbp = TAILQ_NEXT(bp, b_vnbufs);
859			if (bp->b_lblkno >= trunclbn) {
860				if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
861					BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL);
862					goto restart;
863				} else {
864					bremfree(bp);
865					bp->b_flags |= (B_INVAL | B_RELBUF);
866					bp->b_flags &= ~B_ASYNC;
867					brelse(bp);
868					anyfreed = 1;
869				}
870				if (nbp &&
871				    (((nbp->b_xflags & BX_VNCLEAN) == 0) ||
872				    (nbp->b_vp != vp) ||
873				    (nbp->b_flags & B_DELWRI))) {
874					goto restart;
875				}
876			}
877		}
878
879		for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
880			nbp = TAILQ_NEXT(bp, b_vnbufs);
881			if (bp->b_lblkno >= trunclbn) {
882				if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
883					BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL);
884					goto restart;
885				} else {
886					bremfree(bp);
887					bp->b_flags |= (B_INVAL | B_RELBUF);
888					bp->b_flags &= ~B_ASYNC;
889					brelse(bp);
890					anyfreed = 1;
891				}
892				if (nbp &&
893				    (((nbp->b_xflags & BX_VNDIRTY) == 0) ||
894				    (nbp->b_vp != vp) ||
895				    (nbp->b_flags & B_DELWRI) == 0)) {
896					goto restart;
897				}
898			}
899		}
900	}
901
902	if (length > 0) {
903restartsync:
904		for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
905			nbp = TAILQ_NEXT(bp, b_vnbufs);
906			if ((bp->b_flags & B_DELWRI) && (bp->b_lblkno < 0)) {
907				if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
908					BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL);
909					goto restart;
910				} else {
911					bremfree(bp);
912					if (bp->b_vp == vp) {
913						bp->b_flags |= B_ASYNC;
914					} else {
915						bp->b_flags &= ~B_ASYNC;
916					}
917					BUF_WRITE(bp);
918				}
919				goto restartsync;
920			}
921
922		}
923	}
924
925	while (vp->v_numoutput > 0) {
926		vp->v_flag |= VBWAIT;
927		tsleep(&vp->v_numoutput, PVM, "vbtrunc", 0);
928	}
929
930	splx(s);
931
932	vnode_pager_setsize(vp, length);
933
934	return (0);
935}
936
937/*
938 * Associate a buffer with a vnode.
939 */
940void
941bgetvp(vp, bp)
942	register struct vnode *vp;
943	register struct buf *bp;
944{
945	int s;
946
947	KASSERT(bp->b_vp == NULL, ("bgetvp: not free"));
948
949	vhold(vp);
950	bp->b_vp = vp;
951	bp->b_dev = vn_todev(vp);
952	/*
953	 * Insert onto list for new vnode.
954	 */
955	s = splbio();
956	bp->b_xflags |= BX_VNCLEAN;
957	bp->b_xflags &= ~BX_VNDIRTY;
958	TAILQ_INSERT_TAIL(&vp->v_cleanblkhd, bp, b_vnbufs);
959	splx(s);
960}
961
962/*
963 * Disassociate a buffer from a vnode.
964 */
965void
966brelvp(bp)
967	register struct buf *bp;
968{
969	struct vnode *vp;
970	struct buflists *listheadp;
971	int s;
972
973	KASSERT(bp->b_vp != NULL, ("brelvp: NULL"));
974
975	/*
976	 * Delete from old vnode list, if on one.
977	 */
978	vp = bp->b_vp;
979	s = splbio();
980	if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) {
981		if (bp->b_xflags & BX_VNDIRTY)
982			listheadp = &vp->v_dirtyblkhd;
983		else
984			listheadp = &vp->v_cleanblkhd;
985		TAILQ_REMOVE(listheadp, bp, b_vnbufs);
986		bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN);
987	}
988	if ((vp->v_flag & VONWORKLST) && TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
989		vp->v_flag &= ~VONWORKLST;
990		LIST_REMOVE(vp, v_synclist);
991	}
992	splx(s);
993	bp->b_vp = (struct vnode *) 0;
994	vdrop(vp);
995}
996
997/*
998 * Add an item to the syncer work queue.
999 */
1000static void
1001vn_syncer_add_to_worklist(struct vnode *vp, int delay)
1002{
1003	int s, slot;
1004
1005	s = splbio();
1006
1007	if (vp->v_flag & VONWORKLST) {
1008		LIST_REMOVE(vp, v_synclist);
1009	}
1010
1011	if (delay > syncer_maxdelay - 2)
1012		delay = syncer_maxdelay - 2;
1013	slot = (syncer_delayno + delay) & syncer_mask;
1014
1015	LIST_INSERT_HEAD(&syncer_workitem_pending[slot], vp, v_synclist);
1016	vp->v_flag |= VONWORKLST;
1017	splx(s);
1018}
1019
1020struct  proc *updateproc;
1021static void sched_sync __P((void));
1022static struct kproc_desc up_kp = {
1023	"syncer",
1024	sched_sync,
1025	&updateproc
1026};
1027SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp)
1028
1029/*
1030 * System filesystem synchronizer daemon.
1031 */
1032void
1033sched_sync(void)
1034{
1035	struct synclist *slp;
1036	struct vnode *vp;
1037	struct mount *mp;
1038	long starttime;
1039	int s;
1040	struct proc *p = updateproc;
1041
1042	mtx_lock(&Giant);
1043
1044	EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, p,
1045	    SHUTDOWN_PRI_LAST);
1046
1047	for (;;) {
1048		kthread_suspend_check(p);
1049
1050		starttime = time_second;
1051
1052		/*
1053		 * Push files whose dirty time has expired.  Be careful
1054		 * of interrupt race on slp queue.
1055		 */
1056		s = splbio();
1057		slp = &syncer_workitem_pending[syncer_delayno];
1058		syncer_delayno += 1;
1059		if (syncer_delayno == syncer_maxdelay)
1060			syncer_delayno = 0;
1061		splx(s);
1062
1063		while ((vp = LIST_FIRST(slp)) != NULL) {
1064			if (VOP_ISLOCKED(vp, NULL) == 0 &&
1065			    vn_start_write(vp, &mp, V_NOWAIT) == 0) {
1066				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
1067				(void) VOP_FSYNC(vp, p->p_ucred, MNT_LAZY, p);
1068				VOP_UNLOCK(vp, 0, p);
1069				vn_finished_write(mp);
1070			}
1071			s = splbio();
1072			if (LIST_FIRST(slp) == vp) {
1073				/*
1074				 * Note: v_tag VT_VFS vps can remain on the
1075				 * worklist too with no dirty blocks, but
1076				 * since sync_fsync() moves it to a different
1077				 * slot we are safe.
1078				 */
1079				if (TAILQ_EMPTY(&vp->v_dirtyblkhd) &&
1080				    !vn_isdisk(vp, NULL))
1081					panic("sched_sync: fsync failed vp %p tag %d", vp, vp->v_tag);
1082				/*
1083				 * Put us back on the worklist.  The worklist
1084				 * routine will remove us from our current
1085				 * position and then add us back in at a later
1086				 * position.
1087				 */
1088				vn_syncer_add_to_worklist(vp, syncdelay);
1089			}
1090			splx(s);
1091		}
1092
1093		/*
1094		 * Do soft update processing.
1095		 */
1096#ifdef SOFTUPDATES
1097		softdep_process_worklist(NULL);
1098#endif
1099
1100		/*
1101		 * The variable rushjob allows the kernel to speed up the
1102		 * processing of the filesystem syncer process. A rushjob
1103		 * value of N tells the filesystem syncer to process the next
1104		 * N seconds worth of work on its queue ASAP. Currently rushjob
1105		 * is used by the soft update code to speed up the filesystem
1106		 * syncer process when the incore state is getting so far
1107		 * ahead of the disk that the kernel memory pool is being
1108		 * threatened with exhaustion.
1109		 */
1110		if (rushjob > 0) {
1111			rushjob -= 1;
1112			continue;
1113		}
1114		/*
1115		 * If it has taken us less than a second to process the
1116		 * current work, then wait. Otherwise start right over
1117		 * again. We can still lose time if any single round
1118		 * takes more than two seconds, but it does not really
1119		 * matter as we are just trying to generally pace the
1120		 * filesystem activity.
1121		 */
1122		if (time_second == starttime)
1123			tsleep(&lbolt, PPAUSE, "syncer", 0);
1124	}
1125}
1126
1127/*
1128 * Request the syncer daemon to speed up its work.
1129 * We never push it to speed up more than half of its
1130 * normal turn time, otherwise it could take over the cpu.
1131 */
1132int
1133speedup_syncer()
1134{
1135
1136	mtx_lock_spin(&sched_lock);
1137	if (updateproc->p_wchan == &lbolt)
1138		setrunnable(updateproc);
1139	mtx_unlock_spin(&sched_lock);
1140	if (rushjob < syncdelay / 2) {
1141		rushjob += 1;
1142		stat_rush_requests += 1;
1143		return (1);
1144	}
1145	return(0);
1146}
1147
1148/*
1149 * Associate a p-buffer with a vnode.
1150 *
1151 * Also sets B_PAGING flag to indicate that vnode is not fully associated
1152 * with the buffer.  i.e. the bp has not been linked into the vnode or
1153 * ref-counted.
1154 */
1155void
1156pbgetvp(vp, bp)
1157	register struct vnode *vp;
1158	register struct buf *bp;
1159{
1160
1161	KASSERT(bp->b_vp == NULL, ("pbgetvp: not free"));
1162
1163	bp->b_vp = vp;
1164	bp->b_flags |= B_PAGING;
1165	bp->b_dev = vn_todev(vp);
1166}
1167
1168/*
1169 * Disassociate a p-buffer from a vnode.
1170 */
1171void
1172pbrelvp(bp)
1173	register struct buf *bp;
1174{
1175
1176	KASSERT(bp->b_vp != NULL, ("pbrelvp: NULL"));
1177
1178	/* XXX REMOVE ME */
1179	if (TAILQ_NEXT(bp, b_vnbufs) != NULL) {
1180		panic(
1181		    "relpbuf(): b_vp was probably reassignbuf()d %p %x",
1182		    bp,
1183		    (int)bp->b_flags
1184		);
1185	}
1186	bp->b_vp = (struct vnode *) 0;
1187	bp->b_flags &= ~B_PAGING;
1188}
1189
1190/*
1191 * Change the vnode a pager buffer is associated with.
1192 */
1193void
1194pbreassignbuf(bp, newvp)
1195	struct buf *bp;
1196	struct vnode *newvp;
1197{
1198
1199	KASSERT(bp->b_flags & B_PAGING,
1200	    ("pbreassignbuf() on non phys bp %p", bp));
1201	bp->b_vp = newvp;
1202}
1203
1204/*
1205 * Reassign a buffer from one vnode to another.
1206 * Used to assign file specific control information
1207 * (indirect blocks) to the vnode to which they belong.
1208 */
1209void
1210reassignbuf(bp, newvp)
1211	register struct buf *bp;
1212	register struct vnode *newvp;
1213{
1214	struct buflists *listheadp;
1215	int delay;
1216	int s;
1217
1218	if (newvp == NULL) {
1219		printf("reassignbuf: NULL");
1220		return;
1221	}
1222	++reassignbufcalls;
1223
1224	/*
1225	 * B_PAGING flagged buffers cannot be reassigned because their vp
1226	 * is not fully linked in.
1227	 */
1228	if (bp->b_flags & B_PAGING)
1229		panic("cannot reassign paging buffer");
1230
1231	s = splbio();
1232	/*
1233	 * Delete from old vnode list, if on one.
1234	 */
1235	if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) {
1236		if (bp->b_xflags & BX_VNDIRTY)
1237			listheadp = &bp->b_vp->v_dirtyblkhd;
1238		else
1239			listheadp = &bp->b_vp->v_cleanblkhd;
1240		TAILQ_REMOVE(listheadp, bp, b_vnbufs);
1241		bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN);
1242		if (bp->b_vp != newvp) {
1243			vdrop(bp->b_vp);
1244			bp->b_vp = NULL;	/* for clarification */
1245		}
1246	}
1247	/*
1248	 * If dirty, put on list of dirty buffers; otherwise insert onto list
1249	 * of clean buffers.
1250	 */
1251	if (bp->b_flags & B_DELWRI) {
1252		struct buf *tbp;
1253
1254		listheadp = &newvp->v_dirtyblkhd;
1255		if ((newvp->v_flag & VONWORKLST) == 0) {
1256			switch (newvp->v_type) {
1257			case VDIR:
1258				delay = dirdelay;
1259				break;
1260			case VCHR:
1261				if (newvp->v_rdev->si_mountpoint != NULL) {
1262					delay = metadelay;
1263					break;
1264				}
1265				/* fall through */
1266			default:
1267				delay = filedelay;
1268			}
1269			vn_syncer_add_to_worklist(newvp, delay);
1270		}
1271		bp->b_xflags |= BX_VNDIRTY;
1272		tbp = TAILQ_FIRST(listheadp);
1273		if (tbp == NULL ||
1274		    bp->b_lblkno == 0 ||
1275		    (bp->b_lblkno > 0 && tbp->b_lblkno < 0) ||
1276		    (bp->b_lblkno > 0 && bp->b_lblkno < tbp->b_lblkno)) {
1277			TAILQ_INSERT_HEAD(listheadp, bp, b_vnbufs);
1278			++reassignbufsortgood;
1279		} else if (bp->b_lblkno < 0) {
1280			TAILQ_INSERT_TAIL(listheadp, bp, b_vnbufs);
1281			++reassignbufsortgood;
1282		} else if (reassignbufmethod == 1) {
1283			/*
1284			 * New sorting algorithm, only handle sequential case,
1285			 * otherwise append to end (but before metadata)
1286			 */
1287			if ((tbp = gbincore(newvp, bp->b_lblkno - 1)) != NULL &&
1288			    (tbp->b_xflags & BX_VNDIRTY)) {
1289				/*
1290				 * Found the best place to insert the buffer
1291				 */
1292				TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs);
1293				++reassignbufsortgood;
1294			} else {
1295				/*
1296				 * Missed, append to end, but before meta-data.
1297				 * We know that the head buffer in the list is
1298				 * not meta-data due to prior conditionals.
1299				 *
1300				 * Indirect effects:  NFS second stage write
1301				 * tends to wind up here, giving maximum
1302				 * distance between the unstable write and the
1303				 * commit rpc.
1304				 */
1305				tbp = TAILQ_LAST(listheadp, buflists);
1306				while (tbp && tbp->b_lblkno < 0)
1307					tbp = TAILQ_PREV(tbp, buflists, b_vnbufs);
1308				TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs);
1309				++reassignbufsortbad;
1310			}
1311		} else {
1312			/*
1313			 * Old sorting algorithm, scan queue and insert
1314			 */
1315			struct buf *ttbp;
1316			while ((ttbp = TAILQ_NEXT(tbp, b_vnbufs)) &&
1317			    (ttbp->b_lblkno < bp->b_lblkno)) {
1318				++reassignbufloops;
1319				tbp = ttbp;
1320			}
1321			TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs);
1322		}
1323	} else {
1324		bp->b_xflags |= BX_VNCLEAN;
1325		TAILQ_INSERT_TAIL(&newvp->v_cleanblkhd, bp, b_vnbufs);
1326		if ((newvp->v_flag & VONWORKLST) &&
1327		    TAILQ_EMPTY(&newvp->v_dirtyblkhd)) {
1328			newvp->v_flag &= ~VONWORKLST;
1329			LIST_REMOVE(newvp, v_synclist);
1330		}
1331	}
1332	if (bp->b_vp != newvp) {
1333		bp->b_vp = newvp;
1334		vhold(bp->b_vp);
1335	}
1336	splx(s);
1337}
1338
1339/*
1340 * Create a vnode for a device.
1341 * Used for mounting the root file system.
1342 */
1343int
1344bdevvp(dev, vpp)
1345	dev_t dev;
1346	struct vnode **vpp;
1347{
1348	register struct vnode *vp;
1349	struct vnode *nvp;
1350	int error;
1351
1352	if (dev == NODEV) {
1353		*vpp = NULLVP;
1354		return (ENXIO);
1355	}
1356	if (vfinddev(dev, VCHR, vpp))
1357		return (0);
1358	error = getnewvnode(VT_NON, (struct mount *)0, spec_vnodeop_p, &nvp);
1359	if (error) {
1360		*vpp = NULLVP;
1361		return (error);
1362	}
1363	vp = nvp;
1364	vp->v_type = VCHR;
1365	addalias(vp, dev);
1366	*vpp = vp;
1367	return (0);
1368}
1369
1370/*
1371 * Add vnode to the alias list hung off the dev_t.
1372 *
1373 * The reason for this gunk is that multiple vnodes can reference
1374 * the same physical device, so checking vp->v_usecount to see
1375 * how many users there are is inadequate; the v_usecount for
1376 * the vnodes need to be accumulated.  vcount() does that.
1377 */
1378struct vnode *
1379addaliasu(nvp, nvp_rdev)
1380	struct vnode *nvp;
1381	udev_t nvp_rdev;
1382{
1383	struct vnode *ovp;
1384	vop_t **ops;
1385	dev_t dev;
1386
1387	if (nvp->v_type == VBLK)
1388		return (nvp);
1389	if (nvp->v_type != VCHR)
1390		panic("addaliasu on non-special vnode");
1391	dev = udev2dev(nvp_rdev, 0);
1392	/*
1393	 * Check to see if we have a bdevvp vnode with no associated
1394	 * filesystem. If so, we want to associate the filesystem of
1395	 * the new newly instigated vnode with the bdevvp vnode and
1396	 * discard the newly created vnode rather than leaving the
1397	 * bdevvp vnode lying around with no associated filesystem.
1398	 */
1399	if (vfinddev(dev, nvp->v_type, &ovp) == 0 || ovp->v_data != NULL) {
1400		addalias(nvp, dev);
1401		return (nvp);
1402	}
1403	/*
1404	 * Discard unneeded vnode, but save its node specific data.
1405	 * Note that if there is a lock, it is carried over in the
1406	 * node specific data to the replacement vnode.
1407	 */
1408	vref(ovp);
1409	ovp->v_data = nvp->v_data;
1410	ovp->v_tag = nvp->v_tag;
1411	nvp->v_data = NULL;
1412	lockinit(&ovp->v_lock, PVFS, nvp->v_lock.lk_wmesg,
1413	    nvp->v_lock.lk_timo, nvp->v_lock.lk_flags & LK_EXTFLG_MASK);
1414	if (nvp->v_vnlock)
1415		ovp->v_vnlock = &ovp->v_lock;
1416	ops = ovp->v_op;
1417	ovp->v_op = nvp->v_op;
1418	if (VOP_ISLOCKED(nvp, curproc)) {
1419		VOP_UNLOCK(nvp, 0, curproc);
1420		vn_lock(ovp, LK_EXCLUSIVE | LK_RETRY, curproc);
1421	}
1422	nvp->v_op = ops;
1423	insmntque(ovp, nvp->v_mount);
1424	vrele(nvp);
1425	vgone(nvp);
1426	return (ovp);
1427}
1428
1429/* This is a local helper function that do the same as addaliasu, but for a
1430 * dev_t instead of an udev_t. */
1431static void
1432addalias(nvp, dev)
1433	struct vnode *nvp;
1434	dev_t dev;
1435{
1436
1437	KASSERT(nvp->v_type == VCHR, ("addalias on non-special vnode"));
1438	nvp->v_rdev = dev;
1439	mtx_lock(&spechash_mtx);
1440	SLIST_INSERT_HEAD(&dev->si_hlist, nvp, v_specnext);
1441	mtx_unlock(&spechash_mtx);
1442}
1443
1444/*
1445 * Grab a particular vnode from the free list, increment its
1446 * reference count and lock it. The vnode lock bit is set if the
1447 * vnode is being eliminated in vgone. The process is awakened
1448 * when the transition is completed, and an error returned to
1449 * indicate that the vnode is no longer usable (possibly having
1450 * been changed to a new file system type).
1451 */
1452int
1453vget(vp, flags, p)
1454	register struct vnode *vp;
1455	int flags;
1456	struct proc *p;
1457{
1458	int error;
1459
1460	/*
1461	 * If the vnode is in the process of being cleaned out for
1462	 * another use, we wait for the cleaning to finish and then
1463	 * return failure. Cleaning is determined by checking that
1464	 * the VXLOCK flag is set.
1465	 */
1466	if ((flags & LK_INTERLOCK) == 0)
1467		mtx_lock(&vp->v_interlock);
1468	if (vp->v_flag & VXLOCK) {
1469		if (vp->v_vxproc == curproc) {
1470			printf("VXLOCK interlock avoided\n");
1471		} else {
1472			vp->v_flag |= VXWANT;
1473			msleep((caddr_t)vp, &vp->v_interlock, PINOD | PDROP,
1474			    "vget", 0);
1475			return (ENOENT);
1476		}
1477	}
1478
1479	vp->v_usecount++;
1480
1481	if (VSHOULDBUSY(vp))
1482		vbusy(vp);
1483	if (flags & LK_TYPE_MASK) {
1484		if ((error = vn_lock(vp, flags | LK_INTERLOCK, p)) != 0) {
1485			/*
1486			 * must expand vrele here because we do not want
1487			 * to call VOP_INACTIVE if the reference count
1488			 * drops back to zero since it was never really
1489			 * active. We must remove it from the free list
1490			 * before sleeping so that multiple processes do
1491			 * not try to recycle it.
1492			 */
1493			mtx_lock(&vp->v_interlock);
1494			vp->v_usecount--;
1495			if (VSHOULDFREE(vp))
1496				vfree(vp);
1497			mtx_unlock(&vp->v_interlock);
1498		}
1499		return (error);
1500	}
1501	mtx_unlock(&vp->v_interlock);
1502	return (0);
1503}
1504
1505/*
1506 * Increase the reference count of a vnode.
1507 */
1508void
1509vref(struct vnode *vp)
1510{
1511	mtx_lock(&vp->v_interlock);
1512	vp->v_usecount++;
1513	mtx_unlock(&vp->v_interlock);
1514}
1515
1516/*
1517 * Vnode put/release.
1518 * If count drops to zero, call inactive routine and return to freelist.
1519 */
1520void
1521vrele(vp)
1522	struct vnode *vp;
1523{
1524	struct proc *p = curproc;	/* XXX */
1525
1526	KASSERT(vp != NULL, ("vrele: null vp"));
1527
1528	mtx_lock(&vp->v_interlock);
1529
1530	KASSERT(vp->v_writecount < vp->v_usecount, ("vrele: missed vn_close"));
1531
1532	if (vp->v_usecount > 1) {
1533
1534		vp->v_usecount--;
1535		mtx_unlock(&vp->v_interlock);
1536
1537		return;
1538	}
1539
1540	if (vp->v_usecount == 1) {
1541
1542		vp->v_usecount--;
1543		if (VSHOULDFREE(vp))
1544			vfree(vp);
1545	/*
1546	 * If we are doing a vput, the node is already locked, and we must
1547	 * call VOP_INACTIVE with the node locked.  So, in the case of
1548	 * vrele, we explicitly lock the vnode before calling VOP_INACTIVE.
1549	 */
1550		if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK, p) == 0) {
1551			VOP_INACTIVE(vp, p);
1552		}
1553
1554	} else {
1555#ifdef DIAGNOSTIC
1556		vprint("vrele: negative ref count", vp);
1557		mtx_unlock(&vp->v_interlock);
1558#endif
1559		panic("vrele: negative ref cnt");
1560	}
1561}
1562
1563/*
1564 * Release an already locked vnode.  This give the same effects as
1565 * unlock+vrele(), but takes less time and avoids releasing and
1566 * re-aquiring the lock (as vrele() aquires the lock internally.)
1567 */
1568void
1569vput(vp)
1570	struct vnode *vp;
1571{
1572	struct proc *p = curproc;	/* XXX */
1573
1574	KASSERT(vp != NULL, ("vput: null vp"));
1575	mtx_lock(&vp->v_interlock);
1576	KASSERT(vp->v_writecount < vp->v_usecount, ("vput: missed vn_close"));
1577
1578	if (vp->v_usecount > 1) {
1579
1580		vp->v_usecount--;
1581		VOP_UNLOCK(vp, LK_INTERLOCK, p);
1582		return;
1583
1584	}
1585
1586	if (vp->v_usecount == 1) {
1587
1588		vp->v_usecount--;
1589		if (VSHOULDFREE(vp))
1590			vfree(vp);
1591	/*
1592	 * If we are doing a vput, the node is already locked, and we must
1593	 * call VOP_INACTIVE with the node locked.  So, in the case of
1594	 * vrele, we explicitly lock the vnode before calling VOP_INACTIVE.
1595	 */
1596		mtx_unlock(&vp->v_interlock);
1597		VOP_INACTIVE(vp, p);
1598
1599	} else {
1600#ifdef DIAGNOSTIC
1601		vprint("vput: negative ref count", vp);
1602#endif
1603		panic("vput: negative ref cnt");
1604	}
1605}
1606
1607/*
1608 * Somebody doesn't want the vnode recycled.
1609 */
1610void
1611vhold(vp)
1612	register struct vnode *vp;
1613{
1614	int s;
1615
1616  	s = splbio();
1617	vp->v_holdcnt++;
1618	if (VSHOULDBUSY(vp))
1619		vbusy(vp);
1620	splx(s);
1621}
1622
1623/*
1624 * Note that there is one less who cares about this vnode.  vdrop() is the
1625 * opposite of vhold().
1626 */
1627void
1628vdrop(vp)
1629	register struct vnode *vp;
1630{
1631	int s;
1632
1633	s = splbio();
1634	if (vp->v_holdcnt <= 0)
1635		panic("vdrop: holdcnt");
1636	vp->v_holdcnt--;
1637	if (VSHOULDFREE(vp))
1638		vfree(vp);
1639	splx(s);
1640}
1641
1642/*
1643 * Remove any vnodes in the vnode table belonging to mount point mp.
1644 *
1645 * If MNT_NOFORCE is specified, there should not be any active ones,
1646 * return error if any are found (nb: this is a user error, not a
1647 * system error). If MNT_FORCE is specified, detach any active vnodes
1648 * that are found.
1649 */
1650#ifdef DIAGNOSTIC
1651static int busyprt = 0;		/* print out busy vnodes */
1652SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "");
1653#endif
1654
1655int
1656vflush(mp, skipvp, flags)
1657	struct mount *mp;
1658	struct vnode *skipvp;
1659	int flags;
1660{
1661	struct proc *p = curproc;	/* XXX */
1662	struct vnode *vp, *nvp;
1663	int busy = 0;
1664
1665	mtx_lock(&mntvnode_mtx);
1666loop:
1667	for (vp = LIST_FIRST(&mp->mnt_vnodelist); vp; vp = nvp) {
1668		/*
1669		 * Make sure this vnode wasn't reclaimed in getnewvnode().
1670		 * Start over if it has (it won't be on the list anymore).
1671		 */
1672		if (vp->v_mount != mp)
1673			goto loop;
1674		nvp = LIST_NEXT(vp, v_mntvnodes);
1675		/*
1676		 * Skip over a selected vnode.
1677		 */
1678		if (vp == skipvp)
1679			continue;
1680
1681		mtx_lock(&vp->v_interlock);
1682		/*
1683		 * Skip over a vnodes marked VSYSTEM.
1684		 */
1685		if ((flags & SKIPSYSTEM) && (vp->v_flag & VSYSTEM)) {
1686			mtx_unlock(&vp->v_interlock);
1687			continue;
1688		}
1689		/*
1690		 * If WRITECLOSE is set, only flush out regular file vnodes
1691		 * open for writing.
1692		 */
1693		if ((flags & WRITECLOSE) &&
1694		    (vp->v_writecount == 0 || vp->v_type != VREG)) {
1695			mtx_unlock(&vp->v_interlock);
1696			continue;
1697		}
1698
1699		/*
1700		 * With v_usecount == 0, all we need to do is clear out the
1701		 * vnode data structures and we are done.
1702		 */
1703		if (vp->v_usecount == 0) {
1704			mtx_unlock(&mntvnode_mtx);
1705			vgonel(vp, p);
1706			mtx_lock(&mntvnode_mtx);
1707			continue;
1708		}
1709
1710		/*
1711		 * If FORCECLOSE is set, forcibly close the vnode. For block
1712		 * or character devices, revert to an anonymous device. For
1713		 * all other files, just kill them.
1714		 */
1715		if (flags & FORCECLOSE) {
1716			mtx_unlock(&mntvnode_mtx);
1717			if (vp->v_type != VCHR) {
1718				vgonel(vp, p);
1719			} else {
1720				vclean(vp, 0, p);
1721				vp->v_op = spec_vnodeop_p;
1722				insmntque(vp, (struct mount *) 0);
1723			}
1724			mtx_lock(&mntvnode_mtx);
1725			continue;
1726		}
1727#ifdef DIAGNOSTIC
1728		if (busyprt)
1729			vprint("vflush: busy vnode", vp);
1730#endif
1731		mtx_unlock(&vp->v_interlock);
1732		busy++;
1733	}
1734	mtx_unlock(&mntvnode_mtx);
1735	if (busy)
1736		return (EBUSY);
1737	return (0);
1738}
1739
1740/*
1741 * Disassociate the underlying file system from a vnode.
1742 */
1743static void
1744vclean(vp, flags, p)
1745	struct vnode *vp;
1746	int flags;
1747	struct proc *p;
1748{
1749	int active;
1750
1751	/*
1752	 * Check to see if the vnode is in use. If so we have to reference it
1753	 * before we clean it out so that its count cannot fall to zero and
1754	 * generate a race against ourselves to recycle it.
1755	 */
1756	if ((active = vp->v_usecount))
1757		vp->v_usecount++;
1758
1759	/*
1760	 * Prevent the vnode from being recycled or brought into use while we
1761	 * clean it out.
1762	 */
1763	if (vp->v_flag & VXLOCK)
1764		panic("vclean: deadlock");
1765	vp->v_flag |= VXLOCK;
1766	vp->v_vxproc = curproc;
1767	/*
1768	 * Even if the count is zero, the VOP_INACTIVE routine may still
1769	 * have the object locked while it cleans it out. The VOP_LOCK
1770	 * ensures that the VOP_INACTIVE routine is done with its work.
1771	 * For active vnodes, it ensures that no other activity can
1772	 * occur while the underlying object is being cleaned out.
1773	 */
1774	VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK, p);
1775
1776	/*
1777	 * Clean out any buffers associated with the vnode.
1778	 * If the flush fails, just toss the buffers.
1779	 */
1780	if (flags & DOCLOSE) {
1781		if (TAILQ_FIRST(&vp->v_dirtyblkhd) != NULL)
1782			(void) vn_write_suspend_wait(vp, NULL, V_WAIT);
1783		if (vinvalbuf(vp, V_SAVE, NOCRED, p, 0, 0) != 0)
1784			vinvalbuf(vp, 0, NOCRED, p, 0, 0);
1785	}
1786
1787	VOP_DESTROYVOBJECT(vp);
1788
1789	/*
1790	 * If purging an active vnode, it must be closed and
1791	 * deactivated before being reclaimed. Note that the
1792	 * VOP_INACTIVE will unlock the vnode.
1793	 */
1794	if (active) {
1795		if (flags & DOCLOSE)
1796			VOP_CLOSE(vp, FNONBLOCK, NOCRED, p);
1797		VOP_INACTIVE(vp, p);
1798	} else {
1799		/*
1800		 * Any other processes trying to obtain this lock must first
1801		 * wait for VXLOCK to clear, then call the new lock operation.
1802		 */
1803		VOP_UNLOCK(vp, 0, p);
1804	}
1805	/*
1806	 * Reclaim the vnode.
1807	 */
1808	if (VOP_RECLAIM(vp, p))
1809		panic("vclean: cannot reclaim");
1810
1811	if (active) {
1812		/*
1813		 * Inline copy of vrele() since VOP_INACTIVE
1814		 * has already been called.
1815		 */
1816		mtx_lock(&vp->v_interlock);
1817		if (--vp->v_usecount <= 0) {
1818#ifdef DIAGNOSTIC
1819			if (vp->v_usecount < 0 || vp->v_writecount != 0) {
1820				vprint("vclean: bad ref count", vp);
1821				panic("vclean: ref cnt");
1822			}
1823#endif
1824			vfree(vp);
1825		}
1826		mtx_unlock(&vp->v_interlock);
1827	}
1828
1829	cache_purge(vp);
1830	vp->v_vnlock = NULL;
1831	lockdestroy(&vp->v_lock);
1832
1833	if (VSHOULDFREE(vp))
1834		vfree(vp);
1835
1836	/*
1837	 * Done with purge, notify sleepers of the grim news.
1838	 */
1839	vp->v_op = dead_vnodeop_p;
1840	vn_pollgone(vp);
1841	vp->v_tag = VT_NON;
1842	vp->v_flag &= ~VXLOCK;
1843	vp->v_vxproc = NULL;
1844	if (vp->v_flag & VXWANT) {
1845		vp->v_flag &= ~VXWANT;
1846		wakeup((caddr_t) vp);
1847	}
1848}
1849
1850/*
1851 * Eliminate all activity associated with the requested vnode
1852 * and with all vnodes aliased to the requested vnode.
1853 */
1854int
1855vop_revoke(ap)
1856	struct vop_revoke_args /* {
1857		struct vnode *a_vp;
1858		int a_flags;
1859	} */ *ap;
1860{
1861	struct vnode *vp, *vq;
1862	dev_t dev;
1863
1864	KASSERT((ap->a_flags & REVOKEALL) != 0, ("vop_revoke"));
1865
1866	vp = ap->a_vp;
1867	/*
1868	 * If a vgone (or vclean) is already in progress,
1869	 * wait until it is done and return.
1870	 */
1871	if (vp->v_flag & VXLOCK) {
1872		vp->v_flag |= VXWANT;
1873		msleep((caddr_t)vp, &vp->v_interlock, PINOD | PDROP,
1874		    "vop_revokeall", 0);
1875		return (0);
1876	}
1877	dev = vp->v_rdev;
1878	for (;;) {
1879		mtx_lock(&spechash_mtx);
1880		vq = SLIST_FIRST(&dev->si_hlist);
1881		mtx_unlock(&spechash_mtx);
1882		if (!vq)
1883			break;
1884		vgone(vq);
1885	}
1886	return (0);
1887}
1888
1889/*
1890 * Recycle an unused vnode to the front of the free list.
1891 * Release the passed interlock if the vnode will be recycled.
1892 */
1893int
1894vrecycle(vp, inter_lkp, p)
1895	struct vnode *vp;
1896	struct mtx *inter_lkp;
1897	struct proc *p;
1898{
1899
1900	mtx_lock(&vp->v_interlock);
1901	if (vp->v_usecount == 0) {
1902		if (inter_lkp) {
1903			mtx_unlock(inter_lkp);
1904		}
1905		vgonel(vp, p);
1906		return (1);
1907	}
1908	mtx_unlock(&vp->v_interlock);
1909	return (0);
1910}
1911
1912/*
1913 * Eliminate all activity associated with a vnode
1914 * in preparation for reuse.
1915 */
1916void
1917vgone(vp)
1918	register struct vnode *vp;
1919{
1920	struct proc *p = curproc;	/* XXX */
1921
1922	mtx_lock(&vp->v_interlock);
1923	vgonel(vp, p);
1924}
1925
1926/*
1927 * vgone, with the vp interlock held.
1928 */
1929void
1930vgonel(vp, p)
1931	struct vnode *vp;
1932	struct proc *p;
1933{
1934	int s;
1935
1936	/*
1937	 * If a vgone (or vclean) is already in progress,
1938	 * wait until it is done and return.
1939	 */
1940	if (vp->v_flag & VXLOCK) {
1941		vp->v_flag |= VXWANT;
1942		msleep((caddr_t)vp, &vp->v_interlock, PINOD | PDROP,
1943		    "vgone", 0);
1944		return;
1945	}
1946
1947	/*
1948	 * Clean out the filesystem specific data.
1949	 */
1950	vclean(vp, DOCLOSE, p);
1951	mtx_lock(&vp->v_interlock);
1952
1953	/*
1954	 * Delete from old mount point vnode list, if on one.
1955	 */
1956	if (vp->v_mount != NULL)
1957		insmntque(vp, (struct mount *)0);
1958	/*
1959	 * If special device, remove it from special device alias list
1960	 * if it is on one.
1961	 */
1962	if (vp->v_type == VCHR && vp->v_rdev != NULL && vp->v_rdev != NODEV) {
1963		mtx_lock(&spechash_mtx);
1964		SLIST_REMOVE(&vp->v_rdev->si_hlist, vp, vnode, v_specnext);
1965		freedev(vp->v_rdev);
1966		mtx_unlock(&spechash_mtx);
1967		vp->v_rdev = NULL;
1968	}
1969
1970	/*
1971	 * If it is on the freelist and not already at the head,
1972	 * move it to the head of the list. The test of the
1973	 * VDOOMED flag and the reference count of zero is because
1974	 * it will be removed from the free list by getnewvnode,
1975	 * but will not have its reference count incremented until
1976	 * after calling vgone. If the reference count were
1977	 * incremented first, vgone would (incorrectly) try to
1978	 * close the previous instance of the underlying object.
1979	 */
1980	if (vp->v_usecount == 0 && !(vp->v_flag & VDOOMED)) {
1981		s = splbio();
1982		mtx_lock(&vnode_free_list_mtx);
1983		if (vp->v_flag & VFREE)
1984			TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
1985		else
1986			freevnodes++;
1987		vp->v_flag |= VFREE;
1988		TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
1989		mtx_unlock(&vnode_free_list_mtx);
1990		splx(s);
1991	}
1992
1993	vp->v_type = VBAD;
1994	mtx_unlock(&vp->v_interlock);
1995}
1996
1997/*
1998 * Lookup a vnode by device number.
1999 */
2000int
2001vfinddev(dev, type, vpp)
2002	dev_t dev;
2003	enum vtype type;
2004	struct vnode **vpp;
2005{
2006	struct vnode *vp;
2007
2008	mtx_lock(&spechash_mtx);
2009	SLIST_FOREACH(vp, &dev->si_hlist, v_specnext) {
2010		if (type == vp->v_type) {
2011			*vpp = vp;
2012			mtx_unlock(&spechash_mtx);
2013			return (1);
2014		}
2015	}
2016	mtx_unlock(&spechash_mtx);
2017	return (0);
2018}
2019
2020/*
2021 * Calculate the total number of references to a special device.
2022 */
2023int
2024vcount(vp)
2025	struct vnode *vp;
2026{
2027	struct vnode *vq;
2028	int count;
2029
2030	count = 0;
2031	mtx_lock(&spechash_mtx);
2032	SLIST_FOREACH(vq, &vp->v_rdev->si_hlist, v_specnext)
2033		count += vq->v_usecount;
2034	mtx_unlock(&spechash_mtx);
2035	return (count);
2036}
2037
2038/*
2039 * Same as above, but using the dev_t as argument
2040 */
2041int
2042count_dev(dev)
2043	dev_t dev;
2044{
2045	struct vnode *vp;
2046
2047	vp = SLIST_FIRST(&dev->si_hlist);
2048	if (vp == NULL)
2049		return (0);
2050	return(vcount(vp));
2051}
2052
2053/*
2054 * Print out a description of a vnode.
2055 */
2056static char *typename[] =
2057{"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"};
2058
2059void
2060vprint(label, vp)
2061	char *label;
2062	struct vnode *vp;
2063{
2064	char buf[96];
2065
2066	if (label != NULL)
2067		printf("%s: %p: ", label, (void *)vp);
2068	else
2069		printf("%p: ", (void *)vp);
2070	printf("type %s, usecount %d, writecount %d, refcount %d,",
2071	    typename[vp->v_type], vp->v_usecount, vp->v_writecount,
2072	    vp->v_holdcnt);
2073	buf[0] = '\0';
2074	if (vp->v_flag & VROOT)
2075		strcat(buf, "|VROOT");
2076	if (vp->v_flag & VTEXT)
2077		strcat(buf, "|VTEXT");
2078	if (vp->v_flag & VSYSTEM)
2079		strcat(buf, "|VSYSTEM");
2080	if (vp->v_flag & VXLOCK)
2081		strcat(buf, "|VXLOCK");
2082	if (vp->v_flag & VXWANT)
2083		strcat(buf, "|VXWANT");
2084	if (vp->v_flag & VBWAIT)
2085		strcat(buf, "|VBWAIT");
2086	if (vp->v_flag & VDOOMED)
2087		strcat(buf, "|VDOOMED");
2088	if (vp->v_flag & VFREE)
2089		strcat(buf, "|VFREE");
2090	if (vp->v_flag & VOBJBUF)
2091		strcat(buf, "|VOBJBUF");
2092	if (buf[0] != '\0')
2093		printf(" flags (%s)", &buf[1]);
2094	if (vp->v_data == NULL) {
2095		printf("\n");
2096	} else {
2097		printf("\n\t");
2098		VOP_PRINT(vp);
2099	}
2100}
2101
2102#ifdef DDB
2103#include <ddb/ddb.h>
2104/*
2105 * List all of the locked vnodes in the system.
2106 * Called when debugging the kernel.
2107 */
2108DB_SHOW_COMMAND(lockedvnodes, lockedvnodes)
2109{
2110	struct proc *p = curproc;	/* XXX */
2111	struct mount *mp, *nmp;
2112	struct vnode *vp;
2113
2114	printf("Locked vnodes\n");
2115	mtx_lock(&mountlist_mtx);
2116	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
2117		if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, p)) {
2118			nmp = TAILQ_NEXT(mp, mnt_list);
2119			continue;
2120		}
2121		LIST_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) {
2122			if (VOP_ISLOCKED(vp, NULL))
2123				vprint((char *)0, vp);
2124		}
2125		mtx_lock(&mountlist_mtx);
2126		nmp = TAILQ_NEXT(mp, mnt_list);
2127		vfs_unbusy(mp, p);
2128	}
2129	mtx_unlock(&mountlist_mtx);
2130}
2131#endif
2132
2133/*
2134 * Top level filesystem related information gathering.
2135 */
2136static int	sysctl_ovfs_conf __P((SYSCTL_HANDLER_ARGS));
2137
2138static int
2139vfs_sysctl(SYSCTL_HANDLER_ARGS)
2140{
2141	int *name = (int *)arg1 - 1;	/* XXX */
2142	u_int namelen = arg2 + 1;	/* XXX */
2143	struct vfsconf *vfsp;
2144
2145#if 1 || defined(COMPAT_PRELITE2)
2146	/* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */
2147	if (namelen == 1)
2148		return (sysctl_ovfs_conf(oidp, arg1, arg2, req));
2149#endif
2150
2151	/* XXX the below code does not compile; vfs_sysctl does not exist. */
2152#ifdef notyet
2153	/* all sysctl names at this level are at least name and field */
2154	if (namelen < 2)
2155		return (ENOTDIR);		/* overloaded */
2156	if (name[0] != VFS_GENERIC) {
2157		for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
2158			if (vfsp->vfc_typenum == name[0])
2159				break;
2160		if (vfsp == NULL)
2161			return (EOPNOTSUPP);
2162		return ((*vfsp->vfc_vfsops->vfs_sysctl)(&name[1], namelen - 1,
2163		    oldp, oldlenp, newp, newlen, p));
2164	}
2165#endif
2166	switch (name[1]) {
2167	case VFS_MAXTYPENUM:
2168		if (namelen != 2)
2169			return (ENOTDIR);
2170		return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int)));
2171	case VFS_CONF:
2172		if (namelen != 3)
2173			return (ENOTDIR);	/* overloaded */
2174		for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
2175			if (vfsp->vfc_typenum == name[2])
2176				break;
2177		if (vfsp == NULL)
2178			return (EOPNOTSUPP);
2179		return (SYSCTL_OUT(req, vfsp, sizeof *vfsp));
2180	}
2181	return (EOPNOTSUPP);
2182}
2183
2184SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD, vfs_sysctl,
2185	"Generic filesystem");
2186
2187#if 1 || defined(COMPAT_PRELITE2)
2188
2189static int
2190sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS)
2191{
2192	int error;
2193	struct vfsconf *vfsp;
2194	struct ovfsconf ovfs;
2195
2196	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
2197		ovfs.vfc_vfsops = vfsp->vfc_vfsops;	/* XXX used as flag */
2198		strcpy(ovfs.vfc_name, vfsp->vfc_name);
2199		ovfs.vfc_index = vfsp->vfc_typenum;
2200		ovfs.vfc_refcount = vfsp->vfc_refcount;
2201		ovfs.vfc_flags = vfsp->vfc_flags;
2202		error = SYSCTL_OUT(req, &ovfs, sizeof ovfs);
2203		if (error)
2204			return error;
2205	}
2206	return 0;
2207}
2208
2209#endif /* 1 || COMPAT_PRELITE2 */
2210
2211#if COMPILING_LINT
2212#define KINFO_VNODESLOP	10
2213/*
2214 * Dump vnode list (via sysctl).
2215 * Copyout address of vnode followed by vnode.
2216 */
2217/* ARGSUSED */
2218static int
2219sysctl_vnode(SYSCTL_HANDLER_ARGS)
2220{
2221	struct proc *p = curproc;	/* XXX */
2222	struct mount *mp, *nmp;
2223	struct vnode *nvp, *vp;
2224	int error;
2225
2226#define VPTRSZ	sizeof (struct vnode *)
2227#define VNODESZ	sizeof (struct vnode)
2228
2229	req->lock = 0;
2230	if (!req->oldptr) /* Make an estimate */
2231		return (SYSCTL_OUT(req, 0,
2232			(numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ)));
2233
2234	mtx_lock(&mountlist_mtx);
2235	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
2236		if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, p)) {
2237			nmp = TAILQ_NEXT(mp, mnt_list);
2238			continue;
2239		}
2240again:
2241		mtx_lock(&mntvnode_mtx);
2242		for (vp = LIST_FIRST(&mp->mnt_vnodelist);
2243		     vp != NULL;
2244		     vp = nvp) {
2245			/*
2246			 * Check that the vp is still associated with
2247			 * this filesystem.  RACE: could have been
2248			 * recycled onto the same filesystem.
2249			 */
2250			if (vp->v_mount != mp) {
2251				mtx_unlock(&mntvnode_mtx);
2252				goto again;
2253			}
2254			nvp = LIST_NEXT(vp, v_mntvnodes);
2255			mtx_unlock(&mntvnode_mtx);
2256			if ((error = SYSCTL_OUT(req, &vp, VPTRSZ)) ||
2257			    (error = SYSCTL_OUT(req, vp, VNODESZ)))
2258				return (error);
2259			mtx_lock(&mntvnode_mtx);
2260		}
2261		mtx_unlock(&mntvnode_mtx);
2262		mtx_lock(&mountlist_mtx);
2263		nmp = TAILQ_NEXT(mp, mnt_list);
2264		vfs_unbusy(mp, p);
2265	}
2266	mtx_unlock(&mountlist_mtx);
2267
2268	return (0);
2269}
2270
2271/*
2272 * XXX
2273 * Exporting the vnode list on large systems causes them to crash.
2274 * Exporting the vnode list on medium systems causes sysctl to coredump.
2275 */
2276SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD,
2277	0, 0, sysctl_vnode, "S,vnode", "");
2278#endif
2279
2280/*
2281 * Check to see if a filesystem is mounted on a block device.
2282 */
2283int
2284vfs_mountedon(vp)
2285	struct vnode *vp;
2286{
2287
2288	if (vp->v_rdev->si_mountpoint != NULL)
2289		return (EBUSY);
2290	return (0);
2291}
2292
2293/*
2294 * Unmount all filesystems. The list is traversed in reverse order
2295 * of mounting to avoid dependencies.
2296 */
2297void
2298vfs_unmountall()
2299{
2300	struct mount *mp;
2301	struct proc *p;
2302	int error;
2303
2304	if (curproc != NULL)
2305		p = curproc;
2306	else
2307		p = initproc;	/* XXX XXX should this be proc0? */
2308	/*
2309	 * Since this only runs when rebooting, it is not interlocked.
2310	 */
2311	while(!TAILQ_EMPTY(&mountlist)) {
2312		mp = TAILQ_LAST(&mountlist, mntlist);
2313		error = dounmount(mp, MNT_FORCE, p);
2314		if (error) {
2315			TAILQ_REMOVE(&mountlist, mp, mnt_list);
2316			printf("unmount of %s failed (",
2317			    mp->mnt_stat.f_mntonname);
2318			if (error == EBUSY)
2319				printf("BUSY)\n");
2320			else
2321				printf("%d)\n", error);
2322		} else {
2323			/* The unmount has removed mp from the mountlist */
2324		}
2325	}
2326}
2327
2328/*
2329 * Build hash lists of net addresses and hang them off the mount point.
2330 * Called by ufs_mount() to set up the lists of export addresses.
2331 */
2332static int
2333vfs_hang_addrlist(mp, nep, argp)
2334	struct mount *mp;
2335	struct netexport *nep;
2336	struct export_args *argp;
2337{
2338	register struct netcred *np;
2339	register struct radix_node_head *rnh;
2340	register int i;
2341	struct radix_node *rn;
2342	struct sockaddr *saddr, *smask = 0;
2343	struct domain *dom;
2344	int error;
2345
2346	if (argp->ex_addrlen == 0) {
2347		if (mp->mnt_flag & MNT_DEFEXPORTED)
2348			return (EPERM);
2349		np = &nep->ne_defexported;
2350		np->netc_exflags = argp->ex_flags;
2351		bzero(&np->netc_anon, sizeof(np->netc_anon));
2352		np->netc_anon.cr_uid = argp->ex_anon.cr_uid;
2353		np->netc_anon.cr_ngroups = argp->ex_anon.cr_ngroups;
2354		bcopy(argp->ex_anon.cr_groups, np->netc_anon.cr_groups,
2355		    sizeof(np->netc_anon.cr_groups));
2356		np->netc_anon.cr_ref = 1;
2357		mp->mnt_flag |= MNT_DEFEXPORTED;
2358		return (0);
2359	}
2360	i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen;
2361	np = (struct netcred *) malloc(i, M_NETADDR, M_WAITOK | M_ZERO);
2362	saddr = (struct sockaddr *) (np + 1);
2363	if ((error = copyin(argp->ex_addr, (caddr_t) saddr, argp->ex_addrlen)))
2364		goto out;
2365	if (saddr->sa_len > argp->ex_addrlen)
2366		saddr->sa_len = argp->ex_addrlen;
2367	if (argp->ex_masklen) {
2368		smask = (struct sockaddr *) ((caddr_t) saddr + argp->ex_addrlen);
2369		error = copyin(argp->ex_mask, (caddr_t) smask, argp->ex_masklen);
2370		if (error)
2371			goto out;
2372		if (smask->sa_len > argp->ex_masklen)
2373			smask->sa_len = argp->ex_masklen;
2374	}
2375	i = saddr->sa_family;
2376	if ((rnh = nep->ne_rtable[i]) == 0) {
2377		/*
2378		 * Seems silly to initialize every AF when most are not used,
2379		 * do so on demand here
2380		 */
2381		for (dom = domains; dom; dom = dom->dom_next)
2382			if (dom->dom_family == i && dom->dom_rtattach) {
2383				dom->dom_rtattach((void **) &nep->ne_rtable[i],
2384				    dom->dom_rtoffset);
2385				break;
2386			}
2387		if ((rnh = nep->ne_rtable[i]) == 0) {
2388			error = ENOBUFS;
2389			goto out;
2390		}
2391	}
2392	rn = (*rnh->rnh_addaddr) ((caddr_t) saddr, (caddr_t) smask, rnh,
2393	    np->netc_rnodes);
2394	if (rn == 0 || np != (struct netcred *) rn) {	/* already exists */
2395		error = EPERM;
2396		goto out;
2397	}
2398	np->netc_exflags = argp->ex_flags;
2399	bzero(&np->netc_anon, sizeof(np->netc_anon));
2400	np->netc_anon.cr_uid = argp->ex_anon.cr_uid;
2401	np->netc_anon.cr_ngroups = argp->ex_anon.cr_ngroups;
2402	bcopy(argp->ex_anon.cr_groups, np->netc_anon.cr_groups,
2403	    sizeof(np->netc_anon.cr_groups));
2404	np->netc_anon.cr_ref = 1;
2405	return (0);
2406out:
2407	free(np, M_NETADDR);
2408	return (error);
2409}
2410
2411/* Helper for vfs_free_addrlist. */
2412/* ARGSUSED */
2413static int
2414vfs_free_netcred(rn, w)
2415	struct radix_node *rn;
2416	void *w;
2417{
2418	register struct radix_node_head *rnh = (struct radix_node_head *) w;
2419
2420	(*rnh->rnh_deladdr) (rn->rn_key, rn->rn_mask, rnh);
2421	free((caddr_t) rn, M_NETADDR);
2422	return (0);
2423}
2424
2425/*
2426 * Free the net address hash lists that are hanging off the mount points.
2427 */
2428static void
2429vfs_free_addrlist(nep)
2430	struct netexport *nep;
2431{
2432	register int i;
2433	register struct radix_node_head *rnh;
2434
2435	for (i = 0; i <= AF_MAX; i++)
2436		if ((rnh = nep->ne_rtable[i])) {
2437			(*rnh->rnh_walktree) (rnh, vfs_free_netcred,
2438			    (caddr_t) rnh);
2439			free((caddr_t) rnh, M_RTABLE);
2440			nep->ne_rtable[i] = 0;
2441		}
2442}
2443
2444/*
2445 * High level function to manipulate export options on a mount point
2446 * and the passed in netexport.
2447 * Struct export_args *argp is the variable used to twiddle options,
2448 * the structure is described in sys/mount.h
2449 */
2450int
2451vfs_export(mp, nep, argp)
2452	struct mount *mp;
2453	struct netexport *nep;
2454	struct export_args *argp;
2455{
2456	int error;
2457
2458	if (argp->ex_flags & MNT_DELEXPORT) {
2459		if (mp->mnt_flag & MNT_EXPUBLIC) {
2460			vfs_setpublicfs(NULL, NULL, NULL);
2461			mp->mnt_flag &= ~MNT_EXPUBLIC;
2462		}
2463		vfs_free_addrlist(nep);
2464		mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED);
2465	}
2466	if (argp->ex_flags & MNT_EXPORTED) {
2467		if (argp->ex_flags & MNT_EXPUBLIC) {
2468			if ((error = vfs_setpublicfs(mp, nep, argp)) != 0)
2469				return (error);
2470			mp->mnt_flag |= MNT_EXPUBLIC;
2471		}
2472		if ((error = vfs_hang_addrlist(mp, nep, argp)))
2473			return (error);
2474		mp->mnt_flag |= MNT_EXPORTED;
2475	}
2476	return (0);
2477}
2478
2479/*
2480 * Set the publicly exported filesystem (WebNFS). Currently, only
2481 * one public filesystem is possible in the spec (RFC 2054 and 2055)
2482 */
2483int
2484vfs_setpublicfs(mp, nep, argp)
2485	struct mount *mp;
2486	struct netexport *nep;
2487	struct export_args *argp;
2488{
2489	int error;
2490	struct vnode *rvp;
2491	char *cp;
2492
2493	/*
2494	 * mp == NULL -> invalidate the current info, the FS is
2495	 * no longer exported. May be called from either vfs_export
2496	 * or unmount, so check if it hasn't already been done.
2497	 */
2498	if (mp == NULL) {
2499		if (nfs_pub.np_valid) {
2500			nfs_pub.np_valid = 0;
2501			if (nfs_pub.np_index != NULL) {
2502				FREE(nfs_pub.np_index, M_TEMP);
2503				nfs_pub.np_index = NULL;
2504			}
2505		}
2506		return (0);
2507	}
2508
2509	/*
2510	 * Only one allowed at a time.
2511	 */
2512	if (nfs_pub.np_valid != 0 && mp != nfs_pub.np_mount)
2513		return (EBUSY);
2514
2515	/*
2516	 * Get real filehandle for root of exported FS.
2517	 */
2518	bzero((caddr_t)&nfs_pub.np_handle, sizeof(nfs_pub.np_handle));
2519	nfs_pub.np_handle.fh_fsid = mp->mnt_stat.f_fsid;
2520
2521	if ((error = VFS_ROOT(mp, &rvp)))
2522		return (error);
2523
2524	if ((error = VFS_VPTOFH(rvp, &nfs_pub.np_handle.fh_fid)))
2525		return (error);
2526
2527	vput(rvp);
2528
2529	/*
2530	 * If an indexfile was specified, pull it in.
2531	 */
2532	if (argp->ex_indexfile != NULL) {
2533		MALLOC(nfs_pub.np_index, char *, MAXNAMLEN + 1, M_TEMP,
2534		    M_WAITOK);
2535		error = copyinstr(argp->ex_indexfile, nfs_pub.np_index,
2536		    MAXNAMLEN, (size_t *)0);
2537		if (!error) {
2538			/*
2539			 * Check for illegal filenames.
2540			 */
2541			for (cp = nfs_pub.np_index; *cp; cp++) {
2542				if (*cp == '/') {
2543					error = EINVAL;
2544					break;
2545				}
2546			}
2547		}
2548		if (error) {
2549			FREE(nfs_pub.np_index, M_TEMP);
2550			return (error);
2551		}
2552	}
2553
2554	nfs_pub.np_mount = mp;
2555	nfs_pub.np_valid = 1;
2556	return (0);
2557}
2558
2559/*
2560 * Used by the filesystems to determine if a given network address
2561 * (passed in 'nam') is present in thier exports list, returns a pointer
2562 * to struct netcred so that the filesystem can examine it for
2563 * access rights (read/write/etc).
2564 */
2565struct netcred *
2566vfs_export_lookup(mp, nep, nam)
2567	register struct mount *mp;
2568	struct netexport *nep;
2569	struct sockaddr *nam;
2570{
2571	register struct netcred *np;
2572	register struct radix_node_head *rnh;
2573	struct sockaddr *saddr;
2574
2575	np = NULL;
2576	if (mp->mnt_flag & MNT_EXPORTED) {
2577		/*
2578		 * Lookup in the export list first.
2579		 */
2580		if (nam != NULL) {
2581			saddr = nam;
2582			rnh = nep->ne_rtable[saddr->sa_family];
2583			if (rnh != NULL) {
2584				np = (struct netcred *)
2585					(*rnh->rnh_matchaddr)((caddr_t)saddr,
2586							      rnh);
2587				if (np && np->netc_rnodes->rn_flags & RNF_ROOT)
2588					np = NULL;
2589			}
2590		}
2591		/*
2592		 * If no address match, use the default if it exists.
2593		 */
2594		if (np == NULL && mp->mnt_flag & MNT_DEFEXPORTED)
2595			np = &nep->ne_defexported;
2596	}
2597	return (np);
2598}
2599
2600/*
2601 * perform msync on all vnodes under a mount point
2602 * the mount point must be locked.
2603 */
2604void
2605vfs_msync(struct mount *mp, int flags) {
2606	struct vnode *vp, *nvp;
2607	struct vm_object *obj;
2608	int anyio, tries;
2609
2610	tries = 5;
2611loop:
2612	anyio = 0;
2613	for (vp = LIST_FIRST(&mp->mnt_vnodelist); vp != NULL; vp = nvp) {
2614
2615		nvp = LIST_NEXT(vp, v_mntvnodes);
2616
2617		if (vp->v_mount != mp) {
2618			goto loop;
2619		}
2620
2621		if (vp->v_flag & VXLOCK)	/* XXX: what if MNT_WAIT? */
2622			continue;
2623
2624		if (flags != MNT_WAIT) {
2625			if (VOP_GETVOBJECT(vp, &obj) != 0 ||
2626			    (obj->flags & OBJ_MIGHTBEDIRTY) == 0)
2627				continue;
2628			if (VOP_ISLOCKED(vp, NULL))
2629				continue;
2630		}
2631
2632		mtx_lock(&vp->v_interlock);
2633		if (VOP_GETVOBJECT(vp, &obj) == 0 &&
2634		    (obj->flags & OBJ_MIGHTBEDIRTY)) {
2635			if (!vget(vp,
2636				LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY | LK_NOOBJ, curproc)) {
2637				if (VOP_GETVOBJECT(vp, &obj) == 0) {
2638					vm_object_page_clean(obj, 0, 0, flags == MNT_WAIT ? OBJPC_SYNC : OBJPC_NOSYNC);
2639					anyio = 1;
2640				}
2641				vput(vp);
2642			}
2643		} else {
2644			mtx_unlock(&vp->v_interlock);
2645		}
2646	}
2647	if (anyio && (--tries > 0))
2648		goto loop;
2649}
2650
2651/*
2652 * Create the VM object needed for VMIO and mmap support.  This
2653 * is done for all VREG files in the system.  Some filesystems might
2654 * afford the additional metadata buffering capability of the
2655 * VMIO code by making the device node be VMIO mode also.
2656 *
2657 * vp must be locked when vfs_object_create is called.
2658 */
2659int
2660vfs_object_create(vp, p, cred)
2661	struct vnode *vp;
2662	struct proc *p;
2663	struct ucred *cred;
2664{
2665	return (VOP_CREATEVOBJECT(vp, cred, p));
2666}
2667
2668/*
2669 * Mark a vnode as free, putting it up for recycling.
2670 */
2671void
2672vfree(vp)
2673	struct vnode *vp;
2674{
2675	int s;
2676
2677	s = splbio();
2678	mtx_lock(&vnode_free_list_mtx);
2679	KASSERT((vp->v_flag & VFREE) == 0, ("vnode already free"));
2680	if (vp->v_flag & VAGE) {
2681		TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
2682	} else {
2683		TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
2684	}
2685	freevnodes++;
2686	mtx_unlock(&vnode_free_list_mtx);
2687	vp->v_flag &= ~VAGE;
2688	vp->v_flag |= VFREE;
2689	splx(s);
2690}
2691
2692/*
2693 * Opposite of vfree() - mark a vnode as in use.
2694 */
2695void
2696vbusy(vp)
2697	struct vnode *vp;
2698{
2699	int s;
2700
2701	s = splbio();
2702	mtx_lock(&vnode_free_list_mtx);
2703	KASSERT((vp->v_flag & VFREE) != 0, ("vnode not free"));
2704	TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
2705	freevnodes--;
2706	mtx_unlock(&vnode_free_list_mtx);
2707	vp->v_flag &= ~(VFREE|VAGE);
2708	splx(s);
2709}
2710
2711/*
2712 * Record a process's interest in events which might happen to
2713 * a vnode.  Because poll uses the historic select-style interface
2714 * internally, this routine serves as both the ``check for any
2715 * pending events'' and the ``record my interest in future events''
2716 * functions.  (These are done together, while the lock is held,
2717 * to avoid race conditions.)
2718 */
2719int
2720vn_pollrecord(vp, p, events)
2721	struct vnode *vp;
2722	struct proc *p;
2723	short events;
2724{
2725	mtx_lock(&vp->v_pollinfo.vpi_lock);
2726	if (vp->v_pollinfo.vpi_revents & events) {
2727		/*
2728		 * This leaves events we are not interested
2729		 * in available for the other process which
2730		 * which presumably had requested them
2731		 * (otherwise they would never have been
2732		 * recorded).
2733		 */
2734		events &= vp->v_pollinfo.vpi_revents;
2735		vp->v_pollinfo.vpi_revents &= ~events;
2736
2737		mtx_unlock(&vp->v_pollinfo.vpi_lock);
2738		return events;
2739	}
2740	vp->v_pollinfo.vpi_events |= events;
2741	selrecord(p, &vp->v_pollinfo.vpi_selinfo);
2742	mtx_unlock(&vp->v_pollinfo.vpi_lock);
2743	return 0;
2744}
2745
2746/*
2747 * Note the occurrence of an event.  If the VN_POLLEVENT macro is used,
2748 * it is possible for us to miss an event due to race conditions, but
2749 * that condition is expected to be rare, so for the moment it is the
2750 * preferred interface.
2751 */
2752void
2753vn_pollevent(vp, events)
2754	struct vnode *vp;
2755	short events;
2756{
2757	mtx_lock(&vp->v_pollinfo.vpi_lock);
2758	if (vp->v_pollinfo.vpi_events & events) {
2759		/*
2760		 * We clear vpi_events so that we don't
2761		 * call selwakeup() twice if two events are
2762		 * posted before the polling process(es) is
2763		 * awakened.  This also ensures that we take at
2764		 * most one selwakeup() if the polling process
2765		 * is no longer interested.  However, it does
2766		 * mean that only one event can be noticed at
2767		 * a time.  (Perhaps we should only clear those
2768		 * event bits which we note?) XXX
2769		 */
2770		vp->v_pollinfo.vpi_events = 0;	/* &= ~events ??? */
2771		vp->v_pollinfo.vpi_revents |= events;
2772		selwakeup(&vp->v_pollinfo.vpi_selinfo);
2773	}
2774	mtx_unlock(&vp->v_pollinfo.vpi_lock);
2775}
2776
2777#define VN_KNOTE(vp, b) \
2778	KNOTE((struct klist *)&vp->v_pollinfo.vpi_selinfo.si_note, (b))
2779
2780/*
2781 * Wake up anyone polling on vp because it is being revoked.
2782 * This depends on dead_poll() returning POLLHUP for correct
2783 * behavior.
2784 */
2785void
2786vn_pollgone(vp)
2787	struct vnode *vp;
2788{
2789	mtx_lock(&vp->v_pollinfo.vpi_lock);
2790        VN_KNOTE(vp, NOTE_REVOKE);
2791	if (vp->v_pollinfo.vpi_events) {
2792		vp->v_pollinfo.vpi_events = 0;
2793		selwakeup(&vp->v_pollinfo.vpi_selinfo);
2794	}
2795	mtx_unlock(&vp->v_pollinfo.vpi_lock);
2796}
2797
2798
2799
2800/*
2801 * Routine to create and manage a filesystem syncer vnode.
2802 */
2803#define sync_close ((int (*) __P((struct  vop_close_args *)))nullop)
2804static int	sync_fsync __P((struct  vop_fsync_args *));
2805static int	sync_inactive __P((struct  vop_inactive_args *));
2806static int	sync_reclaim  __P((struct  vop_reclaim_args *));
2807#define sync_lock ((int (*) __P((struct  vop_lock_args *)))vop_nolock)
2808#define sync_unlock ((int (*) __P((struct  vop_unlock_args *)))vop_nounlock)
2809static int	sync_print __P((struct vop_print_args *));
2810#define sync_islocked ((int(*) __P((struct vop_islocked_args *)))vop_noislocked)
2811
2812static vop_t **sync_vnodeop_p;
2813static struct vnodeopv_entry_desc sync_vnodeop_entries[] = {
2814	{ &vop_default_desc,	(vop_t *) vop_eopnotsupp },
2815	{ &vop_close_desc,	(vop_t *) sync_close },		/* close */
2816	{ &vop_fsync_desc,	(vop_t *) sync_fsync },		/* fsync */
2817	{ &vop_inactive_desc,	(vop_t *) sync_inactive },	/* inactive */
2818	{ &vop_reclaim_desc,	(vop_t *) sync_reclaim },	/* reclaim */
2819	{ &vop_lock_desc,	(vop_t *) sync_lock },		/* lock */
2820	{ &vop_unlock_desc,	(vop_t *) sync_unlock },	/* unlock */
2821	{ &vop_print_desc,	(vop_t *) sync_print },		/* print */
2822	{ &vop_islocked_desc,	(vop_t *) sync_islocked },	/* islocked */
2823	{ NULL, NULL }
2824};
2825static struct vnodeopv_desc sync_vnodeop_opv_desc =
2826	{ &sync_vnodeop_p, sync_vnodeop_entries };
2827
2828VNODEOP_SET(sync_vnodeop_opv_desc);
2829
2830/*
2831 * Create a new filesystem syncer vnode for the specified mount point.
2832 */
2833int
2834vfs_allocate_syncvnode(mp)
2835	struct mount *mp;
2836{
2837	struct vnode *vp;
2838	static long start, incr, next;
2839	int error;
2840
2841	/* Allocate a new vnode */
2842	if ((error = getnewvnode(VT_VFS, mp, sync_vnodeop_p, &vp)) != 0) {
2843		mp->mnt_syncer = NULL;
2844		return (error);
2845	}
2846	vp->v_type = VNON;
2847	/*
2848	 * Place the vnode onto the syncer worklist. We attempt to
2849	 * scatter them about on the list so that they will go off
2850	 * at evenly distributed times even if all the filesystems
2851	 * are mounted at once.
2852	 */
2853	next += incr;
2854	if (next == 0 || next > syncer_maxdelay) {
2855		start /= 2;
2856		incr /= 2;
2857		if (start == 0) {
2858			start = syncer_maxdelay / 2;
2859			incr = syncer_maxdelay;
2860		}
2861		next = start;
2862	}
2863	vn_syncer_add_to_worklist(vp, syncdelay > 0 ? next % syncdelay : 0);
2864	mp->mnt_syncer = vp;
2865	return (0);
2866}
2867
2868/*
2869 * Do a lazy sync of the filesystem.
2870 */
2871static int
2872sync_fsync(ap)
2873	struct vop_fsync_args /* {
2874		struct vnode *a_vp;
2875		struct ucred *a_cred;
2876		int a_waitfor;
2877		struct proc *a_p;
2878	} */ *ap;
2879{
2880	struct vnode *syncvp = ap->a_vp;
2881	struct mount *mp = syncvp->v_mount;
2882	struct proc *p = ap->a_p;
2883	int asyncflag;
2884
2885	/*
2886	 * We only need to do something if this is a lazy evaluation.
2887	 */
2888	if (ap->a_waitfor != MNT_LAZY)
2889		return (0);
2890
2891	/*
2892	 * Move ourselves to the back of the sync list.
2893	 */
2894	vn_syncer_add_to_worklist(syncvp, syncdelay);
2895
2896	/*
2897	 * Walk the list of vnodes pushing all that are dirty and
2898	 * not already on the sync list.
2899	 */
2900	mtx_lock(&mountlist_mtx);
2901	if (vfs_busy(mp, LK_EXCLUSIVE | LK_NOWAIT, &mountlist_mtx, p) != 0) {
2902		mtx_unlock(&mountlist_mtx);
2903		return (0);
2904	}
2905	if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) {
2906		vfs_unbusy(mp, p);
2907		return (0);
2908	}
2909	asyncflag = mp->mnt_flag & MNT_ASYNC;
2910	mp->mnt_flag &= ~MNT_ASYNC;
2911	vfs_msync(mp, MNT_NOWAIT);
2912	VFS_SYNC(mp, MNT_LAZY, ap->a_cred, p);
2913	if (asyncflag)
2914		mp->mnt_flag |= MNT_ASYNC;
2915	vn_finished_write(mp);
2916	vfs_unbusy(mp, p);
2917	return (0);
2918}
2919
2920/*
2921 * The syncer vnode is no referenced.
2922 */
2923static int
2924sync_inactive(ap)
2925	struct vop_inactive_args /* {
2926		struct vnode *a_vp;
2927		struct proc *a_p;
2928	} */ *ap;
2929{
2930
2931	vgone(ap->a_vp);
2932	return (0);
2933}
2934
2935/*
2936 * The syncer vnode is no longer needed and is being decommissioned.
2937 *
2938 * Modifications to the worklist must be protected at splbio().
2939 */
2940static int
2941sync_reclaim(ap)
2942	struct vop_reclaim_args /* {
2943		struct vnode *a_vp;
2944	} */ *ap;
2945{
2946	struct vnode *vp = ap->a_vp;
2947	int s;
2948
2949	s = splbio();
2950	vp->v_mount->mnt_syncer = NULL;
2951	if (vp->v_flag & VONWORKLST) {
2952		LIST_REMOVE(vp, v_synclist);
2953		vp->v_flag &= ~VONWORKLST;
2954	}
2955	splx(s);
2956
2957	return (0);
2958}
2959
2960/*
2961 * Print out a syncer vnode.
2962 */
2963static int
2964sync_print(ap)
2965	struct vop_print_args /* {
2966		struct vnode *a_vp;
2967	} */ *ap;
2968{
2969	struct vnode *vp = ap->a_vp;
2970
2971	printf("syncer vnode");
2972	if (vp->v_vnlock != NULL)
2973		lockmgr_printinfo(vp->v_vnlock);
2974	printf("\n");
2975	return (0);
2976}
2977
2978/*
2979 * extract the dev_t from a VCHR
2980 */
2981dev_t
2982vn_todev(vp)
2983	struct vnode *vp;
2984{
2985	if (vp->v_type != VCHR)
2986		return (NODEV);
2987	return (vp->v_rdev);
2988}
2989
2990/*
2991 * Check if vnode represents a disk device
2992 */
2993int
2994vn_isdisk(vp, errp)
2995	struct vnode *vp;
2996	int *errp;
2997{
2998	struct cdevsw *cdevsw;
2999
3000	if (vp->v_type != VCHR) {
3001		if (errp != NULL)
3002			*errp = ENOTBLK;
3003		return (0);
3004	}
3005	if (vp->v_rdev == NULL) {
3006		if (errp != NULL)
3007			*errp = ENXIO;
3008		return (0);
3009	}
3010	cdevsw = devsw(vp->v_rdev);
3011	if (cdevsw == NULL) {
3012		if (errp != NULL)
3013			*errp = ENXIO;
3014		return (0);
3015	}
3016	if (!(cdevsw->d_flags & D_DISK)) {
3017		if (errp != NULL)
3018			*errp = ENOTBLK;
3019		return (0);
3020	}
3021	if (errp != NULL)
3022		*errp = 0;
3023	return (1);
3024}
3025
3026/*
3027 * Free data allocated by namei(); see namei(9) for details.
3028 */
3029void
3030NDFREE(ndp, flags)
3031     struct nameidata *ndp;
3032     const uint flags;
3033{
3034	if (!(flags & NDF_NO_FREE_PNBUF) &&
3035	    (ndp->ni_cnd.cn_flags & HASBUF)) {
3036		zfree(namei_zone, ndp->ni_cnd.cn_pnbuf);
3037		ndp->ni_cnd.cn_flags &= ~HASBUF;
3038	}
3039	if (!(flags & NDF_NO_DVP_UNLOCK) &&
3040	    (ndp->ni_cnd.cn_flags & LOCKPARENT) &&
3041	    ndp->ni_dvp != ndp->ni_vp)
3042		VOP_UNLOCK(ndp->ni_dvp, 0, ndp->ni_cnd.cn_proc);
3043	if (!(flags & NDF_NO_DVP_RELE) &&
3044	    (ndp->ni_cnd.cn_flags & (LOCKPARENT|WANTPARENT))) {
3045		vrele(ndp->ni_dvp);
3046		ndp->ni_dvp = NULL;
3047	}
3048	if (!(flags & NDF_NO_VP_UNLOCK) &&
3049	    (ndp->ni_cnd.cn_flags & LOCKLEAF) && ndp->ni_vp)
3050		VOP_UNLOCK(ndp->ni_vp, 0, ndp->ni_cnd.cn_proc);
3051	if (!(flags & NDF_NO_VP_RELE) &&
3052	    ndp->ni_vp) {
3053		vrele(ndp->ni_vp);
3054		ndp->ni_vp = NULL;
3055	}
3056	if (!(flags & NDF_NO_STARTDIR_RELE) &&
3057	    (ndp->ni_cnd.cn_flags & SAVESTART)) {
3058		vrele(ndp->ni_startdir);
3059		ndp->ni_startdir = NULL;
3060	}
3061}
3062
3063/*
3064 * Common file system object access control check routine.  Accepts a
3065 * vnode's type, "mode", uid and gid, requested access mode, credentials,
3066 * and optional call-by-reference privused argument allowing vaccess()
3067 * to indicate to the caller whether privilege was used to satisfy the
3068 * request.  Returns 0 on success, or an errno on failure.
3069 */
3070int
3071vaccess(type, file_mode, file_uid, file_gid, acc_mode, cred, privused)
3072	enum vtype type;
3073	mode_t file_mode;
3074	uid_t file_uid;
3075	gid_t file_gid;
3076	mode_t acc_mode;
3077	struct ucred *cred;
3078	int *privused;
3079{
3080	mode_t dac_granted;
3081#ifdef CAPABILITIES
3082	mode_t cap_granted;
3083#endif
3084
3085	/*
3086	 * Look for a normal, non-privileged way to access the file/directory
3087	 * as requested.  If it exists, go with that.
3088	 */
3089
3090	if (privused != NULL)
3091		*privused = 0;
3092
3093	dac_granted = 0;
3094
3095	/* Check the owner. */
3096	if (cred->cr_uid == file_uid) {
3097		dac_granted |= VADMIN;
3098		if (file_mode & S_IXUSR)
3099			dac_granted |= VEXEC;
3100		if (file_mode & S_IRUSR)
3101			dac_granted |= VREAD;
3102		if (file_mode & S_IWUSR)
3103			dac_granted |= VWRITE;
3104
3105		if ((acc_mode & dac_granted) == acc_mode)
3106			return (0);
3107
3108		goto privcheck;
3109	}
3110
3111	/* Otherwise, check the groups (first match) */
3112	if (groupmember(file_gid, cred)) {
3113		if (file_mode & S_IXGRP)
3114			dac_granted |= VEXEC;
3115		if (file_mode & S_IRGRP)
3116			dac_granted |= VREAD;
3117		if (file_mode & S_IWGRP)
3118			dac_granted |= VWRITE;
3119
3120		if ((acc_mode & dac_granted) == acc_mode)
3121			return (0);
3122
3123		goto privcheck;
3124	}
3125
3126	/* Otherwise, check everyone else. */
3127	if (file_mode & S_IXOTH)
3128		dac_granted |= VEXEC;
3129	if (file_mode & S_IROTH)
3130		dac_granted |= VREAD;
3131	if (file_mode & S_IWOTH)
3132		dac_granted |= VWRITE;
3133	if ((acc_mode & dac_granted) == acc_mode)
3134		return (0);
3135
3136privcheck:
3137	if (!suser_xxx(cred, NULL, PRISON_ROOT)) {
3138		/* XXX audit: privilege used */
3139		if (privused != NULL)
3140			*privused = 1;
3141		return (0);
3142	}
3143
3144#ifdef CAPABILITIES
3145	/*
3146	 * Build a capability mask to determine if the set of capabilities
3147	 * satisfies the requirements when combined with the granted mask
3148	 * from above.
3149	 * For each capability, if the capability is required, bitwise
3150	 * or the request type onto the cap_granted mask.
3151	 */
3152	cap_granted = 0;
3153	if ((acc_mode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
3154	    !cap_check_xxx(cred, NULL, CAP_DAC_EXECUTE, PRISON_ROOT))
3155	    cap_granted |= VEXEC;
3156
3157	if ((acc_mode & VREAD) && ((dac_granted & VREAD) == 0) &&
3158	    !cap_check_xxx(cred, NULL, CAP_DAC_READ_SEARCH, PRISON_ROOT))
3159		cap_granted |= VREAD;
3160
3161	if ((acc_mode & VWRITE) && ((dac_granted & VWRITE) == 0) &&
3162	    !cap_check_xxx(cred, NULL, CAP_DAC_WRITE, PRISON_ROOT))
3163		cap_granted |= VWRITE;
3164
3165	if ((acc_mode & VADMIN) && ((dac_granted & VADMIN) == 0) &&
3166	    !cap_check_xxx(cred, NULL, CAP_FOWNER, PRISON_ROOT))
3167		cap_granted |= VADMIN;
3168
3169	if ((acc_mode & (cap_granted | dac_granted)) == acc_mode) {
3170		/* XXX audit: privilege used */
3171		if (privused != NULL)
3172			*privused = 1;
3173		return (0);
3174	}
3175#endif
3176
3177	return ((acc_mode & VADMIN) ? EPERM : EACCES);
3178}
3179