vfs_subr.c revision 184554
1/*-
2 * Copyright (c) 1989, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 4. Neither the name of the University nor the names of its contributors
19 *    may be used to endorse or promote products derived from this software
20 *    without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 *	@(#)vfs_subr.c	8.31 (Berkeley) 5/26/95
35 */
36
37/*
38 * External virtual filesystem routines
39 */
40
41#include <sys/cdefs.h>
42__FBSDID("$FreeBSD: head/sys/kern/vfs_subr.c 184554 2008-11-02 10:15:42Z attilio $");
43
44#include "opt_ddb.h"
45#include "opt_mac.h"
46
47#include <sys/param.h>
48#include <sys/systm.h>
49#include <sys/bio.h>
50#include <sys/buf.h>
51#include <sys/condvar.h>
52#include <sys/conf.h>
53#include <sys/dirent.h>
54#include <sys/event.h>
55#include <sys/eventhandler.h>
56#include <sys/extattr.h>
57#include <sys/file.h>
58#include <sys/fcntl.h>
59#include <sys/jail.h>
60#include <sys/kdb.h>
61#include <sys/kernel.h>
62#include <sys/kthread.h>
63#include <sys/lockf.h>
64#include <sys/malloc.h>
65#include <sys/mount.h>
66#include <sys/namei.h>
67#include <sys/priv.h>
68#include <sys/reboot.h>
69#include <sys/sleepqueue.h>
70#include <sys/stat.h>
71#include <sys/sysctl.h>
72#include <sys/syslog.h>
73#include <sys/vmmeter.h>
74#include <sys/vnode.h>
75
76#include <machine/stdarg.h>
77
78#include <security/mac/mac_framework.h>
79
80#include <vm/vm.h>
81#include <vm/vm_object.h>
82#include <vm/vm_extern.h>
83#include <vm/pmap.h>
84#include <vm/vm_map.h>
85#include <vm/vm_page.h>
86#include <vm/vm_kern.h>
87#include <vm/uma.h>
88
89#ifdef DDB
90#include <ddb/ddb.h>
91#endif
92
93#define	WI_MPSAFEQ	0
94#define	WI_GIANTQ	1
95
96static MALLOC_DEFINE(M_NETADDR, "subr_export_host", "Export host address structure");
97
98static void	delmntque(struct vnode *vp);
99static int	flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo,
100		    int slpflag, int slptimeo);
101static void	syncer_shutdown(void *arg, int howto);
102static int	vtryrecycle(struct vnode *vp);
103static void	vbusy(struct vnode *vp);
104static void	vinactive(struct vnode *, struct thread *);
105static void	v_incr_usecount(struct vnode *);
106static void	v_decr_usecount(struct vnode *);
107static void	v_decr_useonly(struct vnode *);
108static void	v_upgrade_usecount(struct vnode *);
109static void	vfree(struct vnode *);
110static void	vnlru_free(int);
111static void	vdestroy(struct vnode *);
112static void	vgonel(struct vnode *);
113static void	vfs_knllock(void *arg);
114static void	vfs_knlunlock(void *arg);
115static int	vfs_knllocked(void *arg);
116static void	destroy_vpollinfo(struct vpollinfo *vi);
117
118/*
119 * Enable Giant pushdown based on whether or not the vm is mpsafe in this
120 * build.  Without mpsafevm the buffer cache can not run Giant free.
121 */
122int mpsafe_vfs = 1;
123TUNABLE_INT("debug.mpsafevfs", &mpsafe_vfs);
124SYSCTL_INT(_debug, OID_AUTO, mpsafevfs, CTLFLAG_RD, &mpsafe_vfs, 0,
125    "MPSAFE VFS");
126
127/*
128 * Number of vnodes in existence.  Increased whenever getnewvnode()
129 * allocates a new vnode, decreased on vdestroy() called on VI_DOOMed
130 * vnode.
131 */
132static unsigned long	numvnodes;
133
134SYSCTL_LONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, "");
135
136/*
137 * Conversion tables for conversion from vnode types to inode formats
138 * and back.
139 */
140enum vtype iftovt_tab[16] = {
141	VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
142	VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
143};
144int vttoif_tab[10] = {
145	0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
146	S_IFSOCK, S_IFIFO, S_IFMT, S_IFMT
147};
148
149/*
150 * List of vnodes that are ready for recycling.
151 */
152static TAILQ_HEAD(freelst, vnode) vnode_free_list;
153
154/*
155 * Free vnode target.  Free vnodes may simply be files which have been stat'd
156 * but not read.  This is somewhat common, and a small cache of such files
157 * should be kept to avoid recreation costs.
158 */
159static u_long wantfreevnodes;
160SYSCTL_LONG(_vfs, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, "");
161/* Number of vnodes in the free list. */
162static u_long freevnodes;
163SYSCTL_LONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, "");
164
165/*
166 * Various variables used for debugging the new implementation of
167 * reassignbuf().
168 * XXX these are probably of (very) limited utility now.
169 */
170static int reassignbufcalls;
171SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0, "");
172
173/*
174 * Cache for the mount type id assigned to NFS.  This is used for
175 * special checks in nfs/nfs_nqlease.c and vm/vnode_pager.c.
176 */
177int	nfs_mount_type = -1;
178
179/* To keep more than one thread at a time from running vfs_getnewfsid */
180static struct mtx mntid_mtx;
181
182/*
183 * Lock for any access to the following:
184 *	vnode_free_list
185 *	numvnodes
186 *	freevnodes
187 */
188static struct mtx vnode_free_list_mtx;
189
190/* Publicly exported FS */
191struct nfs_public nfs_pub;
192
193/* Zone for allocation of new vnodes - used exclusively by getnewvnode() */
194static uma_zone_t vnode_zone;
195static uma_zone_t vnodepoll_zone;
196
197/* Set to 1 to print out reclaim of active vnodes */
198int	prtactive;
199
200/*
201 * The workitem queue.
202 *
203 * It is useful to delay writes of file data and filesystem metadata
204 * for tens of seconds so that quickly created and deleted files need
205 * not waste disk bandwidth being created and removed. To realize this,
206 * we append vnodes to a "workitem" queue. When running with a soft
207 * updates implementation, most pending metadata dependencies should
208 * not wait for more than a few seconds. Thus, mounted on block devices
209 * are delayed only about a half the time that file data is delayed.
210 * Similarly, directory updates are more critical, so are only delayed
211 * about a third the time that file data is delayed. Thus, there are
212 * SYNCER_MAXDELAY queues that are processed round-robin at a rate of
213 * one each second (driven off the filesystem syncer process). The
214 * syncer_delayno variable indicates the next queue that is to be processed.
215 * Items that need to be processed soon are placed in this queue:
216 *
217 *	syncer_workitem_pending[syncer_delayno]
218 *
219 * A delay of fifteen seconds is done by placing the request fifteen
220 * entries later in the queue:
221 *
222 *	syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
223 *
224 */
225static int syncer_delayno;
226static long syncer_mask;
227LIST_HEAD(synclist, bufobj);
228static struct synclist *syncer_workitem_pending[2];
229/*
230 * The sync_mtx protects:
231 *	bo->bo_synclist
232 *	sync_vnode_count
233 *	syncer_delayno
234 *	syncer_state
235 *	syncer_workitem_pending
236 *	syncer_worklist_len
237 *	rushjob
238 */
239static struct mtx sync_mtx;
240static struct cv sync_wakeup;
241
242#define SYNCER_MAXDELAY		32
243static int syncer_maxdelay = SYNCER_MAXDELAY;	/* maximum delay time */
244static int syncdelay = 30;		/* max time to delay syncing data */
245static int filedelay = 30;		/* time to delay syncing files */
246SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, "");
247static int dirdelay = 29;		/* time to delay syncing directories */
248SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, "");
249static int metadelay = 28;		/* time to delay syncing metadata */
250SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, "");
251static int rushjob;		/* number of slots to run ASAP */
252static int stat_rush_requests;	/* number of times I/O speeded up */
253SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, "");
254
255/*
256 * When shutting down the syncer, run it at four times normal speed.
257 */
258#define SYNCER_SHUTDOWN_SPEEDUP		4
259static int sync_vnode_count;
260static int syncer_worklist_len;
261static enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY }
262    syncer_state;
263
264/*
265 * Number of vnodes we want to exist at any one time.  This is mostly used
266 * to size hash tables in vnode-related code.  It is normally not used in
267 * getnewvnode(), as wantfreevnodes is normally nonzero.)
268 *
269 * XXX desiredvnodes is historical cruft and should not exist.
270 */
271int desiredvnodes;
272SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW,
273    &desiredvnodes, 0, "Maximum number of vnodes");
274SYSCTL_INT(_kern, OID_AUTO, minvnodes, CTLFLAG_RW,
275    &wantfreevnodes, 0, "Minimum number of vnodes (legacy)");
276static int vnlru_nowhere;
277SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW,
278    &vnlru_nowhere, 0, "Number of times the vnlru process ran without success");
279
280/*
281 * Macros to control when a vnode is freed and recycled.  All require
282 * the vnode interlock.
283 */
284#define VCANRECYCLE(vp) (((vp)->v_iflag & VI_FREE) && !(vp)->v_holdcnt)
285#define VSHOULDFREE(vp) (!((vp)->v_iflag & VI_FREE) && !(vp)->v_holdcnt)
286#define VSHOULDBUSY(vp) (((vp)->v_iflag & VI_FREE) && (vp)->v_holdcnt)
287
288
289/*
290 * Initialize the vnode management data structures.
291 */
292#ifndef	MAXVNODES_MAX
293#define	MAXVNODES_MAX	100000
294#endif
295static void
296vntblinit(void *dummy __unused)
297{
298
299	/*
300	 * Desiredvnodes is a function of the physical memory size and
301	 * the kernel's heap size.  Specifically, desiredvnodes scales
302	 * in proportion to the physical memory size until two fifths
303	 * of the kernel's heap size is consumed by vnodes and vm
304	 * objects.
305	 */
306	desiredvnodes = min(maxproc + cnt.v_page_count / 4, 2 * vm_kmem_size /
307	    (5 * (sizeof(struct vm_object) + sizeof(struct vnode))));
308	if (desiredvnodes > MAXVNODES_MAX) {
309		if (bootverbose)
310			printf("Reducing kern.maxvnodes %d -> %d\n",
311			    desiredvnodes, MAXVNODES_MAX);
312		desiredvnodes = MAXVNODES_MAX;
313	}
314	wantfreevnodes = desiredvnodes / 4;
315	mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF);
316	TAILQ_INIT(&vnode_free_list);
317	mtx_init(&vnode_free_list_mtx, "vnode_free_list", NULL, MTX_DEF);
318	vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL,
319	    NULL, NULL, UMA_ALIGN_PTR, 0);
320	vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo),
321	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
322	/*
323	 * Initialize the filesystem syncer.
324	 */
325	syncer_workitem_pending[WI_MPSAFEQ] = hashinit(syncer_maxdelay, M_VNODE,
326	    &syncer_mask);
327	syncer_workitem_pending[WI_GIANTQ] = hashinit(syncer_maxdelay, M_VNODE,
328	    &syncer_mask);
329	syncer_maxdelay = syncer_mask + 1;
330	mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF);
331	cv_init(&sync_wakeup, "syncer");
332}
333SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL);
334
335
336/*
337 * Mark a mount point as busy. Used to synchronize access and to delay
338 * unmounting. Eventually, mountlist_mtx is not released on failure.
339 */
340int
341vfs_busy(struct mount *mp, int flags)
342{
343
344	MPASS((flags & ~MBF_MASK) == 0);
345
346	MNT_ILOCK(mp);
347	MNT_REF(mp);
348	if (mp->mnt_kern_flag & MNTK_UNMOUNT) {
349		if (flags & MBF_NOWAIT) {
350			MNT_REL(mp);
351			MNT_IUNLOCK(mp);
352			return (ENOENT);
353		}
354		if (flags & MBF_MNTLSTLOCK)
355			mtx_unlock(&mountlist_mtx);
356		mp->mnt_kern_flag |= MNTK_MWAIT;
357		msleep(mp, MNT_MTX(mp), PVFS, "vfs_busy", 0);
358		MNT_REL(mp);
359		MNT_IUNLOCK(mp);
360		if (flags & MBF_MNTLSTLOCK)
361			mtx_lock(&mountlist_mtx);
362		return (ENOENT);
363	}
364	if (flags & MBF_MNTLSTLOCK)
365		mtx_unlock(&mountlist_mtx);
366	mp->mnt_lockref++;
367	MNT_IUNLOCK(mp);
368	return (0);
369}
370
371/*
372 * Free a busy filesystem.
373 */
374void
375vfs_unbusy(struct mount *mp)
376{
377
378	MNT_ILOCK(mp);
379	MNT_REL(mp);
380	mp->mnt_lockref--;
381	if (mp->mnt_lockref == 0 && (mp->mnt_kern_flag & MNTK_DRAINING) != 0) {
382		MPASS(mp->mnt_kern_flag & MNTK_UNMOUNT);
383		mp->mnt_kern_flag &= ~MNTK_DRAINING;
384		wakeup(&mp->mnt_lockref);
385	}
386	MNT_IUNLOCK(mp);
387}
388
389/*
390 * Lookup a mount point by filesystem identifier.
391 */
392struct mount *
393vfs_getvfs(fsid_t *fsid)
394{
395	struct mount *mp;
396
397	mtx_lock(&mountlist_mtx);
398	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
399		if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
400		    mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
401			vfs_ref(mp);
402			mtx_unlock(&mountlist_mtx);
403			return (mp);
404		}
405	}
406	mtx_unlock(&mountlist_mtx);
407	return ((struct mount *) 0);
408}
409
410/*
411 * Check if a user can access privileged mount options.
412 */
413int
414vfs_suser(struct mount *mp, struct thread *td)
415{
416	int error;
417
418	/*
419	 * If the thread is jailed, but this is not a jail-friendly file
420	 * system, deny immediately.
421	 */
422	if (jailed(td->td_ucred) && !(mp->mnt_vfc->vfc_flags & VFCF_JAIL))
423		return (EPERM);
424
425	/*
426	 * If the file system was mounted outside a jail and a jailed thread
427	 * tries to access it, deny immediately.
428	 */
429	if (!jailed(mp->mnt_cred) && jailed(td->td_ucred))
430		return (EPERM);
431
432	/*
433	 * If the file system was mounted inside different jail that the jail of
434	 * the calling thread, deny immediately.
435	 */
436	if (jailed(mp->mnt_cred) && jailed(td->td_ucred) &&
437	    mp->mnt_cred->cr_prison != td->td_ucred->cr_prison) {
438		return (EPERM);
439	}
440
441	if ((mp->mnt_flag & MNT_USER) == 0 ||
442	    mp->mnt_cred->cr_uid != td->td_ucred->cr_uid) {
443		if ((error = priv_check(td, PRIV_VFS_MOUNT_OWNER)) != 0)
444			return (error);
445	}
446	return (0);
447}
448
449/*
450 * Get a new unique fsid.  Try to make its val[0] unique, since this value
451 * will be used to create fake device numbers for stat().  Also try (but
452 * not so hard) make its val[0] unique mod 2^16, since some emulators only
453 * support 16-bit device numbers.  We end up with unique val[0]'s for the
454 * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls.
455 *
456 * Keep in mind that several mounts may be running in parallel.  Starting
457 * the search one past where the previous search terminated is both a
458 * micro-optimization and a defense against returning the same fsid to
459 * different mounts.
460 */
461void
462vfs_getnewfsid(struct mount *mp)
463{
464	static u_int16_t mntid_base;
465	struct mount *nmp;
466	fsid_t tfsid;
467	int mtype;
468
469	mtx_lock(&mntid_mtx);
470	mtype = mp->mnt_vfc->vfc_typenum;
471	tfsid.val[1] = mtype;
472	mtype = (mtype & 0xFF) << 24;
473	for (;;) {
474		tfsid.val[0] = makedev(255,
475		    mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF));
476		mntid_base++;
477		if ((nmp = vfs_getvfs(&tfsid)) == NULL)
478			break;
479		vfs_rel(nmp);
480	}
481	mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
482	mp->mnt_stat.f_fsid.val[1] = tfsid.val[1];
483	mtx_unlock(&mntid_mtx);
484}
485
486/*
487 * Knob to control the precision of file timestamps:
488 *
489 *   0 = seconds only; nanoseconds zeroed.
490 *   1 = seconds and nanoseconds, accurate within 1/HZ.
491 *   2 = seconds and nanoseconds, truncated to microseconds.
492 * >=3 = seconds and nanoseconds, maximum precision.
493 */
494enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC };
495
496static int timestamp_precision = TSP_SEC;
497SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW,
498    &timestamp_precision, 0, "");
499
500/*
501 * Get a current timestamp.
502 */
503void
504vfs_timestamp(struct timespec *tsp)
505{
506	struct timeval tv;
507
508	switch (timestamp_precision) {
509	case TSP_SEC:
510		tsp->tv_sec = time_second;
511		tsp->tv_nsec = 0;
512		break;
513	case TSP_HZ:
514		getnanotime(tsp);
515		break;
516	case TSP_USEC:
517		microtime(&tv);
518		TIMEVAL_TO_TIMESPEC(&tv, tsp);
519		break;
520	case TSP_NSEC:
521	default:
522		nanotime(tsp);
523		break;
524	}
525}
526
527/*
528 * Set vnode attributes to VNOVAL
529 */
530void
531vattr_null(struct vattr *vap)
532{
533
534	vap->va_type = VNON;
535	vap->va_size = VNOVAL;
536	vap->va_bytes = VNOVAL;
537	vap->va_mode = VNOVAL;
538	vap->va_nlink = VNOVAL;
539	vap->va_uid = VNOVAL;
540	vap->va_gid = VNOVAL;
541	vap->va_fsid = VNOVAL;
542	vap->va_fileid = VNOVAL;
543	vap->va_blocksize = VNOVAL;
544	vap->va_rdev = VNOVAL;
545	vap->va_atime.tv_sec = VNOVAL;
546	vap->va_atime.tv_nsec = VNOVAL;
547	vap->va_mtime.tv_sec = VNOVAL;
548	vap->va_mtime.tv_nsec = VNOVAL;
549	vap->va_ctime.tv_sec = VNOVAL;
550	vap->va_ctime.tv_nsec = VNOVAL;
551	vap->va_birthtime.tv_sec = VNOVAL;
552	vap->va_birthtime.tv_nsec = VNOVAL;
553	vap->va_flags = VNOVAL;
554	vap->va_gen = VNOVAL;
555	vap->va_vaflags = 0;
556}
557
558/*
559 * This routine is called when we have too many vnodes.  It attempts
560 * to free <count> vnodes and will potentially free vnodes that still
561 * have VM backing store (VM backing store is typically the cause
562 * of a vnode blowout so we want to do this).  Therefore, this operation
563 * is not considered cheap.
564 *
565 * A number of conditions may prevent a vnode from being reclaimed.
566 * the buffer cache may have references on the vnode, a directory
567 * vnode may still have references due to the namei cache representing
568 * underlying files, or the vnode may be in active use.   It is not
569 * desireable to reuse such vnodes.  These conditions may cause the
570 * number of vnodes to reach some minimum value regardless of what
571 * you set kern.maxvnodes to.  Do not set kern.maxvnodes too low.
572 */
573static int
574vlrureclaim(struct mount *mp)
575{
576	struct vnode *vp;
577	int done;
578	int trigger;
579	int usevnodes;
580	int count;
581
582	/*
583	 * Calculate the trigger point, don't allow user
584	 * screwups to blow us up.   This prevents us from
585	 * recycling vnodes with lots of resident pages.  We
586	 * aren't trying to free memory, we are trying to
587	 * free vnodes.
588	 */
589	usevnodes = desiredvnodes;
590	if (usevnodes <= 0)
591		usevnodes = 1;
592	trigger = cnt.v_page_count * 2 / usevnodes;
593	done = 0;
594	vn_start_write(NULL, &mp, V_WAIT);
595	MNT_ILOCK(mp);
596	count = mp->mnt_nvnodelistsize / 10 + 1;
597	while (count != 0) {
598		vp = TAILQ_FIRST(&mp->mnt_nvnodelist);
599		while (vp != NULL && vp->v_type == VMARKER)
600			vp = TAILQ_NEXT(vp, v_nmntvnodes);
601		if (vp == NULL)
602			break;
603		TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
604		TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
605		--count;
606		if (!VI_TRYLOCK(vp))
607			goto next_iter;
608		/*
609		 * If it's been deconstructed already, it's still
610		 * referenced, or it exceeds the trigger, skip it.
611		 */
612		if (vp->v_usecount || !LIST_EMPTY(&(vp)->v_cache_src) ||
613		    (vp->v_iflag & VI_DOOMED) != 0 || (vp->v_object != NULL &&
614		    vp->v_object->resident_page_count > trigger)) {
615			VI_UNLOCK(vp);
616			goto next_iter;
617		}
618		MNT_IUNLOCK(mp);
619		vholdl(vp);
620		if (VOP_LOCK(vp, LK_INTERLOCK|LK_EXCLUSIVE|LK_NOWAIT)) {
621			vdrop(vp);
622			goto next_iter_mntunlocked;
623		}
624		VI_LOCK(vp);
625		/*
626		 * v_usecount may have been bumped after VOP_LOCK() dropped
627		 * the vnode interlock and before it was locked again.
628		 *
629		 * It is not necessary to recheck VI_DOOMED because it can
630		 * only be set by another thread that holds both the vnode
631		 * lock and vnode interlock.  If another thread has the
632		 * vnode lock before we get to VOP_LOCK() and obtains the
633		 * vnode interlock after VOP_LOCK() drops the vnode
634		 * interlock, the other thread will be unable to drop the
635		 * vnode lock before our VOP_LOCK() call fails.
636		 */
637		if (vp->v_usecount || !LIST_EMPTY(&(vp)->v_cache_src) ||
638		    (vp->v_object != NULL &&
639		    vp->v_object->resident_page_count > trigger)) {
640			VOP_UNLOCK(vp, LK_INTERLOCK);
641			goto next_iter_mntunlocked;
642		}
643		KASSERT((vp->v_iflag & VI_DOOMED) == 0,
644		    ("VI_DOOMED unexpectedly detected in vlrureclaim()"));
645		vgonel(vp);
646		VOP_UNLOCK(vp, 0);
647		vdropl(vp);
648		done++;
649next_iter_mntunlocked:
650		if ((count % 256) != 0)
651			goto relock_mnt;
652		goto yield;
653next_iter:
654		if ((count % 256) != 0)
655			continue;
656		MNT_IUNLOCK(mp);
657yield:
658		uio_yield();
659relock_mnt:
660		MNT_ILOCK(mp);
661	}
662	MNT_IUNLOCK(mp);
663	vn_finished_write(mp);
664	return done;
665}
666
667/*
668 * Attempt to keep the free list at wantfreevnodes length.
669 */
670static void
671vnlru_free(int count)
672{
673	struct vnode *vp;
674	int vfslocked;
675
676	mtx_assert(&vnode_free_list_mtx, MA_OWNED);
677	for (; count > 0; count--) {
678		vp = TAILQ_FIRST(&vnode_free_list);
679		/*
680		 * The list can be modified while the free_list_mtx
681		 * has been dropped and vp could be NULL here.
682		 */
683		if (!vp)
684			break;
685		VNASSERT(vp->v_op != NULL, vp,
686		    ("vnlru_free: vnode already reclaimed."));
687		TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
688		/*
689		 * Don't recycle if we can't get the interlock.
690		 */
691		if (!VI_TRYLOCK(vp)) {
692			TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
693			continue;
694		}
695		VNASSERT(VCANRECYCLE(vp), vp,
696		    ("vp inconsistent on freelist"));
697		freevnodes--;
698		vp->v_iflag &= ~VI_FREE;
699		vholdl(vp);
700		mtx_unlock(&vnode_free_list_mtx);
701		VI_UNLOCK(vp);
702		vfslocked = VFS_LOCK_GIANT(vp->v_mount);
703		vtryrecycle(vp);
704		VFS_UNLOCK_GIANT(vfslocked);
705		/*
706		 * If the recycled succeeded this vdrop will actually free
707		 * the vnode.  If not it will simply place it back on
708		 * the free list.
709		 */
710		vdrop(vp);
711		mtx_lock(&vnode_free_list_mtx);
712	}
713}
714/*
715 * Attempt to recycle vnodes in a context that is always safe to block.
716 * Calling vlrurecycle() from the bowels of filesystem code has some
717 * interesting deadlock problems.
718 */
719static struct proc *vnlruproc;
720static int vnlruproc_sig;
721
722static void
723vnlru_proc(void)
724{
725	struct mount *mp, *nmp;
726	int done;
727	struct proc *p = vnlruproc;
728
729	EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, p,
730	    SHUTDOWN_PRI_FIRST);
731
732	mtx_lock(&Giant);
733
734	for (;;) {
735		kproc_suspend_check(p);
736		mtx_lock(&vnode_free_list_mtx);
737		if (freevnodes > wantfreevnodes)
738			vnlru_free(freevnodes - wantfreevnodes);
739		if (numvnodes <= desiredvnodes * 9 / 10) {
740			vnlruproc_sig = 0;
741			wakeup(&vnlruproc_sig);
742			msleep(vnlruproc, &vnode_free_list_mtx,
743			    PVFS|PDROP, "vlruwt", hz);
744			continue;
745		}
746		mtx_unlock(&vnode_free_list_mtx);
747		done = 0;
748		mtx_lock(&mountlist_mtx);
749		for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
750			int vfsunlocked;
751			if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) {
752				nmp = TAILQ_NEXT(mp, mnt_list);
753				continue;
754			}
755			if (!VFS_NEEDSGIANT(mp)) {
756				mtx_unlock(&Giant);
757				vfsunlocked = 1;
758			} else
759				vfsunlocked = 0;
760			done += vlrureclaim(mp);
761			if (vfsunlocked)
762				mtx_lock(&Giant);
763			mtx_lock(&mountlist_mtx);
764			nmp = TAILQ_NEXT(mp, mnt_list);
765			vfs_unbusy(mp);
766		}
767		mtx_unlock(&mountlist_mtx);
768		if (done == 0) {
769			EVENTHANDLER_INVOKE(vfs_lowvnodes, desiredvnodes / 10);
770#if 0
771			/* These messages are temporary debugging aids */
772			if (vnlru_nowhere < 5)
773				printf("vnlru process getting nowhere..\n");
774			else if (vnlru_nowhere == 5)
775				printf("vnlru process messages stopped.\n");
776#endif
777			vnlru_nowhere++;
778			tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3);
779		} else
780			uio_yield();
781	}
782}
783
784static struct kproc_desc vnlru_kp = {
785	"vnlru",
786	vnlru_proc,
787	&vnlruproc
788};
789SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start,
790    &vnlru_kp);
791
792/*
793 * Routines having to do with the management of the vnode table.
794 */
795
796static void
797vdestroy(struct vnode *vp)
798{
799	struct bufobj *bo;
800
801	CTR1(KTR_VFS, "vdestroy vp %p", vp);
802	mtx_lock(&vnode_free_list_mtx);
803	numvnodes--;
804	mtx_unlock(&vnode_free_list_mtx);
805	bo = &vp->v_bufobj;
806	VNASSERT((vp->v_iflag & VI_FREE) == 0, vp,
807	    ("cleaned vnode still on the free list."));
808	VNASSERT(vp->v_data == NULL, vp, ("cleaned vnode isn't"));
809	VNASSERT(vp->v_holdcnt == 0, vp, ("Non-zero hold count"));
810	VNASSERT(vp->v_usecount == 0, vp, ("Non-zero use count"));
811	VNASSERT(vp->v_writecount == 0, vp, ("Non-zero write count"));
812	VNASSERT(bo->bo_numoutput == 0, vp, ("Clean vnode has pending I/O's"));
813	VNASSERT(bo->bo_clean.bv_cnt == 0, vp, ("cleanbufcnt not 0"));
814	VNASSERT(bo->bo_clean.bv_root == NULL, vp, ("cleanblkroot not NULL"));
815	VNASSERT(bo->bo_dirty.bv_cnt == 0, vp, ("dirtybufcnt not 0"));
816	VNASSERT(bo->bo_dirty.bv_root == NULL, vp, ("dirtyblkroot not NULL"));
817	VNASSERT(TAILQ_EMPTY(&vp->v_cache_dst), vp, ("vp has namecache dst"));
818	VNASSERT(LIST_EMPTY(&vp->v_cache_src), vp, ("vp has namecache src"));
819	VI_UNLOCK(vp);
820#ifdef MAC
821	mac_vnode_destroy(vp);
822#endif
823	if (vp->v_pollinfo != NULL)
824		destroy_vpollinfo(vp->v_pollinfo);
825#ifdef INVARIANTS
826	/* XXX Elsewhere we can detect an already freed vnode via NULL v_op. */
827	vp->v_op = NULL;
828#endif
829	lockdestroy(vp->v_vnlock);
830	mtx_destroy(&vp->v_interlock);
831	mtx_destroy(BO_MTX(bo));
832	uma_zfree(vnode_zone, vp);
833}
834
835/*
836 * Try to recycle a freed vnode.  We abort if anyone picks up a reference
837 * before we actually vgone().  This function must be called with the vnode
838 * held to prevent the vnode from being returned to the free list midway
839 * through vgone().
840 */
841static int
842vtryrecycle(struct vnode *vp)
843{
844	struct mount *vnmp;
845
846	CTR1(KTR_VFS, "vtryrecycle: trying vp %p", vp);
847	VNASSERT(vp->v_holdcnt, vp,
848	    ("vtryrecycle: Recycling vp %p without a reference.", vp));
849	/*
850	 * This vnode may found and locked via some other list, if so we
851	 * can't recycle it yet.
852	 */
853	if (VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT) != 0)
854		return (EWOULDBLOCK);
855	/*
856	 * Don't recycle if its filesystem is being suspended.
857	 */
858	if (vn_start_write(vp, &vnmp, V_NOWAIT) != 0) {
859		VOP_UNLOCK(vp, 0);
860		return (EBUSY);
861	}
862	/*
863	 * If we got this far, we need to acquire the interlock and see if
864	 * anyone picked up this vnode from another list.  If not, we will
865	 * mark it with DOOMED via vgonel() so that anyone who does find it
866	 * will skip over it.
867	 */
868	VI_LOCK(vp);
869	if (vp->v_usecount) {
870		VOP_UNLOCK(vp, LK_INTERLOCK);
871		vn_finished_write(vnmp);
872		return (EBUSY);
873	}
874	if ((vp->v_iflag & VI_DOOMED) == 0)
875		vgonel(vp);
876	VOP_UNLOCK(vp, LK_INTERLOCK);
877	vn_finished_write(vnmp);
878	CTR1(KTR_VFS, "vtryrecycle: recycled vp %p", vp);
879	return (0);
880}
881
882/*
883 * Return the next vnode from the free list.
884 */
885int
886getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops,
887    struct vnode **vpp)
888{
889	struct vnode *vp = NULL;
890	struct bufobj *bo;
891
892	mtx_lock(&vnode_free_list_mtx);
893	/*
894	 * Lend our context to reclaim vnodes if they've exceeded the max.
895	 */
896	if (freevnodes > wantfreevnodes)
897		vnlru_free(1);
898	/*
899	 * Wait for available vnodes.
900	 */
901	if (numvnodes > desiredvnodes) {
902		if (mp != NULL && (mp->mnt_kern_flag & MNTK_SUSPEND)) {
903			/*
904			 * File system is beeing suspended, we cannot risk a
905			 * deadlock here, so allocate new vnode anyway.
906			 */
907			if (freevnodes > wantfreevnodes)
908				vnlru_free(freevnodes - wantfreevnodes);
909			goto alloc;
910		}
911		if (vnlruproc_sig == 0) {
912			vnlruproc_sig = 1;	/* avoid unnecessary wakeups */
913			wakeup(vnlruproc);
914		}
915		msleep(&vnlruproc_sig, &vnode_free_list_mtx, PVFS,
916		    "vlruwk", hz);
917#if 0	/* XXX Not all VFS_VGET/ffs_vget callers check returns. */
918		if (numvnodes > desiredvnodes) {
919			mtx_unlock(&vnode_free_list_mtx);
920			return (ENFILE);
921		}
922#endif
923	}
924alloc:
925	numvnodes++;
926	mtx_unlock(&vnode_free_list_mtx);
927	vp = (struct vnode *) uma_zalloc(vnode_zone, M_WAITOK|M_ZERO);
928	/*
929	 * Setup locks.
930	 */
931	vp->v_vnlock = &vp->v_lock;
932	mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF);
933	/*
934	 * By default, don't allow shared locks unless filesystems
935	 * opt-in.
936	 */
937	lockinit(vp->v_vnlock, PVFS, tag, VLKTIMEOUT, LK_NOSHARE);
938	/*
939	 * Initialize bufobj.
940	 */
941	bo = &vp->v_bufobj;
942	bo->__bo_vnode = vp;
943	mtx_init(BO_MTX(bo), "bufobj interlock", NULL, MTX_DEF);
944	bo->bo_ops = &buf_ops_bio;
945	bo->bo_private = vp;
946	TAILQ_INIT(&bo->bo_clean.bv_hd);
947	TAILQ_INIT(&bo->bo_dirty.bv_hd);
948	/*
949	 * Initialize namecache.
950	 */
951	LIST_INIT(&vp->v_cache_src);
952	TAILQ_INIT(&vp->v_cache_dst);
953	/*
954	 * Finalize various vnode identity bits.
955	 */
956	vp->v_type = VNON;
957	vp->v_tag = tag;
958	vp->v_op = vops;
959	v_incr_usecount(vp);
960	vp->v_data = 0;
961#ifdef MAC
962	mac_vnode_init(vp);
963	if (mp != NULL && (mp->mnt_flag & MNT_MULTILABEL) == 0)
964		mac_vnode_associate_singlelabel(mp, vp);
965	else if (mp == NULL && vops != &dead_vnodeops)
966		printf("NULL mp in getnewvnode()\n");
967#endif
968	if (mp != NULL) {
969		bo->bo_bsize = mp->mnt_stat.f_iosize;
970		if ((mp->mnt_kern_flag & MNTK_NOKNOTE) != 0)
971			vp->v_vflag |= VV_NOKNOTE;
972	}
973
974	CTR2(KTR_VFS, "getnewvnode: mp %p vp %p", mp, vp);
975	*vpp = vp;
976	return (0);
977}
978
979/*
980 * Delete from old mount point vnode list, if on one.
981 */
982static void
983delmntque(struct vnode *vp)
984{
985	struct mount *mp;
986
987	mp = vp->v_mount;
988	if (mp == NULL)
989		return;
990	MNT_ILOCK(mp);
991	vp->v_mount = NULL;
992	VNASSERT(mp->mnt_nvnodelistsize > 0, vp,
993		("bad mount point vnode list size"));
994	TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
995	mp->mnt_nvnodelistsize--;
996	MNT_REL(mp);
997	MNT_IUNLOCK(mp);
998}
999
1000static void
1001insmntque_stddtr(struct vnode *vp, void *dtr_arg)
1002{
1003
1004	vp->v_data = NULL;
1005	vp->v_op = &dead_vnodeops;
1006	/* XXX non mp-safe fs may still call insmntque with vnode
1007	   unlocked */
1008	if (!VOP_ISLOCKED(vp))
1009		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1010	vgone(vp);
1011	vput(vp);
1012}
1013
1014/*
1015 * Insert into list of vnodes for the new mount point, if available.
1016 */
1017int
1018insmntque1(struct vnode *vp, struct mount *mp,
1019	void (*dtr)(struct vnode *, void *), void *dtr_arg)
1020{
1021	int locked;
1022
1023	KASSERT(vp->v_mount == NULL,
1024		("insmntque: vnode already on per mount vnode list"));
1025	VNASSERT(mp != NULL, vp, ("Don't call insmntque(foo, NULL)"));
1026#ifdef DEBUG_VFS_LOCKS
1027	if (!VFS_NEEDSGIANT(mp))
1028		ASSERT_VOP_ELOCKED(vp,
1029		    "insmntque: mp-safe fs and non-locked vp");
1030#endif
1031	MNT_ILOCK(mp);
1032	if ((mp->mnt_kern_flag & MNTK_NOINSMNTQ) != 0 &&
1033	    ((mp->mnt_kern_flag & MNTK_UNMOUNTF) != 0 ||
1034	     mp->mnt_nvnodelistsize == 0)) {
1035		locked = VOP_ISLOCKED(vp);
1036		if (!locked || (locked == LK_EXCLUSIVE &&
1037		     (vp->v_vflag & VV_FORCEINSMQ) == 0)) {
1038			MNT_IUNLOCK(mp);
1039			if (dtr != NULL)
1040				dtr(vp, dtr_arg);
1041			return (EBUSY);
1042		}
1043	}
1044	vp->v_mount = mp;
1045	MNT_REF(mp);
1046	TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
1047	VNASSERT(mp->mnt_nvnodelistsize >= 0, vp,
1048		("neg mount point vnode list size"));
1049	mp->mnt_nvnodelistsize++;
1050	MNT_IUNLOCK(mp);
1051	return (0);
1052}
1053
1054int
1055insmntque(struct vnode *vp, struct mount *mp)
1056{
1057
1058	return (insmntque1(vp, mp, insmntque_stddtr, NULL));
1059}
1060
1061/*
1062 * Flush out and invalidate all buffers associated with a bufobj
1063 * Called with the underlying object locked.
1064 */
1065int
1066bufobj_invalbuf(struct bufobj *bo, int flags, int slpflag, int slptimeo)
1067{
1068	int error;
1069
1070	BO_LOCK(bo);
1071	if (flags & V_SAVE) {
1072		error = bufobj_wwait(bo, slpflag, slptimeo);
1073		if (error) {
1074			BO_UNLOCK(bo);
1075			return (error);
1076		}
1077		if (bo->bo_dirty.bv_cnt > 0) {
1078			BO_UNLOCK(bo);
1079			if ((error = BO_SYNC(bo, MNT_WAIT)) != 0)
1080				return (error);
1081			/*
1082			 * XXX We could save a lock/unlock if this was only
1083			 * enabled under INVARIANTS
1084			 */
1085			BO_LOCK(bo);
1086			if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0)
1087				panic("vinvalbuf: dirty bufs");
1088		}
1089	}
1090	/*
1091	 * If you alter this loop please notice that interlock is dropped and
1092	 * reacquired in flushbuflist.  Special care is needed to ensure that
1093	 * no race conditions occur from this.
1094	 */
1095	do {
1096		error = flushbuflist(&bo->bo_clean,
1097		    flags, bo, slpflag, slptimeo);
1098		if (error == 0)
1099			error = flushbuflist(&bo->bo_dirty,
1100			    flags, bo, slpflag, slptimeo);
1101		if (error != 0 && error != EAGAIN) {
1102			BO_UNLOCK(bo);
1103			return (error);
1104		}
1105	} while (error != 0);
1106
1107	/*
1108	 * Wait for I/O to complete.  XXX needs cleaning up.  The vnode can
1109	 * have write I/O in-progress but if there is a VM object then the
1110	 * VM object can also have read-I/O in-progress.
1111	 */
1112	do {
1113		bufobj_wwait(bo, 0, 0);
1114		BO_UNLOCK(bo);
1115		if (bo->bo_object != NULL) {
1116			VM_OBJECT_LOCK(bo->bo_object);
1117			vm_object_pip_wait(bo->bo_object, "bovlbx");
1118			VM_OBJECT_UNLOCK(bo->bo_object);
1119		}
1120		BO_LOCK(bo);
1121	} while (bo->bo_numoutput > 0);
1122	BO_UNLOCK(bo);
1123
1124	/*
1125	 * Destroy the copy in the VM cache, too.
1126	 */
1127	if (bo->bo_object != NULL) {
1128		VM_OBJECT_LOCK(bo->bo_object);
1129		vm_object_page_remove(bo->bo_object, 0, 0,
1130			(flags & V_SAVE) ? TRUE : FALSE);
1131		VM_OBJECT_UNLOCK(bo->bo_object);
1132	}
1133
1134#ifdef INVARIANTS
1135	BO_LOCK(bo);
1136	if ((flags & (V_ALT | V_NORMAL)) == 0 &&
1137	    (bo->bo_dirty.bv_cnt > 0 || bo->bo_clean.bv_cnt > 0))
1138		panic("vinvalbuf: flush failed");
1139	BO_UNLOCK(bo);
1140#endif
1141	return (0);
1142}
1143
1144/*
1145 * Flush out and invalidate all buffers associated with a vnode.
1146 * Called with the underlying object locked.
1147 */
1148int
1149vinvalbuf(struct vnode *vp, int flags, int slpflag, int slptimeo)
1150{
1151
1152	CTR2(KTR_VFS, "vinvalbuf vp %p flags %d", vp, flags);
1153	ASSERT_VOP_LOCKED(vp, "vinvalbuf");
1154	return (bufobj_invalbuf(&vp->v_bufobj, flags, slpflag, slptimeo));
1155}
1156
1157/*
1158 * Flush out buffers on the specified list.
1159 *
1160 */
1161static int
1162flushbuflist( struct bufv *bufv, int flags, struct bufobj *bo, int slpflag,
1163    int slptimeo)
1164{
1165	struct buf *bp, *nbp;
1166	int retval, error;
1167	daddr_t lblkno;
1168	b_xflags_t xflags;
1169
1170	ASSERT_BO_LOCKED(bo);
1171
1172	retval = 0;
1173	TAILQ_FOREACH_SAFE(bp, &bufv->bv_hd, b_bobufs, nbp) {
1174		if (((flags & V_NORMAL) && (bp->b_xflags & BX_ALTDATA)) ||
1175		    ((flags & V_ALT) && (bp->b_xflags & BX_ALTDATA) == 0)) {
1176			continue;
1177		}
1178		lblkno = 0;
1179		xflags = 0;
1180		if (nbp != NULL) {
1181			lblkno = nbp->b_lblkno;
1182			xflags = nbp->b_xflags &
1183				(BX_BKGRDMARKER | BX_VNDIRTY | BX_VNCLEAN);
1184		}
1185		retval = EAGAIN;
1186		error = BUF_TIMELOCK(bp,
1187		    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_MTX(bo),
1188		    "flushbuf", slpflag, slptimeo);
1189		if (error) {
1190			BO_LOCK(bo);
1191			return (error != ENOLCK ? error : EAGAIN);
1192		}
1193		KASSERT(bp->b_bufobj == bo,
1194		    ("bp %p wrong b_bufobj %p should be %p",
1195		    bp, bp->b_bufobj, bo));
1196		if (bp->b_bufobj != bo) {	/* XXX: necessary ? */
1197			BUF_UNLOCK(bp);
1198			BO_LOCK(bo);
1199			return (EAGAIN);
1200		}
1201		/*
1202		 * XXX Since there are no node locks for NFS, I
1203		 * believe there is a slight chance that a delayed
1204		 * write will occur while sleeping just above, so
1205		 * check for it.
1206		 */
1207		if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) &&
1208		    (flags & V_SAVE)) {
1209			bremfree(bp);
1210			bp->b_flags |= B_ASYNC;
1211			bwrite(bp);
1212			BO_LOCK(bo);
1213			return (EAGAIN);	/* XXX: why not loop ? */
1214		}
1215		bremfree(bp);
1216		bp->b_flags |= (B_INVAL | B_RELBUF);
1217		bp->b_flags &= ~B_ASYNC;
1218		brelse(bp);
1219		BO_LOCK(bo);
1220		if (nbp != NULL &&
1221		    (nbp->b_bufobj != bo ||
1222		     nbp->b_lblkno != lblkno ||
1223		     (nbp->b_xflags &
1224		      (BX_BKGRDMARKER | BX_VNDIRTY | BX_VNCLEAN)) != xflags))
1225			break;			/* nbp invalid */
1226	}
1227	return (retval);
1228}
1229
1230/*
1231 * Truncate a file's buffer and pages to a specified length.  This
1232 * is in lieu of the old vinvalbuf mechanism, which performed unneeded
1233 * sync activity.
1234 */
1235int
1236vtruncbuf(struct vnode *vp, struct ucred *cred, struct thread *td,
1237    off_t length, int blksize)
1238{
1239	struct buf *bp, *nbp;
1240	int anyfreed;
1241	int trunclbn;
1242	struct bufobj *bo;
1243
1244	CTR2(KTR_VFS, "vtruncbuf vp %p length %jd", vp, length);
1245	/*
1246	 * Round up to the *next* lbn.
1247	 */
1248	trunclbn = (length + blksize - 1) / blksize;
1249
1250	ASSERT_VOP_LOCKED(vp, "vtruncbuf");
1251restart:
1252	bo = &vp->v_bufobj;
1253	BO_LOCK(bo);
1254	anyfreed = 1;
1255	for (;anyfreed;) {
1256		anyfreed = 0;
1257		TAILQ_FOREACH_SAFE(bp, &bo->bo_clean.bv_hd, b_bobufs, nbp) {
1258			if (bp->b_lblkno < trunclbn)
1259				continue;
1260			if (BUF_LOCK(bp,
1261			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
1262			    BO_MTX(bo)) == ENOLCK)
1263				goto restart;
1264
1265			bremfree(bp);
1266			bp->b_flags |= (B_INVAL | B_RELBUF);
1267			bp->b_flags &= ~B_ASYNC;
1268			brelse(bp);
1269			anyfreed = 1;
1270
1271			if (nbp != NULL &&
1272			    (((nbp->b_xflags & BX_VNCLEAN) == 0) ||
1273			    (nbp->b_vp != vp) ||
1274			    (nbp->b_flags & B_DELWRI))) {
1275				goto restart;
1276			}
1277			BO_LOCK(bo);
1278		}
1279
1280		TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
1281			if (bp->b_lblkno < trunclbn)
1282				continue;
1283			if (BUF_LOCK(bp,
1284			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
1285			    BO_MTX(bo)) == ENOLCK)
1286				goto restart;
1287			bremfree(bp);
1288			bp->b_flags |= (B_INVAL | B_RELBUF);
1289			bp->b_flags &= ~B_ASYNC;
1290			brelse(bp);
1291			anyfreed = 1;
1292			if (nbp != NULL &&
1293			    (((nbp->b_xflags & BX_VNDIRTY) == 0) ||
1294			    (nbp->b_vp != vp) ||
1295			    (nbp->b_flags & B_DELWRI) == 0)) {
1296				goto restart;
1297			}
1298			BO_LOCK(bo);
1299		}
1300	}
1301
1302	if (length > 0) {
1303restartsync:
1304		TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
1305			if (bp->b_lblkno > 0)
1306				continue;
1307			/*
1308			 * Since we hold the vnode lock this should only
1309			 * fail if we're racing with the buf daemon.
1310			 */
1311			if (BUF_LOCK(bp,
1312			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
1313			    BO_MTX(bo)) == ENOLCK) {
1314				goto restart;
1315			}
1316			VNASSERT((bp->b_flags & B_DELWRI), vp,
1317			    ("buf(%p) on dirty queue without DELWRI", bp));
1318
1319			bremfree(bp);
1320			bawrite(bp);
1321			BO_LOCK(bo);
1322			goto restartsync;
1323		}
1324	}
1325
1326	bufobj_wwait(bo, 0, 0);
1327	BO_UNLOCK(bo);
1328	vnode_pager_setsize(vp, length);
1329
1330	return (0);
1331}
1332
1333/*
1334 * buf_splay() - splay tree core for the clean/dirty list of buffers in
1335 * 		 a vnode.
1336 *
1337 *	NOTE: We have to deal with the special case of a background bitmap
1338 *	buffer, a situation where two buffers will have the same logical
1339 *	block offset.  We want (1) only the foreground buffer to be accessed
1340 *	in a lookup and (2) must differentiate between the foreground and
1341 *	background buffer in the splay tree algorithm because the splay
1342 *	tree cannot normally handle multiple entities with the same 'index'.
1343 *	We accomplish this by adding differentiating flags to the splay tree's
1344 *	numerical domain.
1345 */
1346static
1347struct buf *
1348buf_splay(daddr_t lblkno, b_xflags_t xflags, struct buf *root)
1349{
1350	struct buf dummy;
1351	struct buf *lefttreemax, *righttreemin, *y;
1352
1353	if (root == NULL)
1354		return (NULL);
1355	lefttreemax = righttreemin = &dummy;
1356	for (;;) {
1357		if (lblkno < root->b_lblkno ||
1358		    (lblkno == root->b_lblkno &&
1359		    (xflags & BX_BKGRDMARKER) < (root->b_xflags & BX_BKGRDMARKER))) {
1360			if ((y = root->b_left) == NULL)
1361				break;
1362			if (lblkno < y->b_lblkno) {
1363				/* Rotate right. */
1364				root->b_left = y->b_right;
1365				y->b_right = root;
1366				root = y;
1367				if ((y = root->b_left) == NULL)
1368					break;
1369			}
1370			/* Link into the new root's right tree. */
1371			righttreemin->b_left = root;
1372			righttreemin = root;
1373		} else if (lblkno > root->b_lblkno ||
1374		    (lblkno == root->b_lblkno &&
1375		    (xflags & BX_BKGRDMARKER) > (root->b_xflags & BX_BKGRDMARKER))) {
1376			if ((y = root->b_right) == NULL)
1377				break;
1378			if (lblkno > y->b_lblkno) {
1379				/* Rotate left. */
1380				root->b_right = y->b_left;
1381				y->b_left = root;
1382				root = y;
1383				if ((y = root->b_right) == NULL)
1384					break;
1385			}
1386			/* Link into the new root's left tree. */
1387			lefttreemax->b_right = root;
1388			lefttreemax = root;
1389		} else {
1390			break;
1391		}
1392		root = y;
1393	}
1394	/* Assemble the new root. */
1395	lefttreemax->b_right = root->b_left;
1396	righttreemin->b_left = root->b_right;
1397	root->b_left = dummy.b_right;
1398	root->b_right = dummy.b_left;
1399	return (root);
1400}
1401
1402static void
1403buf_vlist_remove(struct buf *bp)
1404{
1405	struct buf *root;
1406	struct bufv *bv;
1407
1408	KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
1409	ASSERT_BO_LOCKED(bp->b_bufobj);
1410	KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) !=
1411	    (BX_VNDIRTY|BX_VNCLEAN),
1412	    ("buf_vlist_remove: Buf %p is on two lists", bp));
1413	if (bp->b_xflags & BX_VNDIRTY)
1414		bv = &bp->b_bufobj->bo_dirty;
1415	else
1416		bv = &bp->b_bufobj->bo_clean;
1417	if (bp != bv->bv_root) {
1418		root = buf_splay(bp->b_lblkno, bp->b_xflags, bv->bv_root);
1419		KASSERT(root == bp, ("splay lookup failed in remove"));
1420	}
1421	if (bp->b_left == NULL) {
1422		root = bp->b_right;
1423	} else {
1424		root = buf_splay(bp->b_lblkno, bp->b_xflags, bp->b_left);
1425		root->b_right = bp->b_right;
1426	}
1427	bv->bv_root = root;
1428	TAILQ_REMOVE(&bv->bv_hd, bp, b_bobufs);
1429	bv->bv_cnt--;
1430	bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN);
1431}
1432
1433/*
1434 * Add the buffer to the sorted clean or dirty block list using a
1435 * splay tree algorithm.
1436 *
1437 * NOTE: xflags is passed as a constant, optimizing this inline function!
1438 */
1439static void
1440buf_vlist_add(struct buf *bp, struct bufobj *bo, b_xflags_t xflags)
1441{
1442	struct buf *root;
1443	struct bufv *bv;
1444
1445	ASSERT_BO_LOCKED(bo);
1446	KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0,
1447	    ("buf_vlist_add: Buf %p has existing xflags %d", bp, bp->b_xflags));
1448	bp->b_xflags |= xflags;
1449	if (xflags & BX_VNDIRTY)
1450		bv = &bo->bo_dirty;
1451	else
1452		bv = &bo->bo_clean;
1453
1454	root = buf_splay(bp->b_lblkno, bp->b_xflags, bv->bv_root);
1455	if (root == NULL) {
1456		bp->b_left = NULL;
1457		bp->b_right = NULL;
1458		TAILQ_INSERT_TAIL(&bv->bv_hd, bp, b_bobufs);
1459	} else if (bp->b_lblkno < root->b_lblkno ||
1460	    (bp->b_lblkno == root->b_lblkno &&
1461	    (bp->b_xflags & BX_BKGRDMARKER) < (root->b_xflags & BX_BKGRDMARKER))) {
1462		bp->b_left = root->b_left;
1463		bp->b_right = root;
1464		root->b_left = NULL;
1465		TAILQ_INSERT_BEFORE(root, bp, b_bobufs);
1466	} else {
1467		bp->b_right = root->b_right;
1468		bp->b_left = root;
1469		root->b_right = NULL;
1470		TAILQ_INSERT_AFTER(&bv->bv_hd, root, bp, b_bobufs);
1471	}
1472	bv->bv_cnt++;
1473	bv->bv_root = bp;
1474}
1475
1476/*
1477 * Lookup a buffer using the splay tree.  Note that we specifically avoid
1478 * shadow buffers used in background bitmap writes.
1479 *
1480 * This code isn't quite efficient as it could be because we are maintaining
1481 * two sorted lists and do not know which list the block resides in.
1482 *
1483 * During a "make buildworld" the desired buffer is found at one of
1484 * the roots more than 60% of the time.  Thus, checking both roots
1485 * before performing either splay eliminates unnecessary splays on the
1486 * first tree splayed.
1487 */
1488struct buf *
1489gbincore(struct bufobj *bo, daddr_t lblkno)
1490{
1491	struct buf *bp;
1492
1493	ASSERT_BO_LOCKED(bo);
1494	if ((bp = bo->bo_clean.bv_root) != NULL &&
1495	    bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
1496		return (bp);
1497	if ((bp = bo->bo_dirty.bv_root) != NULL &&
1498	    bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
1499		return (bp);
1500	if ((bp = bo->bo_clean.bv_root) != NULL) {
1501		bo->bo_clean.bv_root = bp = buf_splay(lblkno, 0, bp);
1502		if (bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
1503			return (bp);
1504	}
1505	if ((bp = bo->bo_dirty.bv_root) != NULL) {
1506		bo->bo_dirty.bv_root = bp = buf_splay(lblkno, 0, bp);
1507		if (bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
1508			return (bp);
1509	}
1510	return (NULL);
1511}
1512
1513/*
1514 * Associate a buffer with a vnode.
1515 */
1516void
1517bgetvp(struct vnode *vp, struct buf *bp)
1518{
1519	struct bufobj *bo;
1520
1521	bo = &vp->v_bufobj;
1522	ASSERT_BO_LOCKED(bo);
1523	VNASSERT(bp->b_vp == NULL, bp->b_vp, ("bgetvp: not free"));
1524
1525	CTR3(KTR_BUF, "bgetvp(%p) vp %p flags %X", bp, vp, bp->b_flags);
1526	VNASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, vp,
1527	    ("bgetvp: bp already attached! %p", bp));
1528
1529	vhold(vp);
1530	if (VFS_NEEDSGIANT(vp->v_mount) || bo->bo_flag & BO_NEEDSGIANT)
1531		bp->b_flags |= B_NEEDSGIANT;
1532	bp->b_vp = vp;
1533	bp->b_bufobj = bo;
1534	/*
1535	 * Insert onto list for new vnode.
1536	 */
1537	buf_vlist_add(bp, bo, BX_VNCLEAN);
1538}
1539
1540/*
1541 * Disassociate a buffer from a vnode.
1542 */
1543void
1544brelvp(struct buf *bp)
1545{
1546	struct bufobj *bo;
1547	struct vnode *vp;
1548
1549	CTR3(KTR_BUF, "brelvp(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
1550	KASSERT(bp->b_vp != NULL, ("brelvp: NULL"));
1551
1552	/*
1553	 * Delete from old vnode list, if on one.
1554	 */
1555	vp = bp->b_vp;		/* XXX */
1556	bo = bp->b_bufobj;
1557	BO_LOCK(bo);
1558	if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN))
1559		buf_vlist_remove(bp);
1560	else
1561		panic("brelvp: Buffer %p not on queue.", bp);
1562	if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) {
1563		bo->bo_flag &= ~BO_ONWORKLST;
1564		mtx_lock(&sync_mtx);
1565		LIST_REMOVE(bo, bo_synclist);
1566		syncer_worklist_len--;
1567		mtx_unlock(&sync_mtx);
1568	}
1569	bp->b_flags &= ~B_NEEDSGIANT;
1570	bp->b_vp = NULL;
1571	bp->b_bufobj = NULL;
1572	BO_UNLOCK(bo);
1573	vdrop(vp);
1574}
1575
1576/*
1577 * Add an item to the syncer work queue.
1578 */
1579static void
1580vn_syncer_add_to_worklist(struct bufobj *bo, int delay)
1581{
1582	int queue, slot;
1583
1584	ASSERT_BO_LOCKED(bo);
1585
1586	mtx_lock(&sync_mtx);
1587	if (bo->bo_flag & BO_ONWORKLST)
1588		LIST_REMOVE(bo, bo_synclist);
1589	else {
1590		bo->bo_flag |= BO_ONWORKLST;
1591		syncer_worklist_len++;
1592	}
1593
1594	if (delay > syncer_maxdelay - 2)
1595		delay = syncer_maxdelay - 2;
1596	slot = (syncer_delayno + delay) & syncer_mask;
1597
1598	queue = VFS_NEEDSGIANT(bo->__bo_vnode->v_mount) ? WI_GIANTQ :
1599	    WI_MPSAFEQ;
1600	LIST_INSERT_HEAD(&syncer_workitem_pending[queue][slot], bo,
1601	    bo_synclist);
1602	mtx_unlock(&sync_mtx);
1603}
1604
1605static int
1606sysctl_vfs_worklist_len(SYSCTL_HANDLER_ARGS)
1607{
1608	int error, len;
1609
1610	mtx_lock(&sync_mtx);
1611	len = syncer_worklist_len - sync_vnode_count;
1612	mtx_unlock(&sync_mtx);
1613	error = SYSCTL_OUT(req, &len, sizeof(len));
1614	return (error);
1615}
1616
1617SYSCTL_PROC(_vfs, OID_AUTO, worklist_len, CTLTYPE_INT | CTLFLAG_RD, NULL, 0,
1618    sysctl_vfs_worklist_len, "I", "Syncer thread worklist length");
1619
1620static struct proc *updateproc;
1621static void sched_sync(void);
1622static struct kproc_desc up_kp = {
1623	"syncer",
1624	sched_sync,
1625	&updateproc
1626};
1627SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp);
1628
1629static int
1630sync_vnode(struct synclist *slp, struct bufobj **bo, struct thread *td)
1631{
1632	struct vnode *vp;
1633	struct mount *mp;
1634
1635	*bo = LIST_FIRST(slp);
1636	if (*bo == NULL)
1637		return (0);
1638	vp = (*bo)->__bo_vnode;	/* XXX */
1639	if (VOP_ISLOCKED(vp) != 0 || VI_TRYLOCK(vp) == 0)
1640		return (1);
1641	/*
1642	 * We use vhold in case the vnode does not
1643	 * successfully sync.  vhold prevents the vnode from
1644	 * going away when we unlock the sync_mtx so that
1645	 * we can acquire the vnode interlock.
1646	 */
1647	vholdl(vp);
1648	mtx_unlock(&sync_mtx);
1649	VI_UNLOCK(vp);
1650	if (vn_start_write(vp, &mp, V_NOWAIT) != 0) {
1651		vdrop(vp);
1652		mtx_lock(&sync_mtx);
1653		return (*bo == LIST_FIRST(slp));
1654	}
1655	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1656	(void) VOP_FSYNC(vp, MNT_LAZY, td);
1657	VOP_UNLOCK(vp, 0);
1658	vn_finished_write(mp);
1659	BO_LOCK(*bo);
1660	if (((*bo)->bo_flag & BO_ONWORKLST) != 0) {
1661		/*
1662		 * Put us back on the worklist.  The worklist
1663		 * routine will remove us from our current
1664		 * position and then add us back in at a later
1665		 * position.
1666		 */
1667		vn_syncer_add_to_worklist(*bo, syncdelay);
1668	}
1669	BO_UNLOCK(*bo);
1670	vdrop(vp);
1671	mtx_lock(&sync_mtx);
1672	return (0);
1673}
1674
1675/*
1676 * System filesystem synchronizer daemon.
1677 */
1678static void
1679sched_sync(void)
1680{
1681	struct synclist *gnext, *next;
1682	struct synclist *gslp, *slp;
1683	struct bufobj *bo;
1684	long starttime;
1685	struct thread *td = curthread;
1686	int last_work_seen;
1687	int net_worklist_len;
1688	int syncer_final_iter;
1689	int first_printf;
1690	int error;
1691
1692	last_work_seen = 0;
1693	syncer_final_iter = 0;
1694	first_printf = 1;
1695	syncer_state = SYNCER_RUNNING;
1696	starttime = time_uptime;
1697	td->td_pflags |= TDP_NORUNNINGBUF;
1698
1699	EVENTHANDLER_REGISTER(shutdown_pre_sync, syncer_shutdown, td->td_proc,
1700	    SHUTDOWN_PRI_LAST);
1701
1702	mtx_lock(&sync_mtx);
1703	for (;;) {
1704		if (syncer_state == SYNCER_FINAL_DELAY &&
1705		    syncer_final_iter == 0) {
1706			mtx_unlock(&sync_mtx);
1707			kproc_suspend_check(td->td_proc);
1708			mtx_lock(&sync_mtx);
1709		}
1710		net_worklist_len = syncer_worklist_len - sync_vnode_count;
1711		if (syncer_state != SYNCER_RUNNING &&
1712		    starttime != time_uptime) {
1713			if (first_printf) {
1714				printf("\nSyncing disks, vnodes remaining...");
1715				first_printf = 0;
1716			}
1717			printf("%d ", net_worklist_len);
1718		}
1719		starttime = time_uptime;
1720
1721		/*
1722		 * Push files whose dirty time has expired.  Be careful
1723		 * of interrupt race on slp queue.
1724		 *
1725		 * Skip over empty worklist slots when shutting down.
1726		 */
1727		do {
1728			slp = &syncer_workitem_pending[WI_MPSAFEQ][syncer_delayno];
1729			gslp = &syncer_workitem_pending[WI_GIANTQ][syncer_delayno];
1730			syncer_delayno += 1;
1731			if (syncer_delayno == syncer_maxdelay)
1732				syncer_delayno = 0;
1733			next = &syncer_workitem_pending[WI_MPSAFEQ][syncer_delayno];
1734			gnext = &syncer_workitem_pending[WI_GIANTQ][syncer_delayno];
1735			/*
1736			 * If the worklist has wrapped since the
1737			 * it was emptied of all but syncer vnodes,
1738			 * switch to the FINAL_DELAY state and run
1739			 * for one more second.
1740			 */
1741			if (syncer_state == SYNCER_SHUTTING_DOWN &&
1742			    net_worklist_len == 0 &&
1743			    last_work_seen == syncer_delayno) {
1744				syncer_state = SYNCER_FINAL_DELAY;
1745				syncer_final_iter = SYNCER_SHUTDOWN_SPEEDUP;
1746			}
1747		} while (syncer_state != SYNCER_RUNNING && LIST_EMPTY(slp) &&
1748		    LIST_EMPTY(gslp) && syncer_worklist_len > 0);
1749
1750		/*
1751		 * Keep track of the last time there was anything
1752		 * on the worklist other than syncer vnodes.
1753		 * Return to the SHUTTING_DOWN state if any
1754		 * new work appears.
1755		 */
1756		if (net_worklist_len > 0 || syncer_state == SYNCER_RUNNING)
1757			last_work_seen = syncer_delayno;
1758		if (net_worklist_len > 0 && syncer_state == SYNCER_FINAL_DELAY)
1759			syncer_state = SYNCER_SHUTTING_DOWN;
1760		while (!LIST_EMPTY(slp)) {
1761			error = sync_vnode(slp, &bo, td);
1762			if (error == 1) {
1763				LIST_REMOVE(bo, bo_synclist);
1764				LIST_INSERT_HEAD(next, bo, bo_synclist);
1765				continue;
1766			}
1767		}
1768		if (!LIST_EMPTY(gslp)) {
1769			mtx_unlock(&sync_mtx);
1770			mtx_lock(&Giant);
1771			mtx_lock(&sync_mtx);
1772			while (!LIST_EMPTY(gslp)) {
1773				error = sync_vnode(gslp, &bo, td);
1774				if (error == 1) {
1775					LIST_REMOVE(bo, bo_synclist);
1776					LIST_INSERT_HEAD(gnext, bo,
1777					    bo_synclist);
1778					continue;
1779				}
1780			}
1781			mtx_unlock(&Giant);
1782		}
1783		if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter > 0)
1784			syncer_final_iter--;
1785		/*
1786		 * The variable rushjob allows the kernel to speed up the
1787		 * processing of the filesystem syncer process. A rushjob
1788		 * value of N tells the filesystem syncer to process the next
1789		 * N seconds worth of work on its queue ASAP. Currently rushjob
1790		 * is used by the soft update code to speed up the filesystem
1791		 * syncer process when the incore state is getting so far
1792		 * ahead of the disk that the kernel memory pool is being
1793		 * threatened with exhaustion.
1794		 */
1795		if (rushjob > 0) {
1796			rushjob -= 1;
1797			continue;
1798		}
1799		/*
1800		 * Just sleep for a short period of time between
1801		 * iterations when shutting down to allow some I/O
1802		 * to happen.
1803		 *
1804		 * If it has taken us less than a second to process the
1805		 * current work, then wait. Otherwise start right over
1806		 * again. We can still lose time if any single round
1807		 * takes more than two seconds, but it does not really
1808		 * matter as we are just trying to generally pace the
1809		 * filesystem activity.
1810		 */
1811		if (syncer_state != SYNCER_RUNNING)
1812			cv_timedwait(&sync_wakeup, &sync_mtx,
1813			    hz / SYNCER_SHUTDOWN_SPEEDUP);
1814		else if (time_uptime == starttime)
1815			cv_timedwait(&sync_wakeup, &sync_mtx, hz);
1816	}
1817}
1818
1819/*
1820 * Request the syncer daemon to speed up its work.
1821 * We never push it to speed up more than half of its
1822 * normal turn time, otherwise it could take over the cpu.
1823 */
1824int
1825speedup_syncer(void)
1826{
1827	int ret = 0;
1828
1829	mtx_lock(&sync_mtx);
1830	if (rushjob < syncdelay / 2) {
1831		rushjob += 1;
1832		stat_rush_requests += 1;
1833		ret = 1;
1834	}
1835	mtx_unlock(&sync_mtx);
1836	cv_broadcast(&sync_wakeup);
1837	return (ret);
1838}
1839
1840/*
1841 * Tell the syncer to speed up its work and run though its work
1842 * list several times, then tell it to shut down.
1843 */
1844static void
1845syncer_shutdown(void *arg, int howto)
1846{
1847
1848	if (howto & RB_NOSYNC)
1849		return;
1850	mtx_lock(&sync_mtx);
1851	syncer_state = SYNCER_SHUTTING_DOWN;
1852	rushjob = 0;
1853	mtx_unlock(&sync_mtx);
1854	cv_broadcast(&sync_wakeup);
1855	kproc_shutdown(arg, howto);
1856}
1857
1858/*
1859 * Reassign a buffer from one vnode to another.
1860 * Used to assign file specific control information
1861 * (indirect blocks) to the vnode to which they belong.
1862 */
1863void
1864reassignbuf(struct buf *bp)
1865{
1866	struct vnode *vp;
1867	struct bufobj *bo;
1868	int delay;
1869#ifdef INVARIANTS
1870	struct bufv *bv;
1871#endif
1872
1873	vp = bp->b_vp;
1874	bo = bp->b_bufobj;
1875	++reassignbufcalls;
1876
1877	CTR3(KTR_BUF, "reassignbuf(%p) vp %p flags %X",
1878	    bp, bp->b_vp, bp->b_flags);
1879	/*
1880	 * B_PAGING flagged buffers cannot be reassigned because their vp
1881	 * is not fully linked in.
1882	 */
1883	if (bp->b_flags & B_PAGING)
1884		panic("cannot reassign paging buffer");
1885
1886	/*
1887	 * Delete from old vnode list, if on one.
1888	 */
1889	BO_LOCK(bo);
1890	if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN))
1891		buf_vlist_remove(bp);
1892	else
1893		panic("reassignbuf: Buffer %p not on queue.", bp);
1894	/*
1895	 * If dirty, put on list of dirty buffers; otherwise insert onto list
1896	 * of clean buffers.
1897	 */
1898	if (bp->b_flags & B_DELWRI) {
1899		if ((bo->bo_flag & BO_ONWORKLST) == 0) {
1900			switch (vp->v_type) {
1901			case VDIR:
1902				delay = dirdelay;
1903				break;
1904			case VCHR:
1905				delay = metadelay;
1906				break;
1907			default:
1908				delay = filedelay;
1909			}
1910			vn_syncer_add_to_worklist(bo, delay);
1911		}
1912		buf_vlist_add(bp, bo, BX_VNDIRTY);
1913	} else {
1914		buf_vlist_add(bp, bo, BX_VNCLEAN);
1915
1916		if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) {
1917			mtx_lock(&sync_mtx);
1918			LIST_REMOVE(bo, bo_synclist);
1919			syncer_worklist_len--;
1920			mtx_unlock(&sync_mtx);
1921			bo->bo_flag &= ~BO_ONWORKLST;
1922		}
1923	}
1924#ifdef INVARIANTS
1925	bv = &bo->bo_clean;
1926	bp = TAILQ_FIRST(&bv->bv_hd);
1927	KASSERT(bp == NULL || bp->b_bufobj == bo,
1928	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
1929	bp = TAILQ_LAST(&bv->bv_hd, buflists);
1930	KASSERT(bp == NULL || bp->b_bufobj == bo,
1931	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
1932	bv = &bo->bo_dirty;
1933	bp = TAILQ_FIRST(&bv->bv_hd);
1934	KASSERT(bp == NULL || bp->b_bufobj == bo,
1935	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
1936	bp = TAILQ_LAST(&bv->bv_hd, buflists);
1937	KASSERT(bp == NULL || bp->b_bufobj == bo,
1938	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
1939#endif
1940	BO_UNLOCK(bo);
1941}
1942
1943/*
1944 * Increment the use and hold counts on the vnode, taking care to reference
1945 * the driver's usecount if this is a chardev.  The vholdl() will remove
1946 * the vnode from the free list if it is presently free.  Requires the
1947 * vnode interlock and returns with it held.
1948 */
1949static void
1950v_incr_usecount(struct vnode *vp)
1951{
1952
1953	CTR3(KTR_VFS, "v_incr_usecount: vp %p holdcnt %d usecount %d\n",
1954	    vp, vp->v_holdcnt, vp->v_usecount);
1955	vp->v_usecount++;
1956	if (vp->v_type == VCHR && vp->v_rdev != NULL) {
1957		dev_lock();
1958		vp->v_rdev->si_usecount++;
1959		dev_unlock();
1960	}
1961	vholdl(vp);
1962}
1963
1964/*
1965 * Turn a holdcnt into a use+holdcnt such that only one call to
1966 * v_decr_usecount is needed.
1967 */
1968static void
1969v_upgrade_usecount(struct vnode *vp)
1970{
1971
1972	CTR3(KTR_VFS, "v_upgrade_usecount: vp %p holdcnt %d usecount %d\n",
1973	    vp, vp->v_holdcnt, vp->v_usecount);
1974	vp->v_usecount++;
1975	if (vp->v_type == VCHR && vp->v_rdev != NULL) {
1976		dev_lock();
1977		vp->v_rdev->si_usecount++;
1978		dev_unlock();
1979	}
1980}
1981
1982/*
1983 * Decrement the vnode use and hold count along with the driver's usecount
1984 * if this is a chardev.  The vdropl() below releases the vnode interlock
1985 * as it may free the vnode.
1986 */
1987static void
1988v_decr_usecount(struct vnode *vp)
1989{
1990
1991	CTR3(KTR_VFS, "v_decr_usecount: vp %p holdcnt %d usecount %d\n",
1992	    vp, vp->v_holdcnt, vp->v_usecount);
1993	ASSERT_VI_LOCKED(vp, __FUNCTION__);
1994	VNASSERT(vp->v_usecount > 0, vp,
1995	    ("v_decr_usecount: negative usecount"));
1996	vp->v_usecount--;
1997	if (vp->v_type == VCHR && vp->v_rdev != NULL) {
1998		dev_lock();
1999		vp->v_rdev->si_usecount--;
2000		dev_unlock();
2001	}
2002	vdropl(vp);
2003}
2004
2005/*
2006 * Decrement only the use count and driver use count.  This is intended to
2007 * be paired with a follow on vdropl() to release the remaining hold count.
2008 * In this way we may vgone() a vnode with a 0 usecount without risk of
2009 * having it end up on a free list because the hold count is kept above 0.
2010 */
2011static void
2012v_decr_useonly(struct vnode *vp)
2013{
2014
2015	CTR3(KTR_VFS, "v_decr_useonly: vp %p holdcnt %d usecount %d\n",
2016	    vp, vp->v_holdcnt, vp->v_usecount);
2017	ASSERT_VI_LOCKED(vp, __FUNCTION__);
2018	VNASSERT(vp->v_usecount > 0, vp,
2019	    ("v_decr_useonly: negative usecount"));
2020	vp->v_usecount--;
2021	if (vp->v_type == VCHR && vp->v_rdev != NULL) {
2022		dev_lock();
2023		vp->v_rdev->si_usecount--;
2024		dev_unlock();
2025	}
2026}
2027
2028/*
2029 * Grab a particular vnode from the free list, increment its
2030 * reference count and lock it.  VI_DOOMED is set if the vnode
2031 * is being destroyed.  Only callers who specify LK_RETRY will
2032 * see doomed vnodes.  If inactive processing was delayed in
2033 * vput try to do it here.
2034 */
2035int
2036vget(struct vnode *vp, int flags, struct thread *td)
2037{
2038	int error;
2039
2040	error = 0;
2041	VFS_ASSERT_GIANT(vp->v_mount);
2042	VNASSERT((flags & LK_TYPE_MASK) != 0, vp,
2043	    ("vget: invalid lock operation"));
2044	if ((flags & LK_INTERLOCK) == 0)
2045		VI_LOCK(vp);
2046	vholdl(vp);
2047	if ((error = vn_lock(vp, flags | LK_INTERLOCK)) != 0) {
2048		vdrop(vp);
2049		return (error);
2050	}
2051	if (vp->v_iflag & VI_DOOMED && (flags & LK_RETRY) == 0)
2052		panic("vget: vn_lock failed to return ENOENT\n");
2053	VI_LOCK(vp);
2054	/* Upgrade our holdcnt to a usecount. */
2055	v_upgrade_usecount(vp);
2056	/*
2057 	 * We don't guarantee that any particular close will
2058	 * trigger inactive processing so just make a best effort
2059	 * here at preventing a reference to a removed file.  If
2060	 * we don't succeed no harm is done.
2061	 */
2062	if (vp->v_iflag & VI_OWEINACT) {
2063		if (VOP_ISLOCKED(vp) == LK_EXCLUSIVE &&
2064		    (flags & LK_NOWAIT) == 0)
2065			vinactive(vp, td);
2066		vp->v_iflag &= ~VI_OWEINACT;
2067	}
2068	VI_UNLOCK(vp);
2069	return (0);
2070}
2071
2072/*
2073 * Increase the reference count of a vnode.
2074 */
2075void
2076vref(struct vnode *vp)
2077{
2078
2079	VI_LOCK(vp);
2080	v_incr_usecount(vp);
2081	VI_UNLOCK(vp);
2082}
2083
2084/*
2085 * Return reference count of a vnode.
2086 *
2087 * The results of this call are only guaranteed when some mechanism other
2088 * than the VI lock is used to stop other processes from gaining references
2089 * to the vnode.  This may be the case if the caller holds the only reference.
2090 * This is also useful when stale data is acceptable as race conditions may
2091 * be accounted for by some other means.
2092 */
2093int
2094vrefcnt(struct vnode *vp)
2095{
2096	int usecnt;
2097
2098	VI_LOCK(vp);
2099	usecnt = vp->v_usecount;
2100	VI_UNLOCK(vp);
2101
2102	return (usecnt);
2103}
2104
2105
2106/*
2107 * Vnode put/release.
2108 * If count drops to zero, call inactive routine and return to freelist.
2109 */
2110void
2111vrele(struct vnode *vp)
2112{
2113	struct thread *td = curthread;	/* XXX */
2114
2115	KASSERT(vp != NULL, ("vrele: null vp"));
2116	VFS_ASSERT_GIANT(vp->v_mount);
2117
2118	VI_LOCK(vp);
2119
2120	/* Skip this v_writecount check if we're going to panic below. */
2121	VNASSERT(vp->v_writecount < vp->v_usecount || vp->v_usecount < 1, vp,
2122	    ("vrele: missed vn_close"));
2123
2124	if (vp->v_usecount > 1 || ((vp->v_iflag & VI_DOINGINACT) &&
2125	    vp->v_usecount == 1)) {
2126		v_decr_usecount(vp);
2127		return;
2128	}
2129	if (vp->v_usecount != 1) {
2130#ifdef DIAGNOSTIC
2131		vprint("vrele: negative ref count", vp);
2132#endif
2133		VI_UNLOCK(vp);
2134		panic("vrele: negative ref cnt");
2135	}
2136	/*
2137	 * We want to hold the vnode until the inactive finishes to
2138	 * prevent vgone() races.  We drop the use count here and the
2139	 * hold count below when we're done.
2140	 */
2141	v_decr_useonly(vp);
2142	/*
2143	 * We must call VOP_INACTIVE with the node locked. Mark
2144	 * as VI_DOINGINACT to avoid recursion.
2145	 */
2146	vp->v_iflag |= VI_OWEINACT;
2147	if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK) == 0) {
2148		VI_LOCK(vp);
2149		if (vp->v_usecount > 0)
2150			vp->v_iflag &= ~VI_OWEINACT;
2151		if (vp->v_iflag & VI_OWEINACT)
2152			vinactive(vp, td);
2153		VOP_UNLOCK(vp, 0);
2154	} else {
2155		VI_LOCK(vp);
2156		if (vp->v_usecount > 0)
2157			vp->v_iflag &= ~VI_OWEINACT;
2158	}
2159	vdropl(vp);
2160}
2161
2162/*
2163 * Release an already locked vnode.  This give the same effects as
2164 * unlock+vrele(), but takes less time and avoids releasing and
2165 * re-aquiring the lock (as vrele() acquires the lock internally.)
2166 */
2167void
2168vput(struct vnode *vp)
2169{
2170	struct thread *td = curthread;	/* XXX */
2171	int error;
2172
2173	KASSERT(vp != NULL, ("vput: null vp"));
2174	ASSERT_VOP_LOCKED(vp, "vput");
2175	VFS_ASSERT_GIANT(vp->v_mount);
2176	VI_LOCK(vp);
2177	/* Skip this v_writecount check if we're going to panic below. */
2178	VNASSERT(vp->v_writecount < vp->v_usecount || vp->v_usecount < 1, vp,
2179	    ("vput: missed vn_close"));
2180	error = 0;
2181
2182	if (vp->v_usecount > 1 || ((vp->v_iflag & VI_DOINGINACT) &&
2183	    vp->v_usecount == 1)) {
2184		VOP_UNLOCK(vp, 0);
2185		v_decr_usecount(vp);
2186		return;
2187	}
2188
2189	if (vp->v_usecount != 1) {
2190#ifdef DIAGNOSTIC
2191		vprint("vput: negative ref count", vp);
2192#endif
2193		panic("vput: negative ref cnt");
2194	}
2195	/*
2196	 * We want to hold the vnode until the inactive finishes to
2197	 * prevent vgone() races.  We drop the use count here and the
2198	 * hold count below when we're done.
2199	 */
2200	v_decr_useonly(vp);
2201	vp->v_iflag |= VI_OWEINACT;
2202	if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) {
2203		error = VOP_LOCK(vp, LK_UPGRADE|LK_INTERLOCK|LK_NOWAIT);
2204		VI_LOCK(vp);
2205		if (error) {
2206			if (vp->v_usecount > 0)
2207				vp->v_iflag &= ~VI_OWEINACT;
2208			goto done;
2209		}
2210	}
2211	if (vp->v_usecount > 0)
2212		vp->v_iflag &= ~VI_OWEINACT;
2213	if (vp->v_iflag & VI_OWEINACT)
2214		vinactive(vp, td);
2215	VOP_UNLOCK(vp, 0);
2216done:
2217	vdropl(vp);
2218}
2219
2220/*
2221 * Somebody doesn't want the vnode recycled.
2222 */
2223void
2224vhold(struct vnode *vp)
2225{
2226
2227	VI_LOCK(vp);
2228	vholdl(vp);
2229	VI_UNLOCK(vp);
2230}
2231
2232void
2233vholdl(struct vnode *vp)
2234{
2235
2236	vp->v_holdcnt++;
2237	if (VSHOULDBUSY(vp))
2238		vbusy(vp);
2239}
2240
2241/*
2242 * Note that there is one less who cares about this vnode.  vdrop() is the
2243 * opposite of vhold().
2244 */
2245void
2246vdrop(struct vnode *vp)
2247{
2248
2249	VI_LOCK(vp);
2250	vdropl(vp);
2251}
2252
2253/*
2254 * Drop the hold count of the vnode.  If this is the last reference to
2255 * the vnode we will free it if it has been vgone'd otherwise it is
2256 * placed on the free list.
2257 */
2258void
2259vdropl(struct vnode *vp)
2260{
2261
2262	ASSERT_VI_LOCKED(vp, "vdropl");
2263	if (vp->v_holdcnt <= 0)
2264		panic("vdrop: holdcnt %d", vp->v_holdcnt);
2265	vp->v_holdcnt--;
2266	if (vp->v_holdcnt == 0) {
2267		if (vp->v_iflag & VI_DOOMED) {
2268			vdestroy(vp);
2269			return;
2270		} else
2271			vfree(vp);
2272	}
2273	VI_UNLOCK(vp);
2274}
2275
2276/*
2277 * Call VOP_INACTIVE on the vnode and manage the DOINGINACT and OWEINACT
2278 * flags.  DOINGINACT prevents us from recursing in calls to vinactive.
2279 * OWEINACT tracks whether a vnode missed a call to inactive due to a
2280 * failed lock upgrade.
2281 */
2282static void
2283vinactive(struct vnode *vp, struct thread *td)
2284{
2285
2286	ASSERT_VOP_ELOCKED(vp, "vinactive");
2287	ASSERT_VI_LOCKED(vp, "vinactive");
2288	VNASSERT((vp->v_iflag & VI_DOINGINACT) == 0, vp,
2289	    ("vinactive: recursed on VI_DOINGINACT"));
2290	vp->v_iflag |= VI_DOINGINACT;
2291	vp->v_iflag &= ~VI_OWEINACT;
2292	VI_UNLOCK(vp);
2293	VOP_INACTIVE(vp, td);
2294	VI_LOCK(vp);
2295	VNASSERT(vp->v_iflag & VI_DOINGINACT, vp,
2296	    ("vinactive: lost VI_DOINGINACT"));
2297	vp->v_iflag &= ~VI_DOINGINACT;
2298}
2299
2300/*
2301 * Remove any vnodes in the vnode table belonging to mount point mp.
2302 *
2303 * If FORCECLOSE is not specified, there should not be any active ones,
2304 * return error if any are found (nb: this is a user error, not a
2305 * system error). If FORCECLOSE is specified, detach any active vnodes
2306 * that are found.
2307 *
2308 * If WRITECLOSE is set, only flush out regular file vnodes open for
2309 * writing.
2310 *
2311 * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped.
2312 *
2313 * `rootrefs' specifies the base reference count for the root vnode
2314 * of this filesystem. The root vnode is considered busy if its
2315 * v_usecount exceeds this value. On a successful return, vflush(, td)
2316 * will call vrele() on the root vnode exactly rootrefs times.
2317 * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must
2318 * be zero.
2319 */
2320#ifdef DIAGNOSTIC
2321static int busyprt = 0;		/* print out busy vnodes */
2322SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "");
2323#endif
2324
2325int
2326vflush( struct mount *mp, int rootrefs, int flags, struct thread *td)
2327{
2328	struct vnode *vp, *mvp, *rootvp = NULL;
2329	struct vattr vattr;
2330	int busy = 0, error;
2331
2332	CTR1(KTR_VFS, "vflush: mp %p", mp);
2333	if (rootrefs > 0) {
2334		KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0,
2335		    ("vflush: bad args"));
2336		/*
2337		 * Get the filesystem root vnode. We can vput() it
2338		 * immediately, since with rootrefs > 0, it won't go away.
2339		 */
2340		if ((error = VFS_ROOT(mp, LK_EXCLUSIVE, &rootvp, td)) != 0)
2341			return (error);
2342		vput(rootvp);
2343
2344	}
2345	MNT_ILOCK(mp);
2346loop:
2347	MNT_VNODE_FOREACH(vp, mp, mvp) {
2348
2349		VI_LOCK(vp);
2350		vholdl(vp);
2351		MNT_IUNLOCK(mp);
2352		error = vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE);
2353		if (error) {
2354			vdrop(vp);
2355			MNT_ILOCK(mp);
2356			MNT_VNODE_FOREACH_ABORT_ILOCKED(mp, mvp);
2357			goto loop;
2358		}
2359		/*
2360		 * Skip over a vnodes marked VV_SYSTEM.
2361		 */
2362		if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) {
2363			VOP_UNLOCK(vp, 0);
2364			vdrop(vp);
2365			MNT_ILOCK(mp);
2366			continue;
2367		}
2368		/*
2369		 * If WRITECLOSE is set, flush out unlinked but still open
2370		 * files (even if open only for reading) and regular file
2371		 * vnodes open for writing.
2372		 */
2373		if (flags & WRITECLOSE) {
2374			error = VOP_GETATTR(vp, &vattr, td->td_ucred);
2375			VI_LOCK(vp);
2376
2377			if ((vp->v_type == VNON ||
2378			    (error == 0 && vattr.va_nlink > 0)) &&
2379			    (vp->v_writecount == 0 || vp->v_type != VREG)) {
2380				VOP_UNLOCK(vp, 0);
2381				vdropl(vp);
2382				MNT_ILOCK(mp);
2383				continue;
2384			}
2385		} else
2386			VI_LOCK(vp);
2387		/*
2388		 * With v_usecount == 0, all we need to do is clear out the
2389		 * vnode data structures and we are done.
2390		 *
2391		 * If FORCECLOSE is set, forcibly close the vnode.
2392		 */
2393		if (vp->v_usecount == 0 || (flags & FORCECLOSE)) {
2394			VNASSERT(vp->v_usecount == 0 ||
2395			    (vp->v_type != VCHR && vp->v_type != VBLK), vp,
2396			    ("device VNODE %p is FORCECLOSED", vp));
2397			vgonel(vp);
2398		} else {
2399			busy++;
2400#ifdef DIAGNOSTIC
2401			if (busyprt)
2402				vprint("vflush: busy vnode", vp);
2403#endif
2404		}
2405		VOP_UNLOCK(vp, 0);
2406		vdropl(vp);
2407		MNT_ILOCK(mp);
2408	}
2409	MNT_IUNLOCK(mp);
2410	if (rootrefs > 0 && (flags & FORCECLOSE) == 0) {
2411		/*
2412		 * If just the root vnode is busy, and if its refcount
2413		 * is equal to `rootrefs', then go ahead and kill it.
2414		 */
2415		VI_LOCK(rootvp);
2416		KASSERT(busy > 0, ("vflush: not busy"));
2417		VNASSERT(rootvp->v_usecount >= rootrefs, rootvp,
2418		    ("vflush: usecount %d < rootrefs %d",
2419		     rootvp->v_usecount, rootrefs));
2420		if (busy == 1 && rootvp->v_usecount == rootrefs) {
2421			VOP_LOCK(rootvp, LK_EXCLUSIVE|LK_INTERLOCK);
2422			vgone(rootvp);
2423			VOP_UNLOCK(rootvp, 0);
2424			busy = 0;
2425		} else
2426			VI_UNLOCK(rootvp);
2427	}
2428	if (busy)
2429		return (EBUSY);
2430	for (; rootrefs > 0; rootrefs--)
2431		vrele(rootvp);
2432	return (0);
2433}
2434
2435/*
2436 * Recycle an unused vnode to the front of the free list.
2437 */
2438int
2439vrecycle(struct vnode *vp, struct thread *td)
2440{
2441	int recycled;
2442
2443	ASSERT_VOP_ELOCKED(vp, "vrecycle");
2444	recycled = 0;
2445	VI_LOCK(vp);
2446	if (vp->v_usecount == 0) {
2447		recycled = 1;
2448		vgonel(vp);
2449	}
2450	VI_UNLOCK(vp);
2451	return (recycled);
2452}
2453
2454/*
2455 * Eliminate all activity associated with a vnode
2456 * in preparation for reuse.
2457 */
2458void
2459vgone(struct vnode *vp)
2460{
2461	VI_LOCK(vp);
2462	vgonel(vp);
2463	VI_UNLOCK(vp);
2464}
2465
2466/*
2467 * vgone, with the vp interlock held.
2468 */
2469void
2470vgonel(struct vnode *vp)
2471{
2472	struct thread *td;
2473	int oweinact;
2474	int active;
2475	struct mount *mp;
2476
2477	CTR1(KTR_VFS, "vgonel: vp %p", vp);
2478	ASSERT_VOP_ELOCKED(vp, "vgonel");
2479	ASSERT_VI_LOCKED(vp, "vgonel");
2480	VNASSERT(vp->v_holdcnt, vp,
2481	    ("vgonel: vp %p has no reference.", vp));
2482	td = curthread;
2483
2484	/*
2485	 * Don't vgonel if we're already doomed.
2486	 */
2487	if (vp->v_iflag & VI_DOOMED)
2488		return;
2489	vp->v_iflag |= VI_DOOMED;
2490	/*
2491	 * Check to see if the vnode is in use.  If so, we have to call
2492	 * VOP_CLOSE() and VOP_INACTIVE().
2493	 */
2494	active = vp->v_usecount;
2495	oweinact = (vp->v_iflag & VI_OWEINACT);
2496	VI_UNLOCK(vp);
2497	/*
2498	 * Clean out any buffers associated with the vnode.
2499	 * If the flush fails, just toss the buffers.
2500	 */
2501	mp = NULL;
2502	if (!TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd))
2503		(void) vn_start_secondary_write(vp, &mp, V_WAIT);
2504	if (vinvalbuf(vp, V_SAVE, 0, 0) != 0)
2505		vinvalbuf(vp, 0, 0, 0);
2506
2507	/*
2508	 * If purging an active vnode, it must be closed and
2509	 * deactivated before being reclaimed.
2510	 */
2511	if (active)
2512		VOP_CLOSE(vp, FNONBLOCK, NOCRED, td);
2513	if (oweinact || active) {
2514		VI_LOCK(vp);
2515		if ((vp->v_iflag & VI_DOINGINACT) == 0)
2516			vinactive(vp, td);
2517		VI_UNLOCK(vp);
2518	}
2519	/*
2520	 * Reclaim the vnode.
2521	 */
2522	if (VOP_RECLAIM(vp, td))
2523		panic("vgone: cannot reclaim");
2524	if (mp != NULL)
2525		vn_finished_secondary_write(mp);
2526	VNASSERT(vp->v_object == NULL, vp,
2527	    ("vop_reclaim left v_object vp=%p, tag=%s", vp, vp->v_tag));
2528	/*
2529	 * Clear the advisory locks and wake up waiting threads.
2530	 */
2531	lf_purgelocks(vp, &(vp->v_lockf));
2532	/*
2533	 * Delete from old mount point vnode list.
2534	 */
2535	delmntque(vp);
2536	cache_purge(vp);
2537	/*
2538	 * Done with purge, reset to the standard lock and invalidate
2539	 * the vnode.
2540	 */
2541	VI_LOCK(vp);
2542	vp->v_vnlock = &vp->v_lock;
2543	vp->v_op = &dead_vnodeops;
2544	vp->v_tag = "none";
2545	vp->v_type = VBAD;
2546}
2547
2548/*
2549 * Calculate the total number of references to a special device.
2550 */
2551int
2552vcount(struct vnode *vp)
2553{
2554	int count;
2555
2556	dev_lock();
2557	count = vp->v_rdev->si_usecount;
2558	dev_unlock();
2559	return (count);
2560}
2561
2562/*
2563 * Same as above, but using the struct cdev *as argument
2564 */
2565int
2566count_dev(struct cdev *dev)
2567{
2568	int count;
2569
2570	dev_lock();
2571	count = dev->si_usecount;
2572	dev_unlock();
2573	return(count);
2574}
2575
2576/*
2577 * Print out a description of a vnode.
2578 */
2579static char *typename[] =
2580{"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD",
2581 "VMARKER"};
2582
2583void
2584vn_printf(struct vnode *vp, const char *fmt, ...)
2585{
2586	va_list ap;
2587	char buf[256], buf2[16];
2588	u_long flags;
2589
2590	va_start(ap, fmt);
2591	vprintf(fmt, ap);
2592	va_end(ap);
2593	printf("%p: ", (void *)vp);
2594	printf("tag %s, type %s\n", vp->v_tag, typename[vp->v_type]);
2595	printf("    usecount %d, writecount %d, refcount %d mountedhere %p\n",
2596	    vp->v_usecount, vp->v_writecount, vp->v_holdcnt, vp->v_mountedhere);
2597	buf[0] = '\0';
2598	buf[1] = '\0';
2599	if (vp->v_vflag & VV_ROOT)
2600		strlcat(buf, "|VV_ROOT", sizeof(buf));
2601	if (vp->v_vflag & VV_ISTTY)
2602		strlcat(buf, "|VV_ISTTY", sizeof(buf));
2603	if (vp->v_vflag & VV_NOSYNC)
2604		strlcat(buf, "|VV_NOSYNC", sizeof(buf));
2605	if (vp->v_vflag & VV_CACHEDLABEL)
2606		strlcat(buf, "|VV_CACHEDLABEL", sizeof(buf));
2607	if (vp->v_vflag & VV_TEXT)
2608		strlcat(buf, "|VV_TEXT", sizeof(buf));
2609	if (vp->v_vflag & VV_COPYONWRITE)
2610		strlcat(buf, "|VV_COPYONWRITE", sizeof(buf));
2611	if (vp->v_vflag & VV_SYSTEM)
2612		strlcat(buf, "|VV_SYSTEM", sizeof(buf));
2613	if (vp->v_vflag & VV_PROCDEP)
2614		strlcat(buf, "|VV_PROCDEP", sizeof(buf));
2615	if (vp->v_vflag & VV_NOKNOTE)
2616		strlcat(buf, "|VV_NOKNOTE", sizeof(buf));
2617	if (vp->v_vflag & VV_DELETED)
2618		strlcat(buf, "|VV_DELETED", sizeof(buf));
2619	if (vp->v_vflag & VV_MD)
2620		strlcat(buf, "|VV_MD", sizeof(buf));
2621	flags = vp->v_vflag & ~(VV_ROOT | VV_ISTTY | VV_NOSYNC |
2622	    VV_CACHEDLABEL | VV_TEXT | VV_COPYONWRITE | VV_SYSTEM | VV_PROCDEP |
2623	    VV_NOKNOTE | VV_DELETED | VV_MD);
2624	if (flags != 0) {
2625		snprintf(buf2, sizeof(buf2), "|VV(0x%lx)", flags);
2626		strlcat(buf, buf2, sizeof(buf));
2627	}
2628	if (vp->v_iflag & VI_MOUNT)
2629		strlcat(buf, "|VI_MOUNT", sizeof(buf));
2630	if (vp->v_iflag & VI_AGE)
2631		strlcat(buf, "|VI_AGE", sizeof(buf));
2632	if (vp->v_iflag & VI_DOOMED)
2633		strlcat(buf, "|VI_DOOMED", sizeof(buf));
2634	if (vp->v_iflag & VI_FREE)
2635		strlcat(buf, "|VI_FREE", sizeof(buf));
2636	if (vp->v_iflag & VI_OBJDIRTY)
2637		strlcat(buf, "|VI_OBJDIRTY", sizeof(buf));
2638	if (vp->v_iflag & VI_DOINGINACT)
2639		strlcat(buf, "|VI_DOINGINACT", sizeof(buf));
2640	if (vp->v_iflag & VI_OWEINACT)
2641		strlcat(buf, "|VI_OWEINACT", sizeof(buf));
2642	flags = vp->v_iflag & ~(VI_MOUNT | VI_AGE | VI_DOOMED | VI_FREE |
2643	    VI_OBJDIRTY | VI_DOINGINACT | VI_OWEINACT);
2644	if (flags != 0) {
2645		snprintf(buf2, sizeof(buf2), "|VI(0x%lx)", flags);
2646		strlcat(buf, buf2, sizeof(buf));
2647	}
2648	printf("    flags (%s)\n", buf + 1);
2649	if (mtx_owned(VI_MTX(vp)))
2650		printf(" VI_LOCKed");
2651	if (vp->v_object != NULL)
2652		printf("    v_object %p ref %d pages %d\n",
2653		    vp->v_object, vp->v_object->ref_count,
2654		    vp->v_object->resident_page_count);
2655	printf("    ");
2656	lockmgr_printinfo(vp->v_vnlock);
2657	printf("\n");
2658	if (vp->v_data != NULL)
2659		VOP_PRINT(vp);
2660}
2661
2662#ifdef DDB
2663/*
2664 * List all of the locked vnodes in the system.
2665 * Called when debugging the kernel.
2666 */
2667DB_SHOW_COMMAND(lockedvnods, lockedvnodes)
2668{
2669	struct mount *mp, *nmp;
2670	struct vnode *vp;
2671
2672	/*
2673	 * Note: because this is DDB, we can't obey the locking semantics
2674	 * for these structures, which means we could catch an inconsistent
2675	 * state and dereference a nasty pointer.  Not much to be done
2676	 * about that.
2677	 */
2678	db_printf("Locked vnodes\n");
2679	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
2680		nmp = TAILQ_NEXT(mp, mnt_list);
2681		TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
2682			if (vp->v_type != VMARKER &&
2683			    VOP_ISLOCKED(vp))
2684				vprint("", vp);
2685		}
2686		nmp = TAILQ_NEXT(mp, mnt_list);
2687	}
2688}
2689
2690/*
2691 * Show details about the given vnode.
2692 */
2693DB_SHOW_COMMAND(vnode, db_show_vnode)
2694{
2695	struct vnode *vp;
2696
2697	if (!have_addr)
2698		return;
2699	vp = (struct vnode *)addr;
2700	vn_printf(vp, "vnode ");
2701}
2702
2703/*
2704 * Show details about the given mount point.
2705 */
2706DB_SHOW_COMMAND(mount, db_show_mount)
2707{
2708	struct mount *mp;
2709	struct statfs *sp;
2710	struct vnode *vp;
2711	char buf[512];
2712	u_int flags;
2713
2714	if (!have_addr) {
2715		/* No address given, print short info about all mount points. */
2716		TAILQ_FOREACH(mp, &mountlist, mnt_list) {
2717			db_printf("%p %s on %s (%s)\n", mp,
2718			    mp->mnt_stat.f_mntfromname,
2719			    mp->mnt_stat.f_mntonname,
2720			    mp->mnt_stat.f_fstypename);
2721			if (db_pager_quit)
2722				break;
2723		}
2724		db_printf("\nMore info: show mount <addr>\n");
2725		return;
2726	}
2727
2728	mp = (struct mount *)addr;
2729	db_printf("%p %s on %s (%s)\n", mp, mp->mnt_stat.f_mntfromname,
2730	    mp->mnt_stat.f_mntonname, mp->mnt_stat.f_fstypename);
2731
2732	buf[0] = '\0';
2733	flags = mp->mnt_flag;
2734#define	MNT_FLAG(flag)	do {						\
2735	if (flags & (flag)) {						\
2736		if (buf[0] != '\0')					\
2737			strlcat(buf, ", ", sizeof(buf));		\
2738		strlcat(buf, (#flag) + 4, sizeof(buf));			\
2739		flags &= ~(flag);					\
2740	}								\
2741} while (0)
2742	MNT_FLAG(MNT_RDONLY);
2743	MNT_FLAG(MNT_SYNCHRONOUS);
2744	MNT_FLAG(MNT_NOEXEC);
2745	MNT_FLAG(MNT_NOSUID);
2746	MNT_FLAG(MNT_UNION);
2747	MNT_FLAG(MNT_ASYNC);
2748	MNT_FLAG(MNT_SUIDDIR);
2749	MNT_FLAG(MNT_SOFTDEP);
2750	MNT_FLAG(MNT_NOSYMFOLLOW);
2751	MNT_FLAG(MNT_GJOURNAL);
2752	MNT_FLAG(MNT_MULTILABEL);
2753	MNT_FLAG(MNT_ACLS);
2754	MNT_FLAG(MNT_NOATIME);
2755	MNT_FLAG(MNT_NOCLUSTERR);
2756	MNT_FLAG(MNT_NOCLUSTERW);
2757	MNT_FLAG(MNT_EXRDONLY);
2758	MNT_FLAG(MNT_EXPORTED);
2759	MNT_FLAG(MNT_DEFEXPORTED);
2760	MNT_FLAG(MNT_EXPORTANON);
2761	MNT_FLAG(MNT_EXKERB);
2762	MNT_FLAG(MNT_EXPUBLIC);
2763	MNT_FLAG(MNT_LOCAL);
2764	MNT_FLAG(MNT_QUOTA);
2765	MNT_FLAG(MNT_ROOTFS);
2766	MNT_FLAG(MNT_USER);
2767	MNT_FLAG(MNT_IGNORE);
2768	MNT_FLAG(MNT_UPDATE);
2769	MNT_FLAG(MNT_DELEXPORT);
2770	MNT_FLAG(MNT_RELOAD);
2771	MNT_FLAG(MNT_FORCE);
2772	MNT_FLAG(MNT_SNAPSHOT);
2773	MNT_FLAG(MNT_BYFSID);
2774#undef MNT_FLAG
2775	if (flags != 0) {
2776		if (buf[0] != '\0')
2777			strlcat(buf, ", ", sizeof(buf));
2778		snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf),
2779		    "0x%08x", flags);
2780	}
2781	db_printf("    mnt_flag = %s\n", buf);
2782
2783	buf[0] = '\0';
2784	flags = mp->mnt_kern_flag;
2785#define	MNT_KERN_FLAG(flag)	do {					\
2786	if (flags & (flag)) {						\
2787		if (buf[0] != '\0')					\
2788			strlcat(buf, ", ", sizeof(buf));		\
2789		strlcat(buf, (#flag) + 5, sizeof(buf));			\
2790		flags &= ~(flag);					\
2791	}								\
2792} while (0)
2793	MNT_KERN_FLAG(MNTK_UNMOUNTF);
2794	MNT_KERN_FLAG(MNTK_ASYNC);
2795	MNT_KERN_FLAG(MNTK_SOFTDEP);
2796	MNT_KERN_FLAG(MNTK_NOINSMNTQ);
2797	MNT_KERN_FLAG(MNTK_UNMOUNT);
2798	MNT_KERN_FLAG(MNTK_MWAIT);
2799	MNT_KERN_FLAG(MNTK_SUSPEND);
2800	MNT_KERN_FLAG(MNTK_SUSPEND2);
2801	MNT_KERN_FLAG(MNTK_SUSPENDED);
2802	MNT_KERN_FLAG(MNTK_MPSAFE);
2803	MNT_KERN_FLAG(MNTK_NOKNOTE);
2804	MNT_KERN_FLAG(MNTK_LOOKUP_SHARED);
2805#undef MNT_KERN_FLAG
2806	if (flags != 0) {
2807		if (buf[0] != '\0')
2808			strlcat(buf, ", ", sizeof(buf));
2809		snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf),
2810		    "0x%08x", flags);
2811	}
2812	db_printf("    mnt_kern_flag = %s\n", buf);
2813
2814	sp = &mp->mnt_stat;
2815	db_printf("    mnt_stat = { version=%u type=%u flags=0x%016jx "
2816	    "bsize=%ju iosize=%ju blocks=%ju bfree=%ju bavail=%jd files=%ju "
2817	    "ffree=%jd syncwrites=%ju asyncwrites=%ju syncreads=%ju "
2818	    "asyncreads=%ju namemax=%u owner=%u fsid=[%d, %d] }\n",
2819	    (u_int)sp->f_version, (u_int)sp->f_type, (uintmax_t)sp->f_flags,
2820	    (uintmax_t)sp->f_bsize, (uintmax_t)sp->f_iosize,
2821	    (uintmax_t)sp->f_blocks, (uintmax_t)sp->f_bfree,
2822	    (intmax_t)sp->f_bavail, (uintmax_t)sp->f_files,
2823	    (intmax_t)sp->f_ffree, (uintmax_t)sp->f_syncwrites,
2824	    (uintmax_t)sp->f_asyncwrites, (uintmax_t)sp->f_syncreads,
2825	    (uintmax_t)sp->f_asyncreads, (u_int)sp->f_namemax,
2826	    (u_int)sp->f_owner, (int)sp->f_fsid.val[0], (int)sp->f_fsid.val[1]);
2827
2828	db_printf("    mnt_cred = { uid=%u ruid=%u",
2829	    (u_int)mp->mnt_cred->cr_uid, (u_int)mp->mnt_cred->cr_ruid);
2830	if (mp->mnt_cred->cr_prison != NULL)
2831		db_printf(", jail=%d", mp->mnt_cred->cr_prison->pr_id);
2832	db_printf(" }\n");
2833	db_printf("    mnt_ref = %d\n", mp->mnt_ref);
2834	db_printf("    mnt_gen = %d\n", mp->mnt_gen);
2835	db_printf("    mnt_nvnodelistsize = %d\n", mp->mnt_nvnodelistsize);
2836	db_printf("    mnt_writeopcount = %d\n", mp->mnt_writeopcount);
2837	db_printf("    mnt_noasync = %u\n", mp->mnt_noasync);
2838	db_printf("    mnt_maxsymlinklen = %d\n", mp->mnt_maxsymlinklen);
2839	db_printf("    mnt_iosize_max = %d\n", mp->mnt_iosize_max);
2840	db_printf("    mnt_hashseed = %u\n", mp->mnt_hashseed);
2841	db_printf("    mnt_holdcnt = %d\n", mp->mnt_holdcnt);
2842	db_printf("    mnt_holdcntwaiters = %d\n", mp->mnt_holdcntwaiters);
2843	db_printf("    mnt_secondary_writes = %d\n", mp->mnt_secondary_writes);
2844	db_printf("    mnt_secondary_accwrites = %d\n",
2845	    mp->mnt_secondary_accwrites);
2846	db_printf("    mnt_gjprovider = %s\n",
2847	    mp->mnt_gjprovider != NULL ? mp->mnt_gjprovider : "NULL");
2848	db_printf("\n");
2849
2850	TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
2851		if (vp->v_type != VMARKER) {
2852			vn_printf(vp, "vnode ");
2853			if (db_pager_quit)
2854				break;
2855		}
2856	}
2857}
2858#endif	/* DDB */
2859
2860/*
2861 * Fill in a struct xvfsconf based on a struct vfsconf.
2862 */
2863static void
2864vfsconf2x(struct vfsconf *vfsp, struct xvfsconf *xvfsp)
2865{
2866
2867	strcpy(xvfsp->vfc_name, vfsp->vfc_name);
2868	xvfsp->vfc_typenum = vfsp->vfc_typenum;
2869	xvfsp->vfc_refcount = vfsp->vfc_refcount;
2870	xvfsp->vfc_flags = vfsp->vfc_flags;
2871	/*
2872	 * These are unused in userland, we keep them
2873	 * to not break binary compatibility.
2874	 */
2875	xvfsp->vfc_vfsops = NULL;
2876	xvfsp->vfc_next = NULL;
2877}
2878
2879/*
2880 * Top level filesystem related information gathering.
2881 */
2882static int
2883sysctl_vfs_conflist(SYSCTL_HANDLER_ARGS)
2884{
2885	struct vfsconf *vfsp;
2886	struct xvfsconf xvfsp;
2887	int error;
2888
2889	error = 0;
2890	TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
2891		bzero(&xvfsp, sizeof(xvfsp));
2892		vfsconf2x(vfsp, &xvfsp);
2893		error = SYSCTL_OUT(req, &xvfsp, sizeof xvfsp);
2894		if (error)
2895			break;
2896	}
2897	return (error);
2898}
2899
2900SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLFLAG_RD, NULL, 0, sysctl_vfs_conflist,
2901    "S,xvfsconf", "List of all configured filesystems");
2902
2903#ifndef BURN_BRIDGES
2904static int	sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS);
2905
2906static int
2907vfs_sysctl(SYSCTL_HANDLER_ARGS)
2908{
2909	int *name = (int *)arg1 - 1;	/* XXX */
2910	u_int namelen = arg2 + 1;	/* XXX */
2911	struct vfsconf *vfsp;
2912	struct xvfsconf xvfsp;
2913
2914	printf("WARNING: userland calling deprecated sysctl, "
2915	    "please rebuild world\n");
2916
2917#if 1 || defined(COMPAT_PRELITE2)
2918	/* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */
2919	if (namelen == 1)
2920		return (sysctl_ovfs_conf(oidp, arg1, arg2, req));
2921#endif
2922
2923	switch (name[1]) {
2924	case VFS_MAXTYPENUM:
2925		if (namelen != 2)
2926			return (ENOTDIR);
2927		return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int)));
2928	case VFS_CONF:
2929		if (namelen != 3)
2930			return (ENOTDIR);	/* overloaded */
2931		TAILQ_FOREACH(vfsp, &vfsconf, vfc_list)
2932			if (vfsp->vfc_typenum == name[2])
2933				break;
2934		if (vfsp == NULL)
2935			return (EOPNOTSUPP);
2936		bzero(&xvfsp, sizeof(xvfsp));
2937		vfsconf2x(vfsp, &xvfsp);
2938		return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp)));
2939	}
2940	return (EOPNOTSUPP);
2941}
2942
2943static SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD | CTLFLAG_SKIP,
2944	vfs_sysctl, "Generic filesystem");
2945
2946#if 1 || defined(COMPAT_PRELITE2)
2947
2948static int
2949sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS)
2950{
2951	int error;
2952	struct vfsconf *vfsp;
2953	struct ovfsconf ovfs;
2954
2955	TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
2956		bzero(&ovfs, sizeof(ovfs));
2957		ovfs.vfc_vfsops = vfsp->vfc_vfsops;	/* XXX used as flag */
2958		strcpy(ovfs.vfc_name, vfsp->vfc_name);
2959		ovfs.vfc_index = vfsp->vfc_typenum;
2960		ovfs.vfc_refcount = vfsp->vfc_refcount;
2961		ovfs.vfc_flags = vfsp->vfc_flags;
2962		error = SYSCTL_OUT(req, &ovfs, sizeof ovfs);
2963		if (error)
2964			return error;
2965	}
2966	return 0;
2967}
2968
2969#endif /* 1 || COMPAT_PRELITE2 */
2970#endif /* !BURN_BRIDGES */
2971
2972#define KINFO_VNODESLOP		10
2973#ifdef notyet
2974/*
2975 * Dump vnode list (via sysctl).
2976 */
2977/* ARGSUSED */
2978static int
2979sysctl_vnode(SYSCTL_HANDLER_ARGS)
2980{
2981	struct xvnode *xvn;
2982	struct mount *mp;
2983	struct vnode *vp;
2984	int error, len, n;
2985
2986	/*
2987	 * Stale numvnodes access is not fatal here.
2988	 */
2989	req->lock = 0;
2990	len = (numvnodes + KINFO_VNODESLOP) * sizeof *xvn;
2991	if (!req->oldptr)
2992		/* Make an estimate */
2993		return (SYSCTL_OUT(req, 0, len));
2994
2995	error = sysctl_wire_old_buffer(req, 0);
2996	if (error != 0)
2997		return (error);
2998	xvn = malloc(len, M_TEMP, M_ZERO | M_WAITOK);
2999	n = 0;
3000	mtx_lock(&mountlist_mtx);
3001	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
3002		if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK))
3003			continue;
3004		MNT_ILOCK(mp);
3005		TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
3006			if (n == len)
3007				break;
3008			vref(vp);
3009			xvn[n].xv_size = sizeof *xvn;
3010			xvn[n].xv_vnode = vp;
3011			xvn[n].xv_id = 0;	/* XXX compat */
3012#define XV_COPY(field) xvn[n].xv_##field = vp->v_##field
3013			XV_COPY(usecount);
3014			XV_COPY(writecount);
3015			XV_COPY(holdcnt);
3016			XV_COPY(mount);
3017			XV_COPY(numoutput);
3018			XV_COPY(type);
3019#undef XV_COPY
3020			xvn[n].xv_flag = vp->v_vflag;
3021
3022			switch (vp->v_type) {
3023			case VREG:
3024			case VDIR:
3025			case VLNK:
3026				break;
3027			case VBLK:
3028			case VCHR:
3029				if (vp->v_rdev == NULL) {
3030					vrele(vp);
3031					continue;
3032				}
3033				xvn[n].xv_dev = dev2udev(vp->v_rdev);
3034				break;
3035			case VSOCK:
3036				xvn[n].xv_socket = vp->v_socket;
3037				break;
3038			case VFIFO:
3039				xvn[n].xv_fifo = vp->v_fifoinfo;
3040				break;
3041			case VNON:
3042			case VBAD:
3043			default:
3044				/* shouldn't happen? */
3045				vrele(vp);
3046				continue;
3047			}
3048			vrele(vp);
3049			++n;
3050		}
3051		MNT_IUNLOCK(mp);
3052		mtx_lock(&mountlist_mtx);
3053		vfs_unbusy(mp);
3054		if (n == len)
3055			break;
3056	}
3057	mtx_unlock(&mountlist_mtx);
3058
3059	error = SYSCTL_OUT(req, xvn, n * sizeof *xvn);
3060	free(xvn, M_TEMP);
3061	return (error);
3062}
3063
3064SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD,
3065	0, 0, sysctl_vnode, "S,xvnode", "");
3066#endif
3067
3068/*
3069 * Unmount all filesystems. The list is traversed in reverse order
3070 * of mounting to avoid dependencies.
3071 */
3072void
3073vfs_unmountall(void)
3074{
3075	struct mount *mp;
3076	struct thread *td;
3077	int error;
3078
3079	KASSERT(curthread != NULL, ("vfs_unmountall: NULL curthread"));
3080	td = curthread;
3081	/*
3082	 * Since this only runs when rebooting, it is not interlocked.
3083	 */
3084	while(!TAILQ_EMPTY(&mountlist)) {
3085		mp = TAILQ_LAST(&mountlist, mntlist);
3086		error = dounmount(mp, MNT_FORCE, td);
3087		if (error) {
3088			TAILQ_REMOVE(&mountlist, mp, mnt_list);
3089			/*
3090			 * XXX: Due to the way in which we mount the root
3091			 * file system off of devfs, devfs will generate a
3092			 * "busy" warning when we try to unmount it before
3093			 * the root.  Don't print a warning as a result in
3094			 * order to avoid false positive errors that may
3095			 * cause needless upset.
3096			 */
3097			if (strcmp(mp->mnt_vfc->vfc_name, "devfs") != 0) {
3098				printf("unmount of %s failed (",
3099				    mp->mnt_stat.f_mntonname);
3100				if (error == EBUSY)
3101					printf("BUSY)\n");
3102				else
3103					printf("%d)\n", error);
3104			}
3105		} else {
3106			/* The unmount has removed mp from the mountlist */
3107		}
3108	}
3109}
3110
3111/*
3112 * perform msync on all vnodes under a mount point
3113 * the mount point must be locked.
3114 */
3115void
3116vfs_msync(struct mount *mp, int flags)
3117{
3118	struct vnode *vp, *mvp;
3119	struct vm_object *obj;
3120
3121	MNT_ILOCK(mp);
3122	MNT_VNODE_FOREACH(vp, mp, mvp) {
3123		VI_LOCK(vp);
3124		if ((vp->v_iflag & VI_OBJDIRTY) &&
3125		    (flags == MNT_WAIT || VOP_ISLOCKED(vp) == 0)) {
3126			MNT_IUNLOCK(mp);
3127			if (!vget(vp,
3128			    LK_EXCLUSIVE | LK_RETRY | LK_INTERLOCK,
3129			    curthread)) {
3130				if (vp->v_vflag & VV_NOSYNC) {	/* unlinked */
3131					vput(vp);
3132					MNT_ILOCK(mp);
3133					continue;
3134				}
3135
3136				obj = vp->v_object;
3137				if (obj != NULL) {
3138					VM_OBJECT_LOCK(obj);
3139					vm_object_page_clean(obj, 0, 0,
3140					    flags == MNT_WAIT ?
3141					    OBJPC_SYNC : OBJPC_NOSYNC);
3142					VM_OBJECT_UNLOCK(obj);
3143				}
3144				vput(vp);
3145			}
3146			MNT_ILOCK(mp);
3147		} else
3148			VI_UNLOCK(vp);
3149	}
3150	MNT_IUNLOCK(mp);
3151}
3152
3153/*
3154 * Mark a vnode as free, putting it up for recycling.
3155 */
3156static void
3157vfree(struct vnode *vp)
3158{
3159
3160	CTR1(KTR_VFS, "vfree vp %p", vp);
3161	ASSERT_VI_LOCKED(vp, "vfree");
3162	mtx_lock(&vnode_free_list_mtx);
3163	VNASSERT(vp->v_op != NULL, vp, ("vfree: vnode already reclaimed."));
3164	VNASSERT((vp->v_iflag & VI_FREE) == 0, vp, ("vnode already free"));
3165	VNASSERT(VSHOULDFREE(vp), vp, ("vfree: freeing when we shouldn't"));
3166	VNASSERT((vp->v_iflag & VI_DOOMED) == 0, vp,
3167	    ("vfree: Freeing doomed vnode"));
3168	if (vp->v_iflag & VI_AGE) {
3169		TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
3170	} else {
3171		TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
3172	}
3173	freevnodes++;
3174	vp->v_iflag &= ~VI_AGE;
3175	vp->v_iflag |= VI_FREE;
3176	mtx_unlock(&vnode_free_list_mtx);
3177}
3178
3179/*
3180 * Opposite of vfree() - mark a vnode as in use.
3181 */
3182static void
3183vbusy(struct vnode *vp)
3184{
3185	CTR1(KTR_VFS, "vbusy vp %p", vp);
3186	ASSERT_VI_LOCKED(vp, "vbusy");
3187	VNASSERT((vp->v_iflag & VI_FREE) != 0, vp, ("vnode not free"));
3188	VNASSERT(vp->v_op != NULL, vp, ("vbusy: vnode already reclaimed."));
3189
3190	mtx_lock(&vnode_free_list_mtx);
3191	TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
3192	freevnodes--;
3193	vp->v_iflag &= ~(VI_FREE|VI_AGE);
3194	mtx_unlock(&vnode_free_list_mtx);
3195}
3196
3197static void
3198destroy_vpollinfo(struct vpollinfo *vi)
3199{
3200	knlist_destroy(&vi->vpi_selinfo.si_note);
3201	mtx_destroy(&vi->vpi_lock);
3202	uma_zfree(vnodepoll_zone, vi);
3203}
3204
3205/*
3206 * Initalize per-vnode helper structure to hold poll-related state.
3207 */
3208void
3209v_addpollinfo(struct vnode *vp)
3210{
3211	struct vpollinfo *vi;
3212
3213	if (vp->v_pollinfo != NULL)
3214		return;
3215	vi = uma_zalloc(vnodepoll_zone, M_WAITOK);
3216	mtx_init(&vi->vpi_lock, "vnode pollinfo", NULL, MTX_DEF);
3217	knlist_init(&vi->vpi_selinfo.si_note, vp, vfs_knllock,
3218	    vfs_knlunlock, vfs_knllocked);
3219	VI_LOCK(vp);
3220	if (vp->v_pollinfo != NULL) {
3221		VI_UNLOCK(vp);
3222		destroy_vpollinfo(vi);
3223		return;
3224	}
3225	vp->v_pollinfo = vi;
3226	VI_UNLOCK(vp);
3227}
3228
3229/*
3230 * Record a process's interest in events which might happen to
3231 * a vnode.  Because poll uses the historic select-style interface
3232 * internally, this routine serves as both the ``check for any
3233 * pending events'' and the ``record my interest in future events''
3234 * functions.  (These are done together, while the lock is held,
3235 * to avoid race conditions.)
3236 */
3237int
3238vn_pollrecord(struct vnode *vp, struct thread *td, int events)
3239{
3240
3241	v_addpollinfo(vp);
3242	mtx_lock(&vp->v_pollinfo->vpi_lock);
3243	if (vp->v_pollinfo->vpi_revents & events) {
3244		/*
3245		 * This leaves events we are not interested
3246		 * in available for the other process which
3247		 * which presumably had requested them
3248		 * (otherwise they would never have been
3249		 * recorded).
3250		 */
3251		events &= vp->v_pollinfo->vpi_revents;
3252		vp->v_pollinfo->vpi_revents &= ~events;
3253
3254		mtx_unlock(&vp->v_pollinfo->vpi_lock);
3255		return (events);
3256	}
3257	vp->v_pollinfo->vpi_events |= events;
3258	selrecord(td, &vp->v_pollinfo->vpi_selinfo);
3259	mtx_unlock(&vp->v_pollinfo->vpi_lock);
3260	return (0);
3261}
3262
3263/*
3264 * Routine to create and manage a filesystem syncer vnode.
3265 */
3266#define sync_close ((int (*)(struct  vop_close_args *))nullop)
3267static int	sync_fsync(struct  vop_fsync_args *);
3268static int	sync_inactive(struct  vop_inactive_args *);
3269static int	sync_reclaim(struct  vop_reclaim_args *);
3270
3271static struct vop_vector sync_vnodeops = {
3272	.vop_bypass =	VOP_EOPNOTSUPP,
3273	.vop_close =	sync_close,		/* close */
3274	.vop_fsync =	sync_fsync,		/* fsync */
3275	.vop_inactive =	sync_inactive,	/* inactive */
3276	.vop_reclaim =	sync_reclaim,	/* reclaim */
3277	.vop_lock1 =	vop_stdlock,	/* lock */
3278	.vop_unlock =	vop_stdunlock,	/* unlock */
3279	.vop_islocked =	vop_stdislocked,	/* islocked */
3280};
3281
3282/*
3283 * Create a new filesystem syncer vnode for the specified mount point.
3284 */
3285int
3286vfs_allocate_syncvnode(struct mount *mp)
3287{
3288	struct vnode *vp;
3289	struct bufobj *bo;
3290	static long start, incr, next;
3291	int error;
3292
3293	/* Allocate a new vnode */
3294	if ((error = getnewvnode("syncer", mp, &sync_vnodeops, &vp)) != 0) {
3295		mp->mnt_syncer = NULL;
3296		return (error);
3297	}
3298	vp->v_type = VNON;
3299	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3300	vp->v_vflag |= VV_FORCEINSMQ;
3301	error = insmntque(vp, mp);
3302	if (error != 0)
3303		panic("vfs_allocate_syncvnode: insmntque failed");
3304	vp->v_vflag &= ~VV_FORCEINSMQ;
3305	VOP_UNLOCK(vp, 0);
3306	/*
3307	 * Place the vnode onto the syncer worklist. We attempt to
3308	 * scatter them about on the list so that they will go off
3309	 * at evenly distributed times even if all the filesystems
3310	 * are mounted at once.
3311	 */
3312	next += incr;
3313	if (next == 0 || next > syncer_maxdelay) {
3314		start /= 2;
3315		incr /= 2;
3316		if (start == 0) {
3317			start = syncer_maxdelay / 2;
3318			incr = syncer_maxdelay;
3319		}
3320		next = start;
3321	}
3322	bo = &vp->v_bufobj;
3323	BO_LOCK(bo);
3324	vn_syncer_add_to_worklist(bo, syncdelay > 0 ? next % syncdelay : 0);
3325	/* XXX - vn_syncer_add_to_worklist() also grabs and drops sync_mtx. */
3326	mtx_lock(&sync_mtx);
3327	sync_vnode_count++;
3328	mtx_unlock(&sync_mtx);
3329	BO_UNLOCK(bo);
3330	mp->mnt_syncer = vp;
3331	return (0);
3332}
3333
3334/*
3335 * Do a lazy sync of the filesystem.
3336 */
3337static int
3338sync_fsync(struct vop_fsync_args *ap)
3339{
3340	struct vnode *syncvp = ap->a_vp;
3341	struct mount *mp = syncvp->v_mount;
3342	int error;
3343	struct bufobj *bo;
3344
3345	/*
3346	 * We only need to do something if this is a lazy evaluation.
3347	 */
3348	if (ap->a_waitfor != MNT_LAZY)
3349		return (0);
3350
3351	/*
3352	 * Move ourselves to the back of the sync list.
3353	 */
3354	bo = &syncvp->v_bufobj;
3355	BO_LOCK(bo);
3356	vn_syncer_add_to_worklist(bo, syncdelay);
3357	BO_UNLOCK(bo);
3358
3359	/*
3360	 * Walk the list of vnodes pushing all that are dirty and
3361	 * not already on the sync list.
3362	 */
3363	mtx_lock(&mountlist_mtx);
3364	if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK) != 0) {
3365		mtx_unlock(&mountlist_mtx);
3366		return (0);
3367	}
3368	if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) {
3369		vfs_unbusy(mp);
3370		return (0);
3371	}
3372	MNT_ILOCK(mp);
3373	mp->mnt_noasync++;
3374	mp->mnt_kern_flag &= ~MNTK_ASYNC;
3375	MNT_IUNLOCK(mp);
3376	vfs_msync(mp, MNT_NOWAIT);
3377	error = VFS_SYNC(mp, MNT_LAZY, ap->a_td);
3378	MNT_ILOCK(mp);
3379	mp->mnt_noasync--;
3380	if ((mp->mnt_flag & MNT_ASYNC) != 0 && mp->mnt_noasync == 0)
3381		mp->mnt_kern_flag |= MNTK_ASYNC;
3382	MNT_IUNLOCK(mp);
3383	vn_finished_write(mp);
3384	vfs_unbusy(mp);
3385	return (error);
3386}
3387
3388/*
3389 * The syncer vnode is no referenced.
3390 */
3391static int
3392sync_inactive(struct vop_inactive_args *ap)
3393{
3394
3395	vgone(ap->a_vp);
3396	return (0);
3397}
3398
3399/*
3400 * The syncer vnode is no longer needed and is being decommissioned.
3401 *
3402 * Modifications to the worklist must be protected by sync_mtx.
3403 */
3404static int
3405sync_reclaim(struct vop_reclaim_args *ap)
3406{
3407	struct vnode *vp = ap->a_vp;
3408	struct bufobj *bo;
3409
3410	bo = &vp->v_bufobj;
3411	BO_LOCK(bo);
3412	vp->v_mount->mnt_syncer = NULL;
3413	if (bo->bo_flag & BO_ONWORKLST) {
3414		mtx_lock(&sync_mtx);
3415		LIST_REMOVE(bo, bo_synclist);
3416		syncer_worklist_len--;
3417		sync_vnode_count--;
3418		mtx_unlock(&sync_mtx);
3419		bo->bo_flag &= ~BO_ONWORKLST;
3420	}
3421	BO_UNLOCK(bo);
3422
3423	return (0);
3424}
3425
3426/*
3427 * Check if vnode represents a disk device
3428 */
3429int
3430vn_isdisk(struct vnode *vp, int *errp)
3431{
3432	int error;
3433
3434	error = 0;
3435	dev_lock();
3436	if (vp->v_type != VCHR)
3437		error = ENOTBLK;
3438	else if (vp->v_rdev == NULL)
3439		error = ENXIO;
3440	else if (vp->v_rdev->si_devsw == NULL)
3441		error = ENXIO;
3442	else if (!(vp->v_rdev->si_devsw->d_flags & D_DISK))
3443		error = ENOTBLK;
3444	dev_unlock();
3445	if (errp != NULL)
3446		*errp = error;
3447	return (error == 0);
3448}
3449
3450/*
3451 * Common filesystem object access control check routine.  Accepts a
3452 * vnode's type, "mode", uid and gid, requested access mode, credentials,
3453 * and optional call-by-reference privused argument allowing vaccess()
3454 * to indicate to the caller whether privilege was used to satisfy the
3455 * request (obsoleted).  Returns 0 on success, or an errno on failure.
3456 *
3457 * The ifdef'd CAPABILITIES version is here for reference, but is not
3458 * actually used.
3459 */
3460int
3461vaccess(enum vtype type, mode_t file_mode, uid_t file_uid, gid_t file_gid,
3462    accmode_t accmode, struct ucred *cred, int *privused)
3463{
3464	accmode_t dac_granted;
3465	accmode_t priv_granted;
3466
3467	/*
3468	 * Look for a normal, non-privileged way to access the file/directory
3469	 * as requested.  If it exists, go with that.
3470	 */
3471
3472	if (privused != NULL)
3473		*privused = 0;
3474
3475	dac_granted = 0;
3476
3477	/* Check the owner. */
3478	if (cred->cr_uid == file_uid) {
3479		dac_granted |= VADMIN;
3480		if (file_mode & S_IXUSR)
3481			dac_granted |= VEXEC;
3482		if (file_mode & S_IRUSR)
3483			dac_granted |= VREAD;
3484		if (file_mode & S_IWUSR)
3485			dac_granted |= (VWRITE | VAPPEND);
3486
3487		if ((accmode & dac_granted) == accmode)
3488			return (0);
3489
3490		goto privcheck;
3491	}
3492
3493	/* Otherwise, check the groups (first match) */
3494	if (groupmember(file_gid, cred)) {
3495		if (file_mode & S_IXGRP)
3496			dac_granted |= VEXEC;
3497		if (file_mode & S_IRGRP)
3498			dac_granted |= VREAD;
3499		if (file_mode & S_IWGRP)
3500			dac_granted |= (VWRITE | VAPPEND);
3501
3502		if ((accmode & dac_granted) == accmode)
3503			return (0);
3504
3505		goto privcheck;
3506	}
3507
3508	/* Otherwise, check everyone else. */
3509	if (file_mode & S_IXOTH)
3510		dac_granted |= VEXEC;
3511	if (file_mode & S_IROTH)
3512		dac_granted |= VREAD;
3513	if (file_mode & S_IWOTH)
3514		dac_granted |= (VWRITE | VAPPEND);
3515	if ((accmode & dac_granted) == accmode)
3516		return (0);
3517
3518privcheck:
3519	/*
3520	 * Build a privilege mask to determine if the set of privileges
3521	 * satisfies the requirements when combined with the granted mask
3522	 * from above.  For each privilege, if the privilege is required,
3523	 * bitwise or the request type onto the priv_granted mask.
3524	 */
3525	priv_granted = 0;
3526
3527	if (type == VDIR) {
3528		/*
3529		 * For directories, use PRIV_VFS_LOOKUP to satisfy VEXEC
3530		 * requests, instead of PRIV_VFS_EXEC.
3531		 */
3532		if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
3533		    !priv_check_cred(cred, PRIV_VFS_LOOKUP, 0))
3534			priv_granted |= VEXEC;
3535	} else {
3536		if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
3537		    !priv_check_cred(cred, PRIV_VFS_EXEC, 0))
3538			priv_granted |= VEXEC;
3539	}
3540
3541	if ((accmode & VREAD) && ((dac_granted & VREAD) == 0) &&
3542	    !priv_check_cred(cred, PRIV_VFS_READ, 0))
3543		priv_granted |= VREAD;
3544
3545	if ((accmode & VWRITE) && ((dac_granted & VWRITE) == 0) &&
3546	    !priv_check_cred(cred, PRIV_VFS_WRITE, 0))
3547		priv_granted |= (VWRITE | VAPPEND);
3548
3549	if ((accmode & VADMIN) && ((dac_granted & VADMIN) == 0) &&
3550	    !priv_check_cred(cred, PRIV_VFS_ADMIN, 0))
3551		priv_granted |= VADMIN;
3552
3553	if ((accmode & (priv_granted | dac_granted)) == accmode) {
3554		/* XXX audit: privilege used */
3555		if (privused != NULL)
3556			*privused = 1;
3557		return (0);
3558	}
3559
3560	return ((accmode & VADMIN) ? EPERM : EACCES);
3561}
3562
3563/*
3564 * Credential check based on process requesting service, and per-attribute
3565 * permissions.
3566 */
3567int
3568extattr_check_cred(struct vnode *vp, int attrnamespace, struct ucred *cred,
3569    struct thread *td, accmode_t accmode)
3570{
3571
3572	/*
3573	 * Kernel-invoked always succeeds.
3574	 */
3575	if (cred == NOCRED)
3576		return (0);
3577
3578	/*
3579	 * Do not allow privileged processes in jail to directly manipulate
3580	 * system attributes.
3581	 */
3582	switch (attrnamespace) {
3583	case EXTATTR_NAMESPACE_SYSTEM:
3584		/* Potentially should be: return (EPERM); */
3585		return (priv_check_cred(cred, PRIV_VFS_EXTATTR_SYSTEM, 0));
3586	case EXTATTR_NAMESPACE_USER:
3587		return (VOP_ACCESS(vp, accmode, cred, td));
3588	default:
3589		return (EPERM);
3590	}
3591}
3592
3593#ifdef DEBUG_VFS_LOCKS
3594/*
3595 * This only exists to supress warnings from unlocked specfs accesses.  It is
3596 * no longer ok to have an unlocked VFS.
3597 */
3598#define	IGNORE_LOCK(vp) (panicstr != NULL || (vp) == NULL ||		\
3599	(vp)->v_type == VCHR ||	(vp)->v_type == VBAD)
3600
3601int vfs_badlock_ddb = 1;	/* Drop into debugger on violation. */
3602SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_ddb, CTLFLAG_RW, &vfs_badlock_ddb, 0, "");
3603
3604int vfs_badlock_mutex = 1;	/* Check for interlock across VOPs. */
3605SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_mutex, CTLFLAG_RW, &vfs_badlock_mutex, 0, "");
3606
3607int vfs_badlock_print = 1;	/* Print lock violations. */
3608SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_print, CTLFLAG_RW, &vfs_badlock_print, 0, "");
3609
3610#ifdef KDB
3611int vfs_badlock_backtrace = 1;	/* Print backtrace at lock violations. */
3612SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_backtrace, CTLFLAG_RW, &vfs_badlock_backtrace, 0, "");
3613#endif
3614
3615static void
3616vfs_badlock(const char *msg, const char *str, struct vnode *vp)
3617{
3618
3619#ifdef KDB
3620	if (vfs_badlock_backtrace)
3621		kdb_backtrace();
3622#endif
3623	if (vfs_badlock_print)
3624		printf("%s: %p %s\n", str, (void *)vp, msg);
3625	if (vfs_badlock_ddb)
3626		kdb_enter(KDB_WHY_VFSLOCK, "lock violation");
3627}
3628
3629void
3630assert_vi_locked(struct vnode *vp, const char *str)
3631{
3632
3633	if (vfs_badlock_mutex && !mtx_owned(VI_MTX(vp)))
3634		vfs_badlock("interlock is not locked but should be", str, vp);
3635}
3636
3637void
3638assert_vi_unlocked(struct vnode *vp, const char *str)
3639{
3640
3641	if (vfs_badlock_mutex && mtx_owned(VI_MTX(vp)))
3642		vfs_badlock("interlock is locked but should not be", str, vp);
3643}
3644
3645void
3646assert_vop_locked(struct vnode *vp, const char *str)
3647{
3648
3649	if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) == 0)
3650		vfs_badlock("is not locked but should be", str, vp);
3651}
3652
3653void
3654assert_vop_unlocked(struct vnode *vp, const char *str)
3655{
3656
3657	if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) == LK_EXCLUSIVE)
3658		vfs_badlock("is locked but should not be", str, vp);
3659}
3660
3661void
3662assert_vop_elocked(struct vnode *vp, const char *str)
3663{
3664
3665	if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_EXCLUSIVE)
3666		vfs_badlock("is not exclusive locked but should be", str, vp);
3667}
3668
3669#if 0
3670void
3671assert_vop_elocked_other(struct vnode *vp, const char *str)
3672{
3673
3674	if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_EXCLOTHER)
3675		vfs_badlock("is not exclusive locked by another thread",
3676		    str, vp);
3677}
3678
3679void
3680assert_vop_slocked(struct vnode *vp, const char *str)
3681{
3682
3683	if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_SHARED)
3684		vfs_badlock("is not locked shared but should be", str, vp);
3685}
3686#endif /* 0 */
3687#endif /* DEBUG_VFS_LOCKS */
3688
3689void
3690vop_rename_pre(void *ap)
3691{
3692	struct vop_rename_args *a = ap;
3693
3694#ifdef DEBUG_VFS_LOCKS
3695	if (a->a_tvp)
3696		ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME");
3697	ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME");
3698	ASSERT_VI_UNLOCKED(a->a_fvp, "VOP_RENAME");
3699	ASSERT_VI_UNLOCKED(a->a_fdvp, "VOP_RENAME");
3700
3701	/* Check the source (from). */
3702	if (a->a_tdvp != a->a_fdvp && a->a_tvp != a->a_fdvp)
3703		ASSERT_VOP_UNLOCKED(a->a_fdvp, "vop_rename: fdvp locked");
3704	if (a->a_tvp != a->a_fvp)
3705		ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: fvp locked");
3706
3707	/* Check the target. */
3708	if (a->a_tvp)
3709		ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked");
3710	ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked");
3711#endif
3712	if (a->a_tdvp != a->a_fdvp)
3713		vhold(a->a_fdvp);
3714	if (a->a_tvp != a->a_fvp)
3715		vhold(a->a_fvp);
3716	vhold(a->a_tdvp);
3717	if (a->a_tvp)
3718		vhold(a->a_tvp);
3719}
3720
3721void
3722vop_strategy_pre(void *ap)
3723{
3724#ifdef DEBUG_VFS_LOCKS
3725	struct vop_strategy_args *a;
3726	struct buf *bp;
3727
3728	a = ap;
3729	bp = a->a_bp;
3730
3731	/*
3732	 * Cluster ops lock their component buffers but not the IO container.
3733	 */
3734	if ((bp->b_flags & B_CLUSTER) != 0)
3735		return;
3736
3737	if (!BUF_ISLOCKED(bp)) {
3738		if (vfs_badlock_print)
3739			printf(
3740			    "VOP_STRATEGY: bp is not locked but should be\n");
3741		if (vfs_badlock_ddb)
3742			kdb_enter(KDB_WHY_VFSLOCK, "lock violation");
3743	}
3744#endif
3745}
3746
3747void
3748vop_lookup_pre(void *ap)
3749{
3750#ifdef DEBUG_VFS_LOCKS
3751	struct vop_lookup_args *a;
3752	struct vnode *dvp;
3753
3754	a = ap;
3755	dvp = a->a_dvp;
3756	ASSERT_VI_UNLOCKED(dvp, "VOP_LOOKUP");
3757	ASSERT_VOP_LOCKED(dvp, "VOP_LOOKUP");
3758#endif
3759}
3760
3761void
3762vop_lookup_post(void *ap, int rc)
3763{
3764#ifdef DEBUG_VFS_LOCKS
3765	struct vop_lookup_args *a;
3766	struct vnode *dvp;
3767	struct vnode *vp;
3768
3769	a = ap;
3770	dvp = a->a_dvp;
3771	vp = *(a->a_vpp);
3772
3773	ASSERT_VI_UNLOCKED(dvp, "VOP_LOOKUP");
3774	ASSERT_VOP_LOCKED(dvp, "VOP_LOOKUP");
3775
3776	if (!rc)
3777		ASSERT_VOP_LOCKED(vp, "VOP_LOOKUP (child)");
3778#endif
3779}
3780
3781void
3782vop_lock_pre(void *ap)
3783{
3784#ifdef DEBUG_VFS_LOCKS
3785	struct vop_lock1_args *a = ap;
3786
3787	if ((a->a_flags & LK_INTERLOCK) == 0)
3788		ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
3789	else
3790		ASSERT_VI_LOCKED(a->a_vp, "VOP_LOCK");
3791#endif
3792}
3793
3794void
3795vop_lock_post(void *ap, int rc)
3796{
3797#ifdef DEBUG_VFS_LOCKS
3798	struct vop_lock1_args *a = ap;
3799
3800	ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
3801	if (rc == 0)
3802		ASSERT_VOP_LOCKED(a->a_vp, "VOP_LOCK");
3803#endif
3804}
3805
3806void
3807vop_unlock_pre(void *ap)
3808{
3809#ifdef DEBUG_VFS_LOCKS
3810	struct vop_unlock_args *a = ap;
3811
3812	if (a->a_flags & LK_INTERLOCK)
3813		ASSERT_VI_LOCKED(a->a_vp, "VOP_UNLOCK");
3814	ASSERT_VOP_LOCKED(a->a_vp, "VOP_UNLOCK");
3815#endif
3816}
3817
3818void
3819vop_unlock_post(void *ap, int rc)
3820{
3821#ifdef DEBUG_VFS_LOCKS
3822	struct vop_unlock_args *a = ap;
3823
3824	if (a->a_flags & LK_INTERLOCK)
3825		ASSERT_VI_UNLOCKED(a->a_vp, "VOP_UNLOCK");
3826#endif
3827}
3828
3829void
3830vop_create_post(void *ap, int rc)
3831{
3832	struct vop_create_args *a = ap;
3833
3834	if (!rc)
3835		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
3836}
3837
3838void
3839vop_link_post(void *ap, int rc)
3840{
3841	struct vop_link_args *a = ap;
3842
3843	if (!rc) {
3844		VFS_KNOTE_LOCKED(a->a_vp, NOTE_LINK);
3845		VFS_KNOTE_LOCKED(a->a_tdvp, NOTE_WRITE);
3846	}
3847}
3848
3849void
3850vop_mkdir_post(void *ap, int rc)
3851{
3852	struct vop_mkdir_args *a = ap;
3853
3854	if (!rc)
3855		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK);
3856}
3857
3858void
3859vop_mknod_post(void *ap, int rc)
3860{
3861	struct vop_mknod_args *a = ap;
3862
3863	if (!rc)
3864		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
3865}
3866
3867void
3868vop_remove_post(void *ap, int rc)
3869{
3870	struct vop_remove_args *a = ap;
3871
3872	if (!rc) {
3873		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
3874		VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE);
3875	}
3876}
3877
3878void
3879vop_rename_post(void *ap, int rc)
3880{
3881	struct vop_rename_args *a = ap;
3882
3883	if (!rc) {
3884		VFS_KNOTE_UNLOCKED(a->a_fdvp, NOTE_WRITE);
3885		VFS_KNOTE_UNLOCKED(a->a_tdvp, NOTE_WRITE);
3886		VFS_KNOTE_UNLOCKED(a->a_fvp, NOTE_RENAME);
3887		if (a->a_tvp)
3888			VFS_KNOTE_UNLOCKED(a->a_tvp, NOTE_DELETE);
3889	}
3890	if (a->a_tdvp != a->a_fdvp)
3891		vdrop(a->a_fdvp);
3892	if (a->a_tvp != a->a_fvp)
3893		vdrop(a->a_fvp);
3894	vdrop(a->a_tdvp);
3895	if (a->a_tvp)
3896		vdrop(a->a_tvp);
3897}
3898
3899void
3900vop_rmdir_post(void *ap, int rc)
3901{
3902	struct vop_rmdir_args *a = ap;
3903
3904	if (!rc) {
3905		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK);
3906		VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE);
3907	}
3908}
3909
3910void
3911vop_setattr_post(void *ap, int rc)
3912{
3913	struct vop_setattr_args *a = ap;
3914
3915	if (!rc)
3916		VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
3917}
3918
3919void
3920vop_symlink_post(void *ap, int rc)
3921{
3922	struct vop_symlink_args *a = ap;
3923
3924	if (!rc)
3925		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
3926}
3927
3928static struct knlist fs_knlist;
3929
3930static void
3931vfs_event_init(void *arg)
3932{
3933	knlist_init(&fs_knlist, NULL, NULL, NULL, NULL);
3934}
3935/* XXX - correct order? */
3936SYSINIT(vfs_knlist, SI_SUB_VFS, SI_ORDER_ANY, vfs_event_init, NULL);
3937
3938void
3939vfs_event_signal(fsid_t *fsid, u_int32_t event, intptr_t data __unused)
3940{
3941
3942	KNOTE_UNLOCKED(&fs_knlist, event);
3943}
3944
3945static int	filt_fsattach(struct knote *kn);
3946static void	filt_fsdetach(struct knote *kn);
3947static int	filt_fsevent(struct knote *kn, long hint);
3948
3949struct filterops fs_filtops =
3950	{ 0, filt_fsattach, filt_fsdetach, filt_fsevent };
3951
3952static int
3953filt_fsattach(struct knote *kn)
3954{
3955
3956	kn->kn_flags |= EV_CLEAR;
3957	knlist_add(&fs_knlist, kn, 0);
3958	return (0);
3959}
3960
3961static void
3962filt_fsdetach(struct knote *kn)
3963{
3964
3965	knlist_remove(&fs_knlist, kn, 0);
3966}
3967
3968static int
3969filt_fsevent(struct knote *kn, long hint)
3970{
3971
3972	kn->kn_fflags |= hint;
3973	return (kn->kn_fflags != 0);
3974}
3975
3976static int
3977sysctl_vfs_ctl(SYSCTL_HANDLER_ARGS)
3978{
3979	struct vfsidctl vc;
3980	int error;
3981	struct mount *mp;
3982
3983	error = SYSCTL_IN(req, &vc, sizeof(vc));
3984	if (error)
3985		return (error);
3986	if (vc.vc_vers != VFS_CTL_VERS1)
3987		return (EINVAL);
3988	mp = vfs_getvfs(&vc.vc_fsid);
3989	if (mp == NULL)
3990		return (ENOENT);
3991	/* ensure that a specific sysctl goes to the right filesystem. */
3992	if (strcmp(vc.vc_fstypename, "*") != 0 &&
3993	    strcmp(vc.vc_fstypename, mp->mnt_vfc->vfc_name) != 0) {
3994		vfs_rel(mp);
3995		return (EINVAL);
3996	}
3997	VCTLTOREQ(&vc, req);
3998	error = VFS_SYSCTL(mp, vc.vc_op, req);
3999	vfs_rel(mp);
4000	return (error);
4001}
4002
4003SYSCTL_PROC(_vfs, OID_AUTO, ctl, CTLFLAG_WR, NULL, 0, sysctl_vfs_ctl, "",
4004    "Sysctl by fsid");
4005
4006/*
4007 * Function to initialize a va_filerev field sensibly.
4008 * XXX: Wouldn't a random number make a lot more sense ??
4009 */
4010u_quad_t
4011init_va_filerev(void)
4012{
4013	struct bintime bt;
4014
4015	getbinuptime(&bt);
4016	return (((u_quad_t)bt.sec << 32LL) | (bt.frac >> 32LL));
4017}
4018
4019static int	filt_vfsread(struct knote *kn, long hint);
4020static int	filt_vfswrite(struct knote *kn, long hint);
4021static int	filt_vfsvnode(struct knote *kn, long hint);
4022static void	filt_vfsdetach(struct knote *kn);
4023static struct filterops vfsread_filtops =
4024	{ 1, NULL, filt_vfsdetach, filt_vfsread };
4025static struct filterops vfswrite_filtops =
4026	{ 1, NULL, filt_vfsdetach, filt_vfswrite };
4027static struct filterops vfsvnode_filtops =
4028	{ 1, NULL, filt_vfsdetach, filt_vfsvnode };
4029
4030static void
4031vfs_knllock(void *arg)
4032{
4033	struct vnode *vp = arg;
4034
4035	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4036}
4037
4038static void
4039vfs_knlunlock(void *arg)
4040{
4041	struct vnode *vp = arg;
4042
4043	VOP_UNLOCK(vp, 0);
4044}
4045
4046static int
4047vfs_knllocked(void *arg)
4048{
4049	struct vnode *vp = arg;
4050
4051	return (VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
4052}
4053
4054int
4055vfs_kqfilter(struct vop_kqfilter_args *ap)
4056{
4057	struct vnode *vp = ap->a_vp;
4058	struct knote *kn = ap->a_kn;
4059	struct knlist *knl;
4060
4061	switch (kn->kn_filter) {
4062	case EVFILT_READ:
4063		kn->kn_fop = &vfsread_filtops;
4064		break;
4065	case EVFILT_WRITE:
4066		kn->kn_fop = &vfswrite_filtops;
4067		break;
4068	case EVFILT_VNODE:
4069		kn->kn_fop = &vfsvnode_filtops;
4070		break;
4071	default:
4072		return (EINVAL);
4073	}
4074
4075	kn->kn_hook = (caddr_t)vp;
4076
4077	v_addpollinfo(vp);
4078	if (vp->v_pollinfo == NULL)
4079		return (ENOMEM);
4080	knl = &vp->v_pollinfo->vpi_selinfo.si_note;
4081	knlist_add(knl, kn, 0);
4082
4083	return (0);
4084}
4085
4086/*
4087 * Detach knote from vnode
4088 */
4089static void
4090filt_vfsdetach(struct knote *kn)
4091{
4092	struct vnode *vp = (struct vnode *)kn->kn_hook;
4093
4094	KASSERT(vp->v_pollinfo != NULL, ("Missing v_pollinfo"));
4095	knlist_remove(&vp->v_pollinfo->vpi_selinfo.si_note, kn, 0);
4096}
4097
4098/*ARGSUSED*/
4099static int
4100filt_vfsread(struct knote *kn, long hint)
4101{
4102	struct vnode *vp = (struct vnode *)kn->kn_hook;
4103	struct vattr va;
4104
4105	/*
4106	 * filesystem is gone, so set the EOF flag and schedule
4107	 * the knote for deletion.
4108	 */
4109	if (hint == NOTE_REVOKE) {
4110		kn->kn_flags |= (EV_EOF | EV_ONESHOT);
4111		return (1);
4112	}
4113
4114	if (VOP_GETATTR(vp, &va, curthread->td_ucred))
4115		return (0);
4116
4117	kn->kn_data = va.va_size - kn->kn_fp->f_offset;
4118	return (kn->kn_data != 0);
4119}
4120
4121/*ARGSUSED*/
4122static int
4123filt_vfswrite(struct knote *kn, long hint)
4124{
4125	/*
4126	 * filesystem is gone, so set the EOF flag and schedule
4127	 * the knote for deletion.
4128	 */
4129	if (hint == NOTE_REVOKE)
4130		kn->kn_flags |= (EV_EOF | EV_ONESHOT);
4131
4132	kn->kn_data = 0;
4133	return (1);
4134}
4135
4136static int
4137filt_vfsvnode(struct knote *kn, long hint)
4138{
4139	if (kn->kn_sfflags & hint)
4140		kn->kn_fflags |= hint;
4141	if (hint == NOTE_REVOKE) {
4142		kn->kn_flags |= EV_EOF;
4143		return (1);
4144	}
4145	return (kn->kn_fflags != 0);
4146}
4147
4148int
4149vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off)
4150{
4151	int error;
4152
4153	if (dp->d_reclen > ap->a_uio->uio_resid)
4154		return (ENAMETOOLONG);
4155	error = uiomove(dp, dp->d_reclen, ap->a_uio);
4156	if (error) {
4157		if (ap->a_ncookies != NULL) {
4158			if (ap->a_cookies != NULL)
4159				free(ap->a_cookies, M_TEMP);
4160			ap->a_cookies = NULL;
4161			*ap->a_ncookies = 0;
4162		}
4163		return (error);
4164	}
4165	if (ap->a_ncookies == NULL)
4166		return (0);
4167
4168	KASSERT(ap->a_cookies,
4169	    ("NULL ap->a_cookies value with non-NULL ap->a_ncookies!"));
4170
4171	*ap->a_cookies = realloc(*ap->a_cookies,
4172	    (*ap->a_ncookies + 1) * sizeof(u_long), M_TEMP, M_WAITOK | M_ZERO);
4173	(*ap->a_cookies)[*ap->a_ncookies] = off;
4174	return (0);
4175}
4176
4177/*
4178 * Mark for update the access time of the file if the filesystem
4179 * supports VA_MARK_ATIME.  This functionality is used by execve
4180 * and mmap, so we want to avoid the synchronous I/O implied by
4181 * directly setting va_atime for the sake of efficiency.
4182 */
4183void
4184vfs_mark_atime(struct vnode *vp, struct ucred *cred)
4185{
4186	struct vattr atimeattr;
4187
4188	if ((vp->v_mount->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0) {
4189		VATTR_NULL(&atimeattr);
4190		atimeattr.va_vaflags |= VA_MARK_ATIME;
4191		(void)VOP_SETATTR(vp, &atimeattr, cred);
4192	}
4193}
4194