vfs_subr.c revision 249218
1/*-
2 * Copyright (c) 1989, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 4. Neither the name of the University nor the names of its contributors
19 *    may be used to endorse or promote products derived from this software
20 *    without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 *	@(#)vfs_subr.c	8.31 (Berkeley) 5/26/95
35 */
36
37/*
38 * External virtual filesystem routines
39 */
40
41#include <sys/cdefs.h>
42__FBSDID("$FreeBSD: head/sys/kern/vfs_subr.c 249218 2013-04-06 22:21:23Z jeff $");
43
44#include "opt_compat.h"
45#include "opt_ddb.h"
46#include "opt_watchdog.h"
47
48#include <sys/param.h>
49#include <sys/systm.h>
50#include <sys/bio.h>
51#include <sys/buf.h>
52#include <sys/condvar.h>
53#include <sys/conf.h>
54#include <sys/dirent.h>
55#include <sys/event.h>
56#include <sys/eventhandler.h>
57#include <sys/extattr.h>
58#include <sys/file.h>
59#include <sys/fcntl.h>
60#include <sys/jail.h>
61#include <sys/kdb.h>
62#include <sys/kernel.h>
63#include <sys/kthread.h>
64#include <sys/lockf.h>
65#include <sys/malloc.h>
66#include <sys/mount.h>
67#include <sys/namei.h>
68#include <sys/priv.h>
69#include <sys/reboot.h>
70#include <sys/rwlock.h>
71#include <sys/sched.h>
72#include <sys/sleepqueue.h>
73#include <sys/smp.h>
74#include <sys/stat.h>
75#include <sys/sysctl.h>
76#include <sys/syslog.h>
77#include <sys/vmmeter.h>
78#include <sys/vnode.h>
79#include <sys/watchdog.h>
80
81#include <machine/stdarg.h>
82
83#include <security/mac/mac_framework.h>
84
85#include <vm/vm.h>
86#include <vm/vm_object.h>
87#include <vm/vm_extern.h>
88#include <vm/pmap.h>
89#include <vm/vm_map.h>
90#include <vm/vm_page.h>
91#include <vm/vm_kern.h>
92#include <vm/uma.h>
93
94#ifdef DDB
95#include <ddb/ddb.h>
96#endif
97
98static void	delmntque(struct vnode *vp);
99static int	flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo,
100		    int slpflag, int slptimeo);
101static void	syncer_shutdown(void *arg, int howto);
102static int	vtryrecycle(struct vnode *vp);
103static void	v_incr_usecount(struct vnode *);
104static void	v_decr_usecount(struct vnode *);
105static void	v_decr_useonly(struct vnode *);
106static void	v_upgrade_usecount(struct vnode *);
107static void	vnlru_free(int);
108static void	vgonel(struct vnode *);
109static void	vfs_knllock(void *arg);
110static void	vfs_knlunlock(void *arg);
111static void	vfs_knl_assert_locked(void *arg);
112static void	vfs_knl_assert_unlocked(void *arg);
113static void	destroy_vpollinfo(struct vpollinfo *vi);
114
115/*
116 * Number of vnodes in existence.  Increased whenever getnewvnode()
117 * allocates a new vnode, decreased in vdropl() for VI_DOOMED vnode.
118 */
119static unsigned long	numvnodes;
120
121SYSCTL_ULONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0,
122    "Number of vnodes in existence");
123
124/*
125 * Conversion tables for conversion from vnode types to inode formats
126 * and back.
127 */
128enum vtype iftovt_tab[16] = {
129	VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
130	VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
131};
132int vttoif_tab[10] = {
133	0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
134	S_IFSOCK, S_IFIFO, S_IFMT, S_IFMT
135};
136
137/*
138 * List of vnodes that are ready for recycling.
139 */
140static TAILQ_HEAD(freelst, vnode) vnode_free_list;
141
142/*
143 * Free vnode target.  Free vnodes may simply be files which have been stat'd
144 * but not read.  This is somewhat common, and a small cache of such files
145 * should be kept to avoid recreation costs.
146 */
147static u_long wantfreevnodes;
148SYSCTL_ULONG(_vfs, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, "");
149/* Number of vnodes in the free list. */
150static u_long freevnodes;
151SYSCTL_ULONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0,
152    "Number of vnodes in the free list");
153
154static int vlru_allow_cache_src;
155SYSCTL_INT(_vfs, OID_AUTO, vlru_allow_cache_src, CTLFLAG_RW,
156    &vlru_allow_cache_src, 0, "Allow vlru to reclaim source vnode");
157
158/*
159 * Various variables used for debugging the new implementation of
160 * reassignbuf().
161 * XXX these are probably of (very) limited utility now.
162 */
163static int reassignbufcalls;
164SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0,
165    "Number of calls to reassignbuf");
166
167/*
168 * Cache for the mount type id assigned to NFS.  This is used for
169 * special checks in nfs/nfs_nqlease.c and vm/vnode_pager.c.
170 */
171int	nfs_mount_type = -1;
172
173/* To keep more than one thread at a time from running vfs_getnewfsid */
174static struct mtx mntid_mtx;
175
176/*
177 * Lock for any access to the following:
178 *	vnode_free_list
179 *	numvnodes
180 *	freevnodes
181 */
182static struct mtx vnode_free_list_mtx;
183
184/* Publicly exported FS */
185struct nfs_public nfs_pub;
186
187/* Zone for allocation of new vnodes - used exclusively by getnewvnode() */
188static uma_zone_t vnode_zone;
189static uma_zone_t vnodepoll_zone;
190
191/*
192 * The workitem queue.
193 *
194 * It is useful to delay writes of file data and filesystem metadata
195 * for tens of seconds so that quickly created and deleted files need
196 * not waste disk bandwidth being created and removed. To realize this,
197 * we append vnodes to a "workitem" queue. When running with a soft
198 * updates implementation, most pending metadata dependencies should
199 * not wait for more than a few seconds. Thus, mounted on block devices
200 * are delayed only about a half the time that file data is delayed.
201 * Similarly, directory updates are more critical, so are only delayed
202 * about a third the time that file data is delayed. Thus, there are
203 * SYNCER_MAXDELAY queues that are processed round-robin at a rate of
204 * one each second (driven off the filesystem syncer process). The
205 * syncer_delayno variable indicates the next queue that is to be processed.
206 * Items that need to be processed soon are placed in this queue:
207 *
208 *	syncer_workitem_pending[syncer_delayno]
209 *
210 * A delay of fifteen seconds is done by placing the request fifteen
211 * entries later in the queue:
212 *
213 *	syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
214 *
215 */
216static int syncer_delayno;
217static long syncer_mask;
218LIST_HEAD(synclist, bufobj);
219static struct synclist *syncer_workitem_pending;
220/*
221 * The sync_mtx protects:
222 *	bo->bo_synclist
223 *	sync_vnode_count
224 *	syncer_delayno
225 *	syncer_state
226 *	syncer_workitem_pending
227 *	syncer_worklist_len
228 *	rushjob
229 */
230static struct mtx sync_mtx;
231static struct cv sync_wakeup;
232
233#define SYNCER_MAXDELAY		32
234static int syncer_maxdelay = SYNCER_MAXDELAY;	/* maximum delay time */
235static int syncdelay = 30;		/* max time to delay syncing data */
236static int filedelay = 30;		/* time to delay syncing files */
237SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0,
238    "Time to delay syncing files (in seconds)");
239static int dirdelay = 29;		/* time to delay syncing directories */
240SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0,
241    "Time to delay syncing directories (in seconds)");
242static int metadelay = 28;		/* time to delay syncing metadata */
243SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0,
244    "Time to delay syncing metadata (in seconds)");
245static int rushjob;		/* number of slots to run ASAP */
246static int stat_rush_requests;	/* number of times I/O speeded up */
247SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0,
248    "Number of times I/O speeded up (rush requests)");
249
250/*
251 * When shutting down the syncer, run it at four times normal speed.
252 */
253#define SYNCER_SHUTDOWN_SPEEDUP		4
254static int sync_vnode_count;
255static int syncer_worklist_len;
256static enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY }
257    syncer_state;
258
259/*
260 * Number of vnodes we want to exist at any one time.  This is mostly used
261 * to size hash tables in vnode-related code.  It is normally not used in
262 * getnewvnode(), as wantfreevnodes is normally nonzero.)
263 *
264 * XXX desiredvnodes is historical cruft and should not exist.
265 */
266int desiredvnodes;
267SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW,
268    &desiredvnodes, 0, "Maximum number of vnodes");
269SYSCTL_ULONG(_kern, OID_AUTO, minvnodes, CTLFLAG_RW,
270    &wantfreevnodes, 0, "Minimum number of vnodes (legacy)");
271static int vnlru_nowhere;
272SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW,
273    &vnlru_nowhere, 0, "Number of times the vnlru process ran without success");
274
275/*
276 * Macros to control when a vnode is freed and recycled.  All require
277 * the vnode interlock.
278 */
279#define VCANRECYCLE(vp) (((vp)->v_iflag & VI_FREE) && !(vp)->v_holdcnt)
280#define VSHOULDFREE(vp) (!((vp)->v_iflag & VI_FREE) && !(vp)->v_holdcnt)
281#define VSHOULDBUSY(vp) (((vp)->v_iflag & VI_FREE) && (vp)->v_holdcnt)
282
283/* Shift count for (uintptr_t)vp to initialize vp->v_hash. */
284static int vnsz2log;
285
286/*
287 * Initialize the vnode management data structures.
288 *
289 * Reevaluate the following cap on the number of vnodes after the physical
290 * memory size exceeds 512GB.  In the limit, as the physical memory size
291 * grows, the ratio of physical pages to vnodes approaches sixteen to one.
292 */
293#ifndef	MAXVNODES_MAX
294#define	MAXVNODES_MAX	(512 * (1024 * 1024 * 1024 / (int)PAGE_SIZE / 16))
295#endif
296static void
297vntblinit(void *dummy __unused)
298{
299	u_int i;
300	int physvnodes, virtvnodes;
301
302	/*
303	 * Desiredvnodes is a function of the physical memory size and the
304	 * kernel's heap size.  Generally speaking, it scales with the
305	 * physical memory size.  The ratio of desiredvnodes to physical pages
306	 * is one to four until desiredvnodes exceeds 98,304.  Thereafter, the
307	 * marginal ratio of desiredvnodes to physical pages is one to
308	 * sixteen.  However, desiredvnodes is limited by the kernel's heap
309	 * size.  The memory required by desiredvnodes vnodes and vm objects
310	 * may not exceed one seventh of the kernel's heap size.
311	 */
312	physvnodes = maxproc + cnt.v_page_count / 16 + 3 * min(98304 * 4,
313	    cnt.v_page_count) / 16;
314	virtvnodes = vm_kmem_size / (7 * (sizeof(struct vm_object) +
315	    sizeof(struct vnode)));
316	desiredvnodes = min(physvnodes, virtvnodes);
317	if (desiredvnodes > MAXVNODES_MAX) {
318		if (bootverbose)
319			printf("Reducing kern.maxvnodes %d -> %d\n",
320			    desiredvnodes, MAXVNODES_MAX);
321		desiredvnodes = MAXVNODES_MAX;
322	}
323	wantfreevnodes = desiredvnodes / 4;
324	mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF);
325	TAILQ_INIT(&vnode_free_list);
326	mtx_init(&vnode_free_list_mtx, "vnode_free_list", NULL, MTX_DEF);
327	vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL,
328	    NULL, NULL, UMA_ALIGN_PTR, 0);
329	vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo),
330	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
331	/*
332	 * Initialize the filesystem syncer.
333	 */
334	syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE,
335	    &syncer_mask);
336	syncer_maxdelay = syncer_mask + 1;
337	mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF);
338	cv_init(&sync_wakeup, "syncer");
339	for (i = 1; i <= sizeof(struct vnode); i <<= 1)
340		vnsz2log++;
341	vnsz2log--;
342}
343SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL);
344
345
346/*
347 * Mark a mount point as busy. Used to synchronize access and to delay
348 * unmounting. Eventually, mountlist_mtx is not released on failure.
349 *
350 * vfs_busy() is a custom lock, it can block the caller.
351 * vfs_busy() only sleeps if the unmount is active on the mount point.
352 * For a mountpoint mp, vfs_busy-enforced lock is before lock of any
353 * vnode belonging to mp.
354 *
355 * Lookup uses vfs_busy() to traverse mount points.
356 * root fs			var fs
357 * / vnode lock		A	/ vnode lock (/var)		D
358 * /var vnode lock	B	/log vnode lock(/var/log)	E
359 * vfs_busy lock	C	vfs_busy lock			F
360 *
361 * Within each file system, the lock order is C->A->B and F->D->E.
362 *
363 * When traversing across mounts, the system follows that lock order:
364 *
365 *        C->A->B
366 *              |
367 *              +->F->D->E
368 *
369 * The lookup() process for namei("/var") illustrates the process:
370 *  VOP_LOOKUP() obtains B while A is held
371 *  vfs_busy() obtains a shared lock on F while A and B are held
372 *  vput() releases lock on B
373 *  vput() releases lock on A
374 *  VFS_ROOT() obtains lock on D while shared lock on F is held
375 *  vfs_unbusy() releases shared lock on F
376 *  vn_lock() obtains lock on deadfs vnode vp_crossmp instead of A.
377 *    Attempt to lock A (instead of vp_crossmp) while D is held would
378 *    violate the global order, causing deadlocks.
379 *
380 * dounmount() locks B while F is drained.
381 */
382int
383vfs_busy(struct mount *mp, int flags)
384{
385
386	MPASS((flags & ~MBF_MASK) == 0);
387	CTR3(KTR_VFS, "%s: mp %p with flags %d", __func__, mp, flags);
388
389	MNT_ILOCK(mp);
390	MNT_REF(mp);
391	/*
392	 * If mount point is currenly being unmounted, sleep until the
393	 * mount point fate is decided.  If thread doing the unmounting fails,
394	 * it will clear MNTK_UNMOUNT flag before waking us up, indicating
395	 * that this mount point has survived the unmount attempt and vfs_busy
396	 * should retry.  Otherwise the unmounter thread will set MNTK_REFEXPIRE
397	 * flag in addition to MNTK_UNMOUNT, indicating that mount point is
398	 * about to be really destroyed.  vfs_busy needs to release its
399	 * reference on the mount point in this case and return with ENOENT,
400	 * telling the caller that mount mount it tried to busy is no longer
401	 * valid.
402	 */
403	while (mp->mnt_kern_flag & MNTK_UNMOUNT) {
404		if (flags & MBF_NOWAIT || mp->mnt_kern_flag & MNTK_REFEXPIRE) {
405			MNT_REL(mp);
406			MNT_IUNLOCK(mp);
407			CTR1(KTR_VFS, "%s: failed busying before sleeping",
408			    __func__);
409			return (ENOENT);
410		}
411		if (flags & MBF_MNTLSTLOCK)
412			mtx_unlock(&mountlist_mtx);
413		mp->mnt_kern_flag |= MNTK_MWAIT;
414		msleep(mp, MNT_MTX(mp), PVFS | PDROP, "vfs_busy", 0);
415		if (flags & MBF_MNTLSTLOCK)
416			mtx_lock(&mountlist_mtx);
417		MNT_ILOCK(mp);
418	}
419	if (flags & MBF_MNTLSTLOCK)
420		mtx_unlock(&mountlist_mtx);
421	mp->mnt_lockref++;
422	MNT_IUNLOCK(mp);
423	return (0);
424}
425
426/*
427 * Free a busy filesystem.
428 */
429void
430vfs_unbusy(struct mount *mp)
431{
432
433	CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
434	MNT_ILOCK(mp);
435	MNT_REL(mp);
436	KASSERT(mp->mnt_lockref > 0, ("negative mnt_lockref"));
437	mp->mnt_lockref--;
438	if (mp->mnt_lockref == 0 && (mp->mnt_kern_flag & MNTK_DRAINING) != 0) {
439		MPASS(mp->mnt_kern_flag & MNTK_UNMOUNT);
440		CTR1(KTR_VFS, "%s: waking up waiters", __func__);
441		mp->mnt_kern_flag &= ~MNTK_DRAINING;
442		wakeup(&mp->mnt_lockref);
443	}
444	MNT_IUNLOCK(mp);
445}
446
447/*
448 * Lookup a mount point by filesystem identifier.
449 */
450struct mount *
451vfs_getvfs(fsid_t *fsid)
452{
453	struct mount *mp;
454
455	CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid);
456	mtx_lock(&mountlist_mtx);
457	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
458		if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
459		    mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
460			vfs_ref(mp);
461			mtx_unlock(&mountlist_mtx);
462			return (mp);
463		}
464	}
465	mtx_unlock(&mountlist_mtx);
466	CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid);
467	return ((struct mount *) 0);
468}
469
470/*
471 * Lookup a mount point by filesystem identifier, busying it before
472 * returning.
473 */
474struct mount *
475vfs_busyfs(fsid_t *fsid)
476{
477	struct mount *mp;
478	int error;
479
480	CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid);
481	mtx_lock(&mountlist_mtx);
482	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
483		if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
484		    mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
485			error = vfs_busy(mp, MBF_MNTLSTLOCK);
486			if (error) {
487				mtx_unlock(&mountlist_mtx);
488				return (NULL);
489			}
490			return (mp);
491		}
492	}
493	CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid);
494	mtx_unlock(&mountlist_mtx);
495	return ((struct mount *) 0);
496}
497
498/*
499 * Check if a user can access privileged mount options.
500 */
501int
502vfs_suser(struct mount *mp, struct thread *td)
503{
504	int error;
505
506	/*
507	 * If the thread is jailed, but this is not a jail-friendly file
508	 * system, deny immediately.
509	 */
510	if (!(mp->mnt_vfc->vfc_flags & VFCF_JAIL) && jailed(td->td_ucred))
511		return (EPERM);
512
513	/*
514	 * If the file system was mounted outside the jail of the calling
515	 * thread, deny immediately.
516	 */
517	if (prison_check(td->td_ucred, mp->mnt_cred) != 0)
518		return (EPERM);
519
520	/*
521	 * If file system supports delegated administration, we don't check
522	 * for the PRIV_VFS_MOUNT_OWNER privilege - it will be better verified
523	 * by the file system itself.
524	 * If this is not the user that did original mount, we check for
525	 * the PRIV_VFS_MOUNT_OWNER privilege.
526	 */
527	if (!(mp->mnt_vfc->vfc_flags & VFCF_DELEGADMIN) &&
528	    mp->mnt_cred->cr_uid != td->td_ucred->cr_uid) {
529		if ((error = priv_check(td, PRIV_VFS_MOUNT_OWNER)) != 0)
530			return (error);
531	}
532	return (0);
533}
534
535/*
536 * Get a new unique fsid.  Try to make its val[0] unique, since this value
537 * will be used to create fake device numbers for stat().  Also try (but
538 * not so hard) make its val[0] unique mod 2^16, since some emulators only
539 * support 16-bit device numbers.  We end up with unique val[0]'s for the
540 * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls.
541 *
542 * Keep in mind that several mounts may be running in parallel.  Starting
543 * the search one past where the previous search terminated is both a
544 * micro-optimization and a defense against returning the same fsid to
545 * different mounts.
546 */
547void
548vfs_getnewfsid(struct mount *mp)
549{
550	static uint16_t mntid_base;
551	struct mount *nmp;
552	fsid_t tfsid;
553	int mtype;
554
555	CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
556	mtx_lock(&mntid_mtx);
557	mtype = mp->mnt_vfc->vfc_typenum;
558	tfsid.val[1] = mtype;
559	mtype = (mtype & 0xFF) << 24;
560	for (;;) {
561		tfsid.val[0] = makedev(255,
562		    mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF));
563		mntid_base++;
564		if ((nmp = vfs_getvfs(&tfsid)) == NULL)
565			break;
566		vfs_rel(nmp);
567	}
568	mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
569	mp->mnt_stat.f_fsid.val[1] = tfsid.val[1];
570	mtx_unlock(&mntid_mtx);
571}
572
573/*
574 * Knob to control the precision of file timestamps:
575 *
576 *   0 = seconds only; nanoseconds zeroed.
577 *   1 = seconds and nanoseconds, accurate within 1/HZ.
578 *   2 = seconds and nanoseconds, truncated to microseconds.
579 * >=3 = seconds and nanoseconds, maximum precision.
580 */
581enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC };
582
583static int timestamp_precision = TSP_SEC;
584SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW,
585    &timestamp_precision, 0, "File timestamp precision (0: seconds, "
586    "1: sec + ns accurate to 1/HZ, 2: sec + ns truncated to ms, "
587    "3+: sec + ns (max. precision))");
588
589/*
590 * Get a current timestamp.
591 */
592void
593vfs_timestamp(struct timespec *tsp)
594{
595	struct timeval tv;
596
597	switch (timestamp_precision) {
598	case TSP_SEC:
599		tsp->tv_sec = time_second;
600		tsp->tv_nsec = 0;
601		break;
602	case TSP_HZ:
603		getnanotime(tsp);
604		break;
605	case TSP_USEC:
606		microtime(&tv);
607		TIMEVAL_TO_TIMESPEC(&tv, tsp);
608		break;
609	case TSP_NSEC:
610	default:
611		nanotime(tsp);
612		break;
613	}
614}
615
616/*
617 * Set vnode attributes to VNOVAL
618 */
619void
620vattr_null(struct vattr *vap)
621{
622
623	vap->va_type = VNON;
624	vap->va_size = VNOVAL;
625	vap->va_bytes = VNOVAL;
626	vap->va_mode = VNOVAL;
627	vap->va_nlink = VNOVAL;
628	vap->va_uid = VNOVAL;
629	vap->va_gid = VNOVAL;
630	vap->va_fsid = VNOVAL;
631	vap->va_fileid = VNOVAL;
632	vap->va_blocksize = VNOVAL;
633	vap->va_rdev = VNOVAL;
634	vap->va_atime.tv_sec = VNOVAL;
635	vap->va_atime.tv_nsec = VNOVAL;
636	vap->va_mtime.tv_sec = VNOVAL;
637	vap->va_mtime.tv_nsec = VNOVAL;
638	vap->va_ctime.tv_sec = VNOVAL;
639	vap->va_ctime.tv_nsec = VNOVAL;
640	vap->va_birthtime.tv_sec = VNOVAL;
641	vap->va_birthtime.tv_nsec = VNOVAL;
642	vap->va_flags = VNOVAL;
643	vap->va_gen = VNOVAL;
644	vap->va_vaflags = 0;
645}
646
647/*
648 * This routine is called when we have too many vnodes.  It attempts
649 * to free <count> vnodes and will potentially free vnodes that still
650 * have VM backing store (VM backing store is typically the cause
651 * of a vnode blowout so we want to do this).  Therefore, this operation
652 * is not considered cheap.
653 *
654 * A number of conditions may prevent a vnode from being reclaimed.
655 * the buffer cache may have references on the vnode, a directory
656 * vnode may still have references due to the namei cache representing
657 * underlying files, or the vnode may be in active use.   It is not
658 * desireable to reuse such vnodes.  These conditions may cause the
659 * number of vnodes to reach some minimum value regardless of what
660 * you set kern.maxvnodes to.  Do not set kern.maxvnodes too low.
661 */
662static int
663vlrureclaim(struct mount *mp)
664{
665	struct vnode *vp;
666	int done;
667	int trigger;
668	int usevnodes;
669	int count;
670
671	/*
672	 * Calculate the trigger point, don't allow user
673	 * screwups to blow us up.   This prevents us from
674	 * recycling vnodes with lots of resident pages.  We
675	 * aren't trying to free memory, we are trying to
676	 * free vnodes.
677	 */
678	usevnodes = desiredvnodes;
679	if (usevnodes <= 0)
680		usevnodes = 1;
681	trigger = cnt.v_page_count * 2 / usevnodes;
682	done = 0;
683	vn_start_write(NULL, &mp, V_WAIT);
684	MNT_ILOCK(mp);
685	count = mp->mnt_nvnodelistsize / 10 + 1;
686	while (count != 0) {
687		vp = TAILQ_FIRST(&mp->mnt_nvnodelist);
688		while (vp != NULL && vp->v_type == VMARKER)
689			vp = TAILQ_NEXT(vp, v_nmntvnodes);
690		if (vp == NULL)
691			break;
692		TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
693		TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
694		--count;
695		if (!VI_TRYLOCK(vp))
696			goto next_iter;
697		/*
698		 * If it's been deconstructed already, it's still
699		 * referenced, or it exceeds the trigger, skip it.
700		 */
701		if (vp->v_usecount ||
702		    (!vlru_allow_cache_src &&
703			!LIST_EMPTY(&(vp)->v_cache_src)) ||
704		    (vp->v_iflag & VI_DOOMED) != 0 || (vp->v_object != NULL &&
705		    vp->v_object->resident_page_count > trigger)) {
706			VI_UNLOCK(vp);
707			goto next_iter;
708		}
709		MNT_IUNLOCK(mp);
710		vholdl(vp);
711		if (VOP_LOCK(vp, LK_INTERLOCK|LK_EXCLUSIVE|LK_NOWAIT)) {
712			vdrop(vp);
713			goto next_iter_mntunlocked;
714		}
715		VI_LOCK(vp);
716		/*
717		 * v_usecount may have been bumped after VOP_LOCK() dropped
718		 * the vnode interlock and before it was locked again.
719		 *
720		 * It is not necessary to recheck VI_DOOMED because it can
721		 * only be set by another thread that holds both the vnode
722		 * lock and vnode interlock.  If another thread has the
723		 * vnode lock before we get to VOP_LOCK() and obtains the
724		 * vnode interlock after VOP_LOCK() drops the vnode
725		 * interlock, the other thread will be unable to drop the
726		 * vnode lock before our VOP_LOCK() call fails.
727		 */
728		if (vp->v_usecount ||
729		    (!vlru_allow_cache_src &&
730			!LIST_EMPTY(&(vp)->v_cache_src)) ||
731		    (vp->v_object != NULL &&
732		    vp->v_object->resident_page_count > trigger)) {
733			VOP_UNLOCK(vp, LK_INTERLOCK);
734			goto next_iter_mntunlocked;
735		}
736		KASSERT((vp->v_iflag & VI_DOOMED) == 0,
737		    ("VI_DOOMED unexpectedly detected in vlrureclaim()"));
738		vgonel(vp);
739		VOP_UNLOCK(vp, 0);
740		vdropl(vp);
741		done++;
742next_iter_mntunlocked:
743		if (!should_yield())
744			goto relock_mnt;
745		goto yield;
746next_iter:
747		if (!should_yield())
748			continue;
749		MNT_IUNLOCK(mp);
750yield:
751		kern_yield(PRI_USER);
752relock_mnt:
753		MNT_ILOCK(mp);
754	}
755	MNT_IUNLOCK(mp);
756	vn_finished_write(mp);
757	return done;
758}
759
760/*
761 * Attempt to keep the free list at wantfreevnodes length.
762 */
763static void
764vnlru_free(int count)
765{
766	struct vnode *vp;
767
768	mtx_assert(&vnode_free_list_mtx, MA_OWNED);
769	for (; count > 0; count--) {
770		vp = TAILQ_FIRST(&vnode_free_list);
771		/*
772		 * The list can be modified while the free_list_mtx
773		 * has been dropped and vp could be NULL here.
774		 */
775		if (!vp)
776			break;
777		VNASSERT(vp->v_op != NULL, vp,
778		    ("vnlru_free: vnode already reclaimed."));
779		KASSERT((vp->v_iflag & VI_FREE) != 0,
780		    ("Removing vnode not on freelist"));
781		KASSERT((vp->v_iflag & VI_ACTIVE) == 0,
782		    ("Mangling active vnode"));
783		TAILQ_REMOVE(&vnode_free_list, vp, v_actfreelist);
784		/*
785		 * Don't recycle if we can't get the interlock.
786		 */
787		if (!VI_TRYLOCK(vp)) {
788			TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_actfreelist);
789			continue;
790		}
791		VNASSERT(VCANRECYCLE(vp), vp,
792		    ("vp inconsistent on freelist"));
793		freevnodes--;
794		vp->v_iflag &= ~VI_FREE;
795		vholdl(vp);
796		mtx_unlock(&vnode_free_list_mtx);
797		VI_UNLOCK(vp);
798		vtryrecycle(vp);
799		/*
800		 * If the recycled succeeded this vdrop will actually free
801		 * the vnode.  If not it will simply place it back on
802		 * the free list.
803		 */
804		vdrop(vp);
805		mtx_lock(&vnode_free_list_mtx);
806	}
807}
808/*
809 * Attempt to recycle vnodes in a context that is always safe to block.
810 * Calling vlrurecycle() from the bowels of filesystem code has some
811 * interesting deadlock problems.
812 */
813static struct proc *vnlruproc;
814static int vnlruproc_sig;
815
816static void
817vnlru_proc(void)
818{
819	struct mount *mp, *nmp;
820	int done;
821	struct proc *p = vnlruproc;
822
823	EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, p,
824	    SHUTDOWN_PRI_FIRST);
825
826	for (;;) {
827		kproc_suspend_check(p);
828		mtx_lock(&vnode_free_list_mtx);
829		if (freevnodes > wantfreevnodes)
830			vnlru_free(freevnodes - wantfreevnodes);
831		if (numvnodes <= desiredvnodes * 9 / 10) {
832			vnlruproc_sig = 0;
833			wakeup(&vnlruproc_sig);
834			msleep(vnlruproc, &vnode_free_list_mtx,
835			    PVFS|PDROP, "vlruwt", hz);
836			continue;
837		}
838		mtx_unlock(&vnode_free_list_mtx);
839		done = 0;
840		mtx_lock(&mountlist_mtx);
841		for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
842			if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) {
843				nmp = TAILQ_NEXT(mp, mnt_list);
844				continue;
845			}
846			done += vlrureclaim(mp);
847			mtx_lock(&mountlist_mtx);
848			nmp = TAILQ_NEXT(mp, mnt_list);
849			vfs_unbusy(mp);
850		}
851		mtx_unlock(&mountlist_mtx);
852		if (done == 0) {
853#if 0
854			/* These messages are temporary debugging aids */
855			if (vnlru_nowhere < 5)
856				printf("vnlru process getting nowhere..\n");
857			else if (vnlru_nowhere == 5)
858				printf("vnlru process messages stopped.\n");
859#endif
860			vnlru_nowhere++;
861			tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3);
862		} else
863			kern_yield(PRI_USER);
864	}
865}
866
867static struct kproc_desc vnlru_kp = {
868	"vnlru",
869	vnlru_proc,
870	&vnlruproc
871};
872SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start,
873    &vnlru_kp);
874
875/*
876 * Routines having to do with the management of the vnode table.
877 */
878
879/*
880 * Try to recycle a freed vnode.  We abort if anyone picks up a reference
881 * before we actually vgone().  This function must be called with the vnode
882 * held to prevent the vnode from being returned to the free list midway
883 * through vgone().
884 */
885static int
886vtryrecycle(struct vnode *vp)
887{
888	struct mount *vnmp;
889
890	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
891	VNASSERT(vp->v_holdcnt, vp,
892	    ("vtryrecycle: Recycling vp %p without a reference.", vp));
893	/*
894	 * This vnode may found and locked via some other list, if so we
895	 * can't recycle it yet.
896	 */
897	if (VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT) != 0) {
898		CTR2(KTR_VFS,
899		    "%s: impossible to recycle, vp %p lock is already held",
900		    __func__, vp);
901		return (EWOULDBLOCK);
902	}
903	/*
904	 * Don't recycle if its filesystem is being suspended.
905	 */
906	if (vn_start_write(vp, &vnmp, V_NOWAIT) != 0) {
907		VOP_UNLOCK(vp, 0);
908		CTR2(KTR_VFS,
909		    "%s: impossible to recycle, cannot start the write for %p",
910		    __func__, vp);
911		return (EBUSY);
912	}
913	/*
914	 * If we got this far, we need to acquire the interlock and see if
915	 * anyone picked up this vnode from another list.  If not, we will
916	 * mark it with DOOMED via vgonel() so that anyone who does find it
917	 * will skip over it.
918	 */
919	VI_LOCK(vp);
920	if (vp->v_usecount) {
921		VOP_UNLOCK(vp, LK_INTERLOCK);
922		vn_finished_write(vnmp);
923		CTR2(KTR_VFS,
924		    "%s: impossible to recycle, %p is already referenced",
925		    __func__, vp);
926		return (EBUSY);
927	}
928	if ((vp->v_iflag & VI_DOOMED) == 0)
929		vgonel(vp);
930	VOP_UNLOCK(vp, LK_INTERLOCK);
931	vn_finished_write(vnmp);
932	return (0);
933}
934
935/*
936 * Wait for available vnodes.
937 */
938static int
939getnewvnode_wait(int suspended)
940{
941
942	mtx_assert(&vnode_free_list_mtx, MA_OWNED);
943	if (numvnodes > desiredvnodes) {
944		if (suspended) {
945			/*
946			 * File system is beeing suspended, we cannot risk a
947			 * deadlock here, so allocate new vnode anyway.
948			 */
949			if (freevnodes > wantfreevnodes)
950				vnlru_free(freevnodes - wantfreevnodes);
951			return (0);
952		}
953		if (vnlruproc_sig == 0) {
954			vnlruproc_sig = 1;	/* avoid unnecessary wakeups */
955			wakeup(vnlruproc);
956		}
957		msleep(&vnlruproc_sig, &vnode_free_list_mtx, PVFS,
958		    "vlruwk", hz);
959	}
960	return (numvnodes > desiredvnodes ? ENFILE : 0);
961}
962
963void
964getnewvnode_reserve(u_int count)
965{
966	struct thread *td;
967
968	td = curthread;
969	mtx_lock(&vnode_free_list_mtx);
970	while (count > 0) {
971		if (getnewvnode_wait(0) == 0) {
972			count--;
973			td->td_vp_reserv++;
974			numvnodes++;
975		}
976	}
977	mtx_unlock(&vnode_free_list_mtx);
978}
979
980void
981getnewvnode_drop_reserve(void)
982{
983	struct thread *td;
984
985	td = curthread;
986	mtx_lock(&vnode_free_list_mtx);
987	KASSERT(numvnodes >= td->td_vp_reserv, ("reserve too large"));
988	numvnodes -= td->td_vp_reserv;
989	mtx_unlock(&vnode_free_list_mtx);
990	td->td_vp_reserv = 0;
991}
992
993/*
994 * Return the next vnode from the free list.
995 */
996int
997getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops,
998    struct vnode **vpp)
999{
1000	struct vnode *vp;
1001	struct bufobj *bo;
1002	struct thread *td;
1003	int error;
1004
1005	CTR3(KTR_VFS, "%s: mp %p with tag %s", __func__, mp, tag);
1006	vp = NULL;
1007	td = curthread;
1008	if (td->td_vp_reserv > 0) {
1009		td->td_vp_reserv -= 1;
1010		goto alloc;
1011	}
1012	mtx_lock(&vnode_free_list_mtx);
1013	/*
1014	 * Lend our context to reclaim vnodes if they've exceeded the max.
1015	 */
1016	if (freevnodes > wantfreevnodes)
1017		vnlru_free(1);
1018	error = getnewvnode_wait(mp != NULL && (mp->mnt_kern_flag &
1019	    MNTK_SUSPEND));
1020#if 0	/* XXX Not all VFS_VGET/ffs_vget callers check returns. */
1021	if (error != 0) {
1022		mtx_unlock(&vnode_free_list_mtx);
1023		return (error);
1024	}
1025#endif
1026	numvnodes++;
1027	mtx_unlock(&vnode_free_list_mtx);
1028alloc:
1029	vp = (struct vnode *) uma_zalloc(vnode_zone, M_WAITOK|M_ZERO);
1030	/*
1031	 * Setup locks.
1032	 */
1033	vp->v_vnlock = &vp->v_lock;
1034	mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF);
1035	/*
1036	 * By default, don't allow shared locks unless filesystems
1037	 * opt-in.
1038	 */
1039	lockinit(vp->v_vnlock, PVFS, tag, VLKTIMEOUT, LK_NOSHARE);
1040	/*
1041	 * Initialize bufobj.
1042	 */
1043	bo = &vp->v_bufobj;
1044	bo->__bo_vnode = vp;
1045	mtx_init(BO_MTX(bo), "bufobj interlock", NULL, MTX_DEF);
1046	bo->bo_ops = &buf_ops_bio;
1047	bo->bo_private = vp;
1048	TAILQ_INIT(&bo->bo_clean.bv_hd);
1049	TAILQ_INIT(&bo->bo_dirty.bv_hd);
1050	/*
1051	 * Initialize namecache.
1052	 */
1053	LIST_INIT(&vp->v_cache_src);
1054	TAILQ_INIT(&vp->v_cache_dst);
1055	/*
1056	 * Finalize various vnode identity bits.
1057	 */
1058	vp->v_type = VNON;
1059	vp->v_tag = tag;
1060	vp->v_op = vops;
1061	v_incr_usecount(vp);
1062	vp->v_data = NULL;
1063#ifdef MAC
1064	mac_vnode_init(vp);
1065	if (mp != NULL && (mp->mnt_flag & MNT_MULTILABEL) == 0)
1066		mac_vnode_associate_singlelabel(mp, vp);
1067	else if (mp == NULL && vops != &dead_vnodeops)
1068		printf("NULL mp in getnewvnode()\n");
1069#endif
1070	if (mp != NULL) {
1071		bo->bo_bsize = mp->mnt_stat.f_iosize;
1072		if ((mp->mnt_kern_flag & MNTK_NOKNOTE) != 0)
1073			vp->v_vflag |= VV_NOKNOTE;
1074	}
1075	rangelock_init(&vp->v_rl);
1076
1077	/*
1078	 * For the filesystems which do not use vfs_hash_insert(),
1079	 * still initialize v_hash to have vfs_hash_index() useful.
1080	 * E.g., nullfs uses vfs_hash_index() on the lower vnode for
1081	 * its own hashing.
1082	 */
1083	vp->v_hash = (uintptr_t)vp >> vnsz2log;
1084
1085	*vpp = vp;
1086	return (0);
1087}
1088
1089/*
1090 * Delete from old mount point vnode list, if on one.
1091 */
1092static void
1093delmntque(struct vnode *vp)
1094{
1095	struct mount *mp;
1096	int active;
1097
1098	mp = vp->v_mount;
1099	if (mp == NULL)
1100		return;
1101	MNT_ILOCK(mp);
1102	VI_LOCK(vp);
1103	KASSERT(mp->mnt_activevnodelistsize <= mp->mnt_nvnodelistsize,
1104	    ("Active vnode list size %d > Vnode list size %d",
1105	     mp->mnt_activevnodelistsize, mp->mnt_nvnodelistsize));
1106	active = vp->v_iflag & VI_ACTIVE;
1107	vp->v_iflag &= ~VI_ACTIVE;
1108	if (active) {
1109		mtx_lock(&vnode_free_list_mtx);
1110		TAILQ_REMOVE(&mp->mnt_activevnodelist, vp, v_actfreelist);
1111		mp->mnt_activevnodelistsize--;
1112		mtx_unlock(&vnode_free_list_mtx);
1113	}
1114	vp->v_mount = NULL;
1115	VI_UNLOCK(vp);
1116	VNASSERT(mp->mnt_nvnodelistsize > 0, vp,
1117		("bad mount point vnode list size"));
1118	TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
1119	mp->mnt_nvnodelistsize--;
1120	MNT_REL(mp);
1121	MNT_IUNLOCK(mp);
1122}
1123
1124static void
1125insmntque_stddtr(struct vnode *vp, void *dtr_arg)
1126{
1127
1128	vp->v_data = NULL;
1129	vp->v_op = &dead_vnodeops;
1130	vgone(vp);
1131	vput(vp);
1132}
1133
1134/*
1135 * Insert into list of vnodes for the new mount point, if available.
1136 */
1137int
1138insmntque1(struct vnode *vp, struct mount *mp,
1139	void (*dtr)(struct vnode *, void *), void *dtr_arg)
1140{
1141
1142	KASSERT(vp->v_mount == NULL,
1143		("insmntque: vnode already on per mount vnode list"));
1144	VNASSERT(mp != NULL, vp, ("Don't call insmntque(foo, NULL)"));
1145	ASSERT_VOP_ELOCKED(vp, "insmntque: non-locked vp");
1146
1147	/*
1148	 * We acquire the vnode interlock early to ensure that the
1149	 * vnode cannot be recycled by another process releasing a
1150	 * holdcnt on it before we get it on both the vnode list
1151	 * and the active vnode list. The mount mutex protects only
1152	 * manipulation of the vnode list and the vnode freelist
1153	 * mutex protects only manipulation of the active vnode list.
1154	 * Hence the need to hold the vnode interlock throughout.
1155	 */
1156	MNT_ILOCK(mp);
1157	VI_LOCK(vp);
1158	if (((mp->mnt_kern_flag & MNTK_NOINSMNTQ) != 0 &&
1159	    ((mp->mnt_kern_flag & MNTK_UNMOUNTF) != 0 ||
1160	    mp->mnt_nvnodelistsize == 0)) &&
1161	    (vp->v_vflag & VV_FORCEINSMQ) == 0) {
1162		VI_UNLOCK(vp);
1163		MNT_IUNLOCK(mp);
1164		if (dtr != NULL)
1165			dtr(vp, dtr_arg);
1166		return (EBUSY);
1167	}
1168	vp->v_mount = mp;
1169	MNT_REF(mp);
1170	TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
1171	VNASSERT(mp->mnt_nvnodelistsize >= 0, vp,
1172		("neg mount point vnode list size"));
1173	mp->mnt_nvnodelistsize++;
1174	KASSERT((vp->v_iflag & VI_ACTIVE) == 0,
1175	    ("Activating already active vnode"));
1176	vp->v_iflag |= VI_ACTIVE;
1177	mtx_lock(&vnode_free_list_mtx);
1178	TAILQ_INSERT_HEAD(&mp->mnt_activevnodelist, vp, v_actfreelist);
1179	mp->mnt_activevnodelistsize++;
1180	mtx_unlock(&vnode_free_list_mtx);
1181	VI_UNLOCK(vp);
1182	MNT_IUNLOCK(mp);
1183	return (0);
1184}
1185
1186int
1187insmntque(struct vnode *vp, struct mount *mp)
1188{
1189
1190	return (insmntque1(vp, mp, insmntque_stddtr, NULL));
1191}
1192
1193/*
1194 * Flush out and invalidate all buffers associated with a bufobj
1195 * Called with the underlying object locked.
1196 */
1197int
1198bufobj_invalbuf(struct bufobj *bo, int flags, int slpflag, int slptimeo)
1199{
1200	int error;
1201
1202	BO_LOCK(bo);
1203	if (flags & V_SAVE) {
1204		error = bufobj_wwait(bo, slpflag, slptimeo);
1205		if (error) {
1206			BO_UNLOCK(bo);
1207			return (error);
1208		}
1209		if (bo->bo_dirty.bv_cnt > 0) {
1210			BO_UNLOCK(bo);
1211			if ((error = BO_SYNC(bo, MNT_WAIT)) != 0)
1212				return (error);
1213			/*
1214			 * XXX We could save a lock/unlock if this was only
1215			 * enabled under INVARIANTS
1216			 */
1217			BO_LOCK(bo);
1218			if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0)
1219				panic("vinvalbuf: dirty bufs");
1220		}
1221	}
1222	/*
1223	 * If you alter this loop please notice that interlock is dropped and
1224	 * reacquired in flushbuflist.  Special care is needed to ensure that
1225	 * no race conditions occur from this.
1226	 */
1227	do {
1228		error = flushbuflist(&bo->bo_clean,
1229		    flags, bo, slpflag, slptimeo);
1230		if (error == 0 && !(flags & V_CLEANONLY))
1231			error = flushbuflist(&bo->bo_dirty,
1232			    flags, bo, slpflag, slptimeo);
1233		if (error != 0 && error != EAGAIN) {
1234			BO_UNLOCK(bo);
1235			return (error);
1236		}
1237	} while (error != 0);
1238
1239	/*
1240	 * Wait for I/O to complete.  XXX needs cleaning up.  The vnode can
1241	 * have write I/O in-progress but if there is a VM object then the
1242	 * VM object can also have read-I/O in-progress.
1243	 */
1244	do {
1245		bufobj_wwait(bo, 0, 0);
1246		BO_UNLOCK(bo);
1247		if (bo->bo_object != NULL) {
1248			VM_OBJECT_WLOCK(bo->bo_object);
1249			vm_object_pip_wait(bo->bo_object, "bovlbx");
1250			VM_OBJECT_WUNLOCK(bo->bo_object);
1251		}
1252		BO_LOCK(bo);
1253	} while (bo->bo_numoutput > 0);
1254	BO_UNLOCK(bo);
1255
1256	/*
1257	 * Destroy the copy in the VM cache, too.
1258	 */
1259	if (bo->bo_object != NULL &&
1260	    (flags & (V_ALT | V_NORMAL | V_CLEANONLY)) == 0) {
1261		VM_OBJECT_WLOCK(bo->bo_object);
1262		vm_object_page_remove(bo->bo_object, 0, 0, (flags & V_SAVE) ?
1263		    OBJPR_CLEANONLY : 0);
1264		VM_OBJECT_WUNLOCK(bo->bo_object);
1265	}
1266
1267#ifdef INVARIANTS
1268	BO_LOCK(bo);
1269	if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY)) == 0 &&
1270	    (bo->bo_dirty.bv_cnt > 0 || bo->bo_clean.bv_cnt > 0))
1271		panic("vinvalbuf: flush failed");
1272	BO_UNLOCK(bo);
1273#endif
1274	return (0);
1275}
1276
1277/*
1278 * Flush out and invalidate all buffers associated with a vnode.
1279 * Called with the underlying object locked.
1280 */
1281int
1282vinvalbuf(struct vnode *vp, int flags, int slpflag, int slptimeo)
1283{
1284
1285	CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags);
1286	ASSERT_VOP_LOCKED(vp, "vinvalbuf");
1287	return (bufobj_invalbuf(&vp->v_bufobj, flags, slpflag, slptimeo));
1288}
1289
1290/*
1291 * Flush out buffers on the specified list.
1292 *
1293 */
1294static int
1295flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, int slpflag,
1296    int slptimeo)
1297{
1298	struct buf *bp, *nbp;
1299	int retval, error;
1300	daddr_t lblkno;
1301	b_xflags_t xflags;
1302
1303	ASSERT_BO_LOCKED(bo);
1304
1305	retval = 0;
1306	TAILQ_FOREACH_SAFE(bp, &bufv->bv_hd, b_bobufs, nbp) {
1307		if (((flags & V_NORMAL) && (bp->b_xflags & BX_ALTDATA)) ||
1308		    ((flags & V_ALT) && (bp->b_xflags & BX_ALTDATA) == 0)) {
1309			continue;
1310		}
1311		lblkno = 0;
1312		xflags = 0;
1313		if (nbp != NULL) {
1314			lblkno = nbp->b_lblkno;
1315			xflags = nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN);
1316		}
1317		retval = EAGAIN;
1318		error = BUF_TIMELOCK(bp,
1319		    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_MTX(bo),
1320		    "flushbuf", slpflag, slptimeo);
1321		if (error) {
1322			BO_LOCK(bo);
1323			return (error != ENOLCK ? error : EAGAIN);
1324		}
1325		KASSERT(bp->b_bufobj == bo,
1326		    ("bp %p wrong b_bufobj %p should be %p",
1327		    bp, bp->b_bufobj, bo));
1328		if (bp->b_bufobj != bo) {	/* XXX: necessary ? */
1329			BUF_UNLOCK(bp);
1330			BO_LOCK(bo);
1331			return (EAGAIN);
1332		}
1333		/*
1334		 * XXX Since there are no node locks for NFS, I
1335		 * believe there is a slight chance that a delayed
1336		 * write will occur while sleeping just above, so
1337		 * check for it.
1338		 */
1339		if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) &&
1340		    (flags & V_SAVE)) {
1341			BO_LOCK(bo);
1342			bremfree(bp);
1343			BO_UNLOCK(bo);
1344			bp->b_flags |= B_ASYNC;
1345			bwrite(bp);
1346			BO_LOCK(bo);
1347			return (EAGAIN);	/* XXX: why not loop ? */
1348		}
1349		BO_LOCK(bo);
1350		bremfree(bp);
1351		BO_UNLOCK(bo);
1352		bp->b_flags |= (B_INVAL | B_RELBUF);
1353		bp->b_flags &= ~B_ASYNC;
1354		brelse(bp);
1355		BO_LOCK(bo);
1356		if (nbp != NULL &&
1357		    (nbp->b_bufobj != bo ||
1358		     nbp->b_lblkno != lblkno ||
1359		     (nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) != xflags))
1360			break;			/* nbp invalid */
1361	}
1362	return (retval);
1363}
1364
1365/*
1366 * Truncate a file's buffer and pages to a specified length.  This
1367 * is in lieu of the old vinvalbuf mechanism, which performed unneeded
1368 * sync activity.
1369 */
1370int
1371vtruncbuf(struct vnode *vp, struct ucred *cred, off_t length, int blksize)
1372{
1373	struct buf *bp, *nbp;
1374	int anyfreed;
1375	int trunclbn;
1376	struct bufobj *bo;
1377
1378	CTR5(KTR_VFS, "%s: vp %p with cred %p and block %d:%ju", __func__,
1379	    vp, cred, blksize, (uintmax_t)length);
1380
1381	/*
1382	 * Round up to the *next* lbn.
1383	 */
1384	trunclbn = (length + blksize - 1) / blksize;
1385
1386	ASSERT_VOP_LOCKED(vp, "vtruncbuf");
1387restart:
1388	bo = &vp->v_bufobj;
1389	BO_LOCK(bo);
1390	anyfreed = 1;
1391	for (;anyfreed;) {
1392		anyfreed = 0;
1393		TAILQ_FOREACH_SAFE(bp, &bo->bo_clean.bv_hd, b_bobufs, nbp) {
1394			if (bp->b_lblkno < trunclbn)
1395				continue;
1396			if (BUF_LOCK(bp,
1397			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
1398			    BO_MTX(bo)) == ENOLCK)
1399				goto restart;
1400
1401			BO_LOCK(bo);
1402			bremfree(bp);
1403			BO_UNLOCK(bo);
1404			bp->b_flags |= (B_INVAL | B_RELBUF);
1405			bp->b_flags &= ~B_ASYNC;
1406			brelse(bp);
1407			anyfreed = 1;
1408
1409			BO_LOCK(bo);
1410			if (nbp != NULL &&
1411			    (((nbp->b_xflags & BX_VNCLEAN) == 0) ||
1412			    (nbp->b_vp != vp) ||
1413			    (nbp->b_flags & B_DELWRI))) {
1414				BO_UNLOCK(bo);
1415				goto restart;
1416			}
1417		}
1418
1419		TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
1420			if (bp->b_lblkno < trunclbn)
1421				continue;
1422			if (BUF_LOCK(bp,
1423			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
1424			    BO_MTX(bo)) == ENOLCK)
1425				goto restart;
1426			BO_LOCK(bo);
1427			bremfree(bp);
1428			BO_UNLOCK(bo);
1429			bp->b_flags |= (B_INVAL | B_RELBUF);
1430			bp->b_flags &= ~B_ASYNC;
1431			brelse(bp);
1432			anyfreed = 1;
1433
1434			BO_LOCK(bo);
1435			if (nbp != NULL &&
1436			    (((nbp->b_xflags & BX_VNDIRTY) == 0) ||
1437			    (nbp->b_vp != vp) ||
1438			    (nbp->b_flags & B_DELWRI) == 0)) {
1439				BO_UNLOCK(bo);
1440				goto restart;
1441			}
1442		}
1443	}
1444
1445	if (length > 0) {
1446restartsync:
1447		TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
1448			if (bp->b_lblkno > 0)
1449				continue;
1450			/*
1451			 * Since we hold the vnode lock this should only
1452			 * fail if we're racing with the buf daemon.
1453			 */
1454			if (BUF_LOCK(bp,
1455			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
1456			    BO_MTX(bo)) == ENOLCK) {
1457				goto restart;
1458			}
1459			VNASSERT((bp->b_flags & B_DELWRI), vp,
1460			    ("buf(%p) on dirty queue without DELWRI", bp));
1461
1462			BO_LOCK(bo);
1463			bremfree(bp);
1464			BO_UNLOCK(bo);
1465			bawrite(bp);
1466			BO_LOCK(bo);
1467			goto restartsync;
1468		}
1469	}
1470
1471	bufobj_wwait(bo, 0, 0);
1472	BO_UNLOCK(bo);
1473	vnode_pager_setsize(vp, length);
1474
1475	return (0);
1476}
1477
1478/*
1479 * buf_splay() - splay tree core for the clean/dirty list of buffers in
1480 *		 a vnode.
1481 *
1482 *	NOTE: We have to deal with the special case of a background bitmap
1483 *	buffer, a situation where two buffers will have the same logical
1484 *	block offset.  We want (1) only the foreground buffer to be accessed
1485 *	in a lookup and (2) must differentiate between the foreground and
1486 *	background buffer in the splay tree algorithm because the splay
1487 *	tree cannot normally handle multiple entities with the same 'index'.
1488 *	We accomplish this by adding differentiating flags to the splay tree's
1489 *	numerical domain.
1490 */
1491static
1492struct buf *
1493buf_splay(daddr_t lblkno, b_xflags_t xflags, struct buf *root)
1494{
1495	struct buf dummy;
1496	struct buf *lefttreemax, *righttreemin, *y;
1497
1498	if (root == NULL)
1499		return (NULL);
1500	lefttreemax = righttreemin = &dummy;
1501	for (;;) {
1502		if (lblkno < root->b_lblkno) {
1503			if ((y = root->b_left) == NULL)
1504				break;
1505			if (lblkno < y->b_lblkno) {
1506				/* Rotate right. */
1507				root->b_left = y->b_right;
1508				y->b_right = root;
1509				root = y;
1510				if ((y = root->b_left) == NULL)
1511					break;
1512			}
1513			/* Link into the new root's right tree. */
1514			righttreemin->b_left = root;
1515			righttreemin = root;
1516		} else if (lblkno > root->b_lblkno) {
1517			if ((y = root->b_right) == NULL)
1518				break;
1519			if (lblkno > y->b_lblkno) {
1520				/* Rotate left. */
1521				root->b_right = y->b_left;
1522				y->b_left = root;
1523				root = y;
1524				if ((y = root->b_right) == NULL)
1525					break;
1526			}
1527			/* Link into the new root's left tree. */
1528			lefttreemax->b_right = root;
1529			lefttreemax = root;
1530		} else {
1531			break;
1532		}
1533		root = y;
1534	}
1535	/* Assemble the new root. */
1536	lefttreemax->b_right = root->b_left;
1537	righttreemin->b_left = root->b_right;
1538	root->b_left = dummy.b_right;
1539	root->b_right = dummy.b_left;
1540	return (root);
1541}
1542
1543static void
1544buf_vlist_remove(struct buf *bp)
1545{
1546	struct buf *root;
1547	struct bufv *bv;
1548
1549	KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
1550	ASSERT_BO_LOCKED(bp->b_bufobj);
1551	KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) !=
1552	    (BX_VNDIRTY|BX_VNCLEAN),
1553	    ("buf_vlist_remove: Buf %p is on two lists", bp));
1554	if (bp->b_xflags & BX_VNDIRTY)
1555		bv = &bp->b_bufobj->bo_dirty;
1556	else
1557		bv = &bp->b_bufobj->bo_clean;
1558	if (bp != bv->bv_root) {
1559		root = buf_splay(bp->b_lblkno, bp->b_xflags, bv->bv_root);
1560		KASSERT(root == bp, ("splay lookup failed in remove"));
1561	}
1562	if (bp->b_left == NULL) {
1563		root = bp->b_right;
1564	} else {
1565		root = buf_splay(bp->b_lblkno, bp->b_xflags, bp->b_left);
1566		root->b_right = bp->b_right;
1567	}
1568	bv->bv_root = root;
1569	TAILQ_REMOVE(&bv->bv_hd, bp, b_bobufs);
1570	bv->bv_cnt--;
1571	bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN);
1572}
1573
1574/*
1575 * Add the buffer to the sorted clean or dirty block list using a
1576 * splay tree algorithm.
1577 *
1578 * NOTE: xflags is passed as a constant, optimizing this inline function!
1579 */
1580static void
1581buf_vlist_add(struct buf *bp, struct bufobj *bo, b_xflags_t xflags)
1582{
1583	struct buf *root;
1584	struct bufv *bv;
1585
1586	ASSERT_BO_LOCKED(bo);
1587	KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0,
1588	    ("buf_vlist_add: Buf %p has existing xflags %d", bp, bp->b_xflags));
1589	bp->b_xflags |= xflags;
1590	if (xflags & BX_VNDIRTY)
1591		bv = &bo->bo_dirty;
1592	else
1593		bv = &bo->bo_clean;
1594
1595	root = buf_splay(bp->b_lblkno, bp->b_xflags, bv->bv_root);
1596	if (root == NULL) {
1597		bp->b_left = NULL;
1598		bp->b_right = NULL;
1599		TAILQ_INSERT_TAIL(&bv->bv_hd, bp, b_bobufs);
1600	} else if (bp->b_lblkno < root->b_lblkno) {
1601		bp->b_left = root->b_left;
1602		bp->b_right = root;
1603		root->b_left = NULL;
1604		TAILQ_INSERT_BEFORE(root, bp, b_bobufs);
1605	} else {
1606		bp->b_right = root->b_right;
1607		bp->b_left = root;
1608		root->b_right = NULL;
1609		TAILQ_INSERT_AFTER(&bv->bv_hd, root, bp, b_bobufs);
1610	}
1611	bv->bv_cnt++;
1612	bv->bv_root = bp;
1613}
1614
1615/*
1616 * Lookup a buffer using the splay tree.  Note that we specifically avoid
1617 * shadow buffers used in background bitmap writes.
1618 *
1619 * This code isn't quite efficient as it could be because we are maintaining
1620 * two sorted lists and do not know which list the block resides in.
1621 *
1622 * During a "make buildworld" the desired buffer is found at one of
1623 * the roots more than 60% of the time.  Thus, checking both roots
1624 * before performing either splay eliminates unnecessary splays on the
1625 * first tree splayed.
1626 */
1627struct buf *
1628gbincore(struct bufobj *bo, daddr_t lblkno)
1629{
1630	struct buf *bp;
1631
1632	ASSERT_BO_LOCKED(bo);
1633	if ((bp = bo->bo_clean.bv_root) != NULL && bp->b_lblkno == lblkno)
1634		return (bp);
1635	if ((bp = bo->bo_dirty.bv_root) != NULL && bp->b_lblkno == lblkno)
1636		return (bp);
1637	if ((bp = bo->bo_clean.bv_root) != NULL) {
1638		bo->bo_clean.bv_root = bp = buf_splay(lblkno, 0, bp);
1639		if (bp->b_lblkno == lblkno)
1640			return (bp);
1641	}
1642	if ((bp = bo->bo_dirty.bv_root) != NULL) {
1643		bo->bo_dirty.bv_root = bp = buf_splay(lblkno, 0, bp);
1644		if (bp->b_lblkno == lblkno)
1645			return (bp);
1646	}
1647	return (NULL);
1648}
1649
1650/*
1651 * Associate a buffer with a vnode.
1652 */
1653void
1654bgetvp(struct vnode *vp, struct buf *bp)
1655{
1656	struct bufobj *bo;
1657
1658	bo = &vp->v_bufobj;
1659	ASSERT_BO_LOCKED(bo);
1660	VNASSERT(bp->b_vp == NULL, bp->b_vp, ("bgetvp: not free"));
1661
1662	CTR3(KTR_BUF, "bgetvp(%p) vp %p flags %X", bp, vp, bp->b_flags);
1663	VNASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, vp,
1664	    ("bgetvp: bp already attached! %p", bp));
1665
1666	vhold(vp);
1667	bp->b_vp = vp;
1668	bp->b_bufobj = bo;
1669	/*
1670	 * Insert onto list for new vnode.
1671	 */
1672	buf_vlist_add(bp, bo, BX_VNCLEAN);
1673}
1674
1675/*
1676 * Disassociate a buffer from a vnode.
1677 */
1678void
1679brelvp(struct buf *bp)
1680{
1681	struct bufobj *bo;
1682	struct vnode *vp;
1683
1684	CTR3(KTR_BUF, "brelvp(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
1685	KASSERT(bp->b_vp != NULL, ("brelvp: NULL"));
1686
1687	/*
1688	 * Delete from old vnode list, if on one.
1689	 */
1690	vp = bp->b_vp;		/* XXX */
1691	bo = bp->b_bufobj;
1692	BO_LOCK(bo);
1693	if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN))
1694		buf_vlist_remove(bp);
1695	else
1696		panic("brelvp: Buffer %p not on queue.", bp);
1697	if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) {
1698		bo->bo_flag &= ~BO_ONWORKLST;
1699		mtx_lock(&sync_mtx);
1700		LIST_REMOVE(bo, bo_synclist);
1701		syncer_worklist_len--;
1702		mtx_unlock(&sync_mtx);
1703	}
1704	bp->b_vp = NULL;
1705	bp->b_bufobj = NULL;
1706	BO_UNLOCK(bo);
1707	vdrop(vp);
1708}
1709
1710/*
1711 * Add an item to the syncer work queue.
1712 */
1713static void
1714vn_syncer_add_to_worklist(struct bufobj *bo, int delay)
1715{
1716	int slot;
1717
1718	ASSERT_BO_LOCKED(bo);
1719
1720	mtx_lock(&sync_mtx);
1721	if (bo->bo_flag & BO_ONWORKLST)
1722		LIST_REMOVE(bo, bo_synclist);
1723	else {
1724		bo->bo_flag |= BO_ONWORKLST;
1725		syncer_worklist_len++;
1726	}
1727
1728	if (delay > syncer_maxdelay - 2)
1729		delay = syncer_maxdelay - 2;
1730	slot = (syncer_delayno + delay) & syncer_mask;
1731
1732	LIST_INSERT_HEAD(&syncer_workitem_pending[slot], bo, bo_synclist);
1733	mtx_unlock(&sync_mtx);
1734}
1735
1736static int
1737sysctl_vfs_worklist_len(SYSCTL_HANDLER_ARGS)
1738{
1739	int error, len;
1740
1741	mtx_lock(&sync_mtx);
1742	len = syncer_worklist_len - sync_vnode_count;
1743	mtx_unlock(&sync_mtx);
1744	error = SYSCTL_OUT(req, &len, sizeof(len));
1745	return (error);
1746}
1747
1748SYSCTL_PROC(_vfs, OID_AUTO, worklist_len, CTLTYPE_INT | CTLFLAG_RD, NULL, 0,
1749    sysctl_vfs_worklist_len, "I", "Syncer thread worklist length");
1750
1751static struct proc *updateproc;
1752static void sched_sync(void);
1753static struct kproc_desc up_kp = {
1754	"syncer",
1755	sched_sync,
1756	&updateproc
1757};
1758SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp);
1759
1760static int
1761sync_vnode(struct synclist *slp, struct bufobj **bo, struct thread *td)
1762{
1763	struct vnode *vp;
1764	struct mount *mp;
1765
1766	*bo = LIST_FIRST(slp);
1767	if (*bo == NULL)
1768		return (0);
1769	vp = (*bo)->__bo_vnode;	/* XXX */
1770	if (VOP_ISLOCKED(vp) != 0 || VI_TRYLOCK(vp) == 0)
1771		return (1);
1772	/*
1773	 * We use vhold in case the vnode does not
1774	 * successfully sync.  vhold prevents the vnode from
1775	 * going away when we unlock the sync_mtx so that
1776	 * we can acquire the vnode interlock.
1777	 */
1778	vholdl(vp);
1779	mtx_unlock(&sync_mtx);
1780	VI_UNLOCK(vp);
1781	if (vn_start_write(vp, &mp, V_NOWAIT) != 0) {
1782		vdrop(vp);
1783		mtx_lock(&sync_mtx);
1784		return (*bo == LIST_FIRST(slp));
1785	}
1786	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1787	(void) VOP_FSYNC(vp, MNT_LAZY, td);
1788	VOP_UNLOCK(vp, 0);
1789	vn_finished_write(mp);
1790	BO_LOCK(*bo);
1791	if (((*bo)->bo_flag & BO_ONWORKLST) != 0) {
1792		/*
1793		 * Put us back on the worklist.  The worklist
1794		 * routine will remove us from our current
1795		 * position and then add us back in at a later
1796		 * position.
1797		 */
1798		vn_syncer_add_to_worklist(*bo, syncdelay);
1799	}
1800	BO_UNLOCK(*bo);
1801	vdrop(vp);
1802	mtx_lock(&sync_mtx);
1803	return (0);
1804}
1805
1806/*
1807 * System filesystem synchronizer daemon.
1808 */
1809static void
1810sched_sync(void)
1811{
1812	struct synclist *next, *slp;
1813	struct bufobj *bo;
1814	long starttime;
1815	struct thread *td = curthread;
1816	int last_work_seen;
1817	int net_worklist_len;
1818	int syncer_final_iter;
1819	int first_printf;
1820	int error;
1821
1822	last_work_seen = 0;
1823	syncer_final_iter = 0;
1824	first_printf = 1;
1825	syncer_state = SYNCER_RUNNING;
1826	starttime = time_uptime;
1827	td->td_pflags |= TDP_NORUNNINGBUF;
1828
1829	EVENTHANDLER_REGISTER(shutdown_pre_sync, syncer_shutdown, td->td_proc,
1830	    SHUTDOWN_PRI_LAST);
1831
1832	mtx_lock(&sync_mtx);
1833	for (;;) {
1834		if (syncer_state == SYNCER_FINAL_DELAY &&
1835		    syncer_final_iter == 0) {
1836			mtx_unlock(&sync_mtx);
1837			kproc_suspend_check(td->td_proc);
1838			mtx_lock(&sync_mtx);
1839		}
1840		net_worklist_len = syncer_worklist_len - sync_vnode_count;
1841		if (syncer_state != SYNCER_RUNNING &&
1842		    starttime != time_uptime) {
1843			if (first_printf) {
1844				printf("\nSyncing disks, vnodes remaining...");
1845				first_printf = 0;
1846			}
1847			printf("%d ", net_worklist_len);
1848		}
1849		starttime = time_uptime;
1850
1851		/*
1852		 * Push files whose dirty time has expired.  Be careful
1853		 * of interrupt race on slp queue.
1854		 *
1855		 * Skip over empty worklist slots when shutting down.
1856		 */
1857		do {
1858			slp = &syncer_workitem_pending[syncer_delayno];
1859			syncer_delayno += 1;
1860			if (syncer_delayno == syncer_maxdelay)
1861				syncer_delayno = 0;
1862			next = &syncer_workitem_pending[syncer_delayno];
1863			/*
1864			 * If the worklist has wrapped since the
1865			 * it was emptied of all but syncer vnodes,
1866			 * switch to the FINAL_DELAY state and run
1867			 * for one more second.
1868			 */
1869			if (syncer_state == SYNCER_SHUTTING_DOWN &&
1870			    net_worklist_len == 0 &&
1871			    last_work_seen == syncer_delayno) {
1872				syncer_state = SYNCER_FINAL_DELAY;
1873				syncer_final_iter = SYNCER_SHUTDOWN_SPEEDUP;
1874			}
1875		} while (syncer_state != SYNCER_RUNNING && LIST_EMPTY(slp) &&
1876		    syncer_worklist_len > 0);
1877
1878		/*
1879		 * Keep track of the last time there was anything
1880		 * on the worklist other than syncer vnodes.
1881		 * Return to the SHUTTING_DOWN state if any
1882		 * new work appears.
1883		 */
1884		if (net_worklist_len > 0 || syncer_state == SYNCER_RUNNING)
1885			last_work_seen = syncer_delayno;
1886		if (net_worklist_len > 0 && syncer_state == SYNCER_FINAL_DELAY)
1887			syncer_state = SYNCER_SHUTTING_DOWN;
1888		while (!LIST_EMPTY(slp)) {
1889			error = sync_vnode(slp, &bo, td);
1890			if (error == 1) {
1891				LIST_REMOVE(bo, bo_synclist);
1892				LIST_INSERT_HEAD(next, bo, bo_synclist);
1893				continue;
1894			}
1895
1896			if (first_printf == 0)
1897				wdog_kern_pat(WD_LASTVAL);
1898
1899		}
1900		if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter > 0)
1901			syncer_final_iter--;
1902		/*
1903		 * The variable rushjob allows the kernel to speed up the
1904		 * processing of the filesystem syncer process. A rushjob
1905		 * value of N tells the filesystem syncer to process the next
1906		 * N seconds worth of work on its queue ASAP. Currently rushjob
1907		 * is used by the soft update code to speed up the filesystem
1908		 * syncer process when the incore state is getting so far
1909		 * ahead of the disk that the kernel memory pool is being
1910		 * threatened with exhaustion.
1911		 */
1912		if (rushjob > 0) {
1913			rushjob -= 1;
1914			continue;
1915		}
1916		/*
1917		 * Just sleep for a short period of time between
1918		 * iterations when shutting down to allow some I/O
1919		 * to happen.
1920		 *
1921		 * If it has taken us less than a second to process the
1922		 * current work, then wait. Otherwise start right over
1923		 * again. We can still lose time if any single round
1924		 * takes more than two seconds, but it does not really
1925		 * matter as we are just trying to generally pace the
1926		 * filesystem activity.
1927		 */
1928		if (syncer_state != SYNCER_RUNNING ||
1929		    time_uptime == starttime) {
1930			thread_lock(td);
1931			sched_prio(td, PPAUSE);
1932			thread_unlock(td);
1933		}
1934		if (syncer_state != SYNCER_RUNNING)
1935			cv_timedwait(&sync_wakeup, &sync_mtx,
1936			    hz / SYNCER_SHUTDOWN_SPEEDUP);
1937		else if (time_uptime == starttime)
1938			cv_timedwait(&sync_wakeup, &sync_mtx, hz);
1939	}
1940}
1941
1942/*
1943 * Request the syncer daemon to speed up its work.
1944 * We never push it to speed up more than half of its
1945 * normal turn time, otherwise it could take over the cpu.
1946 */
1947int
1948speedup_syncer(void)
1949{
1950	int ret = 0;
1951
1952	mtx_lock(&sync_mtx);
1953	if (rushjob < syncdelay / 2) {
1954		rushjob += 1;
1955		stat_rush_requests += 1;
1956		ret = 1;
1957	}
1958	mtx_unlock(&sync_mtx);
1959	cv_broadcast(&sync_wakeup);
1960	return (ret);
1961}
1962
1963/*
1964 * Tell the syncer to speed up its work and run though its work
1965 * list several times, then tell it to shut down.
1966 */
1967static void
1968syncer_shutdown(void *arg, int howto)
1969{
1970
1971	if (howto & RB_NOSYNC)
1972		return;
1973	mtx_lock(&sync_mtx);
1974	syncer_state = SYNCER_SHUTTING_DOWN;
1975	rushjob = 0;
1976	mtx_unlock(&sync_mtx);
1977	cv_broadcast(&sync_wakeup);
1978	kproc_shutdown(arg, howto);
1979}
1980
1981/*
1982 * Reassign a buffer from one vnode to another.
1983 * Used to assign file specific control information
1984 * (indirect blocks) to the vnode to which they belong.
1985 */
1986void
1987reassignbuf(struct buf *bp)
1988{
1989	struct vnode *vp;
1990	struct bufobj *bo;
1991	int delay;
1992#ifdef INVARIANTS
1993	struct bufv *bv;
1994#endif
1995
1996	vp = bp->b_vp;
1997	bo = bp->b_bufobj;
1998	++reassignbufcalls;
1999
2000	CTR3(KTR_BUF, "reassignbuf(%p) vp %p flags %X",
2001	    bp, bp->b_vp, bp->b_flags);
2002	/*
2003	 * B_PAGING flagged buffers cannot be reassigned because their vp
2004	 * is not fully linked in.
2005	 */
2006	if (bp->b_flags & B_PAGING)
2007		panic("cannot reassign paging buffer");
2008
2009	/*
2010	 * Delete from old vnode list, if on one.
2011	 */
2012	BO_LOCK(bo);
2013	if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN))
2014		buf_vlist_remove(bp);
2015	else
2016		panic("reassignbuf: Buffer %p not on queue.", bp);
2017	/*
2018	 * If dirty, put on list of dirty buffers; otherwise insert onto list
2019	 * of clean buffers.
2020	 */
2021	if (bp->b_flags & B_DELWRI) {
2022		if ((bo->bo_flag & BO_ONWORKLST) == 0) {
2023			switch (vp->v_type) {
2024			case VDIR:
2025				delay = dirdelay;
2026				break;
2027			case VCHR:
2028				delay = metadelay;
2029				break;
2030			default:
2031				delay = filedelay;
2032			}
2033			vn_syncer_add_to_worklist(bo, delay);
2034		}
2035		buf_vlist_add(bp, bo, BX_VNDIRTY);
2036	} else {
2037		buf_vlist_add(bp, bo, BX_VNCLEAN);
2038
2039		if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) {
2040			mtx_lock(&sync_mtx);
2041			LIST_REMOVE(bo, bo_synclist);
2042			syncer_worklist_len--;
2043			mtx_unlock(&sync_mtx);
2044			bo->bo_flag &= ~BO_ONWORKLST;
2045		}
2046	}
2047#ifdef INVARIANTS
2048	bv = &bo->bo_clean;
2049	bp = TAILQ_FIRST(&bv->bv_hd);
2050	KASSERT(bp == NULL || bp->b_bufobj == bo,
2051	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
2052	bp = TAILQ_LAST(&bv->bv_hd, buflists);
2053	KASSERT(bp == NULL || bp->b_bufobj == bo,
2054	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
2055	bv = &bo->bo_dirty;
2056	bp = TAILQ_FIRST(&bv->bv_hd);
2057	KASSERT(bp == NULL || bp->b_bufobj == bo,
2058	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
2059	bp = TAILQ_LAST(&bv->bv_hd, buflists);
2060	KASSERT(bp == NULL || bp->b_bufobj == bo,
2061	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
2062#endif
2063	BO_UNLOCK(bo);
2064}
2065
2066/*
2067 * Increment the use and hold counts on the vnode, taking care to reference
2068 * the driver's usecount if this is a chardev.  The vholdl() will remove
2069 * the vnode from the free list if it is presently free.  Requires the
2070 * vnode interlock and returns with it held.
2071 */
2072static void
2073v_incr_usecount(struct vnode *vp)
2074{
2075
2076	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2077	vp->v_usecount++;
2078	if (vp->v_type == VCHR && vp->v_rdev != NULL) {
2079		dev_lock();
2080		vp->v_rdev->si_usecount++;
2081		dev_unlock();
2082	}
2083	vholdl(vp);
2084}
2085
2086/*
2087 * Turn a holdcnt into a use+holdcnt such that only one call to
2088 * v_decr_usecount is needed.
2089 */
2090static void
2091v_upgrade_usecount(struct vnode *vp)
2092{
2093
2094	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2095	vp->v_usecount++;
2096	if (vp->v_type == VCHR && vp->v_rdev != NULL) {
2097		dev_lock();
2098		vp->v_rdev->si_usecount++;
2099		dev_unlock();
2100	}
2101}
2102
2103/*
2104 * Decrement the vnode use and hold count along with the driver's usecount
2105 * if this is a chardev.  The vdropl() below releases the vnode interlock
2106 * as it may free the vnode.
2107 */
2108static void
2109v_decr_usecount(struct vnode *vp)
2110{
2111
2112	ASSERT_VI_LOCKED(vp, __FUNCTION__);
2113	VNASSERT(vp->v_usecount > 0, vp,
2114	    ("v_decr_usecount: negative usecount"));
2115	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2116	vp->v_usecount--;
2117	if (vp->v_type == VCHR && vp->v_rdev != NULL) {
2118		dev_lock();
2119		vp->v_rdev->si_usecount--;
2120		dev_unlock();
2121	}
2122	vdropl(vp);
2123}
2124
2125/*
2126 * Decrement only the use count and driver use count.  This is intended to
2127 * be paired with a follow on vdropl() to release the remaining hold count.
2128 * In this way we may vgone() a vnode with a 0 usecount without risk of
2129 * having it end up on a free list because the hold count is kept above 0.
2130 */
2131static void
2132v_decr_useonly(struct vnode *vp)
2133{
2134
2135	ASSERT_VI_LOCKED(vp, __FUNCTION__);
2136	VNASSERT(vp->v_usecount > 0, vp,
2137	    ("v_decr_useonly: negative usecount"));
2138	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2139	vp->v_usecount--;
2140	if (vp->v_type == VCHR && vp->v_rdev != NULL) {
2141		dev_lock();
2142		vp->v_rdev->si_usecount--;
2143		dev_unlock();
2144	}
2145}
2146
2147/*
2148 * Grab a particular vnode from the free list, increment its
2149 * reference count and lock it.  VI_DOOMED is set if the vnode
2150 * is being destroyed.  Only callers who specify LK_RETRY will
2151 * see doomed vnodes.  If inactive processing was delayed in
2152 * vput try to do it here.
2153 */
2154int
2155vget(struct vnode *vp, int flags, struct thread *td)
2156{
2157	int error;
2158
2159	error = 0;
2160	VNASSERT((flags & LK_TYPE_MASK) != 0, vp,
2161	    ("vget: invalid lock operation"));
2162	CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags);
2163
2164	if ((flags & LK_INTERLOCK) == 0)
2165		VI_LOCK(vp);
2166	vholdl(vp);
2167	if ((error = vn_lock(vp, flags | LK_INTERLOCK)) != 0) {
2168		vdrop(vp);
2169		CTR2(KTR_VFS, "%s: impossible to lock vnode %p", __func__,
2170		    vp);
2171		return (error);
2172	}
2173	if (vp->v_iflag & VI_DOOMED && (flags & LK_RETRY) == 0)
2174		panic("vget: vn_lock failed to return ENOENT\n");
2175	VI_LOCK(vp);
2176	/* Upgrade our holdcnt to a usecount. */
2177	v_upgrade_usecount(vp);
2178	/*
2179	 * We don't guarantee that any particular close will
2180	 * trigger inactive processing so just make a best effort
2181	 * here at preventing a reference to a removed file.  If
2182	 * we don't succeed no harm is done.
2183	 */
2184	if (vp->v_iflag & VI_OWEINACT) {
2185		if (VOP_ISLOCKED(vp) == LK_EXCLUSIVE &&
2186		    (flags & LK_NOWAIT) == 0)
2187			vinactive(vp, td);
2188		vp->v_iflag &= ~VI_OWEINACT;
2189	}
2190	VI_UNLOCK(vp);
2191	return (0);
2192}
2193
2194/*
2195 * Increase the reference count of a vnode.
2196 */
2197void
2198vref(struct vnode *vp)
2199{
2200
2201	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2202	VI_LOCK(vp);
2203	v_incr_usecount(vp);
2204	VI_UNLOCK(vp);
2205}
2206
2207/*
2208 * Return reference count of a vnode.
2209 *
2210 * The results of this call are only guaranteed when some mechanism other
2211 * than the VI lock is used to stop other processes from gaining references
2212 * to the vnode.  This may be the case if the caller holds the only reference.
2213 * This is also useful when stale data is acceptable as race conditions may
2214 * be accounted for by some other means.
2215 */
2216int
2217vrefcnt(struct vnode *vp)
2218{
2219	int usecnt;
2220
2221	VI_LOCK(vp);
2222	usecnt = vp->v_usecount;
2223	VI_UNLOCK(vp);
2224
2225	return (usecnt);
2226}
2227
2228#define	VPUTX_VRELE	1
2229#define	VPUTX_VPUT	2
2230#define	VPUTX_VUNREF	3
2231
2232static void
2233vputx(struct vnode *vp, int func)
2234{
2235	int error;
2236
2237	KASSERT(vp != NULL, ("vputx: null vp"));
2238	if (func == VPUTX_VUNREF)
2239		ASSERT_VOP_LOCKED(vp, "vunref");
2240	else if (func == VPUTX_VPUT)
2241		ASSERT_VOP_LOCKED(vp, "vput");
2242	else
2243		KASSERT(func == VPUTX_VRELE, ("vputx: wrong func"));
2244	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2245	VI_LOCK(vp);
2246
2247	/* Skip this v_writecount check if we're going to panic below. */
2248	VNASSERT(vp->v_writecount < vp->v_usecount || vp->v_usecount < 1, vp,
2249	    ("vputx: missed vn_close"));
2250	error = 0;
2251
2252	if (vp->v_usecount > 1 || ((vp->v_iflag & VI_DOINGINACT) &&
2253	    vp->v_usecount == 1)) {
2254		if (func == VPUTX_VPUT)
2255			VOP_UNLOCK(vp, 0);
2256		v_decr_usecount(vp);
2257		return;
2258	}
2259
2260	if (vp->v_usecount != 1) {
2261		vprint("vputx: negative ref count", vp);
2262		panic("vputx: negative ref cnt");
2263	}
2264	CTR2(KTR_VFS, "%s: return vnode %p to the freelist", __func__, vp);
2265	/*
2266	 * We want to hold the vnode until the inactive finishes to
2267	 * prevent vgone() races.  We drop the use count here and the
2268	 * hold count below when we're done.
2269	 */
2270	v_decr_useonly(vp);
2271	/*
2272	 * We must call VOP_INACTIVE with the node locked. Mark
2273	 * as VI_DOINGINACT to avoid recursion.
2274	 */
2275	vp->v_iflag |= VI_OWEINACT;
2276	switch (func) {
2277	case VPUTX_VRELE:
2278		error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK);
2279		VI_LOCK(vp);
2280		break;
2281	case VPUTX_VPUT:
2282		if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) {
2283			error = VOP_LOCK(vp, LK_UPGRADE | LK_INTERLOCK |
2284			    LK_NOWAIT);
2285			VI_LOCK(vp);
2286		}
2287		break;
2288	case VPUTX_VUNREF:
2289		if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE)
2290			error = EBUSY;
2291		break;
2292	}
2293	if (vp->v_usecount > 0)
2294		vp->v_iflag &= ~VI_OWEINACT;
2295	if (error == 0) {
2296		if (vp->v_iflag & VI_OWEINACT)
2297			vinactive(vp, curthread);
2298		if (func != VPUTX_VUNREF)
2299			VOP_UNLOCK(vp, 0);
2300	}
2301	vdropl(vp);
2302}
2303
2304/*
2305 * Vnode put/release.
2306 * If count drops to zero, call inactive routine and return to freelist.
2307 */
2308void
2309vrele(struct vnode *vp)
2310{
2311
2312	vputx(vp, VPUTX_VRELE);
2313}
2314
2315/*
2316 * Release an already locked vnode.  This give the same effects as
2317 * unlock+vrele(), but takes less time and avoids releasing and
2318 * re-aquiring the lock (as vrele() acquires the lock internally.)
2319 */
2320void
2321vput(struct vnode *vp)
2322{
2323
2324	vputx(vp, VPUTX_VPUT);
2325}
2326
2327/*
2328 * Release an exclusively locked vnode. Do not unlock the vnode lock.
2329 */
2330void
2331vunref(struct vnode *vp)
2332{
2333
2334	vputx(vp, VPUTX_VUNREF);
2335}
2336
2337/*
2338 * Somebody doesn't want the vnode recycled.
2339 */
2340void
2341vhold(struct vnode *vp)
2342{
2343
2344	VI_LOCK(vp);
2345	vholdl(vp);
2346	VI_UNLOCK(vp);
2347}
2348
2349/*
2350 * Increase the hold count and activate if this is the first reference.
2351 */
2352void
2353vholdl(struct vnode *vp)
2354{
2355	struct mount *mp;
2356
2357	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2358	vp->v_holdcnt++;
2359	if (!VSHOULDBUSY(vp))
2360		return;
2361	ASSERT_VI_LOCKED(vp, "vholdl");
2362	VNASSERT((vp->v_iflag & VI_FREE) != 0, vp, ("vnode not free"));
2363	VNASSERT(vp->v_op != NULL, vp, ("vholdl: vnode already reclaimed."));
2364	/*
2365	 * Remove a vnode from the free list, mark it as in use,
2366	 * and put it on the active list.
2367	 */
2368	mtx_lock(&vnode_free_list_mtx);
2369	TAILQ_REMOVE(&vnode_free_list, vp, v_actfreelist);
2370	freevnodes--;
2371	vp->v_iflag &= ~(VI_FREE|VI_AGE);
2372	KASSERT((vp->v_iflag & VI_ACTIVE) == 0,
2373	    ("Activating already active vnode"));
2374	vp->v_iflag |= VI_ACTIVE;
2375	mp = vp->v_mount;
2376	TAILQ_INSERT_HEAD(&mp->mnt_activevnodelist, vp, v_actfreelist);
2377	mp->mnt_activevnodelistsize++;
2378	mtx_unlock(&vnode_free_list_mtx);
2379}
2380
2381/*
2382 * Note that there is one less who cares about this vnode.
2383 * vdrop() is the opposite of vhold().
2384 */
2385void
2386vdrop(struct vnode *vp)
2387{
2388
2389	VI_LOCK(vp);
2390	vdropl(vp);
2391}
2392
2393/*
2394 * Drop the hold count of the vnode.  If this is the last reference to
2395 * the vnode we place it on the free list unless it has been vgone'd
2396 * (marked VI_DOOMED) in which case we will free it.
2397 */
2398void
2399vdropl(struct vnode *vp)
2400{
2401	struct bufobj *bo;
2402	struct mount *mp;
2403	int active;
2404
2405	ASSERT_VI_LOCKED(vp, "vdropl");
2406	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2407	if (vp->v_holdcnt <= 0)
2408		panic("vdrop: holdcnt %d", vp->v_holdcnt);
2409	vp->v_holdcnt--;
2410	if (vp->v_holdcnt > 0) {
2411		VI_UNLOCK(vp);
2412		return;
2413	}
2414	if ((vp->v_iflag & VI_DOOMED) == 0) {
2415		/*
2416		 * Mark a vnode as free: remove it from its active list
2417		 * and put it up for recycling on the freelist.
2418		 */
2419		VNASSERT(vp->v_op != NULL, vp,
2420		    ("vdropl: vnode already reclaimed."));
2421		VNASSERT((vp->v_iflag & VI_FREE) == 0, vp,
2422		    ("vnode already free"));
2423		VNASSERT(VSHOULDFREE(vp), vp,
2424		    ("vdropl: freeing when we shouldn't"));
2425		active = vp->v_iflag & VI_ACTIVE;
2426		vp->v_iflag &= ~VI_ACTIVE;
2427		mp = vp->v_mount;
2428		mtx_lock(&vnode_free_list_mtx);
2429		if (active) {
2430			TAILQ_REMOVE(&mp->mnt_activevnodelist, vp,
2431			    v_actfreelist);
2432			mp->mnt_activevnodelistsize--;
2433		}
2434		if (vp->v_iflag & VI_AGE) {
2435			TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_actfreelist);
2436		} else {
2437			TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_actfreelist);
2438		}
2439		freevnodes++;
2440		vp->v_iflag &= ~VI_AGE;
2441		vp->v_iflag |= VI_FREE;
2442		mtx_unlock(&vnode_free_list_mtx);
2443		VI_UNLOCK(vp);
2444		return;
2445	}
2446	/*
2447	 * The vnode has been marked for destruction, so free it.
2448	 */
2449	CTR2(KTR_VFS, "%s: destroying the vnode %p", __func__, vp);
2450	mtx_lock(&vnode_free_list_mtx);
2451	numvnodes--;
2452	mtx_unlock(&vnode_free_list_mtx);
2453	bo = &vp->v_bufobj;
2454	VNASSERT((vp->v_iflag & VI_FREE) == 0, vp,
2455	    ("cleaned vnode still on the free list."));
2456	VNASSERT(vp->v_data == NULL, vp, ("cleaned vnode isn't"));
2457	VNASSERT(vp->v_holdcnt == 0, vp, ("Non-zero hold count"));
2458	VNASSERT(vp->v_usecount == 0, vp, ("Non-zero use count"));
2459	VNASSERT(vp->v_writecount == 0, vp, ("Non-zero write count"));
2460	VNASSERT(bo->bo_numoutput == 0, vp, ("Clean vnode has pending I/O's"));
2461	VNASSERT(bo->bo_clean.bv_cnt == 0, vp, ("cleanbufcnt not 0"));
2462	VNASSERT(bo->bo_clean.bv_root == NULL, vp, ("cleanblkroot not NULL"));
2463	VNASSERT(bo->bo_dirty.bv_cnt == 0, vp, ("dirtybufcnt not 0"));
2464	VNASSERT(bo->bo_dirty.bv_root == NULL, vp, ("dirtyblkroot not NULL"));
2465	VNASSERT(TAILQ_EMPTY(&vp->v_cache_dst), vp, ("vp has namecache dst"));
2466	VNASSERT(LIST_EMPTY(&vp->v_cache_src), vp, ("vp has namecache src"));
2467	VNASSERT(vp->v_cache_dd == NULL, vp, ("vp has namecache for .."));
2468	VI_UNLOCK(vp);
2469#ifdef MAC
2470	mac_vnode_destroy(vp);
2471#endif
2472	if (vp->v_pollinfo != NULL)
2473		destroy_vpollinfo(vp->v_pollinfo);
2474#ifdef INVARIANTS
2475	/* XXX Elsewhere we detect an already freed vnode via NULL v_op. */
2476	vp->v_op = NULL;
2477#endif
2478	rangelock_destroy(&vp->v_rl);
2479	lockdestroy(vp->v_vnlock);
2480	mtx_destroy(&vp->v_interlock);
2481	mtx_destroy(BO_MTX(bo));
2482	uma_zfree(vnode_zone, vp);
2483}
2484
2485/*
2486 * Call VOP_INACTIVE on the vnode and manage the DOINGINACT and OWEINACT
2487 * flags.  DOINGINACT prevents us from recursing in calls to vinactive.
2488 * OWEINACT tracks whether a vnode missed a call to inactive due to a
2489 * failed lock upgrade.
2490 */
2491void
2492vinactive(struct vnode *vp, struct thread *td)
2493{
2494	struct vm_object *obj;
2495
2496	ASSERT_VOP_ELOCKED(vp, "vinactive");
2497	ASSERT_VI_LOCKED(vp, "vinactive");
2498	VNASSERT((vp->v_iflag & VI_DOINGINACT) == 0, vp,
2499	    ("vinactive: recursed on VI_DOINGINACT"));
2500	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2501	vp->v_iflag |= VI_DOINGINACT;
2502	vp->v_iflag &= ~VI_OWEINACT;
2503	VI_UNLOCK(vp);
2504	/*
2505	 * Before moving off the active list, we must be sure that any
2506	 * modified pages are on the vnode's dirty list since these will
2507	 * no longer be checked once the vnode is on the inactive list.
2508	 * Because the vnode vm object keeps a hold reference on the vnode
2509	 * if there is at least one resident non-cached page, the vnode
2510	 * cannot leave the active list without the page cleanup done.
2511	 */
2512	obj = vp->v_object;
2513	if (obj != NULL && (obj->flags & OBJ_MIGHTBEDIRTY) != 0) {
2514		VM_OBJECT_WLOCK(obj);
2515		vm_object_page_clean(obj, 0, 0, OBJPC_NOSYNC);
2516		VM_OBJECT_WUNLOCK(obj);
2517	}
2518	VOP_INACTIVE(vp, td);
2519	VI_LOCK(vp);
2520	VNASSERT(vp->v_iflag & VI_DOINGINACT, vp,
2521	    ("vinactive: lost VI_DOINGINACT"));
2522	vp->v_iflag &= ~VI_DOINGINACT;
2523}
2524
2525/*
2526 * Remove any vnodes in the vnode table belonging to mount point mp.
2527 *
2528 * If FORCECLOSE is not specified, there should not be any active ones,
2529 * return error if any are found (nb: this is a user error, not a
2530 * system error). If FORCECLOSE is specified, detach any active vnodes
2531 * that are found.
2532 *
2533 * If WRITECLOSE is set, only flush out regular file vnodes open for
2534 * writing.
2535 *
2536 * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped.
2537 *
2538 * `rootrefs' specifies the base reference count for the root vnode
2539 * of this filesystem. The root vnode is considered busy if its
2540 * v_usecount exceeds this value. On a successful return, vflush(, td)
2541 * will call vrele() on the root vnode exactly rootrefs times.
2542 * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must
2543 * be zero.
2544 */
2545#ifdef DIAGNOSTIC
2546static int busyprt = 0;		/* print out busy vnodes */
2547SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "Print out busy vnodes");
2548#endif
2549
2550int
2551vflush(struct mount *mp, int rootrefs, int flags, struct thread *td)
2552{
2553	struct vnode *vp, *mvp, *rootvp = NULL;
2554	struct vattr vattr;
2555	int busy = 0, error;
2556
2557	CTR4(KTR_VFS, "%s: mp %p with rootrefs %d and flags %d", __func__, mp,
2558	    rootrefs, flags);
2559	if (rootrefs > 0) {
2560		KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0,
2561		    ("vflush: bad args"));
2562		/*
2563		 * Get the filesystem root vnode. We can vput() it
2564		 * immediately, since with rootrefs > 0, it won't go away.
2565		 */
2566		if ((error = VFS_ROOT(mp, LK_EXCLUSIVE, &rootvp)) != 0) {
2567			CTR2(KTR_VFS, "%s: vfs_root lookup failed with %d",
2568			    __func__, error);
2569			return (error);
2570		}
2571		vput(rootvp);
2572	}
2573loop:
2574	MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
2575		vholdl(vp);
2576		error = vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE);
2577		if (error) {
2578			vdrop(vp);
2579			MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
2580			goto loop;
2581		}
2582		/*
2583		 * Skip over a vnodes marked VV_SYSTEM.
2584		 */
2585		if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) {
2586			VOP_UNLOCK(vp, 0);
2587			vdrop(vp);
2588			continue;
2589		}
2590		/*
2591		 * If WRITECLOSE is set, flush out unlinked but still open
2592		 * files (even if open only for reading) and regular file
2593		 * vnodes open for writing.
2594		 */
2595		if (flags & WRITECLOSE) {
2596			if (vp->v_object != NULL) {
2597				VM_OBJECT_WLOCK(vp->v_object);
2598				vm_object_page_clean(vp->v_object, 0, 0, 0);
2599				VM_OBJECT_WUNLOCK(vp->v_object);
2600			}
2601			error = VOP_FSYNC(vp, MNT_WAIT, td);
2602			if (error != 0) {
2603				VOP_UNLOCK(vp, 0);
2604				vdrop(vp);
2605				MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
2606				return (error);
2607			}
2608			error = VOP_GETATTR(vp, &vattr, td->td_ucred);
2609			VI_LOCK(vp);
2610
2611			if ((vp->v_type == VNON ||
2612			    (error == 0 && vattr.va_nlink > 0)) &&
2613			    (vp->v_writecount == 0 || vp->v_type != VREG)) {
2614				VOP_UNLOCK(vp, 0);
2615				vdropl(vp);
2616				continue;
2617			}
2618		} else
2619			VI_LOCK(vp);
2620		/*
2621		 * With v_usecount == 0, all we need to do is clear out the
2622		 * vnode data structures and we are done.
2623		 *
2624		 * If FORCECLOSE is set, forcibly close the vnode.
2625		 */
2626		if (vp->v_usecount == 0 || (flags & FORCECLOSE)) {
2627			VNASSERT(vp->v_usecount == 0 ||
2628			    (vp->v_type != VCHR && vp->v_type != VBLK), vp,
2629			    ("device VNODE %p is FORCECLOSED", vp));
2630			vgonel(vp);
2631		} else {
2632			busy++;
2633#ifdef DIAGNOSTIC
2634			if (busyprt)
2635				vprint("vflush: busy vnode", vp);
2636#endif
2637		}
2638		VOP_UNLOCK(vp, 0);
2639		vdropl(vp);
2640	}
2641	if (rootrefs > 0 && (flags & FORCECLOSE) == 0) {
2642		/*
2643		 * If just the root vnode is busy, and if its refcount
2644		 * is equal to `rootrefs', then go ahead and kill it.
2645		 */
2646		VI_LOCK(rootvp);
2647		KASSERT(busy > 0, ("vflush: not busy"));
2648		VNASSERT(rootvp->v_usecount >= rootrefs, rootvp,
2649		    ("vflush: usecount %d < rootrefs %d",
2650		     rootvp->v_usecount, rootrefs));
2651		if (busy == 1 && rootvp->v_usecount == rootrefs) {
2652			VOP_LOCK(rootvp, LK_EXCLUSIVE|LK_INTERLOCK);
2653			vgone(rootvp);
2654			VOP_UNLOCK(rootvp, 0);
2655			busy = 0;
2656		} else
2657			VI_UNLOCK(rootvp);
2658	}
2659	if (busy) {
2660		CTR2(KTR_VFS, "%s: failing as %d vnodes are busy", __func__,
2661		    busy);
2662		return (EBUSY);
2663	}
2664	for (; rootrefs > 0; rootrefs--)
2665		vrele(rootvp);
2666	return (0);
2667}
2668
2669/*
2670 * Recycle an unused vnode to the front of the free list.
2671 */
2672int
2673vrecycle(struct vnode *vp)
2674{
2675	int recycled;
2676
2677	ASSERT_VOP_ELOCKED(vp, "vrecycle");
2678	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2679	recycled = 0;
2680	VI_LOCK(vp);
2681	if (vp->v_usecount == 0) {
2682		recycled = 1;
2683		vgonel(vp);
2684	}
2685	VI_UNLOCK(vp);
2686	return (recycled);
2687}
2688
2689/*
2690 * Eliminate all activity associated with a vnode
2691 * in preparation for reuse.
2692 */
2693void
2694vgone(struct vnode *vp)
2695{
2696	VI_LOCK(vp);
2697	vgonel(vp);
2698	VI_UNLOCK(vp);
2699}
2700
2701static void
2702vgonel_reclaim_lowervp_vfs(struct mount *mp __unused,
2703    struct vnode *lowervp __unused)
2704{
2705}
2706
2707/*
2708 * Notify upper mounts about reclaimed vnode.
2709 */
2710static void
2711vgonel_reclaim_lowervp(struct vnode *vp)
2712{
2713	static struct vfsops vgonel_vfsops = {
2714		.vfs_reclaim_lowervp = vgonel_reclaim_lowervp_vfs
2715	};
2716	struct mount *mp, *ump, *mmp;
2717
2718	mp = vp->v_mount;
2719	if (mp == NULL)
2720		return;
2721
2722	MNT_ILOCK(mp);
2723	if (TAILQ_EMPTY(&mp->mnt_uppers))
2724		goto unlock;
2725	MNT_IUNLOCK(mp);
2726	mmp = malloc(sizeof(struct mount), M_TEMP, M_WAITOK | M_ZERO);
2727	mmp->mnt_op = &vgonel_vfsops;
2728	mmp->mnt_kern_flag |= MNTK_MARKER;
2729	MNT_ILOCK(mp);
2730	mp->mnt_kern_flag |= MNTK_VGONE_UPPER;
2731	for (ump = TAILQ_FIRST(&mp->mnt_uppers); ump != NULL;) {
2732		if ((ump->mnt_kern_flag & MNTK_MARKER) != 0) {
2733			ump = TAILQ_NEXT(ump, mnt_upper_link);
2734			continue;
2735		}
2736		TAILQ_INSERT_AFTER(&mp->mnt_uppers, ump, mmp, mnt_upper_link);
2737		MNT_IUNLOCK(mp);
2738		VFS_RECLAIM_LOWERVP(ump, vp);
2739		MNT_ILOCK(mp);
2740		ump = TAILQ_NEXT(mmp, mnt_upper_link);
2741		TAILQ_REMOVE(&mp->mnt_uppers, mmp, mnt_upper_link);
2742	}
2743	free(mmp, M_TEMP);
2744	mp->mnt_kern_flag &= ~MNTK_VGONE_UPPER;
2745	if ((mp->mnt_kern_flag & MNTK_VGONE_WAITER) != 0) {
2746		mp->mnt_kern_flag &= ~MNTK_VGONE_WAITER;
2747		wakeup(&mp->mnt_uppers);
2748	}
2749unlock:
2750	MNT_IUNLOCK(mp);
2751}
2752
2753/*
2754 * vgone, with the vp interlock held.
2755 */
2756void
2757vgonel(struct vnode *vp)
2758{
2759	struct thread *td;
2760	int oweinact;
2761	int active;
2762	struct mount *mp;
2763
2764	ASSERT_VOP_ELOCKED(vp, "vgonel");
2765	ASSERT_VI_LOCKED(vp, "vgonel");
2766	VNASSERT(vp->v_holdcnt, vp,
2767	    ("vgonel: vp %p has no reference.", vp));
2768	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2769	td = curthread;
2770
2771	/*
2772	 * Don't vgonel if we're already doomed.
2773	 */
2774	if (vp->v_iflag & VI_DOOMED)
2775		return;
2776	vp->v_iflag |= VI_DOOMED;
2777
2778	/*
2779	 * Check to see if the vnode is in use.  If so, we have to call
2780	 * VOP_CLOSE() and VOP_INACTIVE().
2781	 */
2782	active = vp->v_usecount;
2783	oweinact = (vp->v_iflag & VI_OWEINACT);
2784	VI_UNLOCK(vp);
2785	vgonel_reclaim_lowervp(vp);
2786
2787	/*
2788	 * Clean out any buffers associated with the vnode.
2789	 * If the flush fails, just toss the buffers.
2790	 */
2791	mp = NULL;
2792	if (!TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd))
2793		(void) vn_start_secondary_write(vp, &mp, V_WAIT);
2794	if (vinvalbuf(vp, V_SAVE, 0, 0) != 0)
2795		vinvalbuf(vp, 0, 0, 0);
2796
2797	/*
2798	 * If purging an active vnode, it must be closed and
2799	 * deactivated before being reclaimed.
2800	 */
2801	if (active)
2802		VOP_CLOSE(vp, FNONBLOCK, NOCRED, td);
2803	if (oweinact || active) {
2804		VI_LOCK(vp);
2805		if ((vp->v_iflag & VI_DOINGINACT) == 0)
2806			vinactive(vp, td);
2807		VI_UNLOCK(vp);
2808	}
2809	if (vp->v_type == VSOCK)
2810		vfs_unp_reclaim(vp);
2811	/*
2812	 * Reclaim the vnode.
2813	 */
2814	if (VOP_RECLAIM(vp, td))
2815		panic("vgone: cannot reclaim");
2816	if (mp != NULL)
2817		vn_finished_secondary_write(mp);
2818	VNASSERT(vp->v_object == NULL, vp,
2819	    ("vop_reclaim left v_object vp=%p, tag=%s", vp, vp->v_tag));
2820	/*
2821	 * Clear the advisory locks and wake up waiting threads.
2822	 */
2823	(void)VOP_ADVLOCKPURGE(vp);
2824	/*
2825	 * Delete from old mount point vnode list.
2826	 */
2827	delmntque(vp);
2828	cache_purge(vp);
2829	/*
2830	 * Done with purge, reset to the standard lock and invalidate
2831	 * the vnode.
2832	 */
2833	VI_LOCK(vp);
2834	vp->v_vnlock = &vp->v_lock;
2835	vp->v_op = &dead_vnodeops;
2836	vp->v_tag = "none";
2837	vp->v_type = VBAD;
2838}
2839
2840/*
2841 * Calculate the total number of references to a special device.
2842 */
2843int
2844vcount(struct vnode *vp)
2845{
2846	int count;
2847
2848	dev_lock();
2849	count = vp->v_rdev->si_usecount;
2850	dev_unlock();
2851	return (count);
2852}
2853
2854/*
2855 * Same as above, but using the struct cdev *as argument
2856 */
2857int
2858count_dev(struct cdev *dev)
2859{
2860	int count;
2861
2862	dev_lock();
2863	count = dev->si_usecount;
2864	dev_unlock();
2865	return(count);
2866}
2867
2868/*
2869 * Print out a description of a vnode.
2870 */
2871static char *typename[] =
2872{"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD",
2873 "VMARKER"};
2874
2875void
2876vn_printf(struct vnode *vp, const char *fmt, ...)
2877{
2878	va_list ap;
2879	char buf[256], buf2[16];
2880	u_long flags;
2881
2882	va_start(ap, fmt);
2883	vprintf(fmt, ap);
2884	va_end(ap);
2885	printf("%p: ", (void *)vp);
2886	printf("tag %s, type %s\n", vp->v_tag, typename[vp->v_type]);
2887	printf("    usecount %d, writecount %d, refcount %d mountedhere %p\n",
2888	    vp->v_usecount, vp->v_writecount, vp->v_holdcnt, vp->v_mountedhere);
2889	buf[0] = '\0';
2890	buf[1] = '\0';
2891	if (vp->v_vflag & VV_ROOT)
2892		strlcat(buf, "|VV_ROOT", sizeof(buf));
2893	if (vp->v_vflag & VV_ISTTY)
2894		strlcat(buf, "|VV_ISTTY", sizeof(buf));
2895	if (vp->v_vflag & VV_NOSYNC)
2896		strlcat(buf, "|VV_NOSYNC", sizeof(buf));
2897	if (vp->v_vflag & VV_ETERNALDEV)
2898		strlcat(buf, "|VV_ETERNALDEV", sizeof(buf));
2899	if (vp->v_vflag & VV_CACHEDLABEL)
2900		strlcat(buf, "|VV_CACHEDLABEL", sizeof(buf));
2901	if (vp->v_vflag & VV_TEXT)
2902		strlcat(buf, "|VV_TEXT", sizeof(buf));
2903	if (vp->v_vflag & VV_COPYONWRITE)
2904		strlcat(buf, "|VV_COPYONWRITE", sizeof(buf));
2905	if (vp->v_vflag & VV_SYSTEM)
2906		strlcat(buf, "|VV_SYSTEM", sizeof(buf));
2907	if (vp->v_vflag & VV_PROCDEP)
2908		strlcat(buf, "|VV_PROCDEP", sizeof(buf));
2909	if (vp->v_vflag & VV_NOKNOTE)
2910		strlcat(buf, "|VV_NOKNOTE", sizeof(buf));
2911	if (vp->v_vflag & VV_DELETED)
2912		strlcat(buf, "|VV_DELETED", sizeof(buf));
2913	if (vp->v_vflag & VV_MD)
2914		strlcat(buf, "|VV_MD", sizeof(buf));
2915	if (vp->v_vflag & VV_FORCEINSMQ)
2916		strlcat(buf, "|VV_FORCEINSMQ", sizeof(buf));
2917	flags = vp->v_vflag & ~(VV_ROOT | VV_ISTTY | VV_NOSYNC | VV_ETERNALDEV |
2918	    VV_CACHEDLABEL | VV_TEXT | VV_COPYONWRITE | VV_SYSTEM | VV_PROCDEP |
2919	    VV_NOKNOTE | VV_DELETED | VV_MD | VV_FORCEINSMQ);
2920	if (flags != 0) {
2921		snprintf(buf2, sizeof(buf2), "|VV(0x%lx)", flags);
2922		strlcat(buf, buf2, sizeof(buf));
2923	}
2924	if (vp->v_iflag & VI_MOUNT)
2925		strlcat(buf, "|VI_MOUNT", sizeof(buf));
2926	if (vp->v_iflag & VI_AGE)
2927		strlcat(buf, "|VI_AGE", sizeof(buf));
2928	if (vp->v_iflag & VI_DOOMED)
2929		strlcat(buf, "|VI_DOOMED", sizeof(buf));
2930	if (vp->v_iflag & VI_FREE)
2931		strlcat(buf, "|VI_FREE", sizeof(buf));
2932	if (vp->v_iflag & VI_ACTIVE)
2933		strlcat(buf, "|VI_ACTIVE", sizeof(buf));
2934	if (vp->v_iflag & VI_DOINGINACT)
2935		strlcat(buf, "|VI_DOINGINACT", sizeof(buf));
2936	if (vp->v_iflag & VI_OWEINACT)
2937		strlcat(buf, "|VI_OWEINACT", sizeof(buf));
2938	flags = vp->v_iflag & ~(VI_MOUNT | VI_AGE | VI_DOOMED | VI_FREE |
2939	    VI_ACTIVE | VI_DOINGINACT | VI_OWEINACT);
2940	if (flags != 0) {
2941		snprintf(buf2, sizeof(buf2), "|VI(0x%lx)", flags);
2942		strlcat(buf, buf2, sizeof(buf));
2943	}
2944	printf("    flags (%s)\n", buf + 1);
2945	if (mtx_owned(VI_MTX(vp)))
2946		printf(" VI_LOCKed");
2947	if (vp->v_object != NULL)
2948		printf("    v_object %p ref %d pages %d\n",
2949		    vp->v_object, vp->v_object->ref_count,
2950		    vp->v_object->resident_page_count);
2951	printf("    ");
2952	lockmgr_printinfo(vp->v_vnlock);
2953	if (vp->v_data != NULL)
2954		VOP_PRINT(vp);
2955}
2956
2957#ifdef DDB
2958/*
2959 * List all of the locked vnodes in the system.
2960 * Called when debugging the kernel.
2961 */
2962DB_SHOW_COMMAND(lockedvnods, lockedvnodes)
2963{
2964	struct mount *mp, *nmp;
2965	struct vnode *vp;
2966
2967	/*
2968	 * Note: because this is DDB, we can't obey the locking semantics
2969	 * for these structures, which means we could catch an inconsistent
2970	 * state and dereference a nasty pointer.  Not much to be done
2971	 * about that.
2972	 */
2973	db_printf("Locked vnodes\n");
2974	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
2975		nmp = TAILQ_NEXT(mp, mnt_list);
2976		TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
2977			if (vp->v_type != VMARKER &&
2978			    VOP_ISLOCKED(vp))
2979				vprint("", vp);
2980		}
2981		nmp = TAILQ_NEXT(mp, mnt_list);
2982	}
2983}
2984
2985/*
2986 * Show details about the given vnode.
2987 */
2988DB_SHOW_COMMAND(vnode, db_show_vnode)
2989{
2990	struct vnode *vp;
2991
2992	if (!have_addr)
2993		return;
2994	vp = (struct vnode *)addr;
2995	vn_printf(vp, "vnode ");
2996}
2997
2998/*
2999 * Show details about the given mount point.
3000 */
3001DB_SHOW_COMMAND(mount, db_show_mount)
3002{
3003	struct mount *mp;
3004	struct vfsopt *opt;
3005	struct statfs *sp;
3006	struct vnode *vp;
3007	char buf[512];
3008	uint64_t mflags;
3009	u_int flags;
3010
3011	if (!have_addr) {
3012		/* No address given, print short info about all mount points. */
3013		TAILQ_FOREACH(mp, &mountlist, mnt_list) {
3014			db_printf("%p %s on %s (%s)\n", mp,
3015			    mp->mnt_stat.f_mntfromname,
3016			    mp->mnt_stat.f_mntonname,
3017			    mp->mnt_stat.f_fstypename);
3018			if (db_pager_quit)
3019				break;
3020		}
3021		db_printf("\nMore info: show mount <addr>\n");
3022		return;
3023	}
3024
3025	mp = (struct mount *)addr;
3026	db_printf("%p %s on %s (%s)\n", mp, mp->mnt_stat.f_mntfromname,
3027	    mp->mnt_stat.f_mntonname, mp->mnt_stat.f_fstypename);
3028
3029	buf[0] = '\0';
3030	mflags = mp->mnt_flag;
3031#define	MNT_FLAG(flag)	do {						\
3032	if (mflags & (flag)) {						\
3033		if (buf[0] != '\0')					\
3034			strlcat(buf, ", ", sizeof(buf));		\
3035		strlcat(buf, (#flag) + 4, sizeof(buf));			\
3036		mflags &= ~(flag);					\
3037	}								\
3038} while (0)
3039	MNT_FLAG(MNT_RDONLY);
3040	MNT_FLAG(MNT_SYNCHRONOUS);
3041	MNT_FLAG(MNT_NOEXEC);
3042	MNT_FLAG(MNT_NOSUID);
3043	MNT_FLAG(MNT_NFS4ACLS);
3044	MNT_FLAG(MNT_UNION);
3045	MNT_FLAG(MNT_ASYNC);
3046	MNT_FLAG(MNT_SUIDDIR);
3047	MNT_FLAG(MNT_SOFTDEP);
3048	MNT_FLAG(MNT_NOSYMFOLLOW);
3049	MNT_FLAG(MNT_GJOURNAL);
3050	MNT_FLAG(MNT_MULTILABEL);
3051	MNT_FLAG(MNT_ACLS);
3052	MNT_FLAG(MNT_NOATIME);
3053	MNT_FLAG(MNT_NOCLUSTERR);
3054	MNT_FLAG(MNT_NOCLUSTERW);
3055	MNT_FLAG(MNT_SUJ);
3056	MNT_FLAG(MNT_EXRDONLY);
3057	MNT_FLAG(MNT_EXPORTED);
3058	MNT_FLAG(MNT_DEFEXPORTED);
3059	MNT_FLAG(MNT_EXPORTANON);
3060	MNT_FLAG(MNT_EXKERB);
3061	MNT_FLAG(MNT_EXPUBLIC);
3062	MNT_FLAG(MNT_LOCAL);
3063	MNT_FLAG(MNT_QUOTA);
3064	MNT_FLAG(MNT_ROOTFS);
3065	MNT_FLAG(MNT_USER);
3066	MNT_FLAG(MNT_IGNORE);
3067	MNT_FLAG(MNT_UPDATE);
3068	MNT_FLAG(MNT_DELEXPORT);
3069	MNT_FLAG(MNT_RELOAD);
3070	MNT_FLAG(MNT_FORCE);
3071	MNT_FLAG(MNT_SNAPSHOT);
3072	MNT_FLAG(MNT_BYFSID);
3073#undef MNT_FLAG
3074	if (mflags != 0) {
3075		if (buf[0] != '\0')
3076			strlcat(buf, ", ", sizeof(buf));
3077		snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf),
3078		    "0x%016jx", mflags);
3079	}
3080	db_printf("    mnt_flag = %s\n", buf);
3081
3082	buf[0] = '\0';
3083	flags = mp->mnt_kern_flag;
3084#define	MNT_KERN_FLAG(flag)	do {					\
3085	if (flags & (flag)) {						\
3086		if (buf[0] != '\0')					\
3087			strlcat(buf, ", ", sizeof(buf));		\
3088		strlcat(buf, (#flag) + 5, sizeof(buf));			\
3089		flags &= ~(flag);					\
3090	}								\
3091} while (0)
3092	MNT_KERN_FLAG(MNTK_UNMOUNTF);
3093	MNT_KERN_FLAG(MNTK_ASYNC);
3094	MNT_KERN_FLAG(MNTK_SOFTDEP);
3095	MNT_KERN_FLAG(MNTK_NOINSMNTQ);
3096	MNT_KERN_FLAG(MNTK_DRAINING);
3097	MNT_KERN_FLAG(MNTK_REFEXPIRE);
3098	MNT_KERN_FLAG(MNTK_EXTENDED_SHARED);
3099	MNT_KERN_FLAG(MNTK_SHARED_WRITES);
3100	MNT_KERN_FLAG(MNTK_NO_IOPF);
3101	MNT_KERN_FLAG(MNTK_VGONE_UPPER);
3102	MNT_KERN_FLAG(MNTK_VGONE_WAITER);
3103	MNT_KERN_FLAG(MNTK_LOOKUP_EXCL_DOTDOT);
3104	MNT_KERN_FLAG(MNTK_MARKER);
3105	MNT_KERN_FLAG(MNTK_NOASYNC);
3106	MNT_KERN_FLAG(MNTK_UNMOUNT);
3107	MNT_KERN_FLAG(MNTK_MWAIT);
3108	MNT_KERN_FLAG(MNTK_SUSPEND);
3109	MNT_KERN_FLAG(MNTK_SUSPEND2);
3110	MNT_KERN_FLAG(MNTK_SUSPENDED);
3111	MNT_KERN_FLAG(MNTK_LOOKUP_SHARED);
3112	MNT_KERN_FLAG(MNTK_NOKNOTE);
3113#undef MNT_KERN_FLAG
3114	if (flags != 0) {
3115		if (buf[0] != '\0')
3116			strlcat(buf, ", ", sizeof(buf));
3117		snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf),
3118		    "0x%08x", flags);
3119	}
3120	db_printf("    mnt_kern_flag = %s\n", buf);
3121
3122	db_printf("    mnt_opt = ");
3123	opt = TAILQ_FIRST(mp->mnt_opt);
3124	if (opt != NULL) {
3125		db_printf("%s", opt->name);
3126		opt = TAILQ_NEXT(opt, link);
3127		while (opt != NULL) {
3128			db_printf(", %s", opt->name);
3129			opt = TAILQ_NEXT(opt, link);
3130		}
3131	}
3132	db_printf("\n");
3133
3134	sp = &mp->mnt_stat;
3135	db_printf("    mnt_stat = { version=%u type=%u flags=0x%016jx "
3136	    "bsize=%ju iosize=%ju blocks=%ju bfree=%ju bavail=%jd files=%ju "
3137	    "ffree=%jd syncwrites=%ju asyncwrites=%ju syncreads=%ju "
3138	    "asyncreads=%ju namemax=%u owner=%u fsid=[%d, %d] }\n",
3139	    (u_int)sp->f_version, (u_int)sp->f_type, (uintmax_t)sp->f_flags,
3140	    (uintmax_t)sp->f_bsize, (uintmax_t)sp->f_iosize,
3141	    (uintmax_t)sp->f_blocks, (uintmax_t)sp->f_bfree,
3142	    (intmax_t)sp->f_bavail, (uintmax_t)sp->f_files,
3143	    (intmax_t)sp->f_ffree, (uintmax_t)sp->f_syncwrites,
3144	    (uintmax_t)sp->f_asyncwrites, (uintmax_t)sp->f_syncreads,
3145	    (uintmax_t)sp->f_asyncreads, (u_int)sp->f_namemax,
3146	    (u_int)sp->f_owner, (int)sp->f_fsid.val[0], (int)sp->f_fsid.val[1]);
3147
3148	db_printf("    mnt_cred = { uid=%u ruid=%u",
3149	    (u_int)mp->mnt_cred->cr_uid, (u_int)mp->mnt_cred->cr_ruid);
3150	if (jailed(mp->mnt_cred))
3151		db_printf(", jail=%d", mp->mnt_cred->cr_prison->pr_id);
3152	db_printf(" }\n");
3153	db_printf("    mnt_ref = %d\n", mp->mnt_ref);
3154	db_printf("    mnt_gen = %d\n", mp->mnt_gen);
3155	db_printf("    mnt_nvnodelistsize = %d\n", mp->mnt_nvnodelistsize);
3156	db_printf("    mnt_activevnodelistsize = %d\n",
3157	    mp->mnt_activevnodelistsize);
3158	db_printf("    mnt_writeopcount = %d\n", mp->mnt_writeopcount);
3159	db_printf("    mnt_maxsymlinklen = %d\n", mp->mnt_maxsymlinklen);
3160	db_printf("    mnt_iosize_max = %d\n", mp->mnt_iosize_max);
3161	db_printf("    mnt_hashseed = %u\n", mp->mnt_hashseed);
3162	db_printf("    mnt_secondary_writes = %d\n", mp->mnt_secondary_writes);
3163	db_printf("    mnt_secondary_accwrites = %d\n",
3164	    mp->mnt_secondary_accwrites);
3165	db_printf("    mnt_gjprovider = %s\n",
3166	    mp->mnt_gjprovider != NULL ? mp->mnt_gjprovider : "NULL");
3167
3168	db_printf("\n\nList of active vnodes\n");
3169	TAILQ_FOREACH(vp, &mp->mnt_activevnodelist, v_actfreelist) {
3170		if (vp->v_type != VMARKER) {
3171			vn_printf(vp, "vnode ");
3172			if (db_pager_quit)
3173				break;
3174		}
3175	}
3176	db_printf("\n\nList of inactive vnodes\n");
3177	TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
3178		if (vp->v_type != VMARKER && (vp->v_iflag & VI_ACTIVE) == 0) {
3179			vn_printf(vp, "vnode ");
3180			if (db_pager_quit)
3181				break;
3182		}
3183	}
3184}
3185#endif	/* DDB */
3186
3187/*
3188 * Fill in a struct xvfsconf based on a struct vfsconf.
3189 */
3190static int
3191vfsconf2x(struct sysctl_req *req, struct vfsconf *vfsp)
3192{
3193	struct xvfsconf xvfsp;
3194
3195	bzero(&xvfsp, sizeof(xvfsp));
3196	strcpy(xvfsp.vfc_name, vfsp->vfc_name);
3197	xvfsp.vfc_typenum = vfsp->vfc_typenum;
3198	xvfsp.vfc_refcount = vfsp->vfc_refcount;
3199	xvfsp.vfc_flags = vfsp->vfc_flags;
3200	/*
3201	 * These are unused in userland, we keep them
3202	 * to not break binary compatibility.
3203	 */
3204	xvfsp.vfc_vfsops = NULL;
3205	xvfsp.vfc_next = NULL;
3206	return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp)));
3207}
3208
3209#ifdef COMPAT_FREEBSD32
3210struct xvfsconf32 {
3211	uint32_t	vfc_vfsops;
3212	char		vfc_name[MFSNAMELEN];
3213	int32_t		vfc_typenum;
3214	int32_t		vfc_refcount;
3215	int32_t		vfc_flags;
3216	uint32_t	vfc_next;
3217};
3218
3219static int
3220vfsconf2x32(struct sysctl_req *req, struct vfsconf *vfsp)
3221{
3222	struct xvfsconf32 xvfsp;
3223
3224	strcpy(xvfsp.vfc_name, vfsp->vfc_name);
3225	xvfsp.vfc_typenum = vfsp->vfc_typenum;
3226	xvfsp.vfc_refcount = vfsp->vfc_refcount;
3227	xvfsp.vfc_flags = vfsp->vfc_flags;
3228	xvfsp.vfc_vfsops = 0;
3229	xvfsp.vfc_next = 0;
3230	return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp)));
3231}
3232#endif
3233
3234/*
3235 * Top level filesystem related information gathering.
3236 */
3237static int
3238sysctl_vfs_conflist(SYSCTL_HANDLER_ARGS)
3239{
3240	struct vfsconf *vfsp;
3241	int error;
3242
3243	error = 0;
3244	TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
3245#ifdef COMPAT_FREEBSD32
3246		if (req->flags & SCTL_MASK32)
3247			error = vfsconf2x32(req, vfsp);
3248		else
3249#endif
3250			error = vfsconf2x(req, vfsp);
3251		if (error)
3252			break;
3253	}
3254	return (error);
3255}
3256
3257SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLTYPE_OPAQUE | CTLFLAG_RD,
3258    NULL, 0, sysctl_vfs_conflist,
3259    "S,xvfsconf", "List of all configured filesystems");
3260
3261#ifndef BURN_BRIDGES
3262static int	sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS);
3263
3264static int
3265vfs_sysctl(SYSCTL_HANDLER_ARGS)
3266{
3267	int *name = (int *)arg1 - 1;	/* XXX */
3268	u_int namelen = arg2 + 1;	/* XXX */
3269	struct vfsconf *vfsp;
3270
3271	log(LOG_WARNING, "userland calling deprecated sysctl, "
3272	    "please rebuild world\n");
3273
3274#if 1 || defined(COMPAT_PRELITE2)
3275	/* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */
3276	if (namelen == 1)
3277		return (sysctl_ovfs_conf(oidp, arg1, arg2, req));
3278#endif
3279
3280	switch (name[1]) {
3281	case VFS_MAXTYPENUM:
3282		if (namelen != 2)
3283			return (ENOTDIR);
3284		return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int)));
3285	case VFS_CONF:
3286		if (namelen != 3)
3287			return (ENOTDIR);	/* overloaded */
3288		TAILQ_FOREACH(vfsp, &vfsconf, vfc_list)
3289			if (vfsp->vfc_typenum == name[2])
3290				break;
3291		if (vfsp == NULL)
3292			return (EOPNOTSUPP);
3293#ifdef COMPAT_FREEBSD32
3294		if (req->flags & SCTL_MASK32)
3295			return (vfsconf2x32(req, vfsp));
3296		else
3297#endif
3298			return (vfsconf2x(req, vfsp));
3299	}
3300	return (EOPNOTSUPP);
3301}
3302
3303static SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD | CTLFLAG_SKIP,
3304    vfs_sysctl, "Generic filesystem");
3305
3306#if 1 || defined(COMPAT_PRELITE2)
3307
3308static int
3309sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS)
3310{
3311	int error;
3312	struct vfsconf *vfsp;
3313	struct ovfsconf ovfs;
3314
3315	TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
3316		bzero(&ovfs, sizeof(ovfs));
3317		ovfs.vfc_vfsops = vfsp->vfc_vfsops;	/* XXX used as flag */
3318		strcpy(ovfs.vfc_name, vfsp->vfc_name);
3319		ovfs.vfc_index = vfsp->vfc_typenum;
3320		ovfs.vfc_refcount = vfsp->vfc_refcount;
3321		ovfs.vfc_flags = vfsp->vfc_flags;
3322		error = SYSCTL_OUT(req, &ovfs, sizeof ovfs);
3323		if (error)
3324			return error;
3325	}
3326	return 0;
3327}
3328
3329#endif /* 1 || COMPAT_PRELITE2 */
3330#endif /* !BURN_BRIDGES */
3331
3332#define KINFO_VNODESLOP		10
3333#ifdef notyet
3334/*
3335 * Dump vnode list (via sysctl).
3336 */
3337/* ARGSUSED */
3338static int
3339sysctl_vnode(SYSCTL_HANDLER_ARGS)
3340{
3341	struct xvnode *xvn;
3342	struct mount *mp;
3343	struct vnode *vp;
3344	int error, len, n;
3345
3346	/*
3347	 * Stale numvnodes access is not fatal here.
3348	 */
3349	req->lock = 0;
3350	len = (numvnodes + KINFO_VNODESLOP) * sizeof *xvn;
3351	if (!req->oldptr)
3352		/* Make an estimate */
3353		return (SYSCTL_OUT(req, 0, len));
3354
3355	error = sysctl_wire_old_buffer(req, 0);
3356	if (error != 0)
3357		return (error);
3358	xvn = malloc(len, M_TEMP, M_ZERO | M_WAITOK);
3359	n = 0;
3360	mtx_lock(&mountlist_mtx);
3361	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
3362		if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK))
3363			continue;
3364		MNT_ILOCK(mp);
3365		TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
3366			if (n == len)
3367				break;
3368			vref(vp);
3369			xvn[n].xv_size = sizeof *xvn;
3370			xvn[n].xv_vnode = vp;
3371			xvn[n].xv_id = 0;	/* XXX compat */
3372#define XV_COPY(field) xvn[n].xv_##field = vp->v_##field
3373			XV_COPY(usecount);
3374			XV_COPY(writecount);
3375			XV_COPY(holdcnt);
3376			XV_COPY(mount);
3377			XV_COPY(numoutput);
3378			XV_COPY(type);
3379#undef XV_COPY
3380			xvn[n].xv_flag = vp->v_vflag;
3381
3382			switch (vp->v_type) {
3383			case VREG:
3384			case VDIR:
3385			case VLNK:
3386				break;
3387			case VBLK:
3388			case VCHR:
3389				if (vp->v_rdev == NULL) {
3390					vrele(vp);
3391					continue;
3392				}
3393				xvn[n].xv_dev = dev2udev(vp->v_rdev);
3394				break;
3395			case VSOCK:
3396				xvn[n].xv_socket = vp->v_socket;
3397				break;
3398			case VFIFO:
3399				xvn[n].xv_fifo = vp->v_fifoinfo;
3400				break;
3401			case VNON:
3402			case VBAD:
3403			default:
3404				/* shouldn't happen? */
3405				vrele(vp);
3406				continue;
3407			}
3408			vrele(vp);
3409			++n;
3410		}
3411		MNT_IUNLOCK(mp);
3412		mtx_lock(&mountlist_mtx);
3413		vfs_unbusy(mp);
3414		if (n == len)
3415			break;
3416	}
3417	mtx_unlock(&mountlist_mtx);
3418
3419	error = SYSCTL_OUT(req, xvn, n * sizeof *xvn);
3420	free(xvn, M_TEMP);
3421	return (error);
3422}
3423
3424SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD,
3425    0, 0, sysctl_vnode, "S,xvnode", "");
3426#endif
3427
3428/*
3429 * Unmount all filesystems. The list is traversed in reverse order
3430 * of mounting to avoid dependencies.
3431 */
3432void
3433vfs_unmountall(void)
3434{
3435	struct mount *mp;
3436	struct thread *td;
3437	int error;
3438
3439	CTR1(KTR_VFS, "%s: unmounting all filesystems", __func__);
3440	td = curthread;
3441
3442	/*
3443	 * Since this only runs when rebooting, it is not interlocked.
3444	 */
3445	while(!TAILQ_EMPTY(&mountlist)) {
3446		mp = TAILQ_LAST(&mountlist, mntlist);
3447		error = dounmount(mp, MNT_FORCE, td);
3448		if (error) {
3449			TAILQ_REMOVE(&mountlist, mp, mnt_list);
3450			/*
3451			 * XXX: Due to the way in which we mount the root
3452			 * file system off of devfs, devfs will generate a
3453			 * "busy" warning when we try to unmount it before
3454			 * the root.  Don't print a warning as a result in
3455			 * order to avoid false positive errors that may
3456			 * cause needless upset.
3457			 */
3458			if (strcmp(mp->mnt_vfc->vfc_name, "devfs") != 0) {
3459				printf("unmount of %s failed (",
3460				    mp->mnt_stat.f_mntonname);
3461				if (error == EBUSY)
3462					printf("BUSY)\n");
3463				else
3464					printf("%d)\n", error);
3465			}
3466		} else {
3467			/* The unmount has removed mp from the mountlist */
3468		}
3469	}
3470}
3471
3472/*
3473 * perform msync on all vnodes under a mount point
3474 * the mount point must be locked.
3475 */
3476void
3477vfs_msync(struct mount *mp, int flags)
3478{
3479	struct vnode *vp, *mvp;
3480	struct vm_object *obj;
3481
3482	CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
3483	MNT_VNODE_FOREACH_ACTIVE(vp, mp, mvp) {
3484		obj = vp->v_object;
3485		if (obj != NULL && (obj->flags & OBJ_MIGHTBEDIRTY) != 0 &&
3486		    (flags == MNT_WAIT || VOP_ISLOCKED(vp) == 0)) {
3487			if (!vget(vp,
3488			    LK_EXCLUSIVE | LK_RETRY | LK_INTERLOCK,
3489			    curthread)) {
3490				if (vp->v_vflag & VV_NOSYNC) {	/* unlinked */
3491					vput(vp);
3492					continue;
3493				}
3494
3495				obj = vp->v_object;
3496				if (obj != NULL) {
3497					VM_OBJECT_WLOCK(obj);
3498					vm_object_page_clean(obj, 0, 0,
3499					    flags == MNT_WAIT ?
3500					    OBJPC_SYNC : OBJPC_NOSYNC);
3501					VM_OBJECT_WUNLOCK(obj);
3502				}
3503				vput(vp);
3504			}
3505		} else
3506			VI_UNLOCK(vp);
3507	}
3508}
3509
3510static void
3511destroy_vpollinfo(struct vpollinfo *vi)
3512{
3513	seldrain(&vi->vpi_selinfo);
3514	knlist_destroy(&vi->vpi_selinfo.si_note);
3515	mtx_destroy(&vi->vpi_lock);
3516	uma_zfree(vnodepoll_zone, vi);
3517}
3518
3519/*
3520 * Initalize per-vnode helper structure to hold poll-related state.
3521 */
3522void
3523v_addpollinfo(struct vnode *vp)
3524{
3525	struct vpollinfo *vi;
3526
3527	if (vp->v_pollinfo != NULL)
3528		return;
3529	vi = uma_zalloc(vnodepoll_zone, M_WAITOK);
3530	mtx_init(&vi->vpi_lock, "vnode pollinfo", NULL, MTX_DEF);
3531	knlist_init(&vi->vpi_selinfo.si_note, vp, vfs_knllock,
3532	    vfs_knlunlock, vfs_knl_assert_locked, vfs_knl_assert_unlocked);
3533	VI_LOCK(vp);
3534	if (vp->v_pollinfo != NULL) {
3535		VI_UNLOCK(vp);
3536		destroy_vpollinfo(vi);
3537		return;
3538	}
3539	vp->v_pollinfo = vi;
3540	VI_UNLOCK(vp);
3541}
3542
3543/*
3544 * Record a process's interest in events which might happen to
3545 * a vnode.  Because poll uses the historic select-style interface
3546 * internally, this routine serves as both the ``check for any
3547 * pending events'' and the ``record my interest in future events''
3548 * functions.  (These are done together, while the lock is held,
3549 * to avoid race conditions.)
3550 */
3551int
3552vn_pollrecord(struct vnode *vp, struct thread *td, int events)
3553{
3554
3555	v_addpollinfo(vp);
3556	mtx_lock(&vp->v_pollinfo->vpi_lock);
3557	if (vp->v_pollinfo->vpi_revents & events) {
3558		/*
3559		 * This leaves events we are not interested
3560		 * in available for the other process which
3561		 * which presumably had requested them
3562		 * (otherwise they would never have been
3563		 * recorded).
3564		 */
3565		events &= vp->v_pollinfo->vpi_revents;
3566		vp->v_pollinfo->vpi_revents &= ~events;
3567
3568		mtx_unlock(&vp->v_pollinfo->vpi_lock);
3569		return (events);
3570	}
3571	vp->v_pollinfo->vpi_events |= events;
3572	selrecord(td, &vp->v_pollinfo->vpi_selinfo);
3573	mtx_unlock(&vp->v_pollinfo->vpi_lock);
3574	return (0);
3575}
3576
3577/*
3578 * Routine to create and manage a filesystem syncer vnode.
3579 */
3580#define sync_close ((int (*)(struct  vop_close_args *))nullop)
3581static int	sync_fsync(struct  vop_fsync_args *);
3582static int	sync_inactive(struct  vop_inactive_args *);
3583static int	sync_reclaim(struct  vop_reclaim_args *);
3584
3585static struct vop_vector sync_vnodeops = {
3586	.vop_bypass =	VOP_EOPNOTSUPP,
3587	.vop_close =	sync_close,		/* close */
3588	.vop_fsync =	sync_fsync,		/* fsync */
3589	.vop_inactive =	sync_inactive,	/* inactive */
3590	.vop_reclaim =	sync_reclaim,	/* reclaim */
3591	.vop_lock1 =	vop_stdlock,	/* lock */
3592	.vop_unlock =	vop_stdunlock,	/* unlock */
3593	.vop_islocked =	vop_stdislocked,	/* islocked */
3594};
3595
3596/*
3597 * Create a new filesystem syncer vnode for the specified mount point.
3598 */
3599void
3600vfs_allocate_syncvnode(struct mount *mp)
3601{
3602	struct vnode *vp;
3603	struct bufobj *bo;
3604	static long start, incr, next;
3605	int error;
3606
3607	/* Allocate a new vnode */
3608	error = getnewvnode("syncer", mp, &sync_vnodeops, &vp);
3609	if (error != 0)
3610		panic("vfs_allocate_syncvnode: getnewvnode() failed");
3611	vp->v_type = VNON;
3612	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3613	vp->v_vflag |= VV_FORCEINSMQ;
3614	error = insmntque(vp, mp);
3615	if (error != 0)
3616		panic("vfs_allocate_syncvnode: insmntque() failed");
3617	vp->v_vflag &= ~VV_FORCEINSMQ;
3618	VOP_UNLOCK(vp, 0);
3619	/*
3620	 * Place the vnode onto the syncer worklist. We attempt to
3621	 * scatter them about on the list so that they will go off
3622	 * at evenly distributed times even if all the filesystems
3623	 * are mounted at once.
3624	 */
3625	next += incr;
3626	if (next == 0 || next > syncer_maxdelay) {
3627		start /= 2;
3628		incr /= 2;
3629		if (start == 0) {
3630			start = syncer_maxdelay / 2;
3631			incr = syncer_maxdelay;
3632		}
3633		next = start;
3634	}
3635	bo = &vp->v_bufobj;
3636	BO_LOCK(bo);
3637	vn_syncer_add_to_worklist(bo, syncdelay > 0 ? next % syncdelay : 0);
3638	/* XXX - vn_syncer_add_to_worklist() also grabs and drops sync_mtx. */
3639	mtx_lock(&sync_mtx);
3640	sync_vnode_count++;
3641	if (mp->mnt_syncer == NULL) {
3642		mp->mnt_syncer = vp;
3643		vp = NULL;
3644	}
3645	mtx_unlock(&sync_mtx);
3646	BO_UNLOCK(bo);
3647	if (vp != NULL) {
3648		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3649		vgone(vp);
3650		vput(vp);
3651	}
3652}
3653
3654void
3655vfs_deallocate_syncvnode(struct mount *mp)
3656{
3657	struct vnode *vp;
3658
3659	mtx_lock(&sync_mtx);
3660	vp = mp->mnt_syncer;
3661	if (vp != NULL)
3662		mp->mnt_syncer = NULL;
3663	mtx_unlock(&sync_mtx);
3664	if (vp != NULL)
3665		vrele(vp);
3666}
3667
3668/*
3669 * Do a lazy sync of the filesystem.
3670 */
3671static int
3672sync_fsync(struct vop_fsync_args *ap)
3673{
3674	struct vnode *syncvp = ap->a_vp;
3675	struct mount *mp = syncvp->v_mount;
3676	int error, save;
3677	struct bufobj *bo;
3678
3679	/*
3680	 * We only need to do something if this is a lazy evaluation.
3681	 */
3682	if (ap->a_waitfor != MNT_LAZY)
3683		return (0);
3684
3685	/*
3686	 * Move ourselves to the back of the sync list.
3687	 */
3688	bo = &syncvp->v_bufobj;
3689	BO_LOCK(bo);
3690	vn_syncer_add_to_worklist(bo, syncdelay);
3691	BO_UNLOCK(bo);
3692
3693	/*
3694	 * Walk the list of vnodes pushing all that are dirty and
3695	 * not already on the sync list.
3696	 */
3697	mtx_lock(&mountlist_mtx);
3698	if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK) != 0) {
3699		mtx_unlock(&mountlist_mtx);
3700		return (0);
3701	}
3702	if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) {
3703		vfs_unbusy(mp);
3704		return (0);
3705	}
3706	save = curthread_pflags_set(TDP_SYNCIO);
3707	vfs_msync(mp, MNT_NOWAIT);
3708	error = VFS_SYNC(mp, MNT_LAZY);
3709	curthread_pflags_restore(save);
3710	vn_finished_write(mp);
3711	vfs_unbusy(mp);
3712	return (error);
3713}
3714
3715/*
3716 * The syncer vnode is no referenced.
3717 */
3718static int
3719sync_inactive(struct vop_inactive_args *ap)
3720{
3721
3722	vgone(ap->a_vp);
3723	return (0);
3724}
3725
3726/*
3727 * The syncer vnode is no longer needed and is being decommissioned.
3728 *
3729 * Modifications to the worklist must be protected by sync_mtx.
3730 */
3731static int
3732sync_reclaim(struct vop_reclaim_args *ap)
3733{
3734	struct vnode *vp = ap->a_vp;
3735	struct bufobj *bo;
3736
3737	bo = &vp->v_bufobj;
3738	BO_LOCK(bo);
3739	mtx_lock(&sync_mtx);
3740	if (vp->v_mount->mnt_syncer == vp)
3741		vp->v_mount->mnt_syncer = NULL;
3742	if (bo->bo_flag & BO_ONWORKLST) {
3743		LIST_REMOVE(bo, bo_synclist);
3744		syncer_worklist_len--;
3745		sync_vnode_count--;
3746		bo->bo_flag &= ~BO_ONWORKLST;
3747	}
3748	mtx_unlock(&sync_mtx);
3749	BO_UNLOCK(bo);
3750
3751	return (0);
3752}
3753
3754/*
3755 * Check if vnode represents a disk device
3756 */
3757int
3758vn_isdisk(struct vnode *vp, int *errp)
3759{
3760	int error;
3761
3762	error = 0;
3763	dev_lock();
3764	if (vp->v_type != VCHR)
3765		error = ENOTBLK;
3766	else if (vp->v_rdev == NULL)
3767		error = ENXIO;
3768	else if (vp->v_rdev->si_devsw == NULL)
3769		error = ENXIO;
3770	else if (!(vp->v_rdev->si_devsw->d_flags & D_DISK))
3771		error = ENOTBLK;
3772	dev_unlock();
3773	if (errp != NULL)
3774		*errp = error;
3775	return (error == 0);
3776}
3777
3778/*
3779 * Common filesystem object access control check routine.  Accepts a
3780 * vnode's type, "mode", uid and gid, requested access mode, credentials,
3781 * and optional call-by-reference privused argument allowing vaccess()
3782 * to indicate to the caller whether privilege was used to satisfy the
3783 * request (obsoleted).  Returns 0 on success, or an errno on failure.
3784 */
3785int
3786vaccess(enum vtype type, mode_t file_mode, uid_t file_uid, gid_t file_gid,
3787    accmode_t accmode, struct ucred *cred, int *privused)
3788{
3789	accmode_t dac_granted;
3790	accmode_t priv_granted;
3791
3792	KASSERT((accmode & ~(VEXEC | VWRITE | VREAD | VADMIN | VAPPEND)) == 0,
3793	    ("invalid bit in accmode"));
3794	KASSERT((accmode & VAPPEND) == 0 || (accmode & VWRITE),
3795	    ("VAPPEND without VWRITE"));
3796
3797	/*
3798	 * Look for a normal, non-privileged way to access the file/directory
3799	 * as requested.  If it exists, go with that.
3800	 */
3801
3802	if (privused != NULL)
3803		*privused = 0;
3804
3805	dac_granted = 0;
3806
3807	/* Check the owner. */
3808	if (cred->cr_uid == file_uid) {
3809		dac_granted |= VADMIN;
3810		if (file_mode & S_IXUSR)
3811			dac_granted |= VEXEC;
3812		if (file_mode & S_IRUSR)
3813			dac_granted |= VREAD;
3814		if (file_mode & S_IWUSR)
3815			dac_granted |= (VWRITE | VAPPEND);
3816
3817		if ((accmode & dac_granted) == accmode)
3818			return (0);
3819
3820		goto privcheck;
3821	}
3822
3823	/* Otherwise, check the groups (first match) */
3824	if (groupmember(file_gid, cred)) {
3825		if (file_mode & S_IXGRP)
3826			dac_granted |= VEXEC;
3827		if (file_mode & S_IRGRP)
3828			dac_granted |= VREAD;
3829		if (file_mode & S_IWGRP)
3830			dac_granted |= (VWRITE | VAPPEND);
3831
3832		if ((accmode & dac_granted) == accmode)
3833			return (0);
3834
3835		goto privcheck;
3836	}
3837
3838	/* Otherwise, check everyone else. */
3839	if (file_mode & S_IXOTH)
3840		dac_granted |= VEXEC;
3841	if (file_mode & S_IROTH)
3842		dac_granted |= VREAD;
3843	if (file_mode & S_IWOTH)
3844		dac_granted |= (VWRITE | VAPPEND);
3845	if ((accmode & dac_granted) == accmode)
3846		return (0);
3847
3848privcheck:
3849	/*
3850	 * Build a privilege mask to determine if the set of privileges
3851	 * satisfies the requirements when combined with the granted mask
3852	 * from above.  For each privilege, if the privilege is required,
3853	 * bitwise or the request type onto the priv_granted mask.
3854	 */
3855	priv_granted = 0;
3856
3857	if (type == VDIR) {
3858		/*
3859		 * For directories, use PRIV_VFS_LOOKUP to satisfy VEXEC
3860		 * requests, instead of PRIV_VFS_EXEC.
3861		 */
3862		if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
3863		    !priv_check_cred(cred, PRIV_VFS_LOOKUP, 0))
3864			priv_granted |= VEXEC;
3865	} else {
3866		/*
3867		 * Ensure that at least one execute bit is on. Otherwise,
3868		 * a privileged user will always succeed, and we don't want
3869		 * this to happen unless the file really is executable.
3870		 */
3871		if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
3872		    (file_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) != 0 &&
3873		    !priv_check_cred(cred, PRIV_VFS_EXEC, 0))
3874			priv_granted |= VEXEC;
3875	}
3876
3877	if ((accmode & VREAD) && ((dac_granted & VREAD) == 0) &&
3878	    !priv_check_cred(cred, PRIV_VFS_READ, 0))
3879		priv_granted |= VREAD;
3880
3881	if ((accmode & VWRITE) && ((dac_granted & VWRITE) == 0) &&
3882	    !priv_check_cred(cred, PRIV_VFS_WRITE, 0))
3883		priv_granted |= (VWRITE | VAPPEND);
3884
3885	if ((accmode & VADMIN) && ((dac_granted & VADMIN) == 0) &&
3886	    !priv_check_cred(cred, PRIV_VFS_ADMIN, 0))
3887		priv_granted |= VADMIN;
3888
3889	if ((accmode & (priv_granted | dac_granted)) == accmode) {
3890		/* XXX audit: privilege used */
3891		if (privused != NULL)
3892			*privused = 1;
3893		return (0);
3894	}
3895
3896	return ((accmode & VADMIN) ? EPERM : EACCES);
3897}
3898
3899/*
3900 * Credential check based on process requesting service, and per-attribute
3901 * permissions.
3902 */
3903int
3904extattr_check_cred(struct vnode *vp, int attrnamespace, struct ucred *cred,
3905    struct thread *td, accmode_t accmode)
3906{
3907
3908	/*
3909	 * Kernel-invoked always succeeds.
3910	 */
3911	if (cred == NOCRED)
3912		return (0);
3913
3914	/*
3915	 * Do not allow privileged processes in jail to directly manipulate
3916	 * system attributes.
3917	 */
3918	switch (attrnamespace) {
3919	case EXTATTR_NAMESPACE_SYSTEM:
3920		/* Potentially should be: return (EPERM); */
3921		return (priv_check_cred(cred, PRIV_VFS_EXTATTR_SYSTEM, 0));
3922	case EXTATTR_NAMESPACE_USER:
3923		return (VOP_ACCESS(vp, accmode, cred, td));
3924	default:
3925		return (EPERM);
3926	}
3927}
3928
3929#ifdef DEBUG_VFS_LOCKS
3930/*
3931 * This only exists to supress warnings from unlocked specfs accesses.  It is
3932 * no longer ok to have an unlocked VFS.
3933 */
3934#define	IGNORE_LOCK(vp) (panicstr != NULL || (vp) == NULL ||		\
3935	(vp)->v_type == VCHR ||	(vp)->v_type == VBAD)
3936
3937int vfs_badlock_ddb = 1;	/* Drop into debugger on violation. */
3938SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_ddb, CTLFLAG_RW, &vfs_badlock_ddb, 0,
3939    "Drop into debugger on lock violation");
3940
3941int vfs_badlock_mutex = 1;	/* Check for interlock across VOPs. */
3942SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_mutex, CTLFLAG_RW, &vfs_badlock_mutex,
3943    0, "Check for interlock across VOPs");
3944
3945int vfs_badlock_print = 1;	/* Print lock violations. */
3946SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_print, CTLFLAG_RW, &vfs_badlock_print,
3947    0, "Print lock violations");
3948
3949#ifdef KDB
3950int vfs_badlock_backtrace = 1;	/* Print backtrace at lock violations. */
3951SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_backtrace, CTLFLAG_RW,
3952    &vfs_badlock_backtrace, 0, "Print backtrace at lock violations");
3953#endif
3954
3955static void
3956vfs_badlock(const char *msg, const char *str, struct vnode *vp)
3957{
3958
3959#ifdef KDB
3960	if (vfs_badlock_backtrace)
3961		kdb_backtrace();
3962#endif
3963	if (vfs_badlock_print)
3964		printf("%s: %p %s\n", str, (void *)vp, msg);
3965	if (vfs_badlock_ddb)
3966		kdb_enter(KDB_WHY_VFSLOCK, "lock violation");
3967}
3968
3969void
3970assert_vi_locked(struct vnode *vp, const char *str)
3971{
3972
3973	if (vfs_badlock_mutex && !mtx_owned(VI_MTX(vp)))
3974		vfs_badlock("interlock is not locked but should be", str, vp);
3975}
3976
3977void
3978assert_vi_unlocked(struct vnode *vp, const char *str)
3979{
3980
3981	if (vfs_badlock_mutex && mtx_owned(VI_MTX(vp)))
3982		vfs_badlock("interlock is locked but should not be", str, vp);
3983}
3984
3985void
3986assert_vop_locked(struct vnode *vp, const char *str)
3987{
3988	int locked;
3989
3990	if (!IGNORE_LOCK(vp)) {
3991		locked = VOP_ISLOCKED(vp);
3992		if (locked == 0 || locked == LK_EXCLOTHER)
3993			vfs_badlock("is not locked but should be", str, vp);
3994	}
3995}
3996
3997void
3998assert_vop_unlocked(struct vnode *vp, const char *str)
3999{
4000
4001	if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) == LK_EXCLUSIVE)
4002		vfs_badlock("is locked but should not be", str, vp);
4003}
4004
4005void
4006assert_vop_elocked(struct vnode *vp, const char *str)
4007{
4008
4009	if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_EXCLUSIVE)
4010		vfs_badlock("is not exclusive locked but should be", str, vp);
4011}
4012
4013#if 0
4014void
4015assert_vop_elocked_other(struct vnode *vp, const char *str)
4016{
4017
4018	if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_EXCLOTHER)
4019		vfs_badlock("is not exclusive locked by another thread",
4020		    str, vp);
4021}
4022
4023void
4024assert_vop_slocked(struct vnode *vp, const char *str)
4025{
4026
4027	if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_SHARED)
4028		vfs_badlock("is not locked shared but should be", str, vp);
4029}
4030#endif /* 0 */
4031#endif /* DEBUG_VFS_LOCKS */
4032
4033void
4034vop_rename_fail(struct vop_rename_args *ap)
4035{
4036
4037	if (ap->a_tvp != NULL)
4038		vput(ap->a_tvp);
4039	if (ap->a_tdvp == ap->a_tvp)
4040		vrele(ap->a_tdvp);
4041	else
4042		vput(ap->a_tdvp);
4043	vrele(ap->a_fdvp);
4044	vrele(ap->a_fvp);
4045}
4046
4047void
4048vop_rename_pre(void *ap)
4049{
4050	struct vop_rename_args *a = ap;
4051
4052#ifdef DEBUG_VFS_LOCKS
4053	if (a->a_tvp)
4054		ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME");
4055	ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME");
4056	ASSERT_VI_UNLOCKED(a->a_fvp, "VOP_RENAME");
4057	ASSERT_VI_UNLOCKED(a->a_fdvp, "VOP_RENAME");
4058
4059	/* Check the source (from). */
4060	if (a->a_tdvp->v_vnlock != a->a_fdvp->v_vnlock &&
4061	    (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fdvp->v_vnlock))
4062		ASSERT_VOP_UNLOCKED(a->a_fdvp, "vop_rename: fdvp locked");
4063	if (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fvp->v_vnlock)
4064		ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: fvp locked");
4065
4066	/* Check the target. */
4067	if (a->a_tvp)
4068		ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked");
4069	ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked");
4070#endif
4071	if (a->a_tdvp != a->a_fdvp)
4072		vhold(a->a_fdvp);
4073	if (a->a_tvp != a->a_fvp)
4074		vhold(a->a_fvp);
4075	vhold(a->a_tdvp);
4076	if (a->a_tvp)
4077		vhold(a->a_tvp);
4078}
4079
4080void
4081vop_strategy_pre(void *ap)
4082{
4083#ifdef DEBUG_VFS_LOCKS
4084	struct vop_strategy_args *a;
4085	struct buf *bp;
4086
4087	a = ap;
4088	bp = a->a_bp;
4089
4090	/*
4091	 * Cluster ops lock their component buffers but not the IO container.
4092	 */
4093	if ((bp->b_flags & B_CLUSTER) != 0)
4094		return;
4095
4096	if (panicstr == NULL && !BUF_ISLOCKED(bp)) {
4097		if (vfs_badlock_print)
4098			printf(
4099			    "VOP_STRATEGY: bp is not locked but should be\n");
4100		if (vfs_badlock_ddb)
4101			kdb_enter(KDB_WHY_VFSLOCK, "lock violation");
4102	}
4103#endif
4104}
4105
4106void
4107vop_lock_pre(void *ap)
4108{
4109#ifdef DEBUG_VFS_LOCKS
4110	struct vop_lock1_args *a = ap;
4111
4112	if ((a->a_flags & LK_INTERLOCK) == 0)
4113		ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
4114	else
4115		ASSERT_VI_LOCKED(a->a_vp, "VOP_LOCK");
4116#endif
4117}
4118
4119void
4120vop_lock_post(void *ap, int rc)
4121{
4122#ifdef DEBUG_VFS_LOCKS
4123	struct vop_lock1_args *a = ap;
4124
4125	ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
4126	if (rc == 0)
4127		ASSERT_VOP_LOCKED(a->a_vp, "VOP_LOCK");
4128#endif
4129}
4130
4131void
4132vop_unlock_pre(void *ap)
4133{
4134#ifdef DEBUG_VFS_LOCKS
4135	struct vop_unlock_args *a = ap;
4136
4137	if (a->a_flags & LK_INTERLOCK)
4138		ASSERT_VI_LOCKED(a->a_vp, "VOP_UNLOCK");
4139	ASSERT_VOP_LOCKED(a->a_vp, "VOP_UNLOCK");
4140#endif
4141}
4142
4143void
4144vop_unlock_post(void *ap, int rc)
4145{
4146#ifdef DEBUG_VFS_LOCKS
4147	struct vop_unlock_args *a = ap;
4148
4149	if (a->a_flags & LK_INTERLOCK)
4150		ASSERT_VI_UNLOCKED(a->a_vp, "VOP_UNLOCK");
4151#endif
4152}
4153
4154void
4155vop_create_post(void *ap, int rc)
4156{
4157	struct vop_create_args *a = ap;
4158
4159	if (!rc)
4160		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
4161}
4162
4163void
4164vop_deleteextattr_post(void *ap, int rc)
4165{
4166	struct vop_deleteextattr_args *a = ap;
4167
4168	if (!rc)
4169		VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
4170}
4171
4172void
4173vop_link_post(void *ap, int rc)
4174{
4175	struct vop_link_args *a = ap;
4176
4177	if (!rc) {
4178		VFS_KNOTE_LOCKED(a->a_vp, NOTE_LINK);
4179		VFS_KNOTE_LOCKED(a->a_tdvp, NOTE_WRITE);
4180	}
4181}
4182
4183void
4184vop_mkdir_post(void *ap, int rc)
4185{
4186	struct vop_mkdir_args *a = ap;
4187
4188	if (!rc)
4189		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK);
4190}
4191
4192void
4193vop_mknod_post(void *ap, int rc)
4194{
4195	struct vop_mknod_args *a = ap;
4196
4197	if (!rc)
4198		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
4199}
4200
4201void
4202vop_remove_post(void *ap, int rc)
4203{
4204	struct vop_remove_args *a = ap;
4205
4206	if (!rc) {
4207		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
4208		VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE);
4209	}
4210}
4211
4212void
4213vop_rename_post(void *ap, int rc)
4214{
4215	struct vop_rename_args *a = ap;
4216
4217	if (!rc) {
4218		VFS_KNOTE_UNLOCKED(a->a_fdvp, NOTE_WRITE);
4219		VFS_KNOTE_UNLOCKED(a->a_tdvp, NOTE_WRITE);
4220		VFS_KNOTE_UNLOCKED(a->a_fvp, NOTE_RENAME);
4221		if (a->a_tvp)
4222			VFS_KNOTE_UNLOCKED(a->a_tvp, NOTE_DELETE);
4223	}
4224	if (a->a_tdvp != a->a_fdvp)
4225		vdrop(a->a_fdvp);
4226	if (a->a_tvp != a->a_fvp)
4227		vdrop(a->a_fvp);
4228	vdrop(a->a_tdvp);
4229	if (a->a_tvp)
4230		vdrop(a->a_tvp);
4231}
4232
4233void
4234vop_rmdir_post(void *ap, int rc)
4235{
4236	struct vop_rmdir_args *a = ap;
4237
4238	if (!rc) {
4239		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK);
4240		VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE);
4241	}
4242}
4243
4244void
4245vop_setattr_post(void *ap, int rc)
4246{
4247	struct vop_setattr_args *a = ap;
4248
4249	if (!rc)
4250		VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
4251}
4252
4253void
4254vop_setextattr_post(void *ap, int rc)
4255{
4256	struct vop_setextattr_args *a = ap;
4257
4258	if (!rc)
4259		VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
4260}
4261
4262void
4263vop_symlink_post(void *ap, int rc)
4264{
4265	struct vop_symlink_args *a = ap;
4266
4267	if (!rc)
4268		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
4269}
4270
4271static struct knlist fs_knlist;
4272
4273static void
4274vfs_event_init(void *arg)
4275{
4276	knlist_init_mtx(&fs_knlist, NULL);
4277}
4278/* XXX - correct order? */
4279SYSINIT(vfs_knlist, SI_SUB_VFS, SI_ORDER_ANY, vfs_event_init, NULL);
4280
4281void
4282vfs_event_signal(fsid_t *fsid, uint32_t event, intptr_t data __unused)
4283{
4284
4285	KNOTE_UNLOCKED(&fs_knlist, event);
4286}
4287
4288static int	filt_fsattach(struct knote *kn);
4289static void	filt_fsdetach(struct knote *kn);
4290static int	filt_fsevent(struct knote *kn, long hint);
4291
4292struct filterops fs_filtops = {
4293	.f_isfd = 0,
4294	.f_attach = filt_fsattach,
4295	.f_detach = filt_fsdetach,
4296	.f_event = filt_fsevent
4297};
4298
4299static int
4300filt_fsattach(struct knote *kn)
4301{
4302
4303	kn->kn_flags |= EV_CLEAR;
4304	knlist_add(&fs_knlist, kn, 0);
4305	return (0);
4306}
4307
4308static void
4309filt_fsdetach(struct knote *kn)
4310{
4311
4312	knlist_remove(&fs_knlist, kn, 0);
4313}
4314
4315static int
4316filt_fsevent(struct knote *kn, long hint)
4317{
4318
4319	kn->kn_fflags |= hint;
4320	return (kn->kn_fflags != 0);
4321}
4322
4323static int
4324sysctl_vfs_ctl(SYSCTL_HANDLER_ARGS)
4325{
4326	struct vfsidctl vc;
4327	int error;
4328	struct mount *mp;
4329
4330	error = SYSCTL_IN(req, &vc, sizeof(vc));
4331	if (error)
4332		return (error);
4333	if (vc.vc_vers != VFS_CTL_VERS1)
4334		return (EINVAL);
4335	mp = vfs_getvfs(&vc.vc_fsid);
4336	if (mp == NULL)
4337		return (ENOENT);
4338	/* ensure that a specific sysctl goes to the right filesystem. */
4339	if (strcmp(vc.vc_fstypename, "*") != 0 &&
4340	    strcmp(vc.vc_fstypename, mp->mnt_vfc->vfc_name) != 0) {
4341		vfs_rel(mp);
4342		return (EINVAL);
4343	}
4344	VCTLTOREQ(&vc, req);
4345	error = VFS_SYSCTL(mp, vc.vc_op, req);
4346	vfs_rel(mp);
4347	return (error);
4348}
4349
4350SYSCTL_PROC(_vfs, OID_AUTO, ctl, CTLTYPE_OPAQUE | CTLFLAG_WR,
4351    NULL, 0, sysctl_vfs_ctl, "",
4352    "Sysctl by fsid");
4353
4354/*
4355 * Function to initialize a va_filerev field sensibly.
4356 * XXX: Wouldn't a random number make a lot more sense ??
4357 */
4358u_quad_t
4359init_va_filerev(void)
4360{
4361	struct bintime bt;
4362
4363	getbinuptime(&bt);
4364	return (((u_quad_t)bt.sec << 32LL) | (bt.frac >> 32LL));
4365}
4366
4367static int	filt_vfsread(struct knote *kn, long hint);
4368static int	filt_vfswrite(struct knote *kn, long hint);
4369static int	filt_vfsvnode(struct knote *kn, long hint);
4370static void	filt_vfsdetach(struct knote *kn);
4371static struct filterops vfsread_filtops = {
4372	.f_isfd = 1,
4373	.f_detach = filt_vfsdetach,
4374	.f_event = filt_vfsread
4375};
4376static struct filterops vfswrite_filtops = {
4377	.f_isfd = 1,
4378	.f_detach = filt_vfsdetach,
4379	.f_event = filt_vfswrite
4380};
4381static struct filterops vfsvnode_filtops = {
4382	.f_isfd = 1,
4383	.f_detach = filt_vfsdetach,
4384	.f_event = filt_vfsvnode
4385};
4386
4387static void
4388vfs_knllock(void *arg)
4389{
4390	struct vnode *vp = arg;
4391
4392	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4393}
4394
4395static void
4396vfs_knlunlock(void *arg)
4397{
4398	struct vnode *vp = arg;
4399
4400	VOP_UNLOCK(vp, 0);
4401}
4402
4403static void
4404vfs_knl_assert_locked(void *arg)
4405{
4406#ifdef DEBUG_VFS_LOCKS
4407	struct vnode *vp = arg;
4408
4409	ASSERT_VOP_LOCKED(vp, "vfs_knl_assert_locked");
4410#endif
4411}
4412
4413static void
4414vfs_knl_assert_unlocked(void *arg)
4415{
4416#ifdef DEBUG_VFS_LOCKS
4417	struct vnode *vp = arg;
4418
4419	ASSERT_VOP_UNLOCKED(vp, "vfs_knl_assert_unlocked");
4420#endif
4421}
4422
4423int
4424vfs_kqfilter(struct vop_kqfilter_args *ap)
4425{
4426	struct vnode *vp = ap->a_vp;
4427	struct knote *kn = ap->a_kn;
4428	struct knlist *knl;
4429
4430	switch (kn->kn_filter) {
4431	case EVFILT_READ:
4432		kn->kn_fop = &vfsread_filtops;
4433		break;
4434	case EVFILT_WRITE:
4435		kn->kn_fop = &vfswrite_filtops;
4436		break;
4437	case EVFILT_VNODE:
4438		kn->kn_fop = &vfsvnode_filtops;
4439		break;
4440	default:
4441		return (EINVAL);
4442	}
4443
4444	kn->kn_hook = (caddr_t)vp;
4445
4446	v_addpollinfo(vp);
4447	if (vp->v_pollinfo == NULL)
4448		return (ENOMEM);
4449	knl = &vp->v_pollinfo->vpi_selinfo.si_note;
4450	knlist_add(knl, kn, 0);
4451
4452	return (0);
4453}
4454
4455/*
4456 * Detach knote from vnode
4457 */
4458static void
4459filt_vfsdetach(struct knote *kn)
4460{
4461	struct vnode *vp = (struct vnode *)kn->kn_hook;
4462
4463	KASSERT(vp->v_pollinfo != NULL, ("Missing v_pollinfo"));
4464	knlist_remove(&vp->v_pollinfo->vpi_selinfo.si_note, kn, 0);
4465}
4466
4467/*ARGSUSED*/
4468static int
4469filt_vfsread(struct knote *kn, long hint)
4470{
4471	struct vnode *vp = (struct vnode *)kn->kn_hook;
4472	struct vattr va;
4473	int res;
4474
4475	/*
4476	 * filesystem is gone, so set the EOF flag and schedule
4477	 * the knote for deletion.
4478	 */
4479	if (hint == NOTE_REVOKE) {
4480		VI_LOCK(vp);
4481		kn->kn_flags |= (EV_EOF | EV_ONESHOT);
4482		VI_UNLOCK(vp);
4483		return (1);
4484	}
4485
4486	if (VOP_GETATTR(vp, &va, curthread->td_ucred))
4487		return (0);
4488
4489	VI_LOCK(vp);
4490	kn->kn_data = va.va_size - kn->kn_fp->f_offset;
4491	res = (kn->kn_data != 0);
4492	VI_UNLOCK(vp);
4493	return (res);
4494}
4495
4496/*ARGSUSED*/
4497static int
4498filt_vfswrite(struct knote *kn, long hint)
4499{
4500	struct vnode *vp = (struct vnode *)kn->kn_hook;
4501
4502	VI_LOCK(vp);
4503
4504	/*
4505	 * filesystem is gone, so set the EOF flag and schedule
4506	 * the knote for deletion.
4507	 */
4508	if (hint == NOTE_REVOKE)
4509		kn->kn_flags |= (EV_EOF | EV_ONESHOT);
4510
4511	kn->kn_data = 0;
4512	VI_UNLOCK(vp);
4513	return (1);
4514}
4515
4516static int
4517filt_vfsvnode(struct knote *kn, long hint)
4518{
4519	struct vnode *vp = (struct vnode *)kn->kn_hook;
4520	int res;
4521
4522	VI_LOCK(vp);
4523	if (kn->kn_sfflags & hint)
4524		kn->kn_fflags |= hint;
4525	if (hint == NOTE_REVOKE) {
4526		kn->kn_flags |= EV_EOF;
4527		VI_UNLOCK(vp);
4528		return (1);
4529	}
4530	res = (kn->kn_fflags != 0);
4531	VI_UNLOCK(vp);
4532	return (res);
4533}
4534
4535int
4536vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off)
4537{
4538	int error;
4539
4540	if (dp->d_reclen > ap->a_uio->uio_resid)
4541		return (ENAMETOOLONG);
4542	error = uiomove(dp, dp->d_reclen, ap->a_uio);
4543	if (error) {
4544		if (ap->a_ncookies != NULL) {
4545			if (ap->a_cookies != NULL)
4546				free(ap->a_cookies, M_TEMP);
4547			ap->a_cookies = NULL;
4548			*ap->a_ncookies = 0;
4549		}
4550		return (error);
4551	}
4552	if (ap->a_ncookies == NULL)
4553		return (0);
4554
4555	KASSERT(ap->a_cookies,
4556	    ("NULL ap->a_cookies value with non-NULL ap->a_ncookies!"));
4557
4558	*ap->a_cookies = realloc(*ap->a_cookies,
4559	    (*ap->a_ncookies + 1) * sizeof(u_long), M_TEMP, M_WAITOK | M_ZERO);
4560	(*ap->a_cookies)[*ap->a_ncookies] = off;
4561	return (0);
4562}
4563
4564/*
4565 * Mark for update the access time of the file if the filesystem
4566 * supports VOP_MARKATIME.  This functionality is used by execve and
4567 * mmap, so we want to avoid the I/O implied by directly setting
4568 * va_atime for the sake of efficiency.
4569 */
4570void
4571vfs_mark_atime(struct vnode *vp, struct ucred *cred)
4572{
4573	struct mount *mp;
4574
4575	mp = vp->v_mount;
4576	ASSERT_VOP_LOCKED(vp, "vfs_mark_atime");
4577	if (mp != NULL && (mp->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0)
4578		(void)VOP_MARKATIME(vp);
4579}
4580
4581/*
4582 * The purpose of this routine is to remove granularity from accmode_t,
4583 * reducing it into standard unix access bits - VEXEC, VREAD, VWRITE,
4584 * VADMIN and VAPPEND.
4585 *
4586 * If it returns 0, the caller is supposed to continue with the usual
4587 * access checks using 'accmode' as modified by this routine.  If it
4588 * returns nonzero value, the caller is supposed to return that value
4589 * as errno.
4590 *
4591 * Note that after this routine runs, accmode may be zero.
4592 */
4593int
4594vfs_unixify_accmode(accmode_t *accmode)
4595{
4596	/*
4597	 * There is no way to specify explicit "deny" rule using
4598	 * file mode or POSIX.1e ACLs.
4599	 */
4600	if (*accmode & VEXPLICIT_DENY) {
4601		*accmode = 0;
4602		return (0);
4603	}
4604
4605	/*
4606	 * None of these can be translated into usual access bits.
4607	 * Also, the common case for NFSv4 ACLs is to not contain
4608	 * either of these bits. Caller should check for VWRITE
4609	 * on the containing directory instead.
4610	 */
4611	if (*accmode & (VDELETE_CHILD | VDELETE))
4612		return (EPERM);
4613
4614	if (*accmode & VADMIN_PERMS) {
4615		*accmode &= ~VADMIN_PERMS;
4616		*accmode |= VADMIN;
4617	}
4618
4619	/*
4620	 * There is no way to deny VREAD_ATTRIBUTES, VREAD_ACL
4621	 * or VSYNCHRONIZE using file mode or POSIX.1e ACL.
4622	 */
4623	*accmode &= ~(VSTAT_PERMS | VSYNCHRONIZE);
4624
4625	return (0);
4626}
4627
4628/*
4629 * These are helper functions for filesystems to traverse all
4630 * their vnodes.  See MNT_VNODE_FOREACH_ALL() in sys/mount.h.
4631 *
4632 * This interface replaces MNT_VNODE_FOREACH.
4633 */
4634
4635MALLOC_DEFINE(M_VNODE_MARKER, "vnodemarker", "vnode marker");
4636
4637struct vnode *
4638__mnt_vnode_next_all(struct vnode **mvp, struct mount *mp)
4639{
4640	struct vnode *vp;
4641
4642	if (should_yield())
4643		kern_yield(PRI_USER);
4644	MNT_ILOCK(mp);
4645	KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
4646	vp = TAILQ_NEXT(*mvp, v_nmntvnodes);
4647	while (vp != NULL && (vp->v_type == VMARKER ||
4648	    (vp->v_iflag & VI_DOOMED) != 0))
4649		vp = TAILQ_NEXT(vp, v_nmntvnodes);
4650
4651	/* Check if we are done */
4652	if (vp == NULL) {
4653		__mnt_vnode_markerfree_all(mvp, mp);
4654		/* MNT_IUNLOCK(mp); -- done in above function */
4655		mtx_assert(MNT_MTX(mp), MA_NOTOWNED);
4656		return (NULL);
4657	}
4658	TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes);
4659	TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes);
4660	VI_LOCK(vp);
4661	MNT_IUNLOCK(mp);
4662	return (vp);
4663}
4664
4665struct vnode *
4666__mnt_vnode_first_all(struct vnode **mvp, struct mount *mp)
4667{
4668	struct vnode *vp;
4669
4670	*mvp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO);
4671	MNT_ILOCK(mp);
4672	MNT_REF(mp);
4673	(*mvp)->v_type = VMARKER;
4674
4675	vp = TAILQ_FIRST(&mp->mnt_nvnodelist);
4676	while (vp != NULL && (vp->v_type == VMARKER ||
4677	    (vp->v_iflag & VI_DOOMED) != 0))
4678		vp = TAILQ_NEXT(vp, v_nmntvnodes);
4679
4680	/* Check if we are done */
4681	if (vp == NULL) {
4682		MNT_REL(mp);
4683		MNT_IUNLOCK(mp);
4684		free(*mvp, M_VNODE_MARKER);
4685		*mvp = NULL;
4686		return (NULL);
4687	}
4688	(*mvp)->v_mount = mp;
4689	TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes);
4690	VI_LOCK(vp);
4691	MNT_IUNLOCK(mp);
4692	return (vp);
4693}
4694
4695
4696void
4697__mnt_vnode_markerfree_all(struct vnode **mvp, struct mount *mp)
4698{
4699
4700	if (*mvp == NULL) {
4701		MNT_IUNLOCK(mp);
4702		return;
4703	}
4704
4705	mtx_assert(MNT_MTX(mp), MA_OWNED);
4706
4707	KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
4708	TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes);
4709	MNT_REL(mp);
4710	MNT_IUNLOCK(mp);
4711	free(*mvp, M_VNODE_MARKER);
4712	*mvp = NULL;
4713}
4714
4715/*
4716 * These are helper functions for filesystems to traverse their
4717 * active vnodes.  See MNT_VNODE_FOREACH_ACTIVE() in sys/mount.h
4718 */
4719static void
4720mnt_vnode_markerfree_active(struct vnode **mvp, struct mount *mp)
4721{
4722
4723	KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
4724
4725	MNT_ILOCK(mp);
4726	MNT_REL(mp);
4727	MNT_IUNLOCK(mp);
4728	free(*mvp, M_VNODE_MARKER);
4729	*mvp = NULL;
4730}
4731
4732static struct vnode *
4733mnt_vnode_next_active(struct vnode **mvp, struct mount *mp)
4734{
4735	struct vnode *vp, *nvp;
4736
4737	mtx_assert(&vnode_free_list_mtx, MA_OWNED);
4738	KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
4739restart:
4740	vp = TAILQ_NEXT(*mvp, v_actfreelist);
4741	TAILQ_REMOVE(&mp->mnt_activevnodelist, *mvp, v_actfreelist);
4742	while (vp != NULL) {
4743		if (vp->v_type == VMARKER) {
4744			vp = TAILQ_NEXT(vp, v_actfreelist);
4745			continue;
4746		}
4747		if (!VI_TRYLOCK(vp)) {
4748			if (mp_ncpus == 1 || should_yield()) {
4749				TAILQ_INSERT_BEFORE(vp, *mvp, v_actfreelist);
4750				mtx_unlock(&vnode_free_list_mtx);
4751				kern_yield(PRI_USER);
4752				mtx_lock(&vnode_free_list_mtx);
4753				goto restart;
4754			}
4755			continue;
4756		}
4757		KASSERT(vp->v_type != VMARKER, ("locked marker %p", vp));
4758		KASSERT(vp->v_mount == mp || vp->v_mount == NULL,
4759		    ("alien vnode on the active list %p %p", vp, mp));
4760		if (vp->v_mount == mp && (vp->v_iflag & VI_DOOMED) == 0)
4761			break;
4762		nvp = TAILQ_NEXT(vp, v_actfreelist);
4763		VI_UNLOCK(vp);
4764		vp = nvp;
4765	}
4766
4767	/* Check if we are done */
4768	if (vp == NULL) {
4769		mtx_unlock(&vnode_free_list_mtx);
4770		mnt_vnode_markerfree_active(mvp, mp);
4771		return (NULL);
4772	}
4773	TAILQ_INSERT_AFTER(&mp->mnt_activevnodelist, vp, *mvp, v_actfreelist);
4774	mtx_unlock(&vnode_free_list_mtx);
4775	ASSERT_VI_LOCKED(vp, "active iter");
4776	KASSERT((vp->v_iflag & VI_ACTIVE) != 0, ("Non-active vp %p", vp));
4777	return (vp);
4778}
4779
4780struct vnode *
4781__mnt_vnode_next_active(struct vnode **mvp, struct mount *mp)
4782{
4783
4784	if (should_yield())
4785		kern_yield(PRI_USER);
4786	mtx_lock(&vnode_free_list_mtx);
4787	return (mnt_vnode_next_active(mvp, mp));
4788}
4789
4790struct vnode *
4791__mnt_vnode_first_active(struct vnode **mvp, struct mount *mp)
4792{
4793	struct vnode *vp;
4794
4795	*mvp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO);
4796	MNT_ILOCK(mp);
4797	MNT_REF(mp);
4798	MNT_IUNLOCK(mp);
4799	(*mvp)->v_type = VMARKER;
4800	(*mvp)->v_mount = mp;
4801
4802	mtx_lock(&vnode_free_list_mtx);
4803	vp = TAILQ_FIRST(&mp->mnt_activevnodelist);
4804	if (vp == NULL) {
4805		mtx_unlock(&vnode_free_list_mtx);
4806		mnt_vnode_markerfree_active(mvp, mp);
4807		return (NULL);
4808	}
4809	TAILQ_INSERT_BEFORE(vp, *mvp, v_actfreelist);
4810	return (mnt_vnode_next_active(mvp, mp));
4811}
4812
4813void
4814__mnt_vnode_markerfree_active(struct vnode **mvp, struct mount *mp)
4815{
4816
4817	if (*mvp == NULL)
4818		return;
4819
4820	mtx_lock(&vnode_free_list_mtx);
4821	TAILQ_REMOVE(&mp->mnt_activevnodelist, *mvp, v_actfreelist);
4822	mtx_unlock(&vnode_free_list_mtx);
4823	mnt_vnode_markerfree_active(mvp, mp);
4824}
4825