vfs_subr.c revision 267362
1/*-
2 * Copyright (c) 1989, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 4. Neither the name of the University nor the names of its contributors
19 *    may be used to endorse or promote products derived from this software
20 *    without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 *	@(#)vfs_subr.c	8.31 (Berkeley) 5/26/95
35 */
36
37/*
38 * External virtual filesystem routines
39 */
40
41#include <sys/cdefs.h>
42__FBSDID("$FreeBSD: head/sys/kern/vfs_subr.c 267362 2014-06-11 12:56:49Z mav $");
43
44#include "opt_compat.h"
45#include "opt_ddb.h"
46#include "opt_watchdog.h"
47
48#include <sys/param.h>
49#include <sys/systm.h>
50#include <sys/bio.h>
51#include <sys/buf.h>
52#include <sys/condvar.h>
53#include <sys/conf.h>
54#include <sys/dirent.h>
55#include <sys/event.h>
56#include <sys/eventhandler.h>
57#include <sys/extattr.h>
58#include <sys/file.h>
59#include <sys/fcntl.h>
60#include <sys/jail.h>
61#include <sys/kdb.h>
62#include <sys/kernel.h>
63#include <sys/kthread.h>
64#include <sys/lockf.h>
65#include <sys/malloc.h>
66#include <sys/mount.h>
67#include <sys/namei.h>
68#include <sys/pctrie.h>
69#include <sys/priv.h>
70#include <sys/reboot.h>
71#include <sys/rwlock.h>
72#include <sys/sched.h>
73#include <sys/sleepqueue.h>
74#include <sys/smp.h>
75#include <sys/stat.h>
76#include <sys/sysctl.h>
77#include <sys/syslog.h>
78#include <sys/vmmeter.h>
79#include <sys/vnode.h>
80#include <sys/watchdog.h>
81
82#include <machine/stdarg.h>
83
84#include <security/mac/mac_framework.h>
85
86#include <vm/vm.h>
87#include <vm/vm_object.h>
88#include <vm/vm_extern.h>
89#include <vm/pmap.h>
90#include <vm/vm_map.h>
91#include <vm/vm_page.h>
92#include <vm/vm_kern.h>
93#include <vm/uma.h>
94
95#ifdef DDB
96#include <ddb/ddb.h>
97#endif
98
99static void	delmntque(struct vnode *vp);
100static int	flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo,
101		    int slpflag, int slptimeo);
102static void	syncer_shutdown(void *arg, int howto);
103static int	vtryrecycle(struct vnode *vp);
104static void	v_incr_usecount(struct vnode *);
105static void	v_decr_usecount(struct vnode *);
106static void	v_decr_useonly(struct vnode *);
107static void	v_upgrade_usecount(struct vnode *);
108static void	vnlru_free(int);
109static void	vgonel(struct vnode *);
110static void	vfs_knllock(void *arg);
111static void	vfs_knlunlock(void *arg);
112static void	vfs_knl_assert_locked(void *arg);
113static void	vfs_knl_assert_unlocked(void *arg);
114static void	destroy_vpollinfo(struct vpollinfo *vi);
115
116/*
117 * Number of vnodes in existence.  Increased whenever getnewvnode()
118 * allocates a new vnode, decreased in vdropl() for VI_DOOMED vnode.
119 */
120static unsigned long	numvnodes;
121
122SYSCTL_ULONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0,
123    "Number of vnodes in existence");
124
125/*
126 * Conversion tables for conversion from vnode types to inode formats
127 * and back.
128 */
129enum vtype iftovt_tab[16] = {
130	VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
131	VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
132};
133int vttoif_tab[10] = {
134	0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
135	S_IFSOCK, S_IFIFO, S_IFMT, S_IFMT
136};
137
138/*
139 * List of vnodes that are ready for recycling.
140 */
141static TAILQ_HEAD(freelst, vnode) vnode_free_list;
142
143/*
144 * Free vnode target.  Free vnodes may simply be files which have been stat'd
145 * but not read.  This is somewhat common, and a small cache of such files
146 * should be kept to avoid recreation costs.
147 */
148static u_long wantfreevnodes;
149SYSCTL_ULONG(_vfs, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, "");
150/* Number of vnodes in the free list. */
151static u_long freevnodes;
152SYSCTL_ULONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0,
153    "Number of vnodes in the free list");
154
155static int vlru_allow_cache_src;
156SYSCTL_INT(_vfs, OID_AUTO, vlru_allow_cache_src, CTLFLAG_RW,
157    &vlru_allow_cache_src, 0, "Allow vlru to reclaim source vnode");
158
159/*
160 * Various variables used for debugging the new implementation of
161 * reassignbuf().
162 * XXX these are probably of (very) limited utility now.
163 */
164static int reassignbufcalls;
165SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0,
166    "Number of calls to reassignbuf");
167
168/*
169 * Cache for the mount type id assigned to NFS.  This is used for
170 * special checks in nfs/nfs_nqlease.c and vm/vnode_pager.c.
171 */
172int	nfs_mount_type = -1;
173
174/* To keep more than one thread at a time from running vfs_getnewfsid */
175static struct mtx mntid_mtx;
176
177/*
178 * Lock for any access to the following:
179 *	vnode_free_list
180 *	numvnodes
181 *	freevnodes
182 */
183static struct mtx vnode_free_list_mtx;
184
185/* Publicly exported FS */
186struct nfs_public nfs_pub;
187
188static uma_zone_t buf_trie_zone;
189
190/* Zone for allocation of new vnodes - used exclusively by getnewvnode() */
191static uma_zone_t vnode_zone;
192static uma_zone_t vnodepoll_zone;
193
194/*
195 * The workitem queue.
196 *
197 * It is useful to delay writes of file data and filesystem metadata
198 * for tens of seconds so that quickly created and deleted files need
199 * not waste disk bandwidth being created and removed. To realize this,
200 * we append vnodes to a "workitem" queue. When running with a soft
201 * updates implementation, most pending metadata dependencies should
202 * not wait for more than a few seconds. Thus, mounted on block devices
203 * are delayed only about a half the time that file data is delayed.
204 * Similarly, directory updates are more critical, so are only delayed
205 * about a third the time that file data is delayed. Thus, there are
206 * SYNCER_MAXDELAY queues that are processed round-robin at a rate of
207 * one each second (driven off the filesystem syncer process). The
208 * syncer_delayno variable indicates the next queue that is to be processed.
209 * Items that need to be processed soon are placed in this queue:
210 *
211 *	syncer_workitem_pending[syncer_delayno]
212 *
213 * A delay of fifteen seconds is done by placing the request fifteen
214 * entries later in the queue:
215 *
216 *	syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
217 *
218 */
219static int syncer_delayno;
220static long syncer_mask;
221LIST_HEAD(synclist, bufobj);
222static struct synclist *syncer_workitem_pending;
223/*
224 * The sync_mtx protects:
225 *	bo->bo_synclist
226 *	sync_vnode_count
227 *	syncer_delayno
228 *	syncer_state
229 *	syncer_workitem_pending
230 *	syncer_worklist_len
231 *	rushjob
232 */
233static struct mtx sync_mtx;
234static struct cv sync_wakeup;
235
236#define SYNCER_MAXDELAY		32
237static int syncer_maxdelay = SYNCER_MAXDELAY;	/* maximum delay time */
238static int syncdelay = 30;		/* max time to delay syncing data */
239static int filedelay = 30;		/* time to delay syncing files */
240SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0,
241    "Time to delay syncing files (in seconds)");
242static int dirdelay = 29;		/* time to delay syncing directories */
243SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0,
244    "Time to delay syncing directories (in seconds)");
245static int metadelay = 28;		/* time to delay syncing metadata */
246SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0,
247    "Time to delay syncing metadata (in seconds)");
248static int rushjob;		/* number of slots to run ASAP */
249static int stat_rush_requests;	/* number of times I/O speeded up */
250SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0,
251    "Number of times I/O speeded up (rush requests)");
252
253/*
254 * When shutting down the syncer, run it at four times normal speed.
255 */
256#define SYNCER_SHUTDOWN_SPEEDUP		4
257static int sync_vnode_count;
258static int syncer_worklist_len;
259static enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY }
260    syncer_state;
261
262/*
263 * Number of vnodes we want to exist at any one time.  This is mostly used
264 * to size hash tables in vnode-related code.  It is normally not used in
265 * getnewvnode(), as wantfreevnodes is normally nonzero.)
266 *
267 * XXX desiredvnodes is historical cruft and should not exist.
268 */
269int desiredvnodes;
270SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW,
271    &desiredvnodes, 0, "Maximum number of vnodes");
272SYSCTL_ULONG(_kern, OID_AUTO, minvnodes, CTLFLAG_RW,
273    &wantfreevnodes, 0, "Minimum number of vnodes (legacy)");
274static int vnlru_nowhere;
275SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW,
276    &vnlru_nowhere, 0, "Number of times the vnlru process ran without success");
277
278/*
279 * Macros to control when a vnode is freed and recycled.  All require
280 * the vnode interlock.
281 */
282#define VCANRECYCLE(vp) (((vp)->v_iflag & VI_FREE) && !(vp)->v_holdcnt)
283#define VSHOULDFREE(vp) (!((vp)->v_iflag & VI_FREE) && !(vp)->v_holdcnt)
284#define VSHOULDBUSY(vp) (((vp)->v_iflag & VI_FREE) && (vp)->v_holdcnt)
285
286/* Shift count for (uintptr_t)vp to initialize vp->v_hash. */
287static int vnsz2log;
288
289/*
290 * Support for the bufobj clean & dirty pctrie.
291 */
292static void *
293buf_trie_alloc(struct pctrie *ptree)
294{
295
296	return uma_zalloc(buf_trie_zone, M_NOWAIT);
297}
298
299static void
300buf_trie_free(struct pctrie *ptree, void *node)
301{
302
303	uma_zfree(buf_trie_zone, node);
304}
305PCTRIE_DEFINE(BUF, buf, b_lblkno, buf_trie_alloc, buf_trie_free);
306
307/*
308 * Initialize the vnode management data structures.
309 *
310 * Reevaluate the following cap on the number of vnodes after the physical
311 * memory size exceeds 512GB.  In the limit, as the physical memory size
312 * grows, the ratio of physical pages to vnodes approaches sixteen to one.
313 */
314#ifndef	MAXVNODES_MAX
315#define	MAXVNODES_MAX	(512 * (1024 * 1024 * 1024 / (int)PAGE_SIZE / 16))
316#endif
317static void
318vntblinit(void *dummy __unused)
319{
320	u_int i;
321	int physvnodes, virtvnodes;
322
323	/*
324	 * Desiredvnodes is a function of the physical memory size and the
325	 * kernel's heap size.  Generally speaking, it scales with the
326	 * physical memory size.  The ratio of desiredvnodes to physical pages
327	 * is one to four until desiredvnodes exceeds 98,304.  Thereafter, the
328	 * marginal ratio of desiredvnodes to physical pages is one to
329	 * sixteen.  However, desiredvnodes is limited by the kernel's heap
330	 * size.  The memory required by desiredvnodes vnodes and vm objects
331	 * may not exceed one seventh of the kernel's heap size.
332	 */
333	physvnodes = maxproc + vm_cnt.v_page_count / 16 + 3 * min(98304 * 4,
334	    vm_cnt.v_page_count) / 16;
335	virtvnodes = vm_kmem_size / (7 * (sizeof(struct vm_object) +
336	    sizeof(struct vnode)));
337	desiredvnodes = min(physvnodes, virtvnodes);
338	if (desiredvnodes > MAXVNODES_MAX) {
339		if (bootverbose)
340			printf("Reducing kern.maxvnodes %d -> %d\n",
341			    desiredvnodes, MAXVNODES_MAX);
342		desiredvnodes = MAXVNODES_MAX;
343	}
344	wantfreevnodes = desiredvnodes / 4;
345	mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF);
346	TAILQ_INIT(&vnode_free_list);
347	mtx_init(&vnode_free_list_mtx, "vnode_free_list", NULL, MTX_DEF);
348	vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL,
349	    NULL, NULL, UMA_ALIGN_PTR, 0);
350	vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo),
351	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
352	/*
353	 * Preallocate enough nodes to support one-per buf so that
354	 * we can not fail an insert.  reassignbuf() callers can not
355	 * tolerate the insertion failure.
356	 */
357	buf_trie_zone = uma_zcreate("BUF TRIE", pctrie_node_size(),
358	    NULL, NULL, pctrie_zone_init, NULL, UMA_ALIGN_PTR,
359	    UMA_ZONE_NOFREE | UMA_ZONE_VM);
360	uma_prealloc(buf_trie_zone, nbuf);
361	/*
362	 * Initialize the filesystem syncer.
363	 */
364	syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE,
365	    &syncer_mask);
366	syncer_maxdelay = syncer_mask + 1;
367	mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF);
368	cv_init(&sync_wakeup, "syncer");
369	for (i = 1; i <= sizeof(struct vnode); i <<= 1)
370		vnsz2log++;
371	vnsz2log--;
372}
373SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL);
374
375
376/*
377 * Mark a mount point as busy. Used to synchronize access and to delay
378 * unmounting. Eventually, mountlist_mtx is not released on failure.
379 *
380 * vfs_busy() is a custom lock, it can block the caller.
381 * vfs_busy() only sleeps if the unmount is active on the mount point.
382 * For a mountpoint mp, vfs_busy-enforced lock is before lock of any
383 * vnode belonging to mp.
384 *
385 * Lookup uses vfs_busy() to traverse mount points.
386 * root fs			var fs
387 * / vnode lock		A	/ vnode lock (/var)		D
388 * /var vnode lock	B	/log vnode lock(/var/log)	E
389 * vfs_busy lock	C	vfs_busy lock			F
390 *
391 * Within each file system, the lock order is C->A->B and F->D->E.
392 *
393 * When traversing across mounts, the system follows that lock order:
394 *
395 *        C->A->B
396 *              |
397 *              +->F->D->E
398 *
399 * The lookup() process for namei("/var") illustrates the process:
400 *  VOP_LOOKUP() obtains B while A is held
401 *  vfs_busy() obtains a shared lock on F while A and B are held
402 *  vput() releases lock on B
403 *  vput() releases lock on A
404 *  VFS_ROOT() obtains lock on D while shared lock on F is held
405 *  vfs_unbusy() releases shared lock on F
406 *  vn_lock() obtains lock on deadfs vnode vp_crossmp instead of A.
407 *    Attempt to lock A (instead of vp_crossmp) while D is held would
408 *    violate the global order, causing deadlocks.
409 *
410 * dounmount() locks B while F is drained.
411 */
412int
413vfs_busy(struct mount *mp, int flags)
414{
415
416	MPASS((flags & ~MBF_MASK) == 0);
417	CTR3(KTR_VFS, "%s: mp %p with flags %d", __func__, mp, flags);
418
419	MNT_ILOCK(mp);
420	MNT_REF(mp);
421	/*
422	 * If mount point is currenly being unmounted, sleep until the
423	 * mount point fate is decided.  If thread doing the unmounting fails,
424	 * it will clear MNTK_UNMOUNT flag before waking us up, indicating
425	 * that this mount point has survived the unmount attempt and vfs_busy
426	 * should retry.  Otherwise the unmounter thread will set MNTK_REFEXPIRE
427	 * flag in addition to MNTK_UNMOUNT, indicating that mount point is
428	 * about to be really destroyed.  vfs_busy needs to release its
429	 * reference on the mount point in this case and return with ENOENT,
430	 * telling the caller that mount mount it tried to busy is no longer
431	 * valid.
432	 */
433	while (mp->mnt_kern_flag & MNTK_UNMOUNT) {
434		if (flags & MBF_NOWAIT || mp->mnt_kern_flag & MNTK_REFEXPIRE) {
435			MNT_REL(mp);
436			MNT_IUNLOCK(mp);
437			CTR1(KTR_VFS, "%s: failed busying before sleeping",
438			    __func__);
439			return (ENOENT);
440		}
441		if (flags & MBF_MNTLSTLOCK)
442			mtx_unlock(&mountlist_mtx);
443		mp->mnt_kern_flag |= MNTK_MWAIT;
444		msleep(mp, MNT_MTX(mp), PVFS | PDROP, "vfs_busy", 0);
445		if (flags & MBF_MNTLSTLOCK)
446			mtx_lock(&mountlist_mtx);
447		MNT_ILOCK(mp);
448	}
449	if (flags & MBF_MNTLSTLOCK)
450		mtx_unlock(&mountlist_mtx);
451	mp->mnt_lockref++;
452	MNT_IUNLOCK(mp);
453	return (0);
454}
455
456/*
457 * Free a busy filesystem.
458 */
459void
460vfs_unbusy(struct mount *mp)
461{
462
463	CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
464	MNT_ILOCK(mp);
465	MNT_REL(mp);
466	KASSERT(mp->mnt_lockref > 0, ("negative mnt_lockref"));
467	mp->mnt_lockref--;
468	if (mp->mnt_lockref == 0 && (mp->mnt_kern_flag & MNTK_DRAINING) != 0) {
469		MPASS(mp->mnt_kern_flag & MNTK_UNMOUNT);
470		CTR1(KTR_VFS, "%s: waking up waiters", __func__);
471		mp->mnt_kern_flag &= ~MNTK_DRAINING;
472		wakeup(&mp->mnt_lockref);
473	}
474	MNT_IUNLOCK(mp);
475}
476
477/*
478 * Lookup a mount point by filesystem identifier.
479 */
480struct mount *
481vfs_getvfs(fsid_t *fsid)
482{
483	struct mount *mp;
484
485	CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid);
486	mtx_lock(&mountlist_mtx);
487	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
488		if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
489		    mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
490			vfs_ref(mp);
491			mtx_unlock(&mountlist_mtx);
492			return (mp);
493		}
494	}
495	mtx_unlock(&mountlist_mtx);
496	CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid);
497	return ((struct mount *) 0);
498}
499
500/*
501 * Lookup a mount point by filesystem identifier, busying it before
502 * returning.
503 */
504struct mount *
505vfs_busyfs(fsid_t *fsid)
506{
507	struct mount *mp;
508	int error;
509
510	CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid);
511	mtx_lock(&mountlist_mtx);
512	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
513		if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
514		    mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
515			error = vfs_busy(mp, MBF_MNTLSTLOCK);
516			if (error) {
517				mtx_unlock(&mountlist_mtx);
518				return (NULL);
519			}
520			return (mp);
521		}
522	}
523	CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid);
524	mtx_unlock(&mountlist_mtx);
525	return ((struct mount *) 0);
526}
527
528/*
529 * Check if a user can access privileged mount options.
530 */
531int
532vfs_suser(struct mount *mp, struct thread *td)
533{
534	int error;
535
536	/*
537	 * If the thread is jailed, but this is not a jail-friendly file
538	 * system, deny immediately.
539	 */
540	if (!(mp->mnt_vfc->vfc_flags & VFCF_JAIL) && jailed(td->td_ucred))
541		return (EPERM);
542
543	/*
544	 * If the file system was mounted outside the jail of the calling
545	 * thread, deny immediately.
546	 */
547	if (prison_check(td->td_ucred, mp->mnt_cred) != 0)
548		return (EPERM);
549
550	/*
551	 * If file system supports delegated administration, we don't check
552	 * for the PRIV_VFS_MOUNT_OWNER privilege - it will be better verified
553	 * by the file system itself.
554	 * If this is not the user that did original mount, we check for
555	 * the PRIV_VFS_MOUNT_OWNER privilege.
556	 */
557	if (!(mp->mnt_vfc->vfc_flags & VFCF_DELEGADMIN) &&
558	    mp->mnt_cred->cr_uid != td->td_ucred->cr_uid) {
559		if ((error = priv_check(td, PRIV_VFS_MOUNT_OWNER)) != 0)
560			return (error);
561	}
562	return (0);
563}
564
565/*
566 * Get a new unique fsid.  Try to make its val[0] unique, since this value
567 * will be used to create fake device numbers for stat().  Also try (but
568 * not so hard) make its val[0] unique mod 2^16, since some emulators only
569 * support 16-bit device numbers.  We end up with unique val[0]'s for the
570 * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls.
571 *
572 * Keep in mind that several mounts may be running in parallel.  Starting
573 * the search one past where the previous search terminated is both a
574 * micro-optimization and a defense against returning the same fsid to
575 * different mounts.
576 */
577void
578vfs_getnewfsid(struct mount *mp)
579{
580	static uint16_t mntid_base;
581	struct mount *nmp;
582	fsid_t tfsid;
583	int mtype;
584
585	CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
586	mtx_lock(&mntid_mtx);
587	mtype = mp->mnt_vfc->vfc_typenum;
588	tfsid.val[1] = mtype;
589	mtype = (mtype & 0xFF) << 24;
590	for (;;) {
591		tfsid.val[0] = makedev(255,
592		    mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF));
593		mntid_base++;
594		if ((nmp = vfs_getvfs(&tfsid)) == NULL)
595			break;
596		vfs_rel(nmp);
597	}
598	mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
599	mp->mnt_stat.f_fsid.val[1] = tfsid.val[1];
600	mtx_unlock(&mntid_mtx);
601}
602
603/*
604 * Knob to control the precision of file timestamps:
605 *
606 *   0 = seconds only; nanoseconds zeroed.
607 *   1 = seconds and nanoseconds, accurate within 1/HZ.
608 *   2 = seconds and nanoseconds, truncated to microseconds.
609 * >=3 = seconds and nanoseconds, maximum precision.
610 */
611enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC };
612
613static int timestamp_precision = TSP_SEC;
614SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW,
615    &timestamp_precision, 0, "File timestamp precision (0: seconds, "
616    "1: sec + ns accurate to 1/HZ, 2: sec + ns truncated to ms, "
617    "3+: sec + ns (max. precision))");
618
619/*
620 * Get a current timestamp.
621 */
622void
623vfs_timestamp(struct timespec *tsp)
624{
625	struct timeval tv;
626
627	switch (timestamp_precision) {
628	case TSP_SEC:
629		tsp->tv_sec = time_second;
630		tsp->tv_nsec = 0;
631		break;
632	case TSP_HZ:
633		getnanotime(tsp);
634		break;
635	case TSP_USEC:
636		microtime(&tv);
637		TIMEVAL_TO_TIMESPEC(&tv, tsp);
638		break;
639	case TSP_NSEC:
640	default:
641		nanotime(tsp);
642		break;
643	}
644}
645
646/*
647 * Set vnode attributes to VNOVAL
648 */
649void
650vattr_null(struct vattr *vap)
651{
652
653	vap->va_type = VNON;
654	vap->va_size = VNOVAL;
655	vap->va_bytes = VNOVAL;
656	vap->va_mode = VNOVAL;
657	vap->va_nlink = VNOVAL;
658	vap->va_uid = VNOVAL;
659	vap->va_gid = VNOVAL;
660	vap->va_fsid = VNOVAL;
661	vap->va_fileid = VNOVAL;
662	vap->va_blocksize = VNOVAL;
663	vap->va_rdev = VNOVAL;
664	vap->va_atime.tv_sec = VNOVAL;
665	vap->va_atime.tv_nsec = VNOVAL;
666	vap->va_mtime.tv_sec = VNOVAL;
667	vap->va_mtime.tv_nsec = VNOVAL;
668	vap->va_ctime.tv_sec = VNOVAL;
669	vap->va_ctime.tv_nsec = VNOVAL;
670	vap->va_birthtime.tv_sec = VNOVAL;
671	vap->va_birthtime.tv_nsec = VNOVAL;
672	vap->va_flags = VNOVAL;
673	vap->va_gen = VNOVAL;
674	vap->va_vaflags = 0;
675}
676
677/*
678 * This routine is called when we have too many vnodes.  It attempts
679 * to free <count> vnodes and will potentially free vnodes that still
680 * have VM backing store (VM backing store is typically the cause
681 * of a vnode blowout so we want to do this).  Therefore, this operation
682 * is not considered cheap.
683 *
684 * A number of conditions may prevent a vnode from being reclaimed.
685 * the buffer cache may have references on the vnode, a directory
686 * vnode may still have references due to the namei cache representing
687 * underlying files, or the vnode may be in active use.   It is not
688 * desireable to reuse such vnodes.  These conditions may cause the
689 * number of vnodes to reach some minimum value regardless of what
690 * you set kern.maxvnodes to.  Do not set kern.maxvnodes too low.
691 */
692static int
693vlrureclaim(struct mount *mp)
694{
695	struct vnode *vp;
696	int done;
697	int trigger;
698	int usevnodes;
699	int count;
700
701	/*
702	 * Calculate the trigger point, don't allow user
703	 * screwups to blow us up.   This prevents us from
704	 * recycling vnodes with lots of resident pages.  We
705	 * aren't trying to free memory, we are trying to
706	 * free vnodes.
707	 */
708	usevnodes = desiredvnodes;
709	if (usevnodes <= 0)
710		usevnodes = 1;
711	trigger = vm_cnt.v_page_count * 2 / usevnodes;
712	done = 0;
713	vn_start_write(NULL, &mp, V_WAIT);
714	MNT_ILOCK(mp);
715	count = mp->mnt_nvnodelistsize / 10 + 1;
716	while (count != 0) {
717		vp = TAILQ_FIRST(&mp->mnt_nvnodelist);
718		while (vp != NULL && vp->v_type == VMARKER)
719			vp = TAILQ_NEXT(vp, v_nmntvnodes);
720		if (vp == NULL)
721			break;
722		TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
723		TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
724		--count;
725		if (!VI_TRYLOCK(vp))
726			goto next_iter;
727		/*
728		 * If it's been deconstructed already, it's still
729		 * referenced, or it exceeds the trigger, skip it.
730		 */
731		if (vp->v_usecount ||
732		    (!vlru_allow_cache_src &&
733			!LIST_EMPTY(&(vp)->v_cache_src)) ||
734		    (vp->v_iflag & VI_DOOMED) != 0 || (vp->v_object != NULL &&
735		    vp->v_object->resident_page_count > trigger)) {
736			VI_UNLOCK(vp);
737			goto next_iter;
738		}
739		MNT_IUNLOCK(mp);
740		vholdl(vp);
741		if (VOP_LOCK(vp, LK_INTERLOCK|LK_EXCLUSIVE|LK_NOWAIT)) {
742			vdrop(vp);
743			goto next_iter_mntunlocked;
744		}
745		VI_LOCK(vp);
746		/*
747		 * v_usecount may have been bumped after VOP_LOCK() dropped
748		 * the vnode interlock and before it was locked again.
749		 *
750		 * It is not necessary to recheck VI_DOOMED because it can
751		 * only be set by another thread that holds both the vnode
752		 * lock and vnode interlock.  If another thread has the
753		 * vnode lock before we get to VOP_LOCK() and obtains the
754		 * vnode interlock after VOP_LOCK() drops the vnode
755		 * interlock, the other thread will be unable to drop the
756		 * vnode lock before our VOP_LOCK() call fails.
757		 */
758		if (vp->v_usecount ||
759		    (!vlru_allow_cache_src &&
760			!LIST_EMPTY(&(vp)->v_cache_src)) ||
761		    (vp->v_object != NULL &&
762		    vp->v_object->resident_page_count > trigger)) {
763			VOP_UNLOCK(vp, LK_INTERLOCK);
764			vdrop(vp);
765			goto next_iter_mntunlocked;
766		}
767		KASSERT((vp->v_iflag & VI_DOOMED) == 0,
768		    ("VI_DOOMED unexpectedly detected in vlrureclaim()"));
769		vgonel(vp);
770		VOP_UNLOCK(vp, 0);
771		vdropl(vp);
772		done++;
773next_iter_mntunlocked:
774		if (!should_yield())
775			goto relock_mnt;
776		goto yield;
777next_iter:
778		if (!should_yield())
779			continue;
780		MNT_IUNLOCK(mp);
781yield:
782		kern_yield(PRI_USER);
783relock_mnt:
784		MNT_ILOCK(mp);
785	}
786	MNT_IUNLOCK(mp);
787	vn_finished_write(mp);
788	return done;
789}
790
791/*
792 * Attempt to keep the free list at wantfreevnodes length.
793 */
794static void
795vnlru_free(int count)
796{
797	struct vnode *vp;
798
799	mtx_assert(&vnode_free_list_mtx, MA_OWNED);
800	for (; count > 0; count--) {
801		vp = TAILQ_FIRST(&vnode_free_list);
802		/*
803		 * The list can be modified while the free_list_mtx
804		 * has been dropped and vp could be NULL here.
805		 */
806		if (!vp)
807			break;
808		VNASSERT(vp->v_op != NULL, vp,
809		    ("vnlru_free: vnode already reclaimed."));
810		KASSERT((vp->v_iflag & VI_FREE) != 0,
811		    ("Removing vnode not on freelist"));
812		KASSERT((vp->v_iflag & VI_ACTIVE) == 0,
813		    ("Mangling active vnode"));
814		TAILQ_REMOVE(&vnode_free_list, vp, v_actfreelist);
815		/*
816		 * Don't recycle if we can't get the interlock.
817		 */
818		if (!VI_TRYLOCK(vp)) {
819			TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_actfreelist);
820			continue;
821		}
822		VNASSERT(VCANRECYCLE(vp), vp,
823		    ("vp inconsistent on freelist"));
824		freevnodes--;
825		vp->v_iflag &= ~VI_FREE;
826		vholdl(vp);
827		mtx_unlock(&vnode_free_list_mtx);
828		VI_UNLOCK(vp);
829		vtryrecycle(vp);
830		/*
831		 * If the recycled succeeded this vdrop will actually free
832		 * the vnode.  If not it will simply place it back on
833		 * the free list.
834		 */
835		vdrop(vp);
836		mtx_lock(&vnode_free_list_mtx);
837	}
838}
839/*
840 * Attempt to recycle vnodes in a context that is always safe to block.
841 * Calling vlrurecycle() from the bowels of filesystem code has some
842 * interesting deadlock problems.
843 */
844static struct proc *vnlruproc;
845static int vnlruproc_sig;
846
847static void
848vnlru_proc(void)
849{
850	struct mount *mp, *nmp;
851	int done;
852	struct proc *p = vnlruproc;
853
854	EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, p,
855	    SHUTDOWN_PRI_FIRST);
856
857	for (;;) {
858		kproc_suspend_check(p);
859		mtx_lock(&vnode_free_list_mtx);
860		if (freevnodes > wantfreevnodes)
861			vnlru_free(freevnodes - wantfreevnodes);
862		if (numvnodes <= desiredvnodes * 9 / 10) {
863			vnlruproc_sig = 0;
864			wakeup(&vnlruproc_sig);
865			msleep(vnlruproc, &vnode_free_list_mtx,
866			    PVFS|PDROP, "vlruwt", hz);
867			continue;
868		}
869		mtx_unlock(&vnode_free_list_mtx);
870		done = 0;
871		mtx_lock(&mountlist_mtx);
872		for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
873			if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) {
874				nmp = TAILQ_NEXT(mp, mnt_list);
875				continue;
876			}
877			done += vlrureclaim(mp);
878			mtx_lock(&mountlist_mtx);
879			nmp = TAILQ_NEXT(mp, mnt_list);
880			vfs_unbusy(mp);
881		}
882		mtx_unlock(&mountlist_mtx);
883		if (done == 0) {
884#if 0
885			/* These messages are temporary debugging aids */
886			if (vnlru_nowhere < 5)
887				printf("vnlru process getting nowhere..\n");
888			else if (vnlru_nowhere == 5)
889				printf("vnlru process messages stopped.\n");
890#endif
891			vnlru_nowhere++;
892			tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3);
893		} else
894			kern_yield(PRI_USER);
895	}
896}
897
898static struct kproc_desc vnlru_kp = {
899	"vnlru",
900	vnlru_proc,
901	&vnlruproc
902};
903SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start,
904    &vnlru_kp);
905
906/*
907 * Routines having to do with the management of the vnode table.
908 */
909
910/*
911 * Try to recycle a freed vnode.  We abort if anyone picks up a reference
912 * before we actually vgone().  This function must be called with the vnode
913 * held to prevent the vnode from being returned to the free list midway
914 * through vgone().
915 */
916static int
917vtryrecycle(struct vnode *vp)
918{
919	struct mount *vnmp;
920
921	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
922	VNASSERT(vp->v_holdcnt, vp,
923	    ("vtryrecycle: Recycling vp %p without a reference.", vp));
924	/*
925	 * This vnode may found and locked via some other list, if so we
926	 * can't recycle it yet.
927	 */
928	if (VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT) != 0) {
929		CTR2(KTR_VFS,
930		    "%s: impossible to recycle, vp %p lock is already held",
931		    __func__, vp);
932		return (EWOULDBLOCK);
933	}
934	/*
935	 * Don't recycle if its filesystem is being suspended.
936	 */
937	if (vn_start_write(vp, &vnmp, V_NOWAIT) != 0) {
938		VOP_UNLOCK(vp, 0);
939		CTR2(KTR_VFS,
940		    "%s: impossible to recycle, cannot start the write for %p",
941		    __func__, vp);
942		return (EBUSY);
943	}
944	/*
945	 * If we got this far, we need to acquire the interlock and see if
946	 * anyone picked up this vnode from another list.  If not, we will
947	 * mark it with DOOMED via vgonel() so that anyone who does find it
948	 * will skip over it.
949	 */
950	VI_LOCK(vp);
951	if (vp->v_usecount) {
952		VOP_UNLOCK(vp, LK_INTERLOCK);
953		vn_finished_write(vnmp);
954		CTR2(KTR_VFS,
955		    "%s: impossible to recycle, %p is already referenced",
956		    __func__, vp);
957		return (EBUSY);
958	}
959	if ((vp->v_iflag & VI_DOOMED) == 0)
960		vgonel(vp);
961	VOP_UNLOCK(vp, LK_INTERLOCK);
962	vn_finished_write(vnmp);
963	return (0);
964}
965
966/*
967 * Wait for available vnodes.
968 */
969static int
970getnewvnode_wait(int suspended)
971{
972
973	mtx_assert(&vnode_free_list_mtx, MA_OWNED);
974	if (numvnodes > desiredvnodes) {
975		if (suspended) {
976			/*
977			 * File system is beeing suspended, we cannot risk a
978			 * deadlock here, so allocate new vnode anyway.
979			 */
980			if (freevnodes > wantfreevnodes)
981				vnlru_free(freevnodes - wantfreevnodes);
982			return (0);
983		}
984		if (vnlruproc_sig == 0) {
985			vnlruproc_sig = 1;	/* avoid unnecessary wakeups */
986			wakeup(vnlruproc);
987		}
988		msleep(&vnlruproc_sig, &vnode_free_list_mtx, PVFS,
989		    "vlruwk", hz);
990	}
991	return (numvnodes > desiredvnodes ? ENFILE : 0);
992}
993
994void
995getnewvnode_reserve(u_int count)
996{
997	struct thread *td;
998
999	td = curthread;
1000	/* First try to be quick and racy. */
1001	if (atomic_fetchadd_long(&numvnodes, count) + count <= desiredvnodes) {
1002		td->td_vp_reserv += count;
1003		return;
1004	} else
1005		atomic_subtract_long(&numvnodes, count);
1006
1007	mtx_lock(&vnode_free_list_mtx);
1008	while (count > 0) {
1009		if (getnewvnode_wait(0) == 0) {
1010			count--;
1011			td->td_vp_reserv++;
1012			atomic_add_long(&numvnodes, 1);
1013		}
1014	}
1015	mtx_unlock(&vnode_free_list_mtx);
1016}
1017
1018void
1019getnewvnode_drop_reserve(void)
1020{
1021	struct thread *td;
1022
1023	td = curthread;
1024	atomic_subtract_long(&numvnodes, td->td_vp_reserv);
1025	td->td_vp_reserv = 0;
1026}
1027
1028/*
1029 * Return the next vnode from the free list.
1030 */
1031int
1032getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops,
1033    struct vnode **vpp)
1034{
1035	struct vnode *vp;
1036	struct bufobj *bo;
1037	struct thread *td;
1038	int error;
1039
1040	CTR3(KTR_VFS, "%s: mp %p with tag %s", __func__, mp, tag);
1041	vp = NULL;
1042	td = curthread;
1043	if (td->td_vp_reserv > 0) {
1044		td->td_vp_reserv -= 1;
1045		goto alloc;
1046	}
1047	mtx_lock(&vnode_free_list_mtx);
1048	/*
1049	 * Lend our context to reclaim vnodes if they've exceeded the max.
1050	 */
1051	if (freevnodes > wantfreevnodes)
1052		vnlru_free(1);
1053	error = getnewvnode_wait(mp != NULL && (mp->mnt_kern_flag &
1054	    MNTK_SUSPEND));
1055#if 0	/* XXX Not all VFS_VGET/ffs_vget callers check returns. */
1056	if (error != 0) {
1057		mtx_unlock(&vnode_free_list_mtx);
1058		return (error);
1059	}
1060#endif
1061	atomic_add_long(&numvnodes, 1);
1062	mtx_unlock(&vnode_free_list_mtx);
1063alloc:
1064	vp = (struct vnode *) uma_zalloc(vnode_zone, M_WAITOK|M_ZERO);
1065	/*
1066	 * Setup locks.
1067	 */
1068	vp->v_vnlock = &vp->v_lock;
1069	mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF);
1070	/*
1071	 * By default, don't allow shared locks unless filesystems
1072	 * opt-in.
1073	 */
1074	lockinit(vp->v_vnlock, PVFS, tag, VLKTIMEOUT, LK_NOSHARE | LK_IS_VNODE);
1075	/*
1076	 * Initialize bufobj.
1077	 */
1078	bo = &vp->v_bufobj;
1079	bo->__bo_vnode = vp;
1080	rw_init(BO_LOCKPTR(bo), "bufobj interlock");
1081	bo->bo_ops = &buf_ops_bio;
1082	bo->bo_private = vp;
1083	TAILQ_INIT(&bo->bo_clean.bv_hd);
1084	TAILQ_INIT(&bo->bo_dirty.bv_hd);
1085	/*
1086	 * Initialize namecache.
1087	 */
1088	LIST_INIT(&vp->v_cache_src);
1089	TAILQ_INIT(&vp->v_cache_dst);
1090	/*
1091	 * Finalize various vnode identity bits.
1092	 */
1093	vp->v_type = VNON;
1094	vp->v_tag = tag;
1095	vp->v_op = vops;
1096	v_incr_usecount(vp);
1097	vp->v_data = NULL;
1098#ifdef MAC
1099	mac_vnode_init(vp);
1100	if (mp != NULL && (mp->mnt_flag & MNT_MULTILABEL) == 0)
1101		mac_vnode_associate_singlelabel(mp, vp);
1102	else if (mp == NULL && vops != &dead_vnodeops)
1103		printf("NULL mp in getnewvnode()\n");
1104#endif
1105	if (mp != NULL) {
1106		bo->bo_bsize = mp->mnt_stat.f_iosize;
1107		if ((mp->mnt_kern_flag & MNTK_NOKNOTE) != 0)
1108			vp->v_vflag |= VV_NOKNOTE;
1109	}
1110	rangelock_init(&vp->v_rl);
1111
1112	/*
1113	 * For the filesystems which do not use vfs_hash_insert(),
1114	 * still initialize v_hash to have vfs_hash_index() useful.
1115	 * E.g., nullfs uses vfs_hash_index() on the lower vnode for
1116	 * its own hashing.
1117	 */
1118	vp->v_hash = (uintptr_t)vp >> vnsz2log;
1119
1120	*vpp = vp;
1121	return (0);
1122}
1123
1124/*
1125 * Delete from old mount point vnode list, if on one.
1126 */
1127static void
1128delmntque(struct vnode *vp)
1129{
1130	struct mount *mp;
1131	int active;
1132
1133	mp = vp->v_mount;
1134	if (mp == NULL)
1135		return;
1136	MNT_ILOCK(mp);
1137	VI_LOCK(vp);
1138	KASSERT(mp->mnt_activevnodelistsize <= mp->mnt_nvnodelistsize,
1139	    ("Active vnode list size %d > Vnode list size %d",
1140	     mp->mnt_activevnodelistsize, mp->mnt_nvnodelistsize));
1141	active = vp->v_iflag & VI_ACTIVE;
1142	vp->v_iflag &= ~VI_ACTIVE;
1143	if (active) {
1144		mtx_lock(&vnode_free_list_mtx);
1145		TAILQ_REMOVE(&mp->mnt_activevnodelist, vp, v_actfreelist);
1146		mp->mnt_activevnodelistsize--;
1147		mtx_unlock(&vnode_free_list_mtx);
1148	}
1149	vp->v_mount = NULL;
1150	VI_UNLOCK(vp);
1151	VNASSERT(mp->mnt_nvnodelistsize > 0, vp,
1152		("bad mount point vnode list size"));
1153	TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
1154	mp->mnt_nvnodelistsize--;
1155	MNT_REL(mp);
1156	MNT_IUNLOCK(mp);
1157}
1158
1159static void
1160insmntque_stddtr(struct vnode *vp, void *dtr_arg)
1161{
1162
1163	vp->v_data = NULL;
1164	vp->v_op = &dead_vnodeops;
1165	vgone(vp);
1166	vput(vp);
1167}
1168
1169/*
1170 * Insert into list of vnodes for the new mount point, if available.
1171 */
1172int
1173insmntque1(struct vnode *vp, struct mount *mp,
1174	void (*dtr)(struct vnode *, void *), void *dtr_arg)
1175{
1176
1177	KASSERT(vp->v_mount == NULL,
1178		("insmntque: vnode already on per mount vnode list"));
1179	VNASSERT(mp != NULL, vp, ("Don't call insmntque(foo, NULL)"));
1180	ASSERT_VOP_ELOCKED(vp, "insmntque: non-locked vp");
1181
1182	/*
1183	 * We acquire the vnode interlock early to ensure that the
1184	 * vnode cannot be recycled by another process releasing a
1185	 * holdcnt on it before we get it on both the vnode list
1186	 * and the active vnode list. The mount mutex protects only
1187	 * manipulation of the vnode list and the vnode freelist
1188	 * mutex protects only manipulation of the active vnode list.
1189	 * Hence the need to hold the vnode interlock throughout.
1190	 */
1191	MNT_ILOCK(mp);
1192	VI_LOCK(vp);
1193	if (((mp->mnt_kern_flag & MNTK_NOINSMNTQ) != 0 &&
1194	    ((mp->mnt_kern_flag & MNTK_UNMOUNTF) != 0 ||
1195	    mp->mnt_nvnodelistsize == 0)) &&
1196	    (vp->v_vflag & VV_FORCEINSMQ) == 0) {
1197		VI_UNLOCK(vp);
1198		MNT_IUNLOCK(mp);
1199		if (dtr != NULL)
1200			dtr(vp, dtr_arg);
1201		return (EBUSY);
1202	}
1203	vp->v_mount = mp;
1204	MNT_REF(mp);
1205	TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
1206	VNASSERT(mp->mnt_nvnodelistsize >= 0, vp,
1207		("neg mount point vnode list size"));
1208	mp->mnt_nvnodelistsize++;
1209	KASSERT((vp->v_iflag & VI_ACTIVE) == 0,
1210	    ("Activating already active vnode"));
1211	vp->v_iflag |= VI_ACTIVE;
1212	mtx_lock(&vnode_free_list_mtx);
1213	TAILQ_INSERT_HEAD(&mp->mnt_activevnodelist, vp, v_actfreelist);
1214	mp->mnt_activevnodelistsize++;
1215	mtx_unlock(&vnode_free_list_mtx);
1216	VI_UNLOCK(vp);
1217	MNT_IUNLOCK(mp);
1218	return (0);
1219}
1220
1221int
1222insmntque(struct vnode *vp, struct mount *mp)
1223{
1224
1225	return (insmntque1(vp, mp, insmntque_stddtr, NULL));
1226}
1227
1228/*
1229 * Flush out and invalidate all buffers associated with a bufobj
1230 * Called with the underlying object locked.
1231 */
1232int
1233bufobj_invalbuf(struct bufobj *bo, int flags, int slpflag, int slptimeo)
1234{
1235	int error;
1236
1237	BO_LOCK(bo);
1238	if (flags & V_SAVE) {
1239		error = bufobj_wwait(bo, slpflag, slptimeo);
1240		if (error) {
1241			BO_UNLOCK(bo);
1242			return (error);
1243		}
1244		if (bo->bo_dirty.bv_cnt > 0) {
1245			BO_UNLOCK(bo);
1246			if ((error = BO_SYNC(bo, MNT_WAIT)) != 0)
1247				return (error);
1248			/*
1249			 * XXX We could save a lock/unlock if this was only
1250			 * enabled under INVARIANTS
1251			 */
1252			BO_LOCK(bo);
1253			if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0)
1254				panic("vinvalbuf: dirty bufs");
1255		}
1256	}
1257	/*
1258	 * If you alter this loop please notice that interlock is dropped and
1259	 * reacquired in flushbuflist.  Special care is needed to ensure that
1260	 * no race conditions occur from this.
1261	 */
1262	do {
1263		error = flushbuflist(&bo->bo_clean,
1264		    flags, bo, slpflag, slptimeo);
1265		if (error == 0 && !(flags & V_CLEANONLY))
1266			error = flushbuflist(&bo->bo_dirty,
1267			    flags, bo, slpflag, slptimeo);
1268		if (error != 0 && error != EAGAIN) {
1269			BO_UNLOCK(bo);
1270			return (error);
1271		}
1272	} while (error != 0);
1273
1274	/*
1275	 * Wait for I/O to complete.  XXX needs cleaning up.  The vnode can
1276	 * have write I/O in-progress but if there is a VM object then the
1277	 * VM object can also have read-I/O in-progress.
1278	 */
1279	do {
1280		bufobj_wwait(bo, 0, 0);
1281		BO_UNLOCK(bo);
1282		if (bo->bo_object != NULL) {
1283			VM_OBJECT_WLOCK(bo->bo_object);
1284			vm_object_pip_wait(bo->bo_object, "bovlbx");
1285			VM_OBJECT_WUNLOCK(bo->bo_object);
1286		}
1287		BO_LOCK(bo);
1288	} while (bo->bo_numoutput > 0);
1289	BO_UNLOCK(bo);
1290
1291	/*
1292	 * Destroy the copy in the VM cache, too.
1293	 */
1294	if (bo->bo_object != NULL &&
1295	    (flags & (V_ALT | V_NORMAL | V_CLEANONLY)) == 0) {
1296		VM_OBJECT_WLOCK(bo->bo_object);
1297		vm_object_page_remove(bo->bo_object, 0, 0, (flags & V_SAVE) ?
1298		    OBJPR_CLEANONLY : 0);
1299		VM_OBJECT_WUNLOCK(bo->bo_object);
1300	}
1301
1302#ifdef INVARIANTS
1303	BO_LOCK(bo);
1304	if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY)) == 0 &&
1305	    (bo->bo_dirty.bv_cnt > 0 || bo->bo_clean.bv_cnt > 0))
1306		panic("vinvalbuf: flush failed");
1307	BO_UNLOCK(bo);
1308#endif
1309	return (0);
1310}
1311
1312/*
1313 * Flush out and invalidate all buffers associated with a vnode.
1314 * Called with the underlying object locked.
1315 */
1316int
1317vinvalbuf(struct vnode *vp, int flags, int slpflag, int slptimeo)
1318{
1319
1320	CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags);
1321	ASSERT_VOP_LOCKED(vp, "vinvalbuf");
1322	if (vp->v_object != NULL && vp->v_object->handle != vp)
1323		return (0);
1324	return (bufobj_invalbuf(&vp->v_bufobj, flags, slpflag, slptimeo));
1325}
1326
1327/*
1328 * Flush out buffers on the specified list.
1329 *
1330 */
1331static int
1332flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, int slpflag,
1333    int slptimeo)
1334{
1335	struct buf *bp, *nbp;
1336	int retval, error;
1337	daddr_t lblkno;
1338	b_xflags_t xflags;
1339
1340	ASSERT_BO_WLOCKED(bo);
1341
1342	retval = 0;
1343	TAILQ_FOREACH_SAFE(bp, &bufv->bv_hd, b_bobufs, nbp) {
1344		if (((flags & V_NORMAL) && (bp->b_xflags & BX_ALTDATA)) ||
1345		    ((flags & V_ALT) && (bp->b_xflags & BX_ALTDATA) == 0)) {
1346			continue;
1347		}
1348		lblkno = 0;
1349		xflags = 0;
1350		if (nbp != NULL) {
1351			lblkno = nbp->b_lblkno;
1352			xflags = nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN);
1353		}
1354		retval = EAGAIN;
1355		error = BUF_TIMELOCK(bp,
1356		    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo),
1357		    "flushbuf", slpflag, slptimeo);
1358		if (error) {
1359			BO_LOCK(bo);
1360			return (error != ENOLCK ? error : EAGAIN);
1361		}
1362		KASSERT(bp->b_bufobj == bo,
1363		    ("bp %p wrong b_bufobj %p should be %p",
1364		    bp, bp->b_bufobj, bo));
1365		if (bp->b_bufobj != bo) {	/* XXX: necessary ? */
1366			BUF_UNLOCK(bp);
1367			BO_LOCK(bo);
1368			return (EAGAIN);
1369		}
1370		/*
1371		 * XXX Since there are no node locks for NFS, I
1372		 * believe there is a slight chance that a delayed
1373		 * write will occur while sleeping just above, so
1374		 * check for it.
1375		 */
1376		if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) &&
1377		    (flags & V_SAVE)) {
1378			bremfree(bp);
1379			bp->b_flags |= B_ASYNC;
1380			bwrite(bp);
1381			BO_LOCK(bo);
1382			return (EAGAIN);	/* XXX: why not loop ? */
1383		}
1384		bremfree(bp);
1385		bp->b_flags |= (B_INVAL | B_RELBUF);
1386		bp->b_flags &= ~B_ASYNC;
1387		brelse(bp);
1388		BO_LOCK(bo);
1389		if (nbp != NULL &&
1390		    (nbp->b_bufobj != bo ||
1391		     nbp->b_lblkno != lblkno ||
1392		     (nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) != xflags))
1393			break;			/* nbp invalid */
1394	}
1395	return (retval);
1396}
1397
1398/*
1399 * Truncate a file's buffer and pages to a specified length.  This
1400 * is in lieu of the old vinvalbuf mechanism, which performed unneeded
1401 * sync activity.
1402 */
1403int
1404vtruncbuf(struct vnode *vp, struct ucred *cred, off_t length, int blksize)
1405{
1406	struct buf *bp, *nbp;
1407	int anyfreed;
1408	int trunclbn;
1409	struct bufobj *bo;
1410
1411	CTR5(KTR_VFS, "%s: vp %p with cred %p and block %d:%ju", __func__,
1412	    vp, cred, blksize, (uintmax_t)length);
1413
1414	/*
1415	 * Round up to the *next* lbn.
1416	 */
1417	trunclbn = (length + blksize - 1) / blksize;
1418
1419	ASSERT_VOP_LOCKED(vp, "vtruncbuf");
1420restart:
1421	bo = &vp->v_bufobj;
1422	BO_LOCK(bo);
1423	anyfreed = 1;
1424	for (;anyfreed;) {
1425		anyfreed = 0;
1426		TAILQ_FOREACH_SAFE(bp, &bo->bo_clean.bv_hd, b_bobufs, nbp) {
1427			if (bp->b_lblkno < trunclbn)
1428				continue;
1429			if (BUF_LOCK(bp,
1430			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
1431			    BO_LOCKPTR(bo)) == ENOLCK)
1432				goto restart;
1433
1434			bremfree(bp);
1435			bp->b_flags |= (B_INVAL | B_RELBUF);
1436			bp->b_flags &= ~B_ASYNC;
1437			brelse(bp);
1438			anyfreed = 1;
1439
1440			BO_LOCK(bo);
1441			if (nbp != NULL &&
1442			    (((nbp->b_xflags & BX_VNCLEAN) == 0) ||
1443			    (nbp->b_vp != vp) ||
1444			    (nbp->b_flags & B_DELWRI))) {
1445				BO_UNLOCK(bo);
1446				goto restart;
1447			}
1448		}
1449
1450		TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
1451			if (bp->b_lblkno < trunclbn)
1452				continue;
1453			if (BUF_LOCK(bp,
1454			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
1455			    BO_LOCKPTR(bo)) == ENOLCK)
1456				goto restart;
1457			bremfree(bp);
1458			bp->b_flags |= (B_INVAL | B_RELBUF);
1459			bp->b_flags &= ~B_ASYNC;
1460			brelse(bp);
1461			anyfreed = 1;
1462
1463			BO_LOCK(bo);
1464			if (nbp != NULL &&
1465			    (((nbp->b_xflags & BX_VNDIRTY) == 0) ||
1466			    (nbp->b_vp != vp) ||
1467			    (nbp->b_flags & B_DELWRI) == 0)) {
1468				BO_UNLOCK(bo);
1469				goto restart;
1470			}
1471		}
1472	}
1473
1474	if (length > 0) {
1475restartsync:
1476		TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
1477			if (bp->b_lblkno > 0)
1478				continue;
1479			/*
1480			 * Since we hold the vnode lock this should only
1481			 * fail if we're racing with the buf daemon.
1482			 */
1483			if (BUF_LOCK(bp,
1484			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
1485			    BO_LOCKPTR(bo)) == ENOLCK) {
1486				goto restart;
1487			}
1488			VNASSERT((bp->b_flags & B_DELWRI), vp,
1489			    ("buf(%p) on dirty queue without DELWRI", bp));
1490
1491			bremfree(bp);
1492			bawrite(bp);
1493			BO_LOCK(bo);
1494			goto restartsync;
1495		}
1496	}
1497
1498	bufobj_wwait(bo, 0, 0);
1499	BO_UNLOCK(bo);
1500	vnode_pager_setsize(vp, length);
1501
1502	return (0);
1503}
1504
1505static void
1506buf_vlist_remove(struct buf *bp)
1507{
1508	struct bufv *bv;
1509
1510	KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
1511	ASSERT_BO_WLOCKED(bp->b_bufobj);
1512	KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) !=
1513	    (BX_VNDIRTY|BX_VNCLEAN),
1514	    ("buf_vlist_remove: Buf %p is on two lists", bp));
1515	if (bp->b_xflags & BX_VNDIRTY)
1516		bv = &bp->b_bufobj->bo_dirty;
1517	else
1518		bv = &bp->b_bufobj->bo_clean;
1519	BUF_PCTRIE_REMOVE(&bv->bv_root, bp->b_lblkno);
1520	TAILQ_REMOVE(&bv->bv_hd, bp, b_bobufs);
1521	bv->bv_cnt--;
1522	bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN);
1523}
1524
1525/*
1526 * Add the buffer to the sorted clean or dirty block list.
1527 *
1528 * NOTE: xflags is passed as a constant, optimizing this inline function!
1529 */
1530static void
1531buf_vlist_add(struct buf *bp, struct bufobj *bo, b_xflags_t xflags)
1532{
1533	struct bufv *bv;
1534	struct buf *n;
1535	int error;
1536
1537	ASSERT_BO_WLOCKED(bo);
1538	KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0,
1539	    ("buf_vlist_add: Buf %p has existing xflags %d", bp, bp->b_xflags));
1540	bp->b_xflags |= xflags;
1541	if (xflags & BX_VNDIRTY)
1542		bv = &bo->bo_dirty;
1543	else
1544		bv = &bo->bo_clean;
1545
1546	/*
1547	 * Keep the list ordered.  Optimize empty list insertion.  Assume
1548	 * we tend to grow at the tail so lookup_le should usually be cheaper
1549	 * than _ge.
1550	 */
1551	if (bv->bv_cnt == 0 ||
1552	    bp->b_lblkno > TAILQ_LAST(&bv->bv_hd, buflists)->b_lblkno)
1553		TAILQ_INSERT_TAIL(&bv->bv_hd, bp, b_bobufs);
1554	else if ((n = BUF_PCTRIE_LOOKUP_LE(&bv->bv_root, bp->b_lblkno)) == NULL)
1555		TAILQ_INSERT_HEAD(&bv->bv_hd, bp, b_bobufs);
1556	else
1557		TAILQ_INSERT_AFTER(&bv->bv_hd, n, bp, b_bobufs);
1558	error = BUF_PCTRIE_INSERT(&bv->bv_root, bp);
1559	if (error)
1560		panic("buf_vlist_add:  Preallocated nodes insufficient.");
1561	bv->bv_cnt++;
1562}
1563
1564/*
1565 * Lookup a buffer using the splay tree.  Note that we specifically avoid
1566 * shadow buffers used in background bitmap writes.
1567 *
1568 * This code isn't quite efficient as it could be because we are maintaining
1569 * two sorted lists and do not know which list the block resides in.
1570 *
1571 * During a "make buildworld" the desired buffer is found at one of
1572 * the roots more than 60% of the time.  Thus, checking both roots
1573 * before performing either splay eliminates unnecessary splays on the
1574 * first tree splayed.
1575 */
1576struct buf *
1577gbincore(struct bufobj *bo, daddr_t lblkno)
1578{
1579	struct buf *bp;
1580
1581	ASSERT_BO_LOCKED(bo);
1582	bp = BUF_PCTRIE_LOOKUP(&bo->bo_clean.bv_root, lblkno);
1583	if (bp != NULL)
1584		return (bp);
1585	return BUF_PCTRIE_LOOKUP(&bo->bo_dirty.bv_root, lblkno);
1586}
1587
1588/*
1589 * Associate a buffer with a vnode.
1590 */
1591void
1592bgetvp(struct vnode *vp, struct buf *bp)
1593{
1594	struct bufobj *bo;
1595
1596	bo = &vp->v_bufobj;
1597	ASSERT_BO_WLOCKED(bo);
1598	VNASSERT(bp->b_vp == NULL, bp->b_vp, ("bgetvp: not free"));
1599
1600	CTR3(KTR_BUF, "bgetvp(%p) vp %p flags %X", bp, vp, bp->b_flags);
1601	VNASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, vp,
1602	    ("bgetvp: bp already attached! %p", bp));
1603
1604	vhold(vp);
1605	bp->b_vp = vp;
1606	bp->b_bufobj = bo;
1607	/*
1608	 * Insert onto list for new vnode.
1609	 */
1610	buf_vlist_add(bp, bo, BX_VNCLEAN);
1611}
1612
1613/*
1614 * Disassociate a buffer from a vnode.
1615 */
1616void
1617brelvp(struct buf *bp)
1618{
1619	struct bufobj *bo;
1620	struct vnode *vp;
1621
1622	CTR3(KTR_BUF, "brelvp(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
1623	KASSERT(bp->b_vp != NULL, ("brelvp: NULL"));
1624
1625	/*
1626	 * Delete from old vnode list, if on one.
1627	 */
1628	vp = bp->b_vp;		/* XXX */
1629	bo = bp->b_bufobj;
1630	BO_LOCK(bo);
1631	if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN))
1632		buf_vlist_remove(bp);
1633	else
1634		panic("brelvp: Buffer %p not on queue.", bp);
1635	if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) {
1636		bo->bo_flag &= ~BO_ONWORKLST;
1637		mtx_lock(&sync_mtx);
1638		LIST_REMOVE(bo, bo_synclist);
1639		syncer_worklist_len--;
1640		mtx_unlock(&sync_mtx);
1641	}
1642	bp->b_vp = NULL;
1643	bp->b_bufobj = NULL;
1644	BO_UNLOCK(bo);
1645	vdrop(vp);
1646}
1647
1648/*
1649 * Add an item to the syncer work queue.
1650 */
1651static void
1652vn_syncer_add_to_worklist(struct bufobj *bo, int delay)
1653{
1654	int slot;
1655
1656	ASSERT_BO_WLOCKED(bo);
1657
1658	mtx_lock(&sync_mtx);
1659	if (bo->bo_flag & BO_ONWORKLST)
1660		LIST_REMOVE(bo, bo_synclist);
1661	else {
1662		bo->bo_flag |= BO_ONWORKLST;
1663		syncer_worklist_len++;
1664	}
1665
1666	if (delay > syncer_maxdelay - 2)
1667		delay = syncer_maxdelay - 2;
1668	slot = (syncer_delayno + delay) & syncer_mask;
1669
1670	LIST_INSERT_HEAD(&syncer_workitem_pending[slot], bo, bo_synclist);
1671	mtx_unlock(&sync_mtx);
1672}
1673
1674static int
1675sysctl_vfs_worklist_len(SYSCTL_HANDLER_ARGS)
1676{
1677	int error, len;
1678
1679	mtx_lock(&sync_mtx);
1680	len = syncer_worklist_len - sync_vnode_count;
1681	mtx_unlock(&sync_mtx);
1682	error = SYSCTL_OUT(req, &len, sizeof(len));
1683	return (error);
1684}
1685
1686SYSCTL_PROC(_vfs, OID_AUTO, worklist_len, CTLTYPE_INT | CTLFLAG_RD, NULL, 0,
1687    sysctl_vfs_worklist_len, "I", "Syncer thread worklist length");
1688
1689static struct proc *updateproc;
1690static void sched_sync(void);
1691static struct kproc_desc up_kp = {
1692	"syncer",
1693	sched_sync,
1694	&updateproc
1695};
1696SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp);
1697
1698static int
1699sync_vnode(struct synclist *slp, struct bufobj **bo, struct thread *td)
1700{
1701	struct vnode *vp;
1702	struct mount *mp;
1703
1704	*bo = LIST_FIRST(slp);
1705	if (*bo == NULL)
1706		return (0);
1707	vp = (*bo)->__bo_vnode;	/* XXX */
1708	if (VOP_ISLOCKED(vp) != 0 || VI_TRYLOCK(vp) == 0)
1709		return (1);
1710	/*
1711	 * We use vhold in case the vnode does not
1712	 * successfully sync.  vhold prevents the vnode from
1713	 * going away when we unlock the sync_mtx so that
1714	 * we can acquire the vnode interlock.
1715	 */
1716	vholdl(vp);
1717	mtx_unlock(&sync_mtx);
1718	VI_UNLOCK(vp);
1719	if (vn_start_write(vp, &mp, V_NOWAIT) != 0) {
1720		vdrop(vp);
1721		mtx_lock(&sync_mtx);
1722		return (*bo == LIST_FIRST(slp));
1723	}
1724	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1725	(void) VOP_FSYNC(vp, MNT_LAZY, td);
1726	VOP_UNLOCK(vp, 0);
1727	vn_finished_write(mp);
1728	BO_LOCK(*bo);
1729	if (((*bo)->bo_flag & BO_ONWORKLST) != 0) {
1730		/*
1731		 * Put us back on the worklist.  The worklist
1732		 * routine will remove us from our current
1733		 * position and then add us back in at a later
1734		 * position.
1735		 */
1736		vn_syncer_add_to_worklist(*bo, syncdelay);
1737	}
1738	BO_UNLOCK(*bo);
1739	vdrop(vp);
1740	mtx_lock(&sync_mtx);
1741	return (0);
1742}
1743
1744/*
1745 * System filesystem synchronizer daemon.
1746 */
1747static void
1748sched_sync(void)
1749{
1750	struct synclist *next, *slp;
1751	struct bufobj *bo;
1752	long starttime;
1753	struct thread *td = curthread;
1754	int last_work_seen;
1755	int net_worklist_len;
1756	int syncer_final_iter;
1757	int first_printf;
1758	int error;
1759
1760	last_work_seen = 0;
1761	syncer_final_iter = 0;
1762	first_printf = 1;
1763	syncer_state = SYNCER_RUNNING;
1764	starttime = time_uptime;
1765	td->td_pflags |= TDP_NORUNNINGBUF;
1766
1767	EVENTHANDLER_REGISTER(shutdown_pre_sync, syncer_shutdown, td->td_proc,
1768	    SHUTDOWN_PRI_LAST);
1769
1770	mtx_lock(&sync_mtx);
1771	for (;;) {
1772		if (syncer_state == SYNCER_FINAL_DELAY &&
1773		    syncer_final_iter == 0) {
1774			mtx_unlock(&sync_mtx);
1775			kproc_suspend_check(td->td_proc);
1776			mtx_lock(&sync_mtx);
1777		}
1778		net_worklist_len = syncer_worklist_len - sync_vnode_count;
1779		if (syncer_state != SYNCER_RUNNING &&
1780		    starttime != time_uptime) {
1781			if (first_printf) {
1782				printf("\nSyncing disks, vnodes remaining...");
1783				first_printf = 0;
1784			}
1785			printf("%d ", net_worklist_len);
1786		}
1787		starttime = time_uptime;
1788
1789		/*
1790		 * Push files whose dirty time has expired.  Be careful
1791		 * of interrupt race on slp queue.
1792		 *
1793		 * Skip over empty worklist slots when shutting down.
1794		 */
1795		do {
1796			slp = &syncer_workitem_pending[syncer_delayno];
1797			syncer_delayno += 1;
1798			if (syncer_delayno == syncer_maxdelay)
1799				syncer_delayno = 0;
1800			next = &syncer_workitem_pending[syncer_delayno];
1801			/*
1802			 * If the worklist has wrapped since the
1803			 * it was emptied of all but syncer vnodes,
1804			 * switch to the FINAL_DELAY state and run
1805			 * for one more second.
1806			 */
1807			if (syncer_state == SYNCER_SHUTTING_DOWN &&
1808			    net_worklist_len == 0 &&
1809			    last_work_seen == syncer_delayno) {
1810				syncer_state = SYNCER_FINAL_DELAY;
1811				syncer_final_iter = SYNCER_SHUTDOWN_SPEEDUP;
1812			}
1813		} while (syncer_state != SYNCER_RUNNING && LIST_EMPTY(slp) &&
1814		    syncer_worklist_len > 0);
1815
1816		/*
1817		 * Keep track of the last time there was anything
1818		 * on the worklist other than syncer vnodes.
1819		 * Return to the SHUTTING_DOWN state if any
1820		 * new work appears.
1821		 */
1822		if (net_worklist_len > 0 || syncer_state == SYNCER_RUNNING)
1823			last_work_seen = syncer_delayno;
1824		if (net_worklist_len > 0 && syncer_state == SYNCER_FINAL_DELAY)
1825			syncer_state = SYNCER_SHUTTING_DOWN;
1826		while (!LIST_EMPTY(slp)) {
1827			error = sync_vnode(slp, &bo, td);
1828			if (error == 1) {
1829				LIST_REMOVE(bo, bo_synclist);
1830				LIST_INSERT_HEAD(next, bo, bo_synclist);
1831				continue;
1832			}
1833
1834			if (first_printf == 0)
1835				wdog_kern_pat(WD_LASTVAL);
1836
1837		}
1838		if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter > 0)
1839			syncer_final_iter--;
1840		/*
1841		 * The variable rushjob allows the kernel to speed up the
1842		 * processing of the filesystem syncer process. A rushjob
1843		 * value of N tells the filesystem syncer to process the next
1844		 * N seconds worth of work on its queue ASAP. Currently rushjob
1845		 * is used by the soft update code to speed up the filesystem
1846		 * syncer process when the incore state is getting so far
1847		 * ahead of the disk that the kernel memory pool is being
1848		 * threatened with exhaustion.
1849		 */
1850		if (rushjob > 0) {
1851			rushjob -= 1;
1852			continue;
1853		}
1854		/*
1855		 * Just sleep for a short period of time between
1856		 * iterations when shutting down to allow some I/O
1857		 * to happen.
1858		 *
1859		 * If it has taken us less than a second to process the
1860		 * current work, then wait. Otherwise start right over
1861		 * again. We can still lose time if any single round
1862		 * takes more than two seconds, but it does not really
1863		 * matter as we are just trying to generally pace the
1864		 * filesystem activity.
1865		 */
1866		if (syncer_state != SYNCER_RUNNING ||
1867		    time_uptime == starttime) {
1868			thread_lock(td);
1869			sched_prio(td, PPAUSE);
1870			thread_unlock(td);
1871		}
1872		if (syncer_state != SYNCER_RUNNING)
1873			cv_timedwait(&sync_wakeup, &sync_mtx,
1874			    hz / SYNCER_SHUTDOWN_SPEEDUP);
1875		else if (time_uptime == starttime)
1876			cv_timedwait(&sync_wakeup, &sync_mtx, hz);
1877	}
1878}
1879
1880/*
1881 * Request the syncer daemon to speed up its work.
1882 * We never push it to speed up more than half of its
1883 * normal turn time, otherwise it could take over the cpu.
1884 */
1885int
1886speedup_syncer(void)
1887{
1888	int ret = 0;
1889
1890	mtx_lock(&sync_mtx);
1891	if (rushjob < syncdelay / 2) {
1892		rushjob += 1;
1893		stat_rush_requests += 1;
1894		ret = 1;
1895	}
1896	mtx_unlock(&sync_mtx);
1897	cv_broadcast(&sync_wakeup);
1898	return (ret);
1899}
1900
1901/*
1902 * Tell the syncer to speed up its work and run though its work
1903 * list several times, then tell it to shut down.
1904 */
1905static void
1906syncer_shutdown(void *arg, int howto)
1907{
1908
1909	if (howto & RB_NOSYNC)
1910		return;
1911	mtx_lock(&sync_mtx);
1912	syncer_state = SYNCER_SHUTTING_DOWN;
1913	rushjob = 0;
1914	mtx_unlock(&sync_mtx);
1915	cv_broadcast(&sync_wakeup);
1916	kproc_shutdown(arg, howto);
1917}
1918
1919/*
1920 * Reassign a buffer from one vnode to another.
1921 * Used to assign file specific control information
1922 * (indirect blocks) to the vnode to which they belong.
1923 */
1924void
1925reassignbuf(struct buf *bp)
1926{
1927	struct vnode *vp;
1928	struct bufobj *bo;
1929	int delay;
1930#ifdef INVARIANTS
1931	struct bufv *bv;
1932#endif
1933
1934	vp = bp->b_vp;
1935	bo = bp->b_bufobj;
1936	++reassignbufcalls;
1937
1938	CTR3(KTR_BUF, "reassignbuf(%p) vp %p flags %X",
1939	    bp, bp->b_vp, bp->b_flags);
1940	/*
1941	 * B_PAGING flagged buffers cannot be reassigned because their vp
1942	 * is not fully linked in.
1943	 */
1944	if (bp->b_flags & B_PAGING)
1945		panic("cannot reassign paging buffer");
1946
1947	/*
1948	 * Delete from old vnode list, if on one.
1949	 */
1950	BO_LOCK(bo);
1951	if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN))
1952		buf_vlist_remove(bp);
1953	else
1954		panic("reassignbuf: Buffer %p not on queue.", bp);
1955	/*
1956	 * If dirty, put on list of dirty buffers; otherwise insert onto list
1957	 * of clean buffers.
1958	 */
1959	if (bp->b_flags & B_DELWRI) {
1960		if ((bo->bo_flag & BO_ONWORKLST) == 0) {
1961			switch (vp->v_type) {
1962			case VDIR:
1963				delay = dirdelay;
1964				break;
1965			case VCHR:
1966				delay = metadelay;
1967				break;
1968			default:
1969				delay = filedelay;
1970			}
1971			vn_syncer_add_to_worklist(bo, delay);
1972		}
1973		buf_vlist_add(bp, bo, BX_VNDIRTY);
1974	} else {
1975		buf_vlist_add(bp, bo, BX_VNCLEAN);
1976
1977		if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) {
1978			mtx_lock(&sync_mtx);
1979			LIST_REMOVE(bo, bo_synclist);
1980			syncer_worklist_len--;
1981			mtx_unlock(&sync_mtx);
1982			bo->bo_flag &= ~BO_ONWORKLST;
1983		}
1984	}
1985#ifdef INVARIANTS
1986	bv = &bo->bo_clean;
1987	bp = TAILQ_FIRST(&bv->bv_hd);
1988	KASSERT(bp == NULL || bp->b_bufobj == bo,
1989	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
1990	bp = TAILQ_LAST(&bv->bv_hd, buflists);
1991	KASSERT(bp == NULL || bp->b_bufobj == bo,
1992	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
1993	bv = &bo->bo_dirty;
1994	bp = TAILQ_FIRST(&bv->bv_hd);
1995	KASSERT(bp == NULL || bp->b_bufobj == bo,
1996	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
1997	bp = TAILQ_LAST(&bv->bv_hd, buflists);
1998	KASSERT(bp == NULL || bp->b_bufobj == bo,
1999	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
2000#endif
2001	BO_UNLOCK(bo);
2002}
2003
2004/*
2005 * Increment the use and hold counts on the vnode, taking care to reference
2006 * the driver's usecount if this is a chardev.  The vholdl() will remove
2007 * the vnode from the free list if it is presently free.  Requires the
2008 * vnode interlock and returns with it held.
2009 */
2010static void
2011v_incr_usecount(struct vnode *vp)
2012{
2013
2014	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2015	vp->v_usecount++;
2016	if (vp->v_type == VCHR && vp->v_rdev != NULL) {
2017		dev_lock();
2018		vp->v_rdev->si_usecount++;
2019		dev_unlock();
2020	}
2021	vholdl(vp);
2022}
2023
2024/*
2025 * Turn a holdcnt into a use+holdcnt such that only one call to
2026 * v_decr_usecount is needed.
2027 */
2028static void
2029v_upgrade_usecount(struct vnode *vp)
2030{
2031
2032	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2033	vp->v_usecount++;
2034	if (vp->v_type == VCHR && vp->v_rdev != NULL) {
2035		dev_lock();
2036		vp->v_rdev->si_usecount++;
2037		dev_unlock();
2038	}
2039}
2040
2041/*
2042 * Decrement the vnode use and hold count along with the driver's usecount
2043 * if this is a chardev.  The vdropl() below releases the vnode interlock
2044 * as it may free the vnode.
2045 */
2046static void
2047v_decr_usecount(struct vnode *vp)
2048{
2049
2050	ASSERT_VI_LOCKED(vp, __FUNCTION__);
2051	VNASSERT(vp->v_usecount > 0, vp,
2052	    ("v_decr_usecount: negative usecount"));
2053	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2054	vp->v_usecount--;
2055	if (vp->v_type == VCHR && vp->v_rdev != NULL) {
2056		dev_lock();
2057		vp->v_rdev->si_usecount--;
2058		dev_unlock();
2059	}
2060	vdropl(vp);
2061}
2062
2063/*
2064 * Decrement only the use count and driver use count.  This is intended to
2065 * be paired with a follow on vdropl() to release the remaining hold count.
2066 * In this way we may vgone() a vnode with a 0 usecount without risk of
2067 * having it end up on a free list because the hold count is kept above 0.
2068 */
2069static void
2070v_decr_useonly(struct vnode *vp)
2071{
2072
2073	ASSERT_VI_LOCKED(vp, __FUNCTION__);
2074	VNASSERT(vp->v_usecount > 0, vp,
2075	    ("v_decr_useonly: negative usecount"));
2076	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2077	vp->v_usecount--;
2078	if (vp->v_type == VCHR && vp->v_rdev != NULL) {
2079		dev_lock();
2080		vp->v_rdev->si_usecount--;
2081		dev_unlock();
2082	}
2083}
2084
2085/*
2086 * Grab a particular vnode from the free list, increment its
2087 * reference count and lock it.  VI_DOOMED is set if the vnode
2088 * is being destroyed.  Only callers who specify LK_RETRY will
2089 * see doomed vnodes.  If inactive processing was delayed in
2090 * vput try to do it here.
2091 */
2092int
2093vget(struct vnode *vp, int flags, struct thread *td)
2094{
2095	int error;
2096
2097	error = 0;
2098	VNASSERT((flags & LK_TYPE_MASK) != 0, vp,
2099	    ("vget: invalid lock operation"));
2100	CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags);
2101
2102	if ((flags & LK_INTERLOCK) == 0)
2103		VI_LOCK(vp);
2104	vholdl(vp);
2105	if ((error = vn_lock(vp, flags | LK_INTERLOCK)) != 0) {
2106		vdrop(vp);
2107		CTR2(KTR_VFS, "%s: impossible to lock vnode %p", __func__,
2108		    vp);
2109		return (error);
2110	}
2111	if (vp->v_iflag & VI_DOOMED && (flags & LK_RETRY) == 0)
2112		panic("vget: vn_lock failed to return ENOENT\n");
2113	VI_LOCK(vp);
2114	/* Upgrade our holdcnt to a usecount. */
2115	v_upgrade_usecount(vp);
2116	/*
2117	 * We don't guarantee that any particular close will
2118	 * trigger inactive processing so just make a best effort
2119	 * here at preventing a reference to a removed file.  If
2120	 * we don't succeed no harm is done.
2121	 */
2122	if (vp->v_iflag & VI_OWEINACT) {
2123		if (VOP_ISLOCKED(vp) == LK_EXCLUSIVE &&
2124		    (flags & LK_NOWAIT) == 0)
2125			vinactive(vp, td);
2126		vp->v_iflag &= ~VI_OWEINACT;
2127	}
2128	VI_UNLOCK(vp);
2129	return (0);
2130}
2131
2132/*
2133 * Increase the reference count of a vnode.
2134 */
2135void
2136vref(struct vnode *vp)
2137{
2138
2139	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2140	VI_LOCK(vp);
2141	v_incr_usecount(vp);
2142	VI_UNLOCK(vp);
2143}
2144
2145/*
2146 * Return reference count of a vnode.
2147 *
2148 * The results of this call are only guaranteed when some mechanism other
2149 * than the VI lock is used to stop other processes from gaining references
2150 * to the vnode.  This may be the case if the caller holds the only reference.
2151 * This is also useful when stale data is acceptable as race conditions may
2152 * be accounted for by some other means.
2153 */
2154int
2155vrefcnt(struct vnode *vp)
2156{
2157	int usecnt;
2158
2159	VI_LOCK(vp);
2160	usecnt = vp->v_usecount;
2161	VI_UNLOCK(vp);
2162
2163	return (usecnt);
2164}
2165
2166#define	VPUTX_VRELE	1
2167#define	VPUTX_VPUT	2
2168#define	VPUTX_VUNREF	3
2169
2170static void
2171vputx(struct vnode *vp, int func)
2172{
2173	int error;
2174
2175	KASSERT(vp != NULL, ("vputx: null vp"));
2176	if (func == VPUTX_VUNREF)
2177		ASSERT_VOP_LOCKED(vp, "vunref");
2178	else if (func == VPUTX_VPUT)
2179		ASSERT_VOP_LOCKED(vp, "vput");
2180	else
2181		KASSERT(func == VPUTX_VRELE, ("vputx: wrong func"));
2182	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2183	VI_LOCK(vp);
2184
2185	/* Skip this v_writecount check if we're going to panic below. */
2186	VNASSERT(vp->v_writecount < vp->v_usecount || vp->v_usecount < 1, vp,
2187	    ("vputx: missed vn_close"));
2188	error = 0;
2189
2190	if (vp->v_usecount > 1 || ((vp->v_iflag & VI_DOINGINACT) &&
2191	    vp->v_usecount == 1)) {
2192		if (func == VPUTX_VPUT)
2193			VOP_UNLOCK(vp, 0);
2194		v_decr_usecount(vp);
2195		return;
2196	}
2197
2198	if (vp->v_usecount != 1) {
2199		vprint("vputx: negative ref count", vp);
2200		panic("vputx: negative ref cnt");
2201	}
2202	CTR2(KTR_VFS, "%s: return vnode %p to the freelist", __func__, vp);
2203	/*
2204	 * We want to hold the vnode until the inactive finishes to
2205	 * prevent vgone() races.  We drop the use count here and the
2206	 * hold count below when we're done.
2207	 */
2208	v_decr_useonly(vp);
2209	/*
2210	 * We must call VOP_INACTIVE with the node locked. Mark
2211	 * as VI_DOINGINACT to avoid recursion.
2212	 */
2213	vp->v_iflag |= VI_OWEINACT;
2214	switch (func) {
2215	case VPUTX_VRELE:
2216		error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK);
2217		VI_LOCK(vp);
2218		break;
2219	case VPUTX_VPUT:
2220		if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) {
2221			error = VOP_LOCK(vp, LK_UPGRADE | LK_INTERLOCK |
2222			    LK_NOWAIT);
2223			VI_LOCK(vp);
2224		}
2225		break;
2226	case VPUTX_VUNREF:
2227		if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) {
2228			error = VOP_LOCK(vp, LK_TRYUPGRADE | LK_INTERLOCK);
2229			VI_LOCK(vp);
2230		}
2231		break;
2232	}
2233	if (vp->v_usecount > 0)
2234		vp->v_iflag &= ~VI_OWEINACT;
2235	if (error == 0) {
2236		if (vp->v_iflag & VI_OWEINACT)
2237			vinactive(vp, curthread);
2238		if (func != VPUTX_VUNREF)
2239			VOP_UNLOCK(vp, 0);
2240	}
2241	vdropl(vp);
2242}
2243
2244/*
2245 * Vnode put/release.
2246 * If count drops to zero, call inactive routine and return to freelist.
2247 */
2248void
2249vrele(struct vnode *vp)
2250{
2251
2252	vputx(vp, VPUTX_VRELE);
2253}
2254
2255/*
2256 * Release an already locked vnode.  This give the same effects as
2257 * unlock+vrele(), but takes less time and avoids releasing and
2258 * re-aquiring the lock (as vrele() acquires the lock internally.)
2259 */
2260void
2261vput(struct vnode *vp)
2262{
2263
2264	vputx(vp, VPUTX_VPUT);
2265}
2266
2267/*
2268 * Release an exclusively locked vnode. Do not unlock the vnode lock.
2269 */
2270void
2271vunref(struct vnode *vp)
2272{
2273
2274	vputx(vp, VPUTX_VUNREF);
2275}
2276
2277/*
2278 * Somebody doesn't want the vnode recycled.
2279 */
2280void
2281vhold(struct vnode *vp)
2282{
2283
2284	VI_LOCK(vp);
2285	vholdl(vp);
2286	VI_UNLOCK(vp);
2287}
2288
2289/*
2290 * Increase the hold count and activate if this is the first reference.
2291 */
2292void
2293vholdl(struct vnode *vp)
2294{
2295	struct mount *mp;
2296
2297	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2298	vp->v_holdcnt++;
2299	if (!VSHOULDBUSY(vp))
2300		return;
2301	ASSERT_VI_LOCKED(vp, "vholdl");
2302	VNASSERT((vp->v_iflag & VI_FREE) != 0, vp, ("vnode not free"));
2303	VNASSERT(vp->v_op != NULL, vp, ("vholdl: vnode already reclaimed."));
2304	/*
2305	 * Remove a vnode from the free list, mark it as in use,
2306	 * and put it on the active list.
2307	 */
2308	mtx_lock(&vnode_free_list_mtx);
2309	TAILQ_REMOVE(&vnode_free_list, vp, v_actfreelist);
2310	freevnodes--;
2311	vp->v_iflag &= ~(VI_FREE|VI_AGE);
2312	KASSERT((vp->v_iflag & VI_ACTIVE) == 0,
2313	    ("Activating already active vnode"));
2314	vp->v_iflag |= VI_ACTIVE;
2315	mp = vp->v_mount;
2316	TAILQ_INSERT_HEAD(&mp->mnt_activevnodelist, vp, v_actfreelist);
2317	mp->mnt_activevnodelistsize++;
2318	mtx_unlock(&vnode_free_list_mtx);
2319}
2320
2321/*
2322 * Note that there is one less who cares about this vnode.
2323 * vdrop() is the opposite of vhold().
2324 */
2325void
2326vdrop(struct vnode *vp)
2327{
2328
2329	VI_LOCK(vp);
2330	vdropl(vp);
2331}
2332
2333/*
2334 * Drop the hold count of the vnode.  If this is the last reference to
2335 * the vnode we place it on the free list unless it has been vgone'd
2336 * (marked VI_DOOMED) in which case we will free it.
2337 */
2338void
2339vdropl(struct vnode *vp)
2340{
2341	struct bufobj *bo;
2342	struct mount *mp;
2343	int active;
2344
2345	ASSERT_VI_LOCKED(vp, "vdropl");
2346	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2347	if (vp->v_holdcnt <= 0)
2348		panic("vdrop: holdcnt %d", vp->v_holdcnt);
2349	vp->v_holdcnt--;
2350	VNASSERT(vp->v_holdcnt >= vp->v_usecount, vp,
2351	    ("hold count less than use count"));
2352	if (vp->v_holdcnt > 0) {
2353		VI_UNLOCK(vp);
2354		return;
2355	}
2356	if ((vp->v_iflag & VI_DOOMED) == 0) {
2357		/*
2358		 * Mark a vnode as free: remove it from its active list
2359		 * and put it up for recycling on the freelist.
2360		 */
2361		VNASSERT(vp->v_op != NULL, vp,
2362		    ("vdropl: vnode already reclaimed."));
2363		VNASSERT((vp->v_iflag & VI_FREE) == 0, vp,
2364		    ("vnode already free"));
2365		VNASSERT(VSHOULDFREE(vp), vp,
2366		    ("vdropl: freeing when we shouldn't"));
2367		active = vp->v_iflag & VI_ACTIVE;
2368		vp->v_iflag &= ~VI_ACTIVE;
2369		mp = vp->v_mount;
2370		mtx_lock(&vnode_free_list_mtx);
2371		if (active) {
2372			TAILQ_REMOVE(&mp->mnt_activevnodelist, vp,
2373			    v_actfreelist);
2374			mp->mnt_activevnodelistsize--;
2375		}
2376		if (vp->v_iflag & VI_AGE) {
2377			TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_actfreelist);
2378		} else {
2379			TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_actfreelist);
2380		}
2381		freevnodes++;
2382		vp->v_iflag &= ~VI_AGE;
2383		vp->v_iflag |= VI_FREE;
2384		mtx_unlock(&vnode_free_list_mtx);
2385		VI_UNLOCK(vp);
2386		return;
2387	}
2388	/*
2389	 * The vnode has been marked for destruction, so free it.
2390	 */
2391	CTR2(KTR_VFS, "%s: destroying the vnode %p", __func__, vp);
2392	atomic_subtract_long(&numvnodes, 1);
2393	bo = &vp->v_bufobj;
2394	VNASSERT((vp->v_iflag & VI_FREE) == 0, vp,
2395	    ("cleaned vnode still on the free list."));
2396	VNASSERT(vp->v_data == NULL, vp, ("cleaned vnode isn't"));
2397	VNASSERT(vp->v_holdcnt == 0, vp, ("Non-zero hold count"));
2398	VNASSERT(vp->v_usecount == 0, vp, ("Non-zero use count"));
2399	VNASSERT(vp->v_writecount == 0, vp, ("Non-zero write count"));
2400	VNASSERT(bo->bo_numoutput == 0, vp, ("Clean vnode has pending I/O's"));
2401	VNASSERT(bo->bo_clean.bv_cnt == 0, vp, ("cleanbufcnt not 0"));
2402	VNASSERT(pctrie_is_empty(&bo->bo_clean.bv_root), vp,
2403	    ("clean blk trie not empty"));
2404	VNASSERT(bo->bo_dirty.bv_cnt == 0, vp, ("dirtybufcnt not 0"));
2405	VNASSERT(pctrie_is_empty(&bo->bo_dirty.bv_root), vp,
2406	    ("dirty blk trie not empty"));
2407	VNASSERT(TAILQ_EMPTY(&vp->v_cache_dst), vp, ("vp has namecache dst"));
2408	VNASSERT(LIST_EMPTY(&vp->v_cache_src), vp, ("vp has namecache src"));
2409	VNASSERT(vp->v_cache_dd == NULL, vp, ("vp has namecache for .."));
2410	VI_UNLOCK(vp);
2411#ifdef MAC
2412	mac_vnode_destroy(vp);
2413#endif
2414	if (vp->v_pollinfo != NULL)
2415		destroy_vpollinfo(vp->v_pollinfo);
2416#ifdef INVARIANTS
2417	/* XXX Elsewhere we detect an already freed vnode via NULL v_op. */
2418	vp->v_op = NULL;
2419#endif
2420	rangelock_destroy(&vp->v_rl);
2421	lockdestroy(vp->v_vnlock);
2422	mtx_destroy(&vp->v_interlock);
2423	rw_destroy(BO_LOCKPTR(bo));
2424	uma_zfree(vnode_zone, vp);
2425}
2426
2427/*
2428 * Call VOP_INACTIVE on the vnode and manage the DOINGINACT and OWEINACT
2429 * flags.  DOINGINACT prevents us from recursing in calls to vinactive.
2430 * OWEINACT tracks whether a vnode missed a call to inactive due to a
2431 * failed lock upgrade.
2432 */
2433void
2434vinactive(struct vnode *vp, struct thread *td)
2435{
2436	struct vm_object *obj;
2437
2438	ASSERT_VOP_ELOCKED(vp, "vinactive");
2439	ASSERT_VI_LOCKED(vp, "vinactive");
2440	VNASSERT((vp->v_iflag & VI_DOINGINACT) == 0, vp,
2441	    ("vinactive: recursed on VI_DOINGINACT"));
2442	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2443	vp->v_iflag |= VI_DOINGINACT;
2444	vp->v_iflag &= ~VI_OWEINACT;
2445	VI_UNLOCK(vp);
2446	/*
2447	 * Before moving off the active list, we must be sure that any
2448	 * modified pages are on the vnode's dirty list since these will
2449	 * no longer be checked once the vnode is on the inactive list.
2450	 * Because the vnode vm object keeps a hold reference on the vnode
2451	 * if there is at least one resident non-cached page, the vnode
2452	 * cannot leave the active list without the page cleanup done.
2453	 */
2454	obj = vp->v_object;
2455	if (obj != NULL && (obj->flags & OBJ_MIGHTBEDIRTY) != 0) {
2456		VM_OBJECT_WLOCK(obj);
2457		vm_object_page_clean(obj, 0, 0, OBJPC_NOSYNC);
2458		VM_OBJECT_WUNLOCK(obj);
2459	}
2460	VOP_INACTIVE(vp, td);
2461	VI_LOCK(vp);
2462	VNASSERT(vp->v_iflag & VI_DOINGINACT, vp,
2463	    ("vinactive: lost VI_DOINGINACT"));
2464	vp->v_iflag &= ~VI_DOINGINACT;
2465}
2466
2467/*
2468 * Remove any vnodes in the vnode table belonging to mount point mp.
2469 *
2470 * If FORCECLOSE is not specified, there should not be any active ones,
2471 * return error if any are found (nb: this is a user error, not a
2472 * system error). If FORCECLOSE is specified, detach any active vnodes
2473 * that are found.
2474 *
2475 * If WRITECLOSE is set, only flush out regular file vnodes open for
2476 * writing.
2477 *
2478 * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped.
2479 *
2480 * `rootrefs' specifies the base reference count for the root vnode
2481 * of this filesystem. The root vnode is considered busy if its
2482 * v_usecount exceeds this value. On a successful return, vflush(, td)
2483 * will call vrele() on the root vnode exactly rootrefs times.
2484 * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must
2485 * be zero.
2486 */
2487#ifdef DIAGNOSTIC
2488static int busyprt = 0;		/* print out busy vnodes */
2489SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "Print out busy vnodes");
2490#endif
2491
2492int
2493vflush(struct mount *mp, int rootrefs, int flags, struct thread *td)
2494{
2495	struct vnode *vp, *mvp, *rootvp = NULL;
2496	struct vattr vattr;
2497	int busy = 0, error;
2498
2499	CTR4(KTR_VFS, "%s: mp %p with rootrefs %d and flags %d", __func__, mp,
2500	    rootrefs, flags);
2501	if (rootrefs > 0) {
2502		KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0,
2503		    ("vflush: bad args"));
2504		/*
2505		 * Get the filesystem root vnode. We can vput() it
2506		 * immediately, since with rootrefs > 0, it won't go away.
2507		 */
2508		if ((error = VFS_ROOT(mp, LK_EXCLUSIVE, &rootvp)) != 0) {
2509			CTR2(KTR_VFS, "%s: vfs_root lookup failed with %d",
2510			    __func__, error);
2511			return (error);
2512		}
2513		vput(rootvp);
2514	}
2515loop:
2516	MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
2517		vholdl(vp);
2518		error = vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE);
2519		if (error) {
2520			vdrop(vp);
2521			MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
2522			goto loop;
2523		}
2524		/*
2525		 * Skip over a vnodes marked VV_SYSTEM.
2526		 */
2527		if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) {
2528			VOP_UNLOCK(vp, 0);
2529			vdrop(vp);
2530			continue;
2531		}
2532		/*
2533		 * If WRITECLOSE is set, flush out unlinked but still open
2534		 * files (even if open only for reading) and regular file
2535		 * vnodes open for writing.
2536		 */
2537		if (flags & WRITECLOSE) {
2538			if (vp->v_object != NULL) {
2539				VM_OBJECT_WLOCK(vp->v_object);
2540				vm_object_page_clean(vp->v_object, 0, 0, 0);
2541				VM_OBJECT_WUNLOCK(vp->v_object);
2542			}
2543			error = VOP_FSYNC(vp, MNT_WAIT, td);
2544			if (error != 0) {
2545				VOP_UNLOCK(vp, 0);
2546				vdrop(vp);
2547				MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
2548				return (error);
2549			}
2550			error = VOP_GETATTR(vp, &vattr, td->td_ucred);
2551			VI_LOCK(vp);
2552
2553			if ((vp->v_type == VNON ||
2554			    (error == 0 && vattr.va_nlink > 0)) &&
2555			    (vp->v_writecount == 0 || vp->v_type != VREG)) {
2556				VOP_UNLOCK(vp, 0);
2557				vdropl(vp);
2558				continue;
2559			}
2560		} else
2561			VI_LOCK(vp);
2562		/*
2563		 * With v_usecount == 0, all we need to do is clear out the
2564		 * vnode data structures and we are done.
2565		 *
2566		 * If FORCECLOSE is set, forcibly close the vnode.
2567		 */
2568		if (vp->v_usecount == 0 || (flags & FORCECLOSE)) {
2569			VNASSERT(vp->v_usecount == 0 ||
2570			    (vp->v_type != VCHR && vp->v_type != VBLK), vp,
2571			    ("device VNODE %p is FORCECLOSED", vp));
2572			vgonel(vp);
2573		} else {
2574			busy++;
2575#ifdef DIAGNOSTIC
2576			if (busyprt)
2577				vprint("vflush: busy vnode", vp);
2578#endif
2579		}
2580		VOP_UNLOCK(vp, 0);
2581		vdropl(vp);
2582	}
2583	if (rootrefs > 0 && (flags & FORCECLOSE) == 0) {
2584		/*
2585		 * If just the root vnode is busy, and if its refcount
2586		 * is equal to `rootrefs', then go ahead and kill it.
2587		 */
2588		VI_LOCK(rootvp);
2589		KASSERT(busy > 0, ("vflush: not busy"));
2590		VNASSERT(rootvp->v_usecount >= rootrefs, rootvp,
2591		    ("vflush: usecount %d < rootrefs %d",
2592		     rootvp->v_usecount, rootrefs));
2593		if (busy == 1 && rootvp->v_usecount == rootrefs) {
2594			VOP_LOCK(rootvp, LK_EXCLUSIVE|LK_INTERLOCK);
2595			vgone(rootvp);
2596			VOP_UNLOCK(rootvp, 0);
2597			busy = 0;
2598		} else
2599			VI_UNLOCK(rootvp);
2600	}
2601	if (busy) {
2602		CTR2(KTR_VFS, "%s: failing as %d vnodes are busy", __func__,
2603		    busy);
2604		return (EBUSY);
2605	}
2606	for (; rootrefs > 0; rootrefs--)
2607		vrele(rootvp);
2608	return (0);
2609}
2610
2611/*
2612 * Recycle an unused vnode to the front of the free list.
2613 */
2614int
2615vrecycle(struct vnode *vp)
2616{
2617	int recycled;
2618
2619	ASSERT_VOP_ELOCKED(vp, "vrecycle");
2620	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2621	recycled = 0;
2622	VI_LOCK(vp);
2623	if (vp->v_usecount == 0) {
2624		recycled = 1;
2625		vgonel(vp);
2626	}
2627	VI_UNLOCK(vp);
2628	return (recycled);
2629}
2630
2631/*
2632 * Eliminate all activity associated with a vnode
2633 * in preparation for reuse.
2634 */
2635void
2636vgone(struct vnode *vp)
2637{
2638	VI_LOCK(vp);
2639	vgonel(vp);
2640	VI_UNLOCK(vp);
2641}
2642
2643static void
2644notify_lowervp_vfs_dummy(struct mount *mp __unused,
2645    struct vnode *lowervp __unused)
2646{
2647}
2648
2649/*
2650 * Notify upper mounts about reclaimed or unlinked vnode.
2651 */
2652void
2653vfs_notify_upper(struct vnode *vp, int event)
2654{
2655	static struct vfsops vgonel_vfsops = {
2656		.vfs_reclaim_lowervp = notify_lowervp_vfs_dummy,
2657		.vfs_unlink_lowervp = notify_lowervp_vfs_dummy,
2658	};
2659	struct mount *mp, *ump, *mmp;
2660
2661	mp = vp->v_mount;
2662	if (mp == NULL)
2663		return;
2664
2665	MNT_ILOCK(mp);
2666	if (TAILQ_EMPTY(&mp->mnt_uppers))
2667		goto unlock;
2668	MNT_IUNLOCK(mp);
2669	mmp = malloc(sizeof(struct mount), M_TEMP, M_WAITOK | M_ZERO);
2670	mmp->mnt_op = &vgonel_vfsops;
2671	mmp->mnt_kern_flag |= MNTK_MARKER;
2672	MNT_ILOCK(mp);
2673	mp->mnt_kern_flag |= MNTK_VGONE_UPPER;
2674	for (ump = TAILQ_FIRST(&mp->mnt_uppers); ump != NULL;) {
2675		if ((ump->mnt_kern_flag & MNTK_MARKER) != 0) {
2676			ump = TAILQ_NEXT(ump, mnt_upper_link);
2677			continue;
2678		}
2679		TAILQ_INSERT_AFTER(&mp->mnt_uppers, ump, mmp, mnt_upper_link);
2680		MNT_IUNLOCK(mp);
2681		switch (event) {
2682		case VFS_NOTIFY_UPPER_RECLAIM:
2683			VFS_RECLAIM_LOWERVP(ump, vp);
2684			break;
2685		case VFS_NOTIFY_UPPER_UNLINK:
2686			VFS_UNLINK_LOWERVP(ump, vp);
2687			break;
2688		default:
2689			KASSERT(0, ("invalid event %d", event));
2690			break;
2691		}
2692		MNT_ILOCK(mp);
2693		ump = TAILQ_NEXT(mmp, mnt_upper_link);
2694		TAILQ_REMOVE(&mp->mnt_uppers, mmp, mnt_upper_link);
2695	}
2696	free(mmp, M_TEMP);
2697	mp->mnt_kern_flag &= ~MNTK_VGONE_UPPER;
2698	if ((mp->mnt_kern_flag & MNTK_VGONE_WAITER) != 0) {
2699		mp->mnt_kern_flag &= ~MNTK_VGONE_WAITER;
2700		wakeup(&mp->mnt_uppers);
2701	}
2702unlock:
2703	MNT_IUNLOCK(mp);
2704}
2705
2706/*
2707 * vgone, with the vp interlock held.
2708 */
2709void
2710vgonel(struct vnode *vp)
2711{
2712	struct thread *td;
2713	int oweinact;
2714	int active;
2715	struct mount *mp;
2716
2717	ASSERT_VOP_ELOCKED(vp, "vgonel");
2718	ASSERT_VI_LOCKED(vp, "vgonel");
2719	VNASSERT(vp->v_holdcnt, vp,
2720	    ("vgonel: vp %p has no reference.", vp));
2721	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2722	td = curthread;
2723
2724	/*
2725	 * Don't vgonel if we're already doomed.
2726	 */
2727	if (vp->v_iflag & VI_DOOMED)
2728		return;
2729	vp->v_iflag |= VI_DOOMED;
2730
2731	/*
2732	 * Check to see if the vnode is in use.  If so, we have to call
2733	 * VOP_CLOSE() and VOP_INACTIVE().
2734	 */
2735	active = vp->v_usecount;
2736	oweinact = (vp->v_iflag & VI_OWEINACT);
2737	VI_UNLOCK(vp);
2738	vfs_notify_upper(vp, VFS_NOTIFY_UPPER_RECLAIM);
2739
2740	/*
2741	 * Clean out any buffers associated with the vnode.
2742	 * If the flush fails, just toss the buffers.
2743	 */
2744	mp = NULL;
2745	if (!TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd))
2746		(void) vn_start_secondary_write(vp, &mp, V_WAIT);
2747	if (vinvalbuf(vp, V_SAVE, 0, 0) != 0)
2748		vinvalbuf(vp, 0, 0, 0);
2749
2750	/*
2751	 * If purging an active vnode, it must be closed and
2752	 * deactivated before being reclaimed.
2753	 */
2754	if (active)
2755		VOP_CLOSE(vp, FNONBLOCK, NOCRED, td);
2756	if (oweinact || active) {
2757		VI_LOCK(vp);
2758		if ((vp->v_iflag & VI_DOINGINACT) == 0)
2759			vinactive(vp, td);
2760		VI_UNLOCK(vp);
2761	}
2762	if (vp->v_type == VSOCK)
2763		vfs_unp_reclaim(vp);
2764	/*
2765	 * Reclaim the vnode.
2766	 */
2767	if (VOP_RECLAIM(vp, td))
2768		panic("vgone: cannot reclaim");
2769	if (mp != NULL)
2770		vn_finished_secondary_write(mp);
2771	VNASSERT(vp->v_object == NULL, vp,
2772	    ("vop_reclaim left v_object vp=%p, tag=%s", vp, vp->v_tag));
2773	/*
2774	 * Clear the advisory locks and wake up waiting threads.
2775	 */
2776	(void)VOP_ADVLOCKPURGE(vp);
2777	/*
2778	 * Delete from old mount point vnode list.
2779	 */
2780	delmntque(vp);
2781	cache_purge(vp);
2782	/*
2783	 * Done with purge, reset to the standard lock and invalidate
2784	 * the vnode.
2785	 */
2786	VI_LOCK(vp);
2787	vp->v_vnlock = &vp->v_lock;
2788	vp->v_op = &dead_vnodeops;
2789	vp->v_tag = "none";
2790	vp->v_type = VBAD;
2791}
2792
2793/*
2794 * Calculate the total number of references to a special device.
2795 */
2796int
2797vcount(struct vnode *vp)
2798{
2799	int count;
2800
2801	dev_lock();
2802	count = vp->v_rdev->si_usecount;
2803	dev_unlock();
2804	return (count);
2805}
2806
2807/*
2808 * Same as above, but using the struct cdev *as argument
2809 */
2810int
2811count_dev(struct cdev *dev)
2812{
2813	int count;
2814
2815	dev_lock();
2816	count = dev->si_usecount;
2817	dev_unlock();
2818	return(count);
2819}
2820
2821/*
2822 * Print out a description of a vnode.
2823 */
2824static char *typename[] =
2825{"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD",
2826 "VMARKER"};
2827
2828void
2829vn_printf(struct vnode *vp, const char *fmt, ...)
2830{
2831	va_list ap;
2832	char buf[256], buf2[16];
2833	u_long flags;
2834
2835	va_start(ap, fmt);
2836	vprintf(fmt, ap);
2837	va_end(ap);
2838	printf("%p: ", (void *)vp);
2839	printf("tag %s, type %s\n", vp->v_tag, typename[vp->v_type]);
2840	printf("    usecount %d, writecount %d, refcount %d mountedhere %p\n",
2841	    vp->v_usecount, vp->v_writecount, vp->v_holdcnt, vp->v_mountedhere);
2842	buf[0] = '\0';
2843	buf[1] = '\0';
2844	if (vp->v_vflag & VV_ROOT)
2845		strlcat(buf, "|VV_ROOT", sizeof(buf));
2846	if (vp->v_vflag & VV_ISTTY)
2847		strlcat(buf, "|VV_ISTTY", sizeof(buf));
2848	if (vp->v_vflag & VV_NOSYNC)
2849		strlcat(buf, "|VV_NOSYNC", sizeof(buf));
2850	if (vp->v_vflag & VV_ETERNALDEV)
2851		strlcat(buf, "|VV_ETERNALDEV", sizeof(buf));
2852	if (vp->v_vflag & VV_CACHEDLABEL)
2853		strlcat(buf, "|VV_CACHEDLABEL", sizeof(buf));
2854	if (vp->v_vflag & VV_TEXT)
2855		strlcat(buf, "|VV_TEXT", sizeof(buf));
2856	if (vp->v_vflag & VV_COPYONWRITE)
2857		strlcat(buf, "|VV_COPYONWRITE", sizeof(buf));
2858	if (vp->v_vflag & VV_SYSTEM)
2859		strlcat(buf, "|VV_SYSTEM", sizeof(buf));
2860	if (vp->v_vflag & VV_PROCDEP)
2861		strlcat(buf, "|VV_PROCDEP", sizeof(buf));
2862	if (vp->v_vflag & VV_NOKNOTE)
2863		strlcat(buf, "|VV_NOKNOTE", sizeof(buf));
2864	if (vp->v_vflag & VV_DELETED)
2865		strlcat(buf, "|VV_DELETED", sizeof(buf));
2866	if (vp->v_vflag & VV_MD)
2867		strlcat(buf, "|VV_MD", sizeof(buf));
2868	if (vp->v_vflag & VV_FORCEINSMQ)
2869		strlcat(buf, "|VV_FORCEINSMQ", sizeof(buf));
2870	flags = vp->v_vflag & ~(VV_ROOT | VV_ISTTY | VV_NOSYNC | VV_ETERNALDEV |
2871	    VV_CACHEDLABEL | VV_TEXT | VV_COPYONWRITE | VV_SYSTEM | VV_PROCDEP |
2872	    VV_NOKNOTE | VV_DELETED | VV_MD | VV_FORCEINSMQ);
2873	if (flags != 0) {
2874		snprintf(buf2, sizeof(buf2), "|VV(0x%lx)", flags);
2875		strlcat(buf, buf2, sizeof(buf));
2876	}
2877	if (vp->v_iflag & VI_MOUNT)
2878		strlcat(buf, "|VI_MOUNT", sizeof(buf));
2879	if (vp->v_iflag & VI_AGE)
2880		strlcat(buf, "|VI_AGE", sizeof(buf));
2881	if (vp->v_iflag & VI_DOOMED)
2882		strlcat(buf, "|VI_DOOMED", sizeof(buf));
2883	if (vp->v_iflag & VI_FREE)
2884		strlcat(buf, "|VI_FREE", sizeof(buf));
2885	if (vp->v_iflag & VI_ACTIVE)
2886		strlcat(buf, "|VI_ACTIVE", sizeof(buf));
2887	if (vp->v_iflag & VI_DOINGINACT)
2888		strlcat(buf, "|VI_DOINGINACT", sizeof(buf));
2889	if (vp->v_iflag & VI_OWEINACT)
2890		strlcat(buf, "|VI_OWEINACT", sizeof(buf));
2891	flags = vp->v_iflag & ~(VI_MOUNT | VI_AGE | VI_DOOMED | VI_FREE |
2892	    VI_ACTIVE | VI_DOINGINACT | VI_OWEINACT);
2893	if (flags != 0) {
2894		snprintf(buf2, sizeof(buf2), "|VI(0x%lx)", flags);
2895		strlcat(buf, buf2, sizeof(buf));
2896	}
2897	printf("    flags (%s)\n", buf + 1);
2898	if (mtx_owned(VI_MTX(vp)))
2899		printf(" VI_LOCKed");
2900	if (vp->v_object != NULL)
2901		printf("    v_object %p ref %d pages %d "
2902		    "cleanbuf %d dirtybuf %d\n",
2903		    vp->v_object, vp->v_object->ref_count,
2904		    vp->v_object->resident_page_count,
2905		    vp->v_bufobj.bo_dirty.bv_cnt,
2906		    vp->v_bufobj.bo_clean.bv_cnt);
2907	printf("    ");
2908	lockmgr_printinfo(vp->v_vnlock);
2909	if (vp->v_data != NULL)
2910		VOP_PRINT(vp);
2911}
2912
2913#ifdef DDB
2914/*
2915 * List all of the locked vnodes in the system.
2916 * Called when debugging the kernel.
2917 */
2918DB_SHOW_COMMAND(lockedvnods, lockedvnodes)
2919{
2920	struct mount *mp;
2921	struct vnode *vp;
2922
2923	/*
2924	 * Note: because this is DDB, we can't obey the locking semantics
2925	 * for these structures, which means we could catch an inconsistent
2926	 * state and dereference a nasty pointer.  Not much to be done
2927	 * about that.
2928	 */
2929	db_printf("Locked vnodes\n");
2930	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
2931		TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
2932			if (vp->v_type != VMARKER && VOP_ISLOCKED(vp))
2933				vprint("", vp);
2934		}
2935	}
2936}
2937
2938/*
2939 * Show details about the given vnode.
2940 */
2941DB_SHOW_COMMAND(vnode, db_show_vnode)
2942{
2943	struct vnode *vp;
2944
2945	if (!have_addr)
2946		return;
2947	vp = (struct vnode *)addr;
2948	vn_printf(vp, "vnode ");
2949}
2950
2951/*
2952 * Show details about the given mount point.
2953 */
2954DB_SHOW_COMMAND(mount, db_show_mount)
2955{
2956	struct mount *mp;
2957	struct vfsopt *opt;
2958	struct statfs *sp;
2959	struct vnode *vp;
2960	char buf[512];
2961	uint64_t mflags;
2962	u_int flags;
2963
2964	if (!have_addr) {
2965		/* No address given, print short info about all mount points. */
2966		TAILQ_FOREACH(mp, &mountlist, mnt_list) {
2967			db_printf("%p %s on %s (%s)\n", mp,
2968			    mp->mnt_stat.f_mntfromname,
2969			    mp->mnt_stat.f_mntonname,
2970			    mp->mnt_stat.f_fstypename);
2971			if (db_pager_quit)
2972				break;
2973		}
2974		db_printf("\nMore info: show mount <addr>\n");
2975		return;
2976	}
2977
2978	mp = (struct mount *)addr;
2979	db_printf("%p %s on %s (%s)\n", mp, mp->mnt_stat.f_mntfromname,
2980	    mp->mnt_stat.f_mntonname, mp->mnt_stat.f_fstypename);
2981
2982	buf[0] = '\0';
2983	mflags = mp->mnt_flag;
2984#define	MNT_FLAG(flag)	do {						\
2985	if (mflags & (flag)) {						\
2986		if (buf[0] != '\0')					\
2987			strlcat(buf, ", ", sizeof(buf));		\
2988		strlcat(buf, (#flag) + 4, sizeof(buf));			\
2989		mflags &= ~(flag);					\
2990	}								\
2991} while (0)
2992	MNT_FLAG(MNT_RDONLY);
2993	MNT_FLAG(MNT_SYNCHRONOUS);
2994	MNT_FLAG(MNT_NOEXEC);
2995	MNT_FLAG(MNT_NOSUID);
2996	MNT_FLAG(MNT_NFS4ACLS);
2997	MNT_FLAG(MNT_UNION);
2998	MNT_FLAG(MNT_ASYNC);
2999	MNT_FLAG(MNT_SUIDDIR);
3000	MNT_FLAG(MNT_SOFTDEP);
3001	MNT_FLAG(MNT_NOSYMFOLLOW);
3002	MNT_FLAG(MNT_GJOURNAL);
3003	MNT_FLAG(MNT_MULTILABEL);
3004	MNT_FLAG(MNT_ACLS);
3005	MNT_FLAG(MNT_NOATIME);
3006	MNT_FLAG(MNT_NOCLUSTERR);
3007	MNT_FLAG(MNT_NOCLUSTERW);
3008	MNT_FLAG(MNT_SUJ);
3009	MNT_FLAG(MNT_EXRDONLY);
3010	MNT_FLAG(MNT_EXPORTED);
3011	MNT_FLAG(MNT_DEFEXPORTED);
3012	MNT_FLAG(MNT_EXPORTANON);
3013	MNT_FLAG(MNT_EXKERB);
3014	MNT_FLAG(MNT_EXPUBLIC);
3015	MNT_FLAG(MNT_LOCAL);
3016	MNT_FLAG(MNT_QUOTA);
3017	MNT_FLAG(MNT_ROOTFS);
3018	MNT_FLAG(MNT_USER);
3019	MNT_FLAG(MNT_IGNORE);
3020	MNT_FLAG(MNT_UPDATE);
3021	MNT_FLAG(MNT_DELEXPORT);
3022	MNT_FLAG(MNT_RELOAD);
3023	MNT_FLAG(MNT_FORCE);
3024	MNT_FLAG(MNT_SNAPSHOT);
3025	MNT_FLAG(MNT_BYFSID);
3026#undef MNT_FLAG
3027	if (mflags != 0) {
3028		if (buf[0] != '\0')
3029			strlcat(buf, ", ", sizeof(buf));
3030		snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf),
3031		    "0x%016jx", mflags);
3032	}
3033	db_printf("    mnt_flag = %s\n", buf);
3034
3035	buf[0] = '\0';
3036	flags = mp->mnt_kern_flag;
3037#define	MNT_KERN_FLAG(flag)	do {					\
3038	if (flags & (flag)) {						\
3039		if (buf[0] != '\0')					\
3040			strlcat(buf, ", ", sizeof(buf));		\
3041		strlcat(buf, (#flag) + 5, sizeof(buf));			\
3042		flags &= ~(flag);					\
3043	}								\
3044} while (0)
3045	MNT_KERN_FLAG(MNTK_UNMOUNTF);
3046	MNT_KERN_FLAG(MNTK_ASYNC);
3047	MNT_KERN_FLAG(MNTK_SOFTDEP);
3048	MNT_KERN_FLAG(MNTK_NOINSMNTQ);
3049	MNT_KERN_FLAG(MNTK_DRAINING);
3050	MNT_KERN_FLAG(MNTK_REFEXPIRE);
3051	MNT_KERN_FLAG(MNTK_EXTENDED_SHARED);
3052	MNT_KERN_FLAG(MNTK_SHARED_WRITES);
3053	MNT_KERN_FLAG(MNTK_NO_IOPF);
3054	MNT_KERN_FLAG(MNTK_VGONE_UPPER);
3055	MNT_KERN_FLAG(MNTK_VGONE_WAITER);
3056	MNT_KERN_FLAG(MNTK_LOOKUP_EXCL_DOTDOT);
3057	MNT_KERN_FLAG(MNTK_MARKER);
3058	MNT_KERN_FLAG(MNTK_NOASYNC);
3059	MNT_KERN_FLAG(MNTK_UNMOUNT);
3060	MNT_KERN_FLAG(MNTK_MWAIT);
3061	MNT_KERN_FLAG(MNTK_SUSPEND);
3062	MNT_KERN_FLAG(MNTK_SUSPEND2);
3063	MNT_KERN_FLAG(MNTK_SUSPENDED);
3064	MNT_KERN_FLAG(MNTK_LOOKUP_SHARED);
3065	MNT_KERN_FLAG(MNTK_NOKNOTE);
3066#undef MNT_KERN_FLAG
3067	if (flags != 0) {
3068		if (buf[0] != '\0')
3069			strlcat(buf, ", ", sizeof(buf));
3070		snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf),
3071		    "0x%08x", flags);
3072	}
3073	db_printf("    mnt_kern_flag = %s\n", buf);
3074
3075	db_printf("    mnt_opt = ");
3076	opt = TAILQ_FIRST(mp->mnt_opt);
3077	if (opt != NULL) {
3078		db_printf("%s", opt->name);
3079		opt = TAILQ_NEXT(opt, link);
3080		while (opt != NULL) {
3081			db_printf(", %s", opt->name);
3082			opt = TAILQ_NEXT(opt, link);
3083		}
3084	}
3085	db_printf("\n");
3086
3087	sp = &mp->mnt_stat;
3088	db_printf("    mnt_stat = { version=%u type=%u flags=0x%016jx "
3089	    "bsize=%ju iosize=%ju blocks=%ju bfree=%ju bavail=%jd files=%ju "
3090	    "ffree=%jd syncwrites=%ju asyncwrites=%ju syncreads=%ju "
3091	    "asyncreads=%ju namemax=%u owner=%u fsid=[%d, %d] }\n",
3092	    (u_int)sp->f_version, (u_int)sp->f_type, (uintmax_t)sp->f_flags,
3093	    (uintmax_t)sp->f_bsize, (uintmax_t)sp->f_iosize,
3094	    (uintmax_t)sp->f_blocks, (uintmax_t)sp->f_bfree,
3095	    (intmax_t)sp->f_bavail, (uintmax_t)sp->f_files,
3096	    (intmax_t)sp->f_ffree, (uintmax_t)sp->f_syncwrites,
3097	    (uintmax_t)sp->f_asyncwrites, (uintmax_t)sp->f_syncreads,
3098	    (uintmax_t)sp->f_asyncreads, (u_int)sp->f_namemax,
3099	    (u_int)sp->f_owner, (int)sp->f_fsid.val[0], (int)sp->f_fsid.val[1]);
3100
3101	db_printf("    mnt_cred = { uid=%u ruid=%u",
3102	    (u_int)mp->mnt_cred->cr_uid, (u_int)mp->mnt_cred->cr_ruid);
3103	if (jailed(mp->mnt_cred))
3104		db_printf(", jail=%d", mp->mnt_cred->cr_prison->pr_id);
3105	db_printf(" }\n");
3106	db_printf("    mnt_ref = %d\n", mp->mnt_ref);
3107	db_printf("    mnt_gen = %d\n", mp->mnt_gen);
3108	db_printf("    mnt_nvnodelistsize = %d\n", mp->mnt_nvnodelistsize);
3109	db_printf("    mnt_activevnodelistsize = %d\n",
3110	    mp->mnt_activevnodelistsize);
3111	db_printf("    mnt_writeopcount = %d\n", mp->mnt_writeopcount);
3112	db_printf("    mnt_maxsymlinklen = %d\n", mp->mnt_maxsymlinklen);
3113	db_printf("    mnt_iosize_max = %d\n", mp->mnt_iosize_max);
3114	db_printf("    mnt_hashseed = %u\n", mp->mnt_hashseed);
3115	db_printf("    mnt_secondary_writes = %d\n", mp->mnt_secondary_writes);
3116	db_printf("    mnt_secondary_accwrites = %d\n",
3117	    mp->mnt_secondary_accwrites);
3118	db_printf("    mnt_gjprovider = %s\n",
3119	    mp->mnt_gjprovider != NULL ? mp->mnt_gjprovider : "NULL");
3120
3121	db_printf("\n\nList of active vnodes\n");
3122	TAILQ_FOREACH(vp, &mp->mnt_activevnodelist, v_actfreelist) {
3123		if (vp->v_type != VMARKER) {
3124			vn_printf(vp, "vnode ");
3125			if (db_pager_quit)
3126				break;
3127		}
3128	}
3129	db_printf("\n\nList of inactive vnodes\n");
3130	TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
3131		if (vp->v_type != VMARKER && (vp->v_iflag & VI_ACTIVE) == 0) {
3132			vn_printf(vp, "vnode ");
3133			if (db_pager_quit)
3134				break;
3135		}
3136	}
3137}
3138#endif	/* DDB */
3139
3140/*
3141 * Fill in a struct xvfsconf based on a struct vfsconf.
3142 */
3143static int
3144vfsconf2x(struct sysctl_req *req, struct vfsconf *vfsp)
3145{
3146	struct xvfsconf xvfsp;
3147
3148	bzero(&xvfsp, sizeof(xvfsp));
3149	strcpy(xvfsp.vfc_name, vfsp->vfc_name);
3150	xvfsp.vfc_typenum = vfsp->vfc_typenum;
3151	xvfsp.vfc_refcount = vfsp->vfc_refcount;
3152	xvfsp.vfc_flags = vfsp->vfc_flags;
3153	/*
3154	 * These are unused in userland, we keep them
3155	 * to not break binary compatibility.
3156	 */
3157	xvfsp.vfc_vfsops = NULL;
3158	xvfsp.vfc_next = NULL;
3159	return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp)));
3160}
3161
3162#ifdef COMPAT_FREEBSD32
3163struct xvfsconf32 {
3164	uint32_t	vfc_vfsops;
3165	char		vfc_name[MFSNAMELEN];
3166	int32_t		vfc_typenum;
3167	int32_t		vfc_refcount;
3168	int32_t		vfc_flags;
3169	uint32_t	vfc_next;
3170};
3171
3172static int
3173vfsconf2x32(struct sysctl_req *req, struct vfsconf *vfsp)
3174{
3175	struct xvfsconf32 xvfsp;
3176
3177	strcpy(xvfsp.vfc_name, vfsp->vfc_name);
3178	xvfsp.vfc_typenum = vfsp->vfc_typenum;
3179	xvfsp.vfc_refcount = vfsp->vfc_refcount;
3180	xvfsp.vfc_flags = vfsp->vfc_flags;
3181	xvfsp.vfc_vfsops = 0;
3182	xvfsp.vfc_next = 0;
3183	return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp)));
3184}
3185#endif
3186
3187/*
3188 * Top level filesystem related information gathering.
3189 */
3190static int
3191sysctl_vfs_conflist(SYSCTL_HANDLER_ARGS)
3192{
3193	struct vfsconf *vfsp;
3194	int error;
3195
3196	error = 0;
3197	TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
3198#ifdef COMPAT_FREEBSD32
3199		if (req->flags & SCTL_MASK32)
3200			error = vfsconf2x32(req, vfsp);
3201		else
3202#endif
3203			error = vfsconf2x(req, vfsp);
3204		if (error)
3205			break;
3206	}
3207	return (error);
3208}
3209
3210SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLTYPE_OPAQUE | CTLFLAG_RD,
3211    NULL, 0, sysctl_vfs_conflist,
3212    "S,xvfsconf", "List of all configured filesystems");
3213
3214#ifndef BURN_BRIDGES
3215static int	sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS);
3216
3217static int
3218vfs_sysctl(SYSCTL_HANDLER_ARGS)
3219{
3220	int *name = (int *)arg1 - 1;	/* XXX */
3221	u_int namelen = arg2 + 1;	/* XXX */
3222	struct vfsconf *vfsp;
3223
3224	log(LOG_WARNING, "userland calling deprecated sysctl, "
3225	    "please rebuild world\n");
3226
3227#if 1 || defined(COMPAT_PRELITE2)
3228	/* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */
3229	if (namelen == 1)
3230		return (sysctl_ovfs_conf(oidp, arg1, arg2, req));
3231#endif
3232
3233	switch (name[1]) {
3234	case VFS_MAXTYPENUM:
3235		if (namelen != 2)
3236			return (ENOTDIR);
3237		return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int)));
3238	case VFS_CONF:
3239		if (namelen != 3)
3240			return (ENOTDIR);	/* overloaded */
3241		TAILQ_FOREACH(vfsp, &vfsconf, vfc_list)
3242			if (vfsp->vfc_typenum == name[2])
3243				break;
3244		if (vfsp == NULL)
3245			return (EOPNOTSUPP);
3246#ifdef COMPAT_FREEBSD32
3247		if (req->flags & SCTL_MASK32)
3248			return (vfsconf2x32(req, vfsp));
3249		else
3250#endif
3251			return (vfsconf2x(req, vfsp));
3252	}
3253	return (EOPNOTSUPP);
3254}
3255
3256static SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD | CTLFLAG_SKIP,
3257    vfs_sysctl, "Generic filesystem");
3258
3259#if 1 || defined(COMPAT_PRELITE2)
3260
3261static int
3262sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS)
3263{
3264	int error;
3265	struct vfsconf *vfsp;
3266	struct ovfsconf ovfs;
3267
3268	TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
3269		bzero(&ovfs, sizeof(ovfs));
3270		ovfs.vfc_vfsops = vfsp->vfc_vfsops;	/* XXX used as flag */
3271		strcpy(ovfs.vfc_name, vfsp->vfc_name);
3272		ovfs.vfc_index = vfsp->vfc_typenum;
3273		ovfs.vfc_refcount = vfsp->vfc_refcount;
3274		ovfs.vfc_flags = vfsp->vfc_flags;
3275		error = SYSCTL_OUT(req, &ovfs, sizeof ovfs);
3276		if (error)
3277			return error;
3278	}
3279	return 0;
3280}
3281
3282#endif /* 1 || COMPAT_PRELITE2 */
3283#endif /* !BURN_BRIDGES */
3284
3285#define KINFO_VNODESLOP		10
3286#ifdef notyet
3287/*
3288 * Dump vnode list (via sysctl).
3289 */
3290/* ARGSUSED */
3291static int
3292sysctl_vnode(SYSCTL_HANDLER_ARGS)
3293{
3294	struct xvnode *xvn;
3295	struct mount *mp;
3296	struct vnode *vp;
3297	int error, len, n;
3298
3299	/*
3300	 * Stale numvnodes access is not fatal here.
3301	 */
3302	req->lock = 0;
3303	len = (numvnodes + KINFO_VNODESLOP) * sizeof *xvn;
3304	if (!req->oldptr)
3305		/* Make an estimate */
3306		return (SYSCTL_OUT(req, 0, len));
3307
3308	error = sysctl_wire_old_buffer(req, 0);
3309	if (error != 0)
3310		return (error);
3311	xvn = malloc(len, M_TEMP, M_ZERO | M_WAITOK);
3312	n = 0;
3313	mtx_lock(&mountlist_mtx);
3314	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
3315		if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK))
3316			continue;
3317		MNT_ILOCK(mp);
3318		TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
3319			if (n == len)
3320				break;
3321			vref(vp);
3322			xvn[n].xv_size = sizeof *xvn;
3323			xvn[n].xv_vnode = vp;
3324			xvn[n].xv_id = 0;	/* XXX compat */
3325#define XV_COPY(field) xvn[n].xv_##field = vp->v_##field
3326			XV_COPY(usecount);
3327			XV_COPY(writecount);
3328			XV_COPY(holdcnt);
3329			XV_COPY(mount);
3330			XV_COPY(numoutput);
3331			XV_COPY(type);
3332#undef XV_COPY
3333			xvn[n].xv_flag = vp->v_vflag;
3334
3335			switch (vp->v_type) {
3336			case VREG:
3337			case VDIR:
3338			case VLNK:
3339				break;
3340			case VBLK:
3341			case VCHR:
3342				if (vp->v_rdev == NULL) {
3343					vrele(vp);
3344					continue;
3345				}
3346				xvn[n].xv_dev = dev2udev(vp->v_rdev);
3347				break;
3348			case VSOCK:
3349				xvn[n].xv_socket = vp->v_socket;
3350				break;
3351			case VFIFO:
3352				xvn[n].xv_fifo = vp->v_fifoinfo;
3353				break;
3354			case VNON:
3355			case VBAD:
3356			default:
3357				/* shouldn't happen? */
3358				vrele(vp);
3359				continue;
3360			}
3361			vrele(vp);
3362			++n;
3363		}
3364		MNT_IUNLOCK(mp);
3365		mtx_lock(&mountlist_mtx);
3366		vfs_unbusy(mp);
3367		if (n == len)
3368			break;
3369	}
3370	mtx_unlock(&mountlist_mtx);
3371
3372	error = SYSCTL_OUT(req, xvn, n * sizeof *xvn);
3373	free(xvn, M_TEMP);
3374	return (error);
3375}
3376
3377SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD,
3378    0, 0, sysctl_vnode, "S,xvnode", "");
3379#endif
3380
3381/*
3382 * Unmount all filesystems. The list is traversed in reverse order
3383 * of mounting to avoid dependencies.
3384 */
3385void
3386vfs_unmountall(void)
3387{
3388	struct mount *mp;
3389	struct thread *td;
3390	int error;
3391
3392	CTR1(KTR_VFS, "%s: unmounting all filesystems", __func__);
3393	td = curthread;
3394
3395	/*
3396	 * Since this only runs when rebooting, it is not interlocked.
3397	 */
3398	while(!TAILQ_EMPTY(&mountlist)) {
3399		mp = TAILQ_LAST(&mountlist, mntlist);
3400		error = dounmount(mp, MNT_FORCE, td);
3401		if (error) {
3402			TAILQ_REMOVE(&mountlist, mp, mnt_list);
3403			/*
3404			 * XXX: Due to the way in which we mount the root
3405			 * file system off of devfs, devfs will generate a
3406			 * "busy" warning when we try to unmount it before
3407			 * the root.  Don't print a warning as a result in
3408			 * order to avoid false positive errors that may
3409			 * cause needless upset.
3410			 */
3411			if (strcmp(mp->mnt_vfc->vfc_name, "devfs") != 0) {
3412				printf("unmount of %s failed (",
3413				    mp->mnt_stat.f_mntonname);
3414				if (error == EBUSY)
3415					printf("BUSY)\n");
3416				else
3417					printf("%d)\n", error);
3418			}
3419		} else {
3420			/* The unmount has removed mp from the mountlist */
3421		}
3422	}
3423}
3424
3425/*
3426 * perform msync on all vnodes under a mount point
3427 * the mount point must be locked.
3428 */
3429void
3430vfs_msync(struct mount *mp, int flags)
3431{
3432	struct vnode *vp, *mvp;
3433	struct vm_object *obj;
3434
3435	CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
3436	MNT_VNODE_FOREACH_ACTIVE(vp, mp, mvp) {
3437		obj = vp->v_object;
3438		if (obj != NULL && (obj->flags & OBJ_MIGHTBEDIRTY) != 0 &&
3439		    (flags == MNT_WAIT || VOP_ISLOCKED(vp) == 0)) {
3440			if (!vget(vp,
3441			    LK_EXCLUSIVE | LK_RETRY | LK_INTERLOCK,
3442			    curthread)) {
3443				if (vp->v_vflag & VV_NOSYNC) {	/* unlinked */
3444					vput(vp);
3445					continue;
3446				}
3447
3448				obj = vp->v_object;
3449				if (obj != NULL) {
3450					VM_OBJECT_WLOCK(obj);
3451					vm_object_page_clean(obj, 0, 0,
3452					    flags == MNT_WAIT ?
3453					    OBJPC_SYNC : OBJPC_NOSYNC);
3454					VM_OBJECT_WUNLOCK(obj);
3455				}
3456				vput(vp);
3457			}
3458		} else
3459			VI_UNLOCK(vp);
3460	}
3461}
3462
3463static void
3464destroy_vpollinfo_free(struct vpollinfo *vi)
3465{
3466
3467	knlist_destroy(&vi->vpi_selinfo.si_note);
3468	mtx_destroy(&vi->vpi_lock);
3469	uma_zfree(vnodepoll_zone, vi);
3470}
3471
3472static void
3473destroy_vpollinfo(struct vpollinfo *vi)
3474{
3475
3476	knlist_clear(&vi->vpi_selinfo.si_note, 1);
3477	seldrain(&vi->vpi_selinfo);
3478	destroy_vpollinfo_free(vi);
3479}
3480
3481/*
3482 * Initalize per-vnode helper structure to hold poll-related state.
3483 */
3484void
3485v_addpollinfo(struct vnode *vp)
3486{
3487	struct vpollinfo *vi;
3488
3489	if (vp->v_pollinfo != NULL)
3490		return;
3491	vi = uma_zalloc(vnodepoll_zone, M_WAITOK);
3492	mtx_init(&vi->vpi_lock, "vnode pollinfo", NULL, MTX_DEF);
3493	knlist_init(&vi->vpi_selinfo.si_note, vp, vfs_knllock,
3494	    vfs_knlunlock, vfs_knl_assert_locked, vfs_knl_assert_unlocked);
3495	VI_LOCK(vp);
3496	if (vp->v_pollinfo != NULL) {
3497		VI_UNLOCK(vp);
3498		destroy_vpollinfo_free(vi);
3499		return;
3500	}
3501	vp->v_pollinfo = vi;
3502	VI_UNLOCK(vp);
3503}
3504
3505/*
3506 * Record a process's interest in events which might happen to
3507 * a vnode.  Because poll uses the historic select-style interface
3508 * internally, this routine serves as both the ``check for any
3509 * pending events'' and the ``record my interest in future events''
3510 * functions.  (These are done together, while the lock is held,
3511 * to avoid race conditions.)
3512 */
3513int
3514vn_pollrecord(struct vnode *vp, struct thread *td, int events)
3515{
3516
3517	v_addpollinfo(vp);
3518	mtx_lock(&vp->v_pollinfo->vpi_lock);
3519	if (vp->v_pollinfo->vpi_revents & events) {
3520		/*
3521		 * This leaves events we are not interested
3522		 * in available for the other process which
3523		 * which presumably had requested them
3524		 * (otherwise they would never have been
3525		 * recorded).
3526		 */
3527		events &= vp->v_pollinfo->vpi_revents;
3528		vp->v_pollinfo->vpi_revents &= ~events;
3529
3530		mtx_unlock(&vp->v_pollinfo->vpi_lock);
3531		return (events);
3532	}
3533	vp->v_pollinfo->vpi_events |= events;
3534	selrecord(td, &vp->v_pollinfo->vpi_selinfo);
3535	mtx_unlock(&vp->v_pollinfo->vpi_lock);
3536	return (0);
3537}
3538
3539/*
3540 * Routine to create and manage a filesystem syncer vnode.
3541 */
3542#define sync_close ((int (*)(struct  vop_close_args *))nullop)
3543static int	sync_fsync(struct  vop_fsync_args *);
3544static int	sync_inactive(struct  vop_inactive_args *);
3545static int	sync_reclaim(struct  vop_reclaim_args *);
3546
3547static struct vop_vector sync_vnodeops = {
3548	.vop_bypass =	VOP_EOPNOTSUPP,
3549	.vop_close =	sync_close,		/* close */
3550	.vop_fsync =	sync_fsync,		/* fsync */
3551	.vop_inactive =	sync_inactive,	/* inactive */
3552	.vop_reclaim =	sync_reclaim,	/* reclaim */
3553	.vop_lock1 =	vop_stdlock,	/* lock */
3554	.vop_unlock =	vop_stdunlock,	/* unlock */
3555	.vop_islocked =	vop_stdislocked,	/* islocked */
3556};
3557
3558/*
3559 * Create a new filesystem syncer vnode for the specified mount point.
3560 */
3561void
3562vfs_allocate_syncvnode(struct mount *mp)
3563{
3564	struct vnode *vp;
3565	struct bufobj *bo;
3566	static long start, incr, next;
3567	int error;
3568
3569	/* Allocate a new vnode */
3570	error = getnewvnode("syncer", mp, &sync_vnodeops, &vp);
3571	if (error != 0)
3572		panic("vfs_allocate_syncvnode: getnewvnode() failed");
3573	vp->v_type = VNON;
3574	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3575	vp->v_vflag |= VV_FORCEINSMQ;
3576	error = insmntque(vp, mp);
3577	if (error != 0)
3578		panic("vfs_allocate_syncvnode: insmntque() failed");
3579	vp->v_vflag &= ~VV_FORCEINSMQ;
3580	VOP_UNLOCK(vp, 0);
3581	/*
3582	 * Place the vnode onto the syncer worklist. We attempt to
3583	 * scatter them about on the list so that they will go off
3584	 * at evenly distributed times even if all the filesystems
3585	 * are mounted at once.
3586	 */
3587	next += incr;
3588	if (next == 0 || next > syncer_maxdelay) {
3589		start /= 2;
3590		incr /= 2;
3591		if (start == 0) {
3592			start = syncer_maxdelay / 2;
3593			incr = syncer_maxdelay;
3594		}
3595		next = start;
3596	}
3597	bo = &vp->v_bufobj;
3598	BO_LOCK(bo);
3599	vn_syncer_add_to_worklist(bo, syncdelay > 0 ? next % syncdelay : 0);
3600	/* XXX - vn_syncer_add_to_worklist() also grabs and drops sync_mtx. */
3601	mtx_lock(&sync_mtx);
3602	sync_vnode_count++;
3603	if (mp->mnt_syncer == NULL) {
3604		mp->mnt_syncer = vp;
3605		vp = NULL;
3606	}
3607	mtx_unlock(&sync_mtx);
3608	BO_UNLOCK(bo);
3609	if (vp != NULL) {
3610		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3611		vgone(vp);
3612		vput(vp);
3613	}
3614}
3615
3616void
3617vfs_deallocate_syncvnode(struct mount *mp)
3618{
3619	struct vnode *vp;
3620
3621	mtx_lock(&sync_mtx);
3622	vp = mp->mnt_syncer;
3623	if (vp != NULL)
3624		mp->mnt_syncer = NULL;
3625	mtx_unlock(&sync_mtx);
3626	if (vp != NULL)
3627		vrele(vp);
3628}
3629
3630/*
3631 * Do a lazy sync of the filesystem.
3632 */
3633static int
3634sync_fsync(struct vop_fsync_args *ap)
3635{
3636	struct vnode *syncvp = ap->a_vp;
3637	struct mount *mp = syncvp->v_mount;
3638	int error, save;
3639	struct bufobj *bo;
3640
3641	/*
3642	 * We only need to do something if this is a lazy evaluation.
3643	 */
3644	if (ap->a_waitfor != MNT_LAZY)
3645		return (0);
3646
3647	/*
3648	 * Move ourselves to the back of the sync list.
3649	 */
3650	bo = &syncvp->v_bufobj;
3651	BO_LOCK(bo);
3652	vn_syncer_add_to_worklist(bo, syncdelay);
3653	BO_UNLOCK(bo);
3654
3655	/*
3656	 * Walk the list of vnodes pushing all that are dirty and
3657	 * not already on the sync list.
3658	 */
3659	if (vfs_busy(mp, MBF_NOWAIT) != 0)
3660		return (0);
3661	if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) {
3662		vfs_unbusy(mp);
3663		return (0);
3664	}
3665	save = curthread_pflags_set(TDP_SYNCIO);
3666	vfs_msync(mp, MNT_NOWAIT);
3667	error = VFS_SYNC(mp, MNT_LAZY);
3668	curthread_pflags_restore(save);
3669	vn_finished_write(mp);
3670	vfs_unbusy(mp);
3671	return (error);
3672}
3673
3674/*
3675 * The syncer vnode is no referenced.
3676 */
3677static int
3678sync_inactive(struct vop_inactive_args *ap)
3679{
3680
3681	vgone(ap->a_vp);
3682	return (0);
3683}
3684
3685/*
3686 * The syncer vnode is no longer needed and is being decommissioned.
3687 *
3688 * Modifications to the worklist must be protected by sync_mtx.
3689 */
3690static int
3691sync_reclaim(struct vop_reclaim_args *ap)
3692{
3693	struct vnode *vp = ap->a_vp;
3694	struct bufobj *bo;
3695
3696	bo = &vp->v_bufobj;
3697	BO_LOCK(bo);
3698	mtx_lock(&sync_mtx);
3699	if (vp->v_mount->mnt_syncer == vp)
3700		vp->v_mount->mnt_syncer = NULL;
3701	if (bo->bo_flag & BO_ONWORKLST) {
3702		LIST_REMOVE(bo, bo_synclist);
3703		syncer_worklist_len--;
3704		sync_vnode_count--;
3705		bo->bo_flag &= ~BO_ONWORKLST;
3706	}
3707	mtx_unlock(&sync_mtx);
3708	BO_UNLOCK(bo);
3709
3710	return (0);
3711}
3712
3713/*
3714 * Check if vnode represents a disk device
3715 */
3716int
3717vn_isdisk(struct vnode *vp, int *errp)
3718{
3719	int error;
3720
3721	error = 0;
3722	dev_lock();
3723	if (vp->v_type != VCHR)
3724		error = ENOTBLK;
3725	else if (vp->v_rdev == NULL)
3726		error = ENXIO;
3727	else if (vp->v_rdev->si_devsw == NULL)
3728		error = ENXIO;
3729	else if (!(vp->v_rdev->si_devsw->d_flags & D_DISK))
3730		error = ENOTBLK;
3731	dev_unlock();
3732	if (errp != NULL)
3733		*errp = error;
3734	return (error == 0);
3735}
3736
3737/*
3738 * Common filesystem object access control check routine.  Accepts a
3739 * vnode's type, "mode", uid and gid, requested access mode, credentials,
3740 * and optional call-by-reference privused argument allowing vaccess()
3741 * to indicate to the caller whether privilege was used to satisfy the
3742 * request (obsoleted).  Returns 0 on success, or an errno on failure.
3743 */
3744int
3745vaccess(enum vtype type, mode_t file_mode, uid_t file_uid, gid_t file_gid,
3746    accmode_t accmode, struct ucred *cred, int *privused)
3747{
3748	accmode_t dac_granted;
3749	accmode_t priv_granted;
3750
3751	KASSERT((accmode & ~(VEXEC | VWRITE | VREAD | VADMIN | VAPPEND)) == 0,
3752	    ("invalid bit in accmode"));
3753	KASSERT((accmode & VAPPEND) == 0 || (accmode & VWRITE),
3754	    ("VAPPEND without VWRITE"));
3755
3756	/*
3757	 * Look for a normal, non-privileged way to access the file/directory
3758	 * as requested.  If it exists, go with that.
3759	 */
3760
3761	if (privused != NULL)
3762		*privused = 0;
3763
3764	dac_granted = 0;
3765
3766	/* Check the owner. */
3767	if (cred->cr_uid == file_uid) {
3768		dac_granted |= VADMIN;
3769		if (file_mode & S_IXUSR)
3770			dac_granted |= VEXEC;
3771		if (file_mode & S_IRUSR)
3772			dac_granted |= VREAD;
3773		if (file_mode & S_IWUSR)
3774			dac_granted |= (VWRITE | VAPPEND);
3775
3776		if ((accmode & dac_granted) == accmode)
3777			return (0);
3778
3779		goto privcheck;
3780	}
3781
3782	/* Otherwise, check the groups (first match) */
3783	if (groupmember(file_gid, cred)) {
3784		if (file_mode & S_IXGRP)
3785			dac_granted |= VEXEC;
3786		if (file_mode & S_IRGRP)
3787			dac_granted |= VREAD;
3788		if (file_mode & S_IWGRP)
3789			dac_granted |= (VWRITE | VAPPEND);
3790
3791		if ((accmode & dac_granted) == accmode)
3792			return (0);
3793
3794		goto privcheck;
3795	}
3796
3797	/* Otherwise, check everyone else. */
3798	if (file_mode & S_IXOTH)
3799		dac_granted |= VEXEC;
3800	if (file_mode & S_IROTH)
3801		dac_granted |= VREAD;
3802	if (file_mode & S_IWOTH)
3803		dac_granted |= (VWRITE | VAPPEND);
3804	if ((accmode & dac_granted) == accmode)
3805		return (0);
3806
3807privcheck:
3808	/*
3809	 * Build a privilege mask to determine if the set of privileges
3810	 * satisfies the requirements when combined with the granted mask
3811	 * from above.  For each privilege, if the privilege is required,
3812	 * bitwise or the request type onto the priv_granted mask.
3813	 */
3814	priv_granted = 0;
3815
3816	if (type == VDIR) {
3817		/*
3818		 * For directories, use PRIV_VFS_LOOKUP to satisfy VEXEC
3819		 * requests, instead of PRIV_VFS_EXEC.
3820		 */
3821		if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
3822		    !priv_check_cred(cred, PRIV_VFS_LOOKUP, 0))
3823			priv_granted |= VEXEC;
3824	} else {
3825		/*
3826		 * Ensure that at least one execute bit is on. Otherwise,
3827		 * a privileged user will always succeed, and we don't want
3828		 * this to happen unless the file really is executable.
3829		 */
3830		if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
3831		    (file_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) != 0 &&
3832		    !priv_check_cred(cred, PRIV_VFS_EXEC, 0))
3833			priv_granted |= VEXEC;
3834	}
3835
3836	if ((accmode & VREAD) && ((dac_granted & VREAD) == 0) &&
3837	    !priv_check_cred(cred, PRIV_VFS_READ, 0))
3838		priv_granted |= VREAD;
3839
3840	if ((accmode & VWRITE) && ((dac_granted & VWRITE) == 0) &&
3841	    !priv_check_cred(cred, PRIV_VFS_WRITE, 0))
3842		priv_granted |= (VWRITE | VAPPEND);
3843
3844	if ((accmode & VADMIN) && ((dac_granted & VADMIN) == 0) &&
3845	    !priv_check_cred(cred, PRIV_VFS_ADMIN, 0))
3846		priv_granted |= VADMIN;
3847
3848	if ((accmode & (priv_granted | dac_granted)) == accmode) {
3849		/* XXX audit: privilege used */
3850		if (privused != NULL)
3851			*privused = 1;
3852		return (0);
3853	}
3854
3855	return ((accmode & VADMIN) ? EPERM : EACCES);
3856}
3857
3858/*
3859 * Credential check based on process requesting service, and per-attribute
3860 * permissions.
3861 */
3862int
3863extattr_check_cred(struct vnode *vp, int attrnamespace, struct ucred *cred,
3864    struct thread *td, accmode_t accmode)
3865{
3866
3867	/*
3868	 * Kernel-invoked always succeeds.
3869	 */
3870	if (cred == NOCRED)
3871		return (0);
3872
3873	/*
3874	 * Do not allow privileged processes in jail to directly manipulate
3875	 * system attributes.
3876	 */
3877	switch (attrnamespace) {
3878	case EXTATTR_NAMESPACE_SYSTEM:
3879		/* Potentially should be: return (EPERM); */
3880		return (priv_check_cred(cred, PRIV_VFS_EXTATTR_SYSTEM, 0));
3881	case EXTATTR_NAMESPACE_USER:
3882		return (VOP_ACCESS(vp, accmode, cred, td));
3883	default:
3884		return (EPERM);
3885	}
3886}
3887
3888#ifdef DEBUG_VFS_LOCKS
3889/*
3890 * This only exists to supress warnings from unlocked specfs accesses.  It is
3891 * no longer ok to have an unlocked VFS.
3892 */
3893#define	IGNORE_LOCK(vp) (panicstr != NULL || (vp) == NULL ||		\
3894	(vp)->v_type == VCHR ||	(vp)->v_type == VBAD)
3895
3896int vfs_badlock_ddb = 1;	/* Drop into debugger on violation. */
3897SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_ddb, CTLFLAG_RW, &vfs_badlock_ddb, 0,
3898    "Drop into debugger on lock violation");
3899
3900int vfs_badlock_mutex = 1;	/* Check for interlock across VOPs. */
3901SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_mutex, CTLFLAG_RW, &vfs_badlock_mutex,
3902    0, "Check for interlock across VOPs");
3903
3904int vfs_badlock_print = 1;	/* Print lock violations. */
3905SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_print, CTLFLAG_RW, &vfs_badlock_print,
3906    0, "Print lock violations");
3907
3908#ifdef KDB
3909int vfs_badlock_backtrace = 1;	/* Print backtrace at lock violations. */
3910SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_backtrace, CTLFLAG_RW,
3911    &vfs_badlock_backtrace, 0, "Print backtrace at lock violations");
3912#endif
3913
3914static void
3915vfs_badlock(const char *msg, const char *str, struct vnode *vp)
3916{
3917
3918#ifdef KDB
3919	if (vfs_badlock_backtrace)
3920		kdb_backtrace();
3921#endif
3922	if (vfs_badlock_print)
3923		printf("%s: %p %s\n", str, (void *)vp, msg);
3924	if (vfs_badlock_ddb)
3925		kdb_enter(KDB_WHY_VFSLOCK, "lock violation");
3926}
3927
3928void
3929assert_vi_locked(struct vnode *vp, const char *str)
3930{
3931
3932	if (vfs_badlock_mutex && !mtx_owned(VI_MTX(vp)))
3933		vfs_badlock("interlock is not locked but should be", str, vp);
3934}
3935
3936void
3937assert_vi_unlocked(struct vnode *vp, const char *str)
3938{
3939
3940	if (vfs_badlock_mutex && mtx_owned(VI_MTX(vp)))
3941		vfs_badlock("interlock is locked but should not be", str, vp);
3942}
3943
3944void
3945assert_vop_locked(struct vnode *vp, const char *str)
3946{
3947	int locked;
3948
3949	if (!IGNORE_LOCK(vp)) {
3950		locked = VOP_ISLOCKED(vp);
3951		if (locked == 0 || locked == LK_EXCLOTHER)
3952			vfs_badlock("is not locked but should be", str, vp);
3953	}
3954}
3955
3956void
3957assert_vop_unlocked(struct vnode *vp, const char *str)
3958{
3959
3960	if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) == LK_EXCLUSIVE)
3961		vfs_badlock("is locked but should not be", str, vp);
3962}
3963
3964void
3965assert_vop_elocked(struct vnode *vp, const char *str)
3966{
3967
3968	if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_EXCLUSIVE)
3969		vfs_badlock("is not exclusive locked but should be", str, vp);
3970}
3971
3972#if 0
3973void
3974assert_vop_elocked_other(struct vnode *vp, const char *str)
3975{
3976
3977	if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_EXCLOTHER)
3978		vfs_badlock("is not exclusive locked by another thread",
3979		    str, vp);
3980}
3981
3982void
3983assert_vop_slocked(struct vnode *vp, const char *str)
3984{
3985
3986	if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_SHARED)
3987		vfs_badlock("is not locked shared but should be", str, vp);
3988}
3989#endif /* 0 */
3990#endif /* DEBUG_VFS_LOCKS */
3991
3992void
3993vop_rename_fail(struct vop_rename_args *ap)
3994{
3995
3996	if (ap->a_tvp != NULL)
3997		vput(ap->a_tvp);
3998	if (ap->a_tdvp == ap->a_tvp)
3999		vrele(ap->a_tdvp);
4000	else
4001		vput(ap->a_tdvp);
4002	vrele(ap->a_fdvp);
4003	vrele(ap->a_fvp);
4004}
4005
4006void
4007vop_rename_pre(void *ap)
4008{
4009	struct vop_rename_args *a = ap;
4010
4011#ifdef DEBUG_VFS_LOCKS
4012	if (a->a_tvp)
4013		ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME");
4014	ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME");
4015	ASSERT_VI_UNLOCKED(a->a_fvp, "VOP_RENAME");
4016	ASSERT_VI_UNLOCKED(a->a_fdvp, "VOP_RENAME");
4017
4018	/* Check the source (from). */
4019	if (a->a_tdvp->v_vnlock != a->a_fdvp->v_vnlock &&
4020	    (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fdvp->v_vnlock))
4021		ASSERT_VOP_UNLOCKED(a->a_fdvp, "vop_rename: fdvp locked");
4022	if (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fvp->v_vnlock)
4023		ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: fvp locked");
4024
4025	/* Check the target. */
4026	if (a->a_tvp)
4027		ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked");
4028	ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked");
4029#endif
4030	if (a->a_tdvp != a->a_fdvp)
4031		vhold(a->a_fdvp);
4032	if (a->a_tvp != a->a_fvp)
4033		vhold(a->a_fvp);
4034	vhold(a->a_tdvp);
4035	if (a->a_tvp)
4036		vhold(a->a_tvp);
4037}
4038
4039void
4040vop_strategy_pre(void *ap)
4041{
4042#ifdef DEBUG_VFS_LOCKS
4043	struct vop_strategy_args *a;
4044	struct buf *bp;
4045
4046	a = ap;
4047	bp = a->a_bp;
4048
4049	/*
4050	 * Cluster ops lock their component buffers but not the IO container.
4051	 */
4052	if ((bp->b_flags & B_CLUSTER) != 0)
4053		return;
4054
4055	if (panicstr == NULL && !BUF_ISLOCKED(bp)) {
4056		if (vfs_badlock_print)
4057			printf(
4058			    "VOP_STRATEGY: bp is not locked but should be\n");
4059		if (vfs_badlock_ddb)
4060			kdb_enter(KDB_WHY_VFSLOCK, "lock violation");
4061	}
4062#endif
4063}
4064
4065void
4066vop_lock_pre(void *ap)
4067{
4068#ifdef DEBUG_VFS_LOCKS
4069	struct vop_lock1_args *a = ap;
4070
4071	if ((a->a_flags & LK_INTERLOCK) == 0)
4072		ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
4073	else
4074		ASSERT_VI_LOCKED(a->a_vp, "VOP_LOCK");
4075#endif
4076}
4077
4078void
4079vop_lock_post(void *ap, int rc)
4080{
4081#ifdef DEBUG_VFS_LOCKS
4082	struct vop_lock1_args *a = ap;
4083
4084	ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
4085	if (rc == 0 && (a->a_flags & LK_EXCLOTHER) == 0)
4086		ASSERT_VOP_LOCKED(a->a_vp, "VOP_LOCK");
4087#endif
4088}
4089
4090void
4091vop_unlock_pre(void *ap)
4092{
4093#ifdef DEBUG_VFS_LOCKS
4094	struct vop_unlock_args *a = ap;
4095
4096	if (a->a_flags & LK_INTERLOCK)
4097		ASSERT_VI_LOCKED(a->a_vp, "VOP_UNLOCK");
4098	ASSERT_VOP_LOCKED(a->a_vp, "VOP_UNLOCK");
4099#endif
4100}
4101
4102void
4103vop_unlock_post(void *ap, int rc)
4104{
4105#ifdef DEBUG_VFS_LOCKS
4106	struct vop_unlock_args *a = ap;
4107
4108	if (a->a_flags & LK_INTERLOCK)
4109		ASSERT_VI_UNLOCKED(a->a_vp, "VOP_UNLOCK");
4110#endif
4111}
4112
4113void
4114vop_create_post(void *ap, int rc)
4115{
4116	struct vop_create_args *a = ap;
4117
4118	if (!rc)
4119		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
4120}
4121
4122void
4123vop_deleteextattr_post(void *ap, int rc)
4124{
4125	struct vop_deleteextattr_args *a = ap;
4126
4127	if (!rc)
4128		VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
4129}
4130
4131void
4132vop_link_post(void *ap, int rc)
4133{
4134	struct vop_link_args *a = ap;
4135
4136	if (!rc) {
4137		VFS_KNOTE_LOCKED(a->a_vp, NOTE_LINK);
4138		VFS_KNOTE_LOCKED(a->a_tdvp, NOTE_WRITE);
4139	}
4140}
4141
4142void
4143vop_mkdir_post(void *ap, int rc)
4144{
4145	struct vop_mkdir_args *a = ap;
4146
4147	if (!rc)
4148		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK);
4149}
4150
4151void
4152vop_mknod_post(void *ap, int rc)
4153{
4154	struct vop_mknod_args *a = ap;
4155
4156	if (!rc)
4157		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
4158}
4159
4160void
4161vop_remove_post(void *ap, int rc)
4162{
4163	struct vop_remove_args *a = ap;
4164
4165	if (!rc) {
4166		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
4167		VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE);
4168	}
4169}
4170
4171void
4172vop_rename_post(void *ap, int rc)
4173{
4174	struct vop_rename_args *a = ap;
4175
4176	if (!rc) {
4177		VFS_KNOTE_UNLOCKED(a->a_fdvp, NOTE_WRITE);
4178		VFS_KNOTE_UNLOCKED(a->a_tdvp, NOTE_WRITE);
4179		VFS_KNOTE_UNLOCKED(a->a_fvp, NOTE_RENAME);
4180		if (a->a_tvp)
4181			VFS_KNOTE_UNLOCKED(a->a_tvp, NOTE_DELETE);
4182	}
4183	if (a->a_tdvp != a->a_fdvp)
4184		vdrop(a->a_fdvp);
4185	if (a->a_tvp != a->a_fvp)
4186		vdrop(a->a_fvp);
4187	vdrop(a->a_tdvp);
4188	if (a->a_tvp)
4189		vdrop(a->a_tvp);
4190}
4191
4192void
4193vop_rmdir_post(void *ap, int rc)
4194{
4195	struct vop_rmdir_args *a = ap;
4196
4197	if (!rc) {
4198		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK);
4199		VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE);
4200	}
4201}
4202
4203void
4204vop_setattr_post(void *ap, int rc)
4205{
4206	struct vop_setattr_args *a = ap;
4207
4208	if (!rc)
4209		VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
4210}
4211
4212void
4213vop_setextattr_post(void *ap, int rc)
4214{
4215	struct vop_setextattr_args *a = ap;
4216
4217	if (!rc)
4218		VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
4219}
4220
4221void
4222vop_symlink_post(void *ap, int rc)
4223{
4224	struct vop_symlink_args *a = ap;
4225
4226	if (!rc)
4227		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
4228}
4229
4230static struct knlist fs_knlist;
4231
4232static void
4233vfs_event_init(void *arg)
4234{
4235	knlist_init_mtx(&fs_knlist, NULL);
4236}
4237/* XXX - correct order? */
4238SYSINIT(vfs_knlist, SI_SUB_VFS, SI_ORDER_ANY, vfs_event_init, NULL);
4239
4240void
4241vfs_event_signal(fsid_t *fsid, uint32_t event, intptr_t data __unused)
4242{
4243
4244	KNOTE_UNLOCKED(&fs_knlist, event);
4245}
4246
4247static int	filt_fsattach(struct knote *kn);
4248static void	filt_fsdetach(struct knote *kn);
4249static int	filt_fsevent(struct knote *kn, long hint);
4250
4251struct filterops fs_filtops = {
4252	.f_isfd = 0,
4253	.f_attach = filt_fsattach,
4254	.f_detach = filt_fsdetach,
4255	.f_event = filt_fsevent
4256};
4257
4258static int
4259filt_fsattach(struct knote *kn)
4260{
4261
4262	kn->kn_flags |= EV_CLEAR;
4263	knlist_add(&fs_knlist, kn, 0);
4264	return (0);
4265}
4266
4267static void
4268filt_fsdetach(struct knote *kn)
4269{
4270
4271	knlist_remove(&fs_knlist, kn, 0);
4272}
4273
4274static int
4275filt_fsevent(struct knote *kn, long hint)
4276{
4277
4278	kn->kn_fflags |= hint;
4279	return (kn->kn_fflags != 0);
4280}
4281
4282static int
4283sysctl_vfs_ctl(SYSCTL_HANDLER_ARGS)
4284{
4285	struct vfsidctl vc;
4286	int error;
4287	struct mount *mp;
4288
4289	error = SYSCTL_IN(req, &vc, sizeof(vc));
4290	if (error)
4291		return (error);
4292	if (vc.vc_vers != VFS_CTL_VERS1)
4293		return (EINVAL);
4294	mp = vfs_getvfs(&vc.vc_fsid);
4295	if (mp == NULL)
4296		return (ENOENT);
4297	/* ensure that a specific sysctl goes to the right filesystem. */
4298	if (strcmp(vc.vc_fstypename, "*") != 0 &&
4299	    strcmp(vc.vc_fstypename, mp->mnt_vfc->vfc_name) != 0) {
4300		vfs_rel(mp);
4301		return (EINVAL);
4302	}
4303	VCTLTOREQ(&vc, req);
4304	error = VFS_SYSCTL(mp, vc.vc_op, req);
4305	vfs_rel(mp);
4306	return (error);
4307}
4308
4309SYSCTL_PROC(_vfs, OID_AUTO, ctl, CTLTYPE_OPAQUE | CTLFLAG_WR,
4310    NULL, 0, sysctl_vfs_ctl, "",
4311    "Sysctl by fsid");
4312
4313/*
4314 * Function to initialize a va_filerev field sensibly.
4315 * XXX: Wouldn't a random number make a lot more sense ??
4316 */
4317u_quad_t
4318init_va_filerev(void)
4319{
4320	struct bintime bt;
4321
4322	getbinuptime(&bt);
4323	return (((u_quad_t)bt.sec << 32LL) | (bt.frac >> 32LL));
4324}
4325
4326static int	filt_vfsread(struct knote *kn, long hint);
4327static int	filt_vfswrite(struct knote *kn, long hint);
4328static int	filt_vfsvnode(struct knote *kn, long hint);
4329static void	filt_vfsdetach(struct knote *kn);
4330static struct filterops vfsread_filtops = {
4331	.f_isfd = 1,
4332	.f_detach = filt_vfsdetach,
4333	.f_event = filt_vfsread
4334};
4335static struct filterops vfswrite_filtops = {
4336	.f_isfd = 1,
4337	.f_detach = filt_vfsdetach,
4338	.f_event = filt_vfswrite
4339};
4340static struct filterops vfsvnode_filtops = {
4341	.f_isfd = 1,
4342	.f_detach = filt_vfsdetach,
4343	.f_event = filt_vfsvnode
4344};
4345
4346static void
4347vfs_knllock(void *arg)
4348{
4349	struct vnode *vp = arg;
4350
4351	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4352}
4353
4354static void
4355vfs_knlunlock(void *arg)
4356{
4357	struct vnode *vp = arg;
4358
4359	VOP_UNLOCK(vp, 0);
4360}
4361
4362static void
4363vfs_knl_assert_locked(void *arg)
4364{
4365#ifdef DEBUG_VFS_LOCKS
4366	struct vnode *vp = arg;
4367
4368	ASSERT_VOP_LOCKED(vp, "vfs_knl_assert_locked");
4369#endif
4370}
4371
4372static void
4373vfs_knl_assert_unlocked(void *arg)
4374{
4375#ifdef DEBUG_VFS_LOCKS
4376	struct vnode *vp = arg;
4377
4378	ASSERT_VOP_UNLOCKED(vp, "vfs_knl_assert_unlocked");
4379#endif
4380}
4381
4382int
4383vfs_kqfilter(struct vop_kqfilter_args *ap)
4384{
4385	struct vnode *vp = ap->a_vp;
4386	struct knote *kn = ap->a_kn;
4387	struct knlist *knl;
4388
4389	switch (kn->kn_filter) {
4390	case EVFILT_READ:
4391		kn->kn_fop = &vfsread_filtops;
4392		break;
4393	case EVFILT_WRITE:
4394		kn->kn_fop = &vfswrite_filtops;
4395		break;
4396	case EVFILT_VNODE:
4397		kn->kn_fop = &vfsvnode_filtops;
4398		break;
4399	default:
4400		return (EINVAL);
4401	}
4402
4403	kn->kn_hook = (caddr_t)vp;
4404
4405	v_addpollinfo(vp);
4406	if (vp->v_pollinfo == NULL)
4407		return (ENOMEM);
4408	knl = &vp->v_pollinfo->vpi_selinfo.si_note;
4409	vhold(vp);
4410	knlist_add(knl, kn, 0);
4411
4412	return (0);
4413}
4414
4415/*
4416 * Detach knote from vnode
4417 */
4418static void
4419filt_vfsdetach(struct knote *kn)
4420{
4421	struct vnode *vp = (struct vnode *)kn->kn_hook;
4422
4423	KASSERT(vp->v_pollinfo != NULL, ("Missing v_pollinfo"));
4424	knlist_remove(&vp->v_pollinfo->vpi_selinfo.si_note, kn, 0);
4425	vdrop(vp);
4426}
4427
4428/*ARGSUSED*/
4429static int
4430filt_vfsread(struct knote *kn, long hint)
4431{
4432	struct vnode *vp = (struct vnode *)kn->kn_hook;
4433	struct vattr va;
4434	int res;
4435
4436	/*
4437	 * filesystem is gone, so set the EOF flag and schedule
4438	 * the knote for deletion.
4439	 */
4440	if (hint == NOTE_REVOKE) {
4441		VI_LOCK(vp);
4442		kn->kn_flags |= (EV_EOF | EV_ONESHOT);
4443		VI_UNLOCK(vp);
4444		return (1);
4445	}
4446
4447	if (VOP_GETATTR(vp, &va, curthread->td_ucred))
4448		return (0);
4449
4450	VI_LOCK(vp);
4451	kn->kn_data = va.va_size - kn->kn_fp->f_offset;
4452	res = (kn->kn_data != 0);
4453	VI_UNLOCK(vp);
4454	return (res);
4455}
4456
4457/*ARGSUSED*/
4458static int
4459filt_vfswrite(struct knote *kn, long hint)
4460{
4461	struct vnode *vp = (struct vnode *)kn->kn_hook;
4462
4463	VI_LOCK(vp);
4464
4465	/*
4466	 * filesystem is gone, so set the EOF flag and schedule
4467	 * the knote for deletion.
4468	 */
4469	if (hint == NOTE_REVOKE)
4470		kn->kn_flags |= (EV_EOF | EV_ONESHOT);
4471
4472	kn->kn_data = 0;
4473	VI_UNLOCK(vp);
4474	return (1);
4475}
4476
4477static int
4478filt_vfsvnode(struct knote *kn, long hint)
4479{
4480	struct vnode *vp = (struct vnode *)kn->kn_hook;
4481	int res;
4482
4483	VI_LOCK(vp);
4484	if (kn->kn_sfflags & hint)
4485		kn->kn_fflags |= hint;
4486	if (hint == NOTE_REVOKE) {
4487		kn->kn_flags |= EV_EOF;
4488		VI_UNLOCK(vp);
4489		return (1);
4490	}
4491	res = (kn->kn_fflags != 0);
4492	VI_UNLOCK(vp);
4493	return (res);
4494}
4495
4496int
4497vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off)
4498{
4499	int error;
4500
4501	if (dp->d_reclen > ap->a_uio->uio_resid)
4502		return (ENAMETOOLONG);
4503	error = uiomove(dp, dp->d_reclen, ap->a_uio);
4504	if (error) {
4505		if (ap->a_ncookies != NULL) {
4506			if (ap->a_cookies != NULL)
4507				free(ap->a_cookies, M_TEMP);
4508			ap->a_cookies = NULL;
4509			*ap->a_ncookies = 0;
4510		}
4511		return (error);
4512	}
4513	if (ap->a_ncookies == NULL)
4514		return (0);
4515
4516	KASSERT(ap->a_cookies,
4517	    ("NULL ap->a_cookies value with non-NULL ap->a_ncookies!"));
4518
4519	*ap->a_cookies = realloc(*ap->a_cookies,
4520	    (*ap->a_ncookies + 1) * sizeof(u_long), M_TEMP, M_WAITOK | M_ZERO);
4521	(*ap->a_cookies)[*ap->a_ncookies] = off;
4522	return (0);
4523}
4524
4525/*
4526 * Mark for update the access time of the file if the filesystem
4527 * supports VOP_MARKATIME.  This functionality is used by execve and
4528 * mmap, so we want to avoid the I/O implied by directly setting
4529 * va_atime for the sake of efficiency.
4530 */
4531void
4532vfs_mark_atime(struct vnode *vp, struct ucred *cred)
4533{
4534	struct mount *mp;
4535
4536	mp = vp->v_mount;
4537	ASSERT_VOP_LOCKED(vp, "vfs_mark_atime");
4538	if (mp != NULL && (mp->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0)
4539		(void)VOP_MARKATIME(vp);
4540}
4541
4542/*
4543 * The purpose of this routine is to remove granularity from accmode_t,
4544 * reducing it into standard unix access bits - VEXEC, VREAD, VWRITE,
4545 * VADMIN and VAPPEND.
4546 *
4547 * If it returns 0, the caller is supposed to continue with the usual
4548 * access checks using 'accmode' as modified by this routine.  If it
4549 * returns nonzero value, the caller is supposed to return that value
4550 * as errno.
4551 *
4552 * Note that after this routine runs, accmode may be zero.
4553 */
4554int
4555vfs_unixify_accmode(accmode_t *accmode)
4556{
4557	/*
4558	 * There is no way to specify explicit "deny" rule using
4559	 * file mode or POSIX.1e ACLs.
4560	 */
4561	if (*accmode & VEXPLICIT_DENY) {
4562		*accmode = 0;
4563		return (0);
4564	}
4565
4566	/*
4567	 * None of these can be translated into usual access bits.
4568	 * Also, the common case for NFSv4 ACLs is to not contain
4569	 * either of these bits. Caller should check for VWRITE
4570	 * on the containing directory instead.
4571	 */
4572	if (*accmode & (VDELETE_CHILD | VDELETE))
4573		return (EPERM);
4574
4575	if (*accmode & VADMIN_PERMS) {
4576		*accmode &= ~VADMIN_PERMS;
4577		*accmode |= VADMIN;
4578	}
4579
4580	/*
4581	 * There is no way to deny VREAD_ATTRIBUTES, VREAD_ACL
4582	 * or VSYNCHRONIZE using file mode or POSIX.1e ACL.
4583	 */
4584	*accmode &= ~(VSTAT_PERMS | VSYNCHRONIZE);
4585
4586	return (0);
4587}
4588
4589/*
4590 * These are helper functions for filesystems to traverse all
4591 * their vnodes.  See MNT_VNODE_FOREACH_ALL() in sys/mount.h.
4592 *
4593 * This interface replaces MNT_VNODE_FOREACH.
4594 */
4595
4596MALLOC_DEFINE(M_VNODE_MARKER, "vnodemarker", "vnode marker");
4597
4598struct vnode *
4599__mnt_vnode_next_all(struct vnode **mvp, struct mount *mp)
4600{
4601	struct vnode *vp;
4602
4603	if (should_yield())
4604		kern_yield(PRI_USER);
4605	MNT_ILOCK(mp);
4606	KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
4607	vp = TAILQ_NEXT(*mvp, v_nmntvnodes);
4608	while (vp != NULL && (vp->v_type == VMARKER ||
4609	    (vp->v_iflag & VI_DOOMED) != 0))
4610		vp = TAILQ_NEXT(vp, v_nmntvnodes);
4611
4612	/* Check if we are done */
4613	if (vp == NULL) {
4614		__mnt_vnode_markerfree_all(mvp, mp);
4615		/* MNT_IUNLOCK(mp); -- done in above function */
4616		mtx_assert(MNT_MTX(mp), MA_NOTOWNED);
4617		return (NULL);
4618	}
4619	TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes);
4620	TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes);
4621	VI_LOCK(vp);
4622	MNT_IUNLOCK(mp);
4623	return (vp);
4624}
4625
4626struct vnode *
4627__mnt_vnode_first_all(struct vnode **mvp, struct mount *mp)
4628{
4629	struct vnode *vp;
4630
4631	*mvp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO);
4632	MNT_ILOCK(mp);
4633	MNT_REF(mp);
4634	(*mvp)->v_type = VMARKER;
4635
4636	vp = TAILQ_FIRST(&mp->mnt_nvnodelist);
4637	while (vp != NULL && (vp->v_type == VMARKER ||
4638	    (vp->v_iflag & VI_DOOMED) != 0))
4639		vp = TAILQ_NEXT(vp, v_nmntvnodes);
4640
4641	/* Check if we are done */
4642	if (vp == NULL) {
4643		MNT_REL(mp);
4644		MNT_IUNLOCK(mp);
4645		free(*mvp, M_VNODE_MARKER);
4646		*mvp = NULL;
4647		return (NULL);
4648	}
4649	(*mvp)->v_mount = mp;
4650	TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes);
4651	VI_LOCK(vp);
4652	MNT_IUNLOCK(mp);
4653	return (vp);
4654}
4655
4656
4657void
4658__mnt_vnode_markerfree_all(struct vnode **mvp, struct mount *mp)
4659{
4660
4661	if (*mvp == NULL) {
4662		MNT_IUNLOCK(mp);
4663		return;
4664	}
4665
4666	mtx_assert(MNT_MTX(mp), MA_OWNED);
4667
4668	KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
4669	TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes);
4670	MNT_REL(mp);
4671	MNT_IUNLOCK(mp);
4672	free(*mvp, M_VNODE_MARKER);
4673	*mvp = NULL;
4674}
4675
4676/*
4677 * These are helper functions for filesystems to traverse their
4678 * active vnodes.  See MNT_VNODE_FOREACH_ACTIVE() in sys/mount.h
4679 */
4680static void
4681mnt_vnode_markerfree_active(struct vnode **mvp, struct mount *mp)
4682{
4683
4684	KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
4685
4686	MNT_ILOCK(mp);
4687	MNT_REL(mp);
4688	MNT_IUNLOCK(mp);
4689	free(*mvp, M_VNODE_MARKER);
4690	*mvp = NULL;
4691}
4692
4693static struct vnode *
4694mnt_vnode_next_active(struct vnode **mvp, struct mount *mp)
4695{
4696	struct vnode *vp, *nvp;
4697
4698	mtx_assert(&vnode_free_list_mtx, MA_OWNED);
4699	KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
4700restart:
4701	vp = TAILQ_NEXT(*mvp, v_actfreelist);
4702	TAILQ_REMOVE(&mp->mnt_activevnodelist, *mvp, v_actfreelist);
4703	while (vp != NULL) {
4704		if (vp->v_type == VMARKER) {
4705			vp = TAILQ_NEXT(vp, v_actfreelist);
4706			continue;
4707		}
4708		if (!VI_TRYLOCK(vp)) {
4709			if (mp_ncpus == 1 || should_yield()) {
4710				TAILQ_INSERT_BEFORE(vp, *mvp, v_actfreelist);
4711				mtx_unlock(&vnode_free_list_mtx);
4712				pause("vnacti", 1);
4713				mtx_lock(&vnode_free_list_mtx);
4714				goto restart;
4715			}
4716			continue;
4717		}
4718		KASSERT(vp->v_type != VMARKER, ("locked marker %p", vp));
4719		KASSERT(vp->v_mount == mp || vp->v_mount == NULL,
4720		    ("alien vnode on the active list %p %p", vp, mp));
4721		if (vp->v_mount == mp && (vp->v_iflag & VI_DOOMED) == 0)
4722			break;
4723		nvp = TAILQ_NEXT(vp, v_actfreelist);
4724		VI_UNLOCK(vp);
4725		vp = nvp;
4726	}
4727
4728	/* Check if we are done */
4729	if (vp == NULL) {
4730		mtx_unlock(&vnode_free_list_mtx);
4731		mnt_vnode_markerfree_active(mvp, mp);
4732		return (NULL);
4733	}
4734	TAILQ_INSERT_AFTER(&mp->mnt_activevnodelist, vp, *mvp, v_actfreelist);
4735	mtx_unlock(&vnode_free_list_mtx);
4736	ASSERT_VI_LOCKED(vp, "active iter");
4737	KASSERT((vp->v_iflag & VI_ACTIVE) != 0, ("Non-active vp %p", vp));
4738	return (vp);
4739}
4740
4741struct vnode *
4742__mnt_vnode_next_active(struct vnode **mvp, struct mount *mp)
4743{
4744
4745	if (should_yield())
4746		kern_yield(PRI_USER);
4747	mtx_lock(&vnode_free_list_mtx);
4748	return (mnt_vnode_next_active(mvp, mp));
4749}
4750
4751struct vnode *
4752__mnt_vnode_first_active(struct vnode **mvp, struct mount *mp)
4753{
4754	struct vnode *vp;
4755
4756	*mvp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO);
4757	MNT_ILOCK(mp);
4758	MNT_REF(mp);
4759	MNT_IUNLOCK(mp);
4760	(*mvp)->v_type = VMARKER;
4761	(*mvp)->v_mount = mp;
4762
4763	mtx_lock(&vnode_free_list_mtx);
4764	vp = TAILQ_FIRST(&mp->mnt_activevnodelist);
4765	if (vp == NULL) {
4766		mtx_unlock(&vnode_free_list_mtx);
4767		mnt_vnode_markerfree_active(mvp, mp);
4768		return (NULL);
4769	}
4770	TAILQ_INSERT_BEFORE(vp, *mvp, v_actfreelist);
4771	return (mnt_vnode_next_active(mvp, mp));
4772}
4773
4774void
4775__mnt_vnode_markerfree_active(struct vnode **mvp, struct mount *mp)
4776{
4777
4778	if (*mvp == NULL)
4779		return;
4780
4781	mtx_lock(&vnode_free_list_mtx);
4782	TAILQ_REMOVE(&mp->mnt_activevnodelist, *mvp, v_actfreelist);
4783	mtx_unlock(&vnode_free_list_mtx);
4784	mnt_vnode_markerfree_active(mvp, mp);
4785}
4786