vfs_subr.c revision 223677
1/*-
2 * Copyright (c) 1989, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 4. Neither the name of the University nor the names of its contributors
19 *    may be used to endorse or promote products derived from this software
20 *    without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 *	@(#)vfs_subr.c	8.31 (Berkeley) 5/26/95
35 */
36
37/*
38 * External virtual filesystem routines
39 */
40
41#include <sys/cdefs.h>
42__FBSDID("$FreeBSD: head/sys/kern/vfs_subr.c 223677 2011-06-29 16:40:41Z alc $");
43
44#include "opt_ddb.h"
45#include "opt_watchdog.h"
46
47#include <sys/param.h>
48#include <sys/systm.h>
49#include <sys/bio.h>
50#include <sys/buf.h>
51#include <sys/condvar.h>
52#include <sys/conf.h>
53#include <sys/dirent.h>
54#include <sys/event.h>
55#include <sys/eventhandler.h>
56#include <sys/extattr.h>
57#include <sys/file.h>
58#include <sys/fcntl.h>
59#include <sys/jail.h>
60#include <sys/kdb.h>
61#include <sys/kernel.h>
62#include <sys/kthread.h>
63#include <sys/lockf.h>
64#include <sys/malloc.h>
65#include <sys/mount.h>
66#include <sys/namei.h>
67#include <sys/priv.h>
68#include <sys/reboot.h>
69#include <sys/sched.h>
70#include <sys/sleepqueue.h>
71#include <sys/stat.h>
72#include <sys/sysctl.h>
73#include <sys/syslog.h>
74#include <sys/vmmeter.h>
75#include <sys/vnode.h>
76#ifdef SW_WATCHDOG
77#include <sys/watchdog.h>
78#endif
79
80#include <machine/stdarg.h>
81
82#include <security/mac/mac_framework.h>
83
84#include <vm/vm.h>
85#include <vm/vm_object.h>
86#include <vm/vm_extern.h>
87#include <vm/pmap.h>
88#include <vm/vm_map.h>
89#include <vm/vm_page.h>
90#include <vm/vm_kern.h>
91#include <vm/uma.h>
92
93#ifdef DDB
94#include <ddb/ddb.h>
95#endif
96
97#define	WI_MPSAFEQ	0
98#define	WI_GIANTQ	1
99
100static void	delmntque(struct vnode *vp);
101static int	flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo,
102		    int slpflag, int slptimeo);
103static void	syncer_shutdown(void *arg, int howto);
104static int	vtryrecycle(struct vnode *vp);
105static void	vbusy(struct vnode *vp);
106static void	vinactive(struct vnode *, struct thread *);
107static void	v_incr_usecount(struct vnode *);
108static void	v_decr_usecount(struct vnode *);
109static void	v_decr_useonly(struct vnode *);
110static void	v_upgrade_usecount(struct vnode *);
111static void	vfree(struct vnode *);
112static void	vnlru_free(int);
113static void	vgonel(struct vnode *);
114static void	vfs_knllock(void *arg);
115static void	vfs_knlunlock(void *arg);
116static void	vfs_knl_assert_locked(void *arg);
117static void	vfs_knl_assert_unlocked(void *arg);
118static void	destroy_vpollinfo(struct vpollinfo *vi);
119
120/*
121 * Number of vnodes in existence.  Increased whenever getnewvnode()
122 * allocates a new vnode, decreased on vdestroy() called on VI_DOOMed
123 * vnode.
124 */
125static unsigned long	numvnodes;
126
127SYSCTL_ULONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0,
128    "Number of vnodes in existence");
129
130/*
131 * Conversion tables for conversion from vnode types to inode formats
132 * and back.
133 */
134enum vtype iftovt_tab[16] = {
135	VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
136	VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
137};
138int vttoif_tab[10] = {
139	0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
140	S_IFSOCK, S_IFIFO, S_IFMT, S_IFMT
141};
142
143/*
144 * List of vnodes that are ready for recycling.
145 */
146static TAILQ_HEAD(freelst, vnode) vnode_free_list;
147
148/*
149 * Free vnode target.  Free vnodes may simply be files which have been stat'd
150 * but not read.  This is somewhat common, and a small cache of such files
151 * should be kept to avoid recreation costs.
152 */
153static u_long wantfreevnodes;
154SYSCTL_ULONG(_vfs, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, "");
155/* Number of vnodes in the free list. */
156static u_long freevnodes;
157SYSCTL_ULONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0,
158    "Number of vnodes in the free list");
159
160static int vlru_allow_cache_src;
161SYSCTL_INT(_vfs, OID_AUTO, vlru_allow_cache_src, CTLFLAG_RW,
162    &vlru_allow_cache_src, 0, "Allow vlru to reclaim source vnode");
163
164/*
165 * Various variables used for debugging the new implementation of
166 * reassignbuf().
167 * XXX these are probably of (very) limited utility now.
168 */
169static int reassignbufcalls;
170SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0,
171    "Number of calls to reassignbuf");
172
173/*
174 * Cache for the mount type id assigned to NFS.  This is used for
175 * special checks in nfs/nfs_nqlease.c and vm/vnode_pager.c.
176 */
177int	nfs_mount_type = -1;
178
179/* To keep more than one thread at a time from running vfs_getnewfsid */
180static struct mtx mntid_mtx;
181
182/*
183 * Lock for any access to the following:
184 *	vnode_free_list
185 *	numvnodes
186 *	freevnodes
187 */
188static struct mtx vnode_free_list_mtx;
189
190/* Publicly exported FS */
191struct nfs_public nfs_pub;
192
193/* Zone for allocation of new vnodes - used exclusively by getnewvnode() */
194static uma_zone_t vnode_zone;
195static uma_zone_t vnodepoll_zone;
196
197/*
198 * The workitem queue.
199 *
200 * It is useful to delay writes of file data and filesystem metadata
201 * for tens of seconds so that quickly created and deleted files need
202 * not waste disk bandwidth being created and removed. To realize this,
203 * we append vnodes to a "workitem" queue. When running with a soft
204 * updates implementation, most pending metadata dependencies should
205 * not wait for more than a few seconds. Thus, mounted on block devices
206 * are delayed only about a half the time that file data is delayed.
207 * Similarly, directory updates are more critical, so are only delayed
208 * about a third the time that file data is delayed. Thus, there are
209 * SYNCER_MAXDELAY queues that are processed round-robin at a rate of
210 * one each second (driven off the filesystem syncer process). The
211 * syncer_delayno variable indicates the next queue that is to be processed.
212 * Items that need to be processed soon are placed in this queue:
213 *
214 *	syncer_workitem_pending[syncer_delayno]
215 *
216 * A delay of fifteen seconds is done by placing the request fifteen
217 * entries later in the queue:
218 *
219 *	syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
220 *
221 */
222static int syncer_delayno;
223static long syncer_mask;
224LIST_HEAD(synclist, bufobj);
225static struct synclist *syncer_workitem_pending[2];
226/*
227 * The sync_mtx protects:
228 *	bo->bo_synclist
229 *	sync_vnode_count
230 *	syncer_delayno
231 *	syncer_state
232 *	syncer_workitem_pending
233 *	syncer_worklist_len
234 *	rushjob
235 */
236static struct mtx sync_mtx;
237static struct cv sync_wakeup;
238
239#define SYNCER_MAXDELAY		32
240static int syncer_maxdelay = SYNCER_MAXDELAY;	/* maximum delay time */
241static int syncdelay = 30;		/* max time to delay syncing data */
242static int filedelay = 30;		/* time to delay syncing files */
243SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0,
244    "Time to delay syncing files (in seconds)");
245static int dirdelay = 29;		/* time to delay syncing directories */
246SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0,
247    "Time to delay syncing directories (in seconds)");
248static int metadelay = 28;		/* time to delay syncing metadata */
249SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0,
250    "Time to delay syncing metadata (in seconds)");
251static int rushjob;		/* number of slots to run ASAP */
252static int stat_rush_requests;	/* number of times I/O speeded up */
253SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0,
254    "Number of times I/O speeded up (rush requests)");
255
256/*
257 * When shutting down the syncer, run it at four times normal speed.
258 */
259#define SYNCER_SHUTDOWN_SPEEDUP		4
260static int sync_vnode_count;
261static int syncer_worklist_len;
262static enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY }
263    syncer_state;
264
265/*
266 * Number of vnodes we want to exist at any one time.  This is mostly used
267 * to size hash tables in vnode-related code.  It is normally not used in
268 * getnewvnode(), as wantfreevnodes is normally nonzero.)
269 *
270 * XXX desiredvnodes is historical cruft and should not exist.
271 */
272int desiredvnodes;
273SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW,
274    &desiredvnodes, 0, "Maximum number of vnodes");
275SYSCTL_ULONG(_kern, OID_AUTO, minvnodes, CTLFLAG_RW,
276    &wantfreevnodes, 0, "Minimum number of vnodes (legacy)");
277static int vnlru_nowhere;
278SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW,
279    &vnlru_nowhere, 0, "Number of times the vnlru process ran without success");
280
281/*
282 * Macros to control when a vnode is freed and recycled.  All require
283 * the vnode interlock.
284 */
285#define VCANRECYCLE(vp) (((vp)->v_iflag & VI_FREE) && !(vp)->v_holdcnt)
286#define VSHOULDFREE(vp) (!((vp)->v_iflag & VI_FREE) && !(vp)->v_holdcnt)
287#define VSHOULDBUSY(vp) (((vp)->v_iflag & VI_FREE) && (vp)->v_holdcnt)
288
289
290/*
291 * Initialize the vnode management data structures.
292 *
293 * Reevaluate the following cap on the number of vnodes after the physical
294 * memory size exceeds 512GB.  In the limit, as the physical memory size
295 * grows, the ratio of physical pages to vnodes approaches sixteen to one.
296 */
297#ifndef	MAXVNODES_MAX
298#define	MAXVNODES_MAX	(512 * (1024 * 1024 * 1024 / (int)PAGE_SIZE / 16))
299#endif
300static void
301vntblinit(void *dummy __unused)
302{
303	int physvnodes, virtvnodes;
304
305	/*
306	 * Desiredvnodes is a function of the physical memory size and the
307	 * kernel's heap size.  Generally speaking, it scales with the
308	 * physical memory size.  The ratio of desiredvnodes to physical pages
309	 * is one to four until desiredvnodes exceeds 98,304.  Thereafter, the
310	 * marginal ratio of desiredvnodes to physical pages is one to
311	 * sixteen.  However, desiredvnodes is limited by the kernel's heap
312	 * size.  The memory required by desiredvnodes vnodes and vm objects
313	 * may not exceed one seventh of the kernel's heap size.
314	 */
315	physvnodes = maxproc + cnt.v_page_count / 16 + 3 * min(98304 * 4,
316	    cnt.v_page_count) / 16;
317	virtvnodes = vm_kmem_size / (7 * (sizeof(struct vm_object) +
318	    sizeof(struct vnode)));
319	desiredvnodes = min(physvnodes, virtvnodes);
320	if (desiredvnodes > MAXVNODES_MAX) {
321		if (bootverbose)
322			printf("Reducing kern.maxvnodes %d -> %d\n",
323			    desiredvnodes, MAXVNODES_MAX);
324		desiredvnodes = MAXVNODES_MAX;
325	}
326	wantfreevnodes = desiredvnodes / 4;
327	mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF);
328	TAILQ_INIT(&vnode_free_list);
329	mtx_init(&vnode_free_list_mtx, "vnode_free_list", NULL, MTX_DEF);
330	vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL,
331	    NULL, NULL, UMA_ALIGN_PTR, 0);
332	vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo),
333	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
334	/*
335	 * Initialize the filesystem syncer.
336	 */
337	syncer_workitem_pending[WI_MPSAFEQ] = hashinit(syncer_maxdelay, M_VNODE,
338	    &syncer_mask);
339	syncer_workitem_pending[WI_GIANTQ] = hashinit(syncer_maxdelay, M_VNODE,
340	    &syncer_mask);
341	syncer_maxdelay = syncer_mask + 1;
342	mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF);
343	cv_init(&sync_wakeup, "syncer");
344}
345SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL);
346
347
348/*
349 * Mark a mount point as busy. Used to synchronize access and to delay
350 * unmounting. Eventually, mountlist_mtx is not released on failure.
351 */
352int
353vfs_busy(struct mount *mp, int flags)
354{
355
356	MPASS((flags & ~MBF_MASK) == 0);
357	CTR3(KTR_VFS, "%s: mp %p with flags %d", __func__, mp, flags);
358
359	MNT_ILOCK(mp);
360	MNT_REF(mp);
361	/*
362	 * If mount point is currenly being unmounted, sleep until the
363	 * mount point fate is decided.  If thread doing the unmounting fails,
364	 * it will clear MNTK_UNMOUNT flag before waking us up, indicating
365	 * that this mount point has survived the unmount attempt and vfs_busy
366	 * should retry.  Otherwise the unmounter thread will set MNTK_REFEXPIRE
367	 * flag in addition to MNTK_UNMOUNT, indicating that mount point is
368	 * about to be really destroyed.  vfs_busy needs to release its
369	 * reference on the mount point in this case and return with ENOENT,
370	 * telling the caller that mount mount it tried to busy is no longer
371	 * valid.
372	 */
373	while (mp->mnt_kern_flag & MNTK_UNMOUNT) {
374		if (flags & MBF_NOWAIT || mp->mnt_kern_flag & MNTK_REFEXPIRE) {
375			MNT_REL(mp);
376			MNT_IUNLOCK(mp);
377			CTR1(KTR_VFS, "%s: failed busying before sleeping",
378			    __func__);
379			return (ENOENT);
380		}
381		if (flags & MBF_MNTLSTLOCK)
382			mtx_unlock(&mountlist_mtx);
383		mp->mnt_kern_flag |= MNTK_MWAIT;
384		msleep(mp, MNT_MTX(mp), PVFS | PDROP, "vfs_busy", 0);
385		if (flags & MBF_MNTLSTLOCK)
386			mtx_lock(&mountlist_mtx);
387		MNT_ILOCK(mp);
388	}
389	if (flags & MBF_MNTLSTLOCK)
390		mtx_unlock(&mountlist_mtx);
391	mp->mnt_lockref++;
392	MNT_IUNLOCK(mp);
393	return (0);
394}
395
396/*
397 * Free a busy filesystem.
398 */
399void
400vfs_unbusy(struct mount *mp)
401{
402
403	CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
404	MNT_ILOCK(mp);
405	MNT_REL(mp);
406	KASSERT(mp->mnt_lockref > 0, ("negative mnt_lockref"));
407	mp->mnt_lockref--;
408	if (mp->mnt_lockref == 0 && (mp->mnt_kern_flag & MNTK_DRAINING) != 0) {
409		MPASS(mp->mnt_kern_flag & MNTK_UNMOUNT);
410		CTR1(KTR_VFS, "%s: waking up waiters", __func__);
411		mp->mnt_kern_flag &= ~MNTK_DRAINING;
412		wakeup(&mp->mnt_lockref);
413	}
414	MNT_IUNLOCK(mp);
415}
416
417/*
418 * Lookup a mount point by filesystem identifier.
419 */
420struct mount *
421vfs_getvfs(fsid_t *fsid)
422{
423	struct mount *mp;
424
425	CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid);
426	mtx_lock(&mountlist_mtx);
427	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
428		if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
429		    mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
430			vfs_ref(mp);
431			mtx_unlock(&mountlist_mtx);
432			return (mp);
433		}
434	}
435	mtx_unlock(&mountlist_mtx);
436	CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid);
437	return ((struct mount *) 0);
438}
439
440/*
441 * Lookup a mount point by filesystem identifier, busying it before
442 * returning.
443 */
444struct mount *
445vfs_busyfs(fsid_t *fsid)
446{
447	struct mount *mp;
448	int error;
449
450	CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid);
451	mtx_lock(&mountlist_mtx);
452	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
453		if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
454		    mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
455			error = vfs_busy(mp, MBF_MNTLSTLOCK);
456			if (error) {
457				mtx_unlock(&mountlist_mtx);
458				return (NULL);
459			}
460			return (mp);
461		}
462	}
463	CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid);
464	mtx_unlock(&mountlist_mtx);
465	return ((struct mount *) 0);
466}
467
468/*
469 * Check if a user can access privileged mount options.
470 */
471int
472vfs_suser(struct mount *mp, struct thread *td)
473{
474	int error;
475
476	/*
477	 * If the thread is jailed, but this is not a jail-friendly file
478	 * system, deny immediately.
479	 */
480	if (!(mp->mnt_vfc->vfc_flags & VFCF_JAIL) && jailed(td->td_ucred))
481		return (EPERM);
482
483	/*
484	 * If the file system was mounted outside the jail of the calling
485	 * thread, deny immediately.
486	 */
487	if (prison_check(td->td_ucred, mp->mnt_cred) != 0)
488		return (EPERM);
489
490	/*
491	 * If file system supports delegated administration, we don't check
492	 * for the PRIV_VFS_MOUNT_OWNER privilege - it will be better verified
493	 * by the file system itself.
494	 * If this is not the user that did original mount, we check for
495	 * the PRIV_VFS_MOUNT_OWNER privilege.
496	 */
497	if (!(mp->mnt_vfc->vfc_flags & VFCF_DELEGADMIN) &&
498	    mp->mnt_cred->cr_uid != td->td_ucred->cr_uid) {
499		if ((error = priv_check(td, PRIV_VFS_MOUNT_OWNER)) != 0)
500			return (error);
501	}
502	return (0);
503}
504
505/*
506 * Get a new unique fsid.  Try to make its val[0] unique, since this value
507 * will be used to create fake device numbers for stat().  Also try (but
508 * not so hard) make its val[0] unique mod 2^16, since some emulators only
509 * support 16-bit device numbers.  We end up with unique val[0]'s for the
510 * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls.
511 *
512 * Keep in mind that several mounts may be running in parallel.  Starting
513 * the search one past where the previous search terminated is both a
514 * micro-optimization and a defense against returning the same fsid to
515 * different mounts.
516 */
517void
518vfs_getnewfsid(struct mount *mp)
519{
520	static uint16_t mntid_base;
521	struct mount *nmp;
522	fsid_t tfsid;
523	int mtype;
524
525	CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
526	mtx_lock(&mntid_mtx);
527	mtype = mp->mnt_vfc->vfc_typenum;
528	tfsid.val[1] = mtype;
529	mtype = (mtype & 0xFF) << 24;
530	for (;;) {
531		tfsid.val[0] = makedev(255,
532		    mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF));
533		mntid_base++;
534		if ((nmp = vfs_getvfs(&tfsid)) == NULL)
535			break;
536		vfs_rel(nmp);
537	}
538	mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
539	mp->mnt_stat.f_fsid.val[1] = tfsid.val[1];
540	mtx_unlock(&mntid_mtx);
541}
542
543/*
544 * Knob to control the precision of file timestamps:
545 *
546 *   0 = seconds only; nanoseconds zeroed.
547 *   1 = seconds and nanoseconds, accurate within 1/HZ.
548 *   2 = seconds and nanoseconds, truncated to microseconds.
549 * >=3 = seconds and nanoseconds, maximum precision.
550 */
551enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC };
552
553static int timestamp_precision = TSP_SEC;
554SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW,
555    &timestamp_precision, 0, "File timestamp precision (0: seconds, "
556    "1: sec + ns accurate to 1/HZ, 2: sec + ns truncated to ms, "
557    "3+: sec + ns (max. precision))");
558
559/*
560 * Get a current timestamp.
561 */
562void
563vfs_timestamp(struct timespec *tsp)
564{
565	struct timeval tv;
566
567	switch (timestamp_precision) {
568	case TSP_SEC:
569		tsp->tv_sec = time_second;
570		tsp->tv_nsec = 0;
571		break;
572	case TSP_HZ:
573		getnanotime(tsp);
574		break;
575	case TSP_USEC:
576		microtime(&tv);
577		TIMEVAL_TO_TIMESPEC(&tv, tsp);
578		break;
579	case TSP_NSEC:
580	default:
581		nanotime(tsp);
582		break;
583	}
584}
585
586/*
587 * Set vnode attributes to VNOVAL
588 */
589void
590vattr_null(struct vattr *vap)
591{
592
593	vap->va_type = VNON;
594	vap->va_size = VNOVAL;
595	vap->va_bytes = VNOVAL;
596	vap->va_mode = VNOVAL;
597	vap->va_nlink = VNOVAL;
598	vap->va_uid = VNOVAL;
599	vap->va_gid = VNOVAL;
600	vap->va_fsid = VNOVAL;
601	vap->va_fileid = VNOVAL;
602	vap->va_blocksize = VNOVAL;
603	vap->va_rdev = VNOVAL;
604	vap->va_atime.tv_sec = VNOVAL;
605	vap->va_atime.tv_nsec = VNOVAL;
606	vap->va_mtime.tv_sec = VNOVAL;
607	vap->va_mtime.tv_nsec = VNOVAL;
608	vap->va_ctime.tv_sec = VNOVAL;
609	vap->va_ctime.tv_nsec = VNOVAL;
610	vap->va_birthtime.tv_sec = VNOVAL;
611	vap->va_birthtime.tv_nsec = VNOVAL;
612	vap->va_flags = VNOVAL;
613	vap->va_gen = VNOVAL;
614	vap->va_vaflags = 0;
615}
616
617/*
618 * This routine is called when we have too many vnodes.  It attempts
619 * to free <count> vnodes and will potentially free vnodes that still
620 * have VM backing store (VM backing store is typically the cause
621 * of a vnode blowout so we want to do this).  Therefore, this operation
622 * is not considered cheap.
623 *
624 * A number of conditions may prevent a vnode from being reclaimed.
625 * the buffer cache may have references on the vnode, a directory
626 * vnode may still have references due to the namei cache representing
627 * underlying files, or the vnode may be in active use.   It is not
628 * desireable to reuse such vnodes.  These conditions may cause the
629 * number of vnodes to reach some minimum value regardless of what
630 * you set kern.maxvnodes to.  Do not set kern.maxvnodes too low.
631 */
632static int
633vlrureclaim(struct mount *mp)
634{
635	struct vnode *vp;
636	int done;
637	int trigger;
638	int usevnodes;
639	int count;
640
641	/*
642	 * Calculate the trigger point, don't allow user
643	 * screwups to blow us up.   This prevents us from
644	 * recycling vnodes with lots of resident pages.  We
645	 * aren't trying to free memory, we are trying to
646	 * free vnodes.
647	 */
648	usevnodes = desiredvnodes;
649	if (usevnodes <= 0)
650		usevnodes = 1;
651	trigger = cnt.v_page_count * 2 / usevnodes;
652	done = 0;
653	vn_start_write(NULL, &mp, V_WAIT);
654	MNT_ILOCK(mp);
655	count = mp->mnt_nvnodelistsize / 10 + 1;
656	while (count != 0) {
657		vp = TAILQ_FIRST(&mp->mnt_nvnodelist);
658		while (vp != NULL && vp->v_type == VMARKER)
659			vp = TAILQ_NEXT(vp, v_nmntvnodes);
660		if (vp == NULL)
661			break;
662		TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
663		TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
664		--count;
665		if (!VI_TRYLOCK(vp))
666			goto next_iter;
667		/*
668		 * If it's been deconstructed already, it's still
669		 * referenced, or it exceeds the trigger, skip it.
670		 */
671		if (vp->v_usecount ||
672		    (!vlru_allow_cache_src &&
673			!LIST_EMPTY(&(vp)->v_cache_src)) ||
674		    (vp->v_iflag & VI_DOOMED) != 0 || (vp->v_object != NULL &&
675		    vp->v_object->resident_page_count > trigger)) {
676			VI_UNLOCK(vp);
677			goto next_iter;
678		}
679		MNT_IUNLOCK(mp);
680		vholdl(vp);
681		if (VOP_LOCK(vp, LK_INTERLOCK|LK_EXCLUSIVE|LK_NOWAIT)) {
682			vdrop(vp);
683			goto next_iter_mntunlocked;
684		}
685		VI_LOCK(vp);
686		/*
687		 * v_usecount may have been bumped after VOP_LOCK() dropped
688		 * the vnode interlock and before it was locked again.
689		 *
690		 * It is not necessary to recheck VI_DOOMED because it can
691		 * only be set by another thread that holds both the vnode
692		 * lock and vnode interlock.  If another thread has the
693		 * vnode lock before we get to VOP_LOCK() and obtains the
694		 * vnode interlock after VOP_LOCK() drops the vnode
695		 * interlock, the other thread will be unable to drop the
696		 * vnode lock before our VOP_LOCK() call fails.
697		 */
698		if (vp->v_usecount ||
699		    (!vlru_allow_cache_src &&
700			!LIST_EMPTY(&(vp)->v_cache_src)) ||
701		    (vp->v_object != NULL &&
702		    vp->v_object->resident_page_count > trigger)) {
703			VOP_UNLOCK(vp, LK_INTERLOCK);
704			goto next_iter_mntunlocked;
705		}
706		KASSERT((vp->v_iflag & VI_DOOMED) == 0,
707		    ("VI_DOOMED unexpectedly detected in vlrureclaim()"));
708		vgonel(vp);
709		VOP_UNLOCK(vp, 0);
710		vdropl(vp);
711		done++;
712next_iter_mntunlocked:
713		if (!should_yield())
714			goto relock_mnt;
715		goto yield;
716next_iter:
717		if (!should_yield())
718			continue;
719		MNT_IUNLOCK(mp);
720yield:
721		kern_yield(PRI_UNCHANGED);
722relock_mnt:
723		MNT_ILOCK(mp);
724	}
725	MNT_IUNLOCK(mp);
726	vn_finished_write(mp);
727	return done;
728}
729
730/*
731 * Attempt to keep the free list at wantfreevnodes length.
732 */
733static void
734vnlru_free(int count)
735{
736	struct vnode *vp;
737	int vfslocked;
738
739	mtx_assert(&vnode_free_list_mtx, MA_OWNED);
740	for (; count > 0; count--) {
741		vp = TAILQ_FIRST(&vnode_free_list);
742		/*
743		 * The list can be modified while the free_list_mtx
744		 * has been dropped and vp could be NULL here.
745		 */
746		if (!vp)
747			break;
748		VNASSERT(vp->v_op != NULL, vp,
749		    ("vnlru_free: vnode already reclaimed."));
750		TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
751		/*
752		 * Don't recycle if we can't get the interlock.
753		 */
754		if (!VI_TRYLOCK(vp)) {
755			TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
756			continue;
757		}
758		VNASSERT(VCANRECYCLE(vp), vp,
759		    ("vp inconsistent on freelist"));
760		freevnodes--;
761		vp->v_iflag &= ~VI_FREE;
762		vholdl(vp);
763		mtx_unlock(&vnode_free_list_mtx);
764		VI_UNLOCK(vp);
765		vfslocked = VFS_LOCK_GIANT(vp->v_mount);
766		vtryrecycle(vp);
767		VFS_UNLOCK_GIANT(vfslocked);
768		/*
769		 * If the recycled succeeded this vdrop will actually free
770		 * the vnode.  If not it will simply place it back on
771		 * the free list.
772		 */
773		vdrop(vp);
774		mtx_lock(&vnode_free_list_mtx);
775	}
776}
777/*
778 * Attempt to recycle vnodes in a context that is always safe to block.
779 * Calling vlrurecycle() from the bowels of filesystem code has some
780 * interesting deadlock problems.
781 */
782static struct proc *vnlruproc;
783static int vnlruproc_sig;
784
785static void
786vnlru_proc(void)
787{
788	struct mount *mp, *nmp;
789	int done, vfslocked;
790	struct proc *p = vnlruproc;
791
792	EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, p,
793	    SHUTDOWN_PRI_FIRST);
794
795	for (;;) {
796		kproc_suspend_check(p);
797		mtx_lock(&vnode_free_list_mtx);
798		if (freevnodes > wantfreevnodes)
799			vnlru_free(freevnodes - wantfreevnodes);
800		if (numvnodes <= desiredvnodes * 9 / 10) {
801			vnlruproc_sig = 0;
802			wakeup(&vnlruproc_sig);
803			msleep(vnlruproc, &vnode_free_list_mtx,
804			    PVFS|PDROP, "vlruwt", hz);
805			continue;
806		}
807		mtx_unlock(&vnode_free_list_mtx);
808		done = 0;
809		mtx_lock(&mountlist_mtx);
810		for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
811			if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) {
812				nmp = TAILQ_NEXT(mp, mnt_list);
813				continue;
814			}
815			vfslocked = VFS_LOCK_GIANT(mp);
816			done += vlrureclaim(mp);
817			VFS_UNLOCK_GIANT(vfslocked);
818			mtx_lock(&mountlist_mtx);
819			nmp = TAILQ_NEXT(mp, mnt_list);
820			vfs_unbusy(mp);
821		}
822		mtx_unlock(&mountlist_mtx);
823		if (done == 0) {
824#if 0
825			/* These messages are temporary debugging aids */
826			if (vnlru_nowhere < 5)
827				printf("vnlru process getting nowhere..\n");
828			else if (vnlru_nowhere == 5)
829				printf("vnlru process messages stopped.\n");
830#endif
831			vnlru_nowhere++;
832			tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3);
833		} else
834			kern_yield(PRI_UNCHANGED);
835	}
836}
837
838static struct kproc_desc vnlru_kp = {
839	"vnlru",
840	vnlru_proc,
841	&vnlruproc
842};
843SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start,
844    &vnlru_kp);
845
846/*
847 * Routines having to do with the management of the vnode table.
848 */
849
850void
851vdestroy(struct vnode *vp)
852{
853	struct bufobj *bo;
854
855	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
856	mtx_lock(&vnode_free_list_mtx);
857	numvnodes--;
858	mtx_unlock(&vnode_free_list_mtx);
859	bo = &vp->v_bufobj;
860	VNASSERT((vp->v_iflag & VI_FREE) == 0, vp,
861	    ("cleaned vnode still on the free list."));
862	VNASSERT(vp->v_data == NULL, vp, ("cleaned vnode isn't"));
863	VNASSERT(vp->v_holdcnt == 0, vp, ("Non-zero hold count"));
864	VNASSERT(vp->v_usecount == 0, vp, ("Non-zero use count"));
865	VNASSERT(vp->v_writecount == 0, vp, ("Non-zero write count"));
866	VNASSERT(bo->bo_numoutput == 0, vp, ("Clean vnode has pending I/O's"));
867	VNASSERT(bo->bo_clean.bv_cnt == 0, vp, ("cleanbufcnt not 0"));
868	VNASSERT(bo->bo_clean.bv_root == NULL, vp, ("cleanblkroot not NULL"));
869	VNASSERT(bo->bo_dirty.bv_cnt == 0, vp, ("dirtybufcnt not 0"));
870	VNASSERT(bo->bo_dirty.bv_root == NULL, vp, ("dirtyblkroot not NULL"));
871	VNASSERT(TAILQ_EMPTY(&vp->v_cache_dst), vp, ("vp has namecache dst"));
872	VNASSERT(LIST_EMPTY(&vp->v_cache_src), vp, ("vp has namecache src"));
873	VNASSERT(vp->v_cache_dd == NULL, vp, ("vp has namecache for .."));
874	VI_UNLOCK(vp);
875#ifdef MAC
876	mac_vnode_destroy(vp);
877#endif
878	if (vp->v_pollinfo != NULL)
879		destroy_vpollinfo(vp->v_pollinfo);
880#ifdef INVARIANTS
881	/* XXX Elsewhere we can detect an already freed vnode via NULL v_op. */
882	vp->v_op = NULL;
883#endif
884	lockdestroy(vp->v_vnlock);
885	mtx_destroy(&vp->v_interlock);
886	mtx_destroy(BO_MTX(bo));
887	uma_zfree(vnode_zone, vp);
888}
889
890/*
891 * Try to recycle a freed vnode.  We abort if anyone picks up a reference
892 * before we actually vgone().  This function must be called with the vnode
893 * held to prevent the vnode from being returned to the free list midway
894 * through vgone().
895 */
896static int
897vtryrecycle(struct vnode *vp)
898{
899	struct mount *vnmp;
900
901	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
902	VNASSERT(vp->v_holdcnt, vp,
903	    ("vtryrecycle: Recycling vp %p without a reference.", vp));
904	/*
905	 * This vnode may found and locked via some other list, if so we
906	 * can't recycle it yet.
907	 */
908	if (VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT) != 0) {
909		CTR2(KTR_VFS,
910		    "%s: impossible to recycle, vp %p lock is already held",
911		    __func__, vp);
912		return (EWOULDBLOCK);
913	}
914	/*
915	 * Don't recycle if its filesystem is being suspended.
916	 */
917	if (vn_start_write(vp, &vnmp, V_NOWAIT) != 0) {
918		VOP_UNLOCK(vp, 0);
919		CTR2(KTR_VFS,
920		    "%s: impossible to recycle, cannot start the write for %p",
921		    __func__, vp);
922		return (EBUSY);
923	}
924	/*
925	 * If we got this far, we need to acquire the interlock and see if
926	 * anyone picked up this vnode from another list.  If not, we will
927	 * mark it with DOOMED via vgonel() so that anyone who does find it
928	 * will skip over it.
929	 */
930	VI_LOCK(vp);
931	if (vp->v_usecount) {
932		VOP_UNLOCK(vp, LK_INTERLOCK);
933		vn_finished_write(vnmp);
934		CTR2(KTR_VFS,
935		    "%s: impossible to recycle, %p is already referenced",
936		    __func__, vp);
937		return (EBUSY);
938	}
939	if ((vp->v_iflag & VI_DOOMED) == 0)
940		vgonel(vp);
941	VOP_UNLOCK(vp, LK_INTERLOCK);
942	vn_finished_write(vnmp);
943	return (0);
944}
945
946/*
947 * Return the next vnode from the free list.
948 */
949int
950getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops,
951    struct vnode **vpp)
952{
953	struct vnode *vp = NULL;
954	struct bufobj *bo;
955
956	CTR3(KTR_VFS, "%s: mp %p with tag %s", __func__, mp, tag);
957	mtx_lock(&vnode_free_list_mtx);
958	/*
959	 * Lend our context to reclaim vnodes if they've exceeded the max.
960	 */
961	if (freevnodes > wantfreevnodes)
962		vnlru_free(1);
963	/*
964	 * Wait for available vnodes.
965	 */
966	if (numvnodes > desiredvnodes) {
967		if (mp != NULL && (mp->mnt_kern_flag & MNTK_SUSPEND)) {
968			/*
969			 * File system is beeing suspended, we cannot risk a
970			 * deadlock here, so allocate new vnode anyway.
971			 */
972			if (freevnodes > wantfreevnodes)
973				vnlru_free(freevnodes - wantfreevnodes);
974			goto alloc;
975		}
976		if (vnlruproc_sig == 0) {
977			vnlruproc_sig = 1;	/* avoid unnecessary wakeups */
978			wakeup(vnlruproc);
979		}
980		msleep(&vnlruproc_sig, &vnode_free_list_mtx, PVFS,
981		    "vlruwk", hz);
982#if 0	/* XXX Not all VFS_VGET/ffs_vget callers check returns. */
983		if (numvnodes > desiredvnodes) {
984			mtx_unlock(&vnode_free_list_mtx);
985			return (ENFILE);
986		}
987#endif
988	}
989alloc:
990	numvnodes++;
991	mtx_unlock(&vnode_free_list_mtx);
992	vp = (struct vnode *) uma_zalloc(vnode_zone, M_WAITOK|M_ZERO);
993	/*
994	 * Setup locks.
995	 */
996	vp->v_vnlock = &vp->v_lock;
997	mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF);
998	/*
999	 * By default, don't allow shared locks unless filesystems
1000	 * opt-in.
1001	 */
1002	lockinit(vp->v_vnlock, PVFS, tag, VLKTIMEOUT, LK_NOSHARE);
1003	/*
1004	 * Initialize bufobj.
1005	 */
1006	bo = &vp->v_bufobj;
1007	bo->__bo_vnode = vp;
1008	mtx_init(BO_MTX(bo), "bufobj interlock", NULL, MTX_DEF);
1009	bo->bo_ops = &buf_ops_bio;
1010	bo->bo_private = vp;
1011	TAILQ_INIT(&bo->bo_clean.bv_hd);
1012	TAILQ_INIT(&bo->bo_dirty.bv_hd);
1013	/*
1014	 * Initialize namecache.
1015	 */
1016	LIST_INIT(&vp->v_cache_src);
1017	TAILQ_INIT(&vp->v_cache_dst);
1018	/*
1019	 * Finalize various vnode identity bits.
1020	 */
1021	vp->v_type = VNON;
1022	vp->v_tag = tag;
1023	vp->v_op = vops;
1024	v_incr_usecount(vp);
1025	vp->v_data = 0;
1026#ifdef MAC
1027	mac_vnode_init(vp);
1028	if (mp != NULL && (mp->mnt_flag & MNT_MULTILABEL) == 0)
1029		mac_vnode_associate_singlelabel(mp, vp);
1030	else if (mp == NULL && vops != &dead_vnodeops)
1031		printf("NULL mp in getnewvnode()\n");
1032#endif
1033	if (mp != NULL) {
1034		bo->bo_bsize = mp->mnt_stat.f_iosize;
1035		if ((mp->mnt_kern_flag & MNTK_NOKNOTE) != 0)
1036			vp->v_vflag |= VV_NOKNOTE;
1037	}
1038
1039	*vpp = vp;
1040	return (0);
1041}
1042
1043/*
1044 * Delete from old mount point vnode list, if on one.
1045 */
1046static void
1047delmntque(struct vnode *vp)
1048{
1049	struct mount *mp;
1050
1051	mp = vp->v_mount;
1052	if (mp == NULL)
1053		return;
1054	MNT_ILOCK(mp);
1055	vp->v_mount = NULL;
1056	VNASSERT(mp->mnt_nvnodelistsize > 0, vp,
1057		("bad mount point vnode list size"));
1058	TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
1059	mp->mnt_nvnodelistsize--;
1060	MNT_REL(mp);
1061	MNT_IUNLOCK(mp);
1062}
1063
1064static void
1065insmntque_stddtr(struct vnode *vp, void *dtr_arg)
1066{
1067
1068	vp->v_data = NULL;
1069	vp->v_op = &dead_vnodeops;
1070	/* XXX non mp-safe fs may still call insmntque with vnode
1071	   unlocked */
1072	if (!VOP_ISLOCKED(vp))
1073		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1074	vgone(vp);
1075	vput(vp);
1076}
1077
1078/*
1079 * Insert into list of vnodes for the new mount point, if available.
1080 */
1081int
1082insmntque1(struct vnode *vp, struct mount *mp,
1083	void (*dtr)(struct vnode *, void *), void *dtr_arg)
1084{
1085	int locked;
1086
1087	KASSERT(vp->v_mount == NULL,
1088		("insmntque: vnode already on per mount vnode list"));
1089	VNASSERT(mp != NULL, vp, ("Don't call insmntque(foo, NULL)"));
1090#ifdef DEBUG_VFS_LOCKS
1091	if (!VFS_NEEDSGIANT(mp))
1092		ASSERT_VOP_ELOCKED(vp,
1093		    "insmntque: mp-safe fs and non-locked vp");
1094#endif
1095	MNT_ILOCK(mp);
1096	if ((mp->mnt_kern_flag & MNTK_NOINSMNTQ) != 0 &&
1097	    ((mp->mnt_kern_flag & MNTK_UNMOUNTF) != 0 ||
1098	     mp->mnt_nvnodelistsize == 0)) {
1099		locked = VOP_ISLOCKED(vp);
1100		if (!locked || (locked == LK_EXCLUSIVE &&
1101		     (vp->v_vflag & VV_FORCEINSMQ) == 0)) {
1102			MNT_IUNLOCK(mp);
1103			if (dtr != NULL)
1104				dtr(vp, dtr_arg);
1105			return (EBUSY);
1106		}
1107	}
1108	vp->v_mount = mp;
1109	MNT_REF(mp);
1110	TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
1111	VNASSERT(mp->mnt_nvnodelistsize >= 0, vp,
1112		("neg mount point vnode list size"));
1113	mp->mnt_nvnodelistsize++;
1114	MNT_IUNLOCK(mp);
1115	return (0);
1116}
1117
1118int
1119insmntque(struct vnode *vp, struct mount *mp)
1120{
1121
1122	return (insmntque1(vp, mp, insmntque_stddtr, NULL));
1123}
1124
1125/*
1126 * Flush out and invalidate all buffers associated with a bufobj
1127 * Called with the underlying object locked.
1128 */
1129int
1130bufobj_invalbuf(struct bufobj *bo, int flags, int slpflag, int slptimeo)
1131{
1132	int error;
1133
1134	BO_LOCK(bo);
1135	if (flags & V_SAVE) {
1136		error = bufobj_wwait(bo, slpflag, slptimeo);
1137		if (error) {
1138			BO_UNLOCK(bo);
1139			return (error);
1140		}
1141		if (bo->bo_dirty.bv_cnt > 0) {
1142			BO_UNLOCK(bo);
1143			if ((error = BO_SYNC(bo, MNT_WAIT)) != 0)
1144				return (error);
1145			/*
1146			 * XXX We could save a lock/unlock if this was only
1147			 * enabled under INVARIANTS
1148			 */
1149			BO_LOCK(bo);
1150			if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0)
1151				panic("vinvalbuf: dirty bufs");
1152		}
1153	}
1154	/*
1155	 * If you alter this loop please notice that interlock is dropped and
1156	 * reacquired in flushbuflist.  Special care is needed to ensure that
1157	 * no race conditions occur from this.
1158	 */
1159	do {
1160		error = flushbuflist(&bo->bo_clean,
1161		    flags, bo, slpflag, slptimeo);
1162		if (error == 0)
1163			error = flushbuflist(&bo->bo_dirty,
1164			    flags, bo, slpflag, slptimeo);
1165		if (error != 0 && error != EAGAIN) {
1166			BO_UNLOCK(bo);
1167			return (error);
1168		}
1169	} while (error != 0);
1170
1171	/*
1172	 * Wait for I/O to complete.  XXX needs cleaning up.  The vnode can
1173	 * have write I/O in-progress but if there is a VM object then the
1174	 * VM object can also have read-I/O in-progress.
1175	 */
1176	do {
1177		bufobj_wwait(bo, 0, 0);
1178		BO_UNLOCK(bo);
1179		if (bo->bo_object != NULL) {
1180			VM_OBJECT_LOCK(bo->bo_object);
1181			vm_object_pip_wait(bo->bo_object, "bovlbx");
1182			VM_OBJECT_UNLOCK(bo->bo_object);
1183		}
1184		BO_LOCK(bo);
1185	} while (bo->bo_numoutput > 0);
1186	BO_UNLOCK(bo);
1187
1188	/*
1189	 * Destroy the copy in the VM cache, too.
1190	 */
1191	if (bo->bo_object != NULL && (flags & (V_ALT | V_NORMAL)) == 0) {
1192		VM_OBJECT_LOCK(bo->bo_object);
1193		vm_object_page_remove(bo->bo_object, 0, 0, (flags & V_SAVE) ?
1194		    OBJPR_CLEANONLY : 0);
1195		VM_OBJECT_UNLOCK(bo->bo_object);
1196	}
1197
1198#ifdef INVARIANTS
1199	BO_LOCK(bo);
1200	if ((flags & (V_ALT | V_NORMAL)) == 0 &&
1201	    (bo->bo_dirty.bv_cnt > 0 || bo->bo_clean.bv_cnt > 0))
1202		panic("vinvalbuf: flush failed");
1203	BO_UNLOCK(bo);
1204#endif
1205	return (0);
1206}
1207
1208/*
1209 * Flush out and invalidate all buffers associated with a vnode.
1210 * Called with the underlying object locked.
1211 */
1212int
1213vinvalbuf(struct vnode *vp, int flags, int slpflag, int slptimeo)
1214{
1215
1216	CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags);
1217	ASSERT_VOP_LOCKED(vp, "vinvalbuf");
1218	return (bufobj_invalbuf(&vp->v_bufobj, flags, slpflag, slptimeo));
1219}
1220
1221/*
1222 * Flush out buffers on the specified list.
1223 *
1224 */
1225static int
1226flushbuflist( struct bufv *bufv, int flags, struct bufobj *bo, int slpflag,
1227    int slptimeo)
1228{
1229	struct buf *bp, *nbp;
1230	int retval, error;
1231	daddr_t lblkno;
1232	b_xflags_t xflags;
1233
1234	ASSERT_BO_LOCKED(bo);
1235
1236	retval = 0;
1237	TAILQ_FOREACH_SAFE(bp, &bufv->bv_hd, b_bobufs, nbp) {
1238		if (((flags & V_NORMAL) && (bp->b_xflags & BX_ALTDATA)) ||
1239		    ((flags & V_ALT) && (bp->b_xflags & BX_ALTDATA) == 0)) {
1240			continue;
1241		}
1242		lblkno = 0;
1243		xflags = 0;
1244		if (nbp != NULL) {
1245			lblkno = nbp->b_lblkno;
1246			xflags = nbp->b_xflags &
1247				(BX_BKGRDMARKER | BX_VNDIRTY | BX_VNCLEAN);
1248		}
1249		retval = EAGAIN;
1250		error = BUF_TIMELOCK(bp,
1251		    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_MTX(bo),
1252		    "flushbuf", slpflag, slptimeo);
1253		if (error) {
1254			BO_LOCK(bo);
1255			return (error != ENOLCK ? error : EAGAIN);
1256		}
1257		KASSERT(bp->b_bufobj == bo,
1258		    ("bp %p wrong b_bufobj %p should be %p",
1259		    bp, bp->b_bufobj, bo));
1260		if (bp->b_bufobj != bo) {	/* XXX: necessary ? */
1261			BUF_UNLOCK(bp);
1262			BO_LOCK(bo);
1263			return (EAGAIN);
1264		}
1265		/*
1266		 * XXX Since there are no node locks for NFS, I
1267		 * believe there is a slight chance that a delayed
1268		 * write will occur while sleeping just above, so
1269		 * check for it.
1270		 */
1271		if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) &&
1272		    (flags & V_SAVE)) {
1273			BO_LOCK(bo);
1274			bremfree(bp);
1275			BO_UNLOCK(bo);
1276			bp->b_flags |= B_ASYNC;
1277			bwrite(bp);
1278			BO_LOCK(bo);
1279			return (EAGAIN);	/* XXX: why not loop ? */
1280		}
1281		BO_LOCK(bo);
1282		bremfree(bp);
1283		BO_UNLOCK(bo);
1284		bp->b_flags |= (B_INVAL | B_RELBUF);
1285		bp->b_flags &= ~B_ASYNC;
1286		brelse(bp);
1287		BO_LOCK(bo);
1288		if (nbp != NULL &&
1289		    (nbp->b_bufobj != bo ||
1290		     nbp->b_lblkno != lblkno ||
1291		     (nbp->b_xflags &
1292		      (BX_BKGRDMARKER | BX_VNDIRTY | BX_VNCLEAN)) != xflags))
1293			break;			/* nbp invalid */
1294	}
1295	return (retval);
1296}
1297
1298/*
1299 * Truncate a file's buffer and pages to a specified length.  This
1300 * is in lieu of the old vinvalbuf mechanism, which performed unneeded
1301 * sync activity.
1302 */
1303int
1304vtruncbuf(struct vnode *vp, struct ucred *cred, struct thread *td,
1305    off_t length, int blksize)
1306{
1307	struct buf *bp, *nbp;
1308	int anyfreed;
1309	int trunclbn;
1310	struct bufobj *bo;
1311
1312	CTR5(KTR_VFS, "%s: vp %p with cred %p and block %d:%ju", __func__,
1313	    vp, cred, blksize, (uintmax_t)length);
1314
1315	/*
1316	 * Round up to the *next* lbn.
1317	 */
1318	trunclbn = (length + blksize - 1) / blksize;
1319
1320	ASSERT_VOP_LOCKED(vp, "vtruncbuf");
1321restart:
1322	bo = &vp->v_bufobj;
1323	BO_LOCK(bo);
1324	anyfreed = 1;
1325	for (;anyfreed;) {
1326		anyfreed = 0;
1327		TAILQ_FOREACH_SAFE(bp, &bo->bo_clean.bv_hd, b_bobufs, nbp) {
1328			if (bp->b_lblkno < trunclbn)
1329				continue;
1330			if (BUF_LOCK(bp,
1331			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
1332			    BO_MTX(bo)) == ENOLCK)
1333				goto restart;
1334
1335			BO_LOCK(bo);
1336			bremfree(bp);
1337			BO_UNLOCK(bo);
1338			bp->b_flags |= (B_INVAL | B_RELBUF);
1339			bp->b_flags &= ~B_ASYNC;
1340			brelse(bp);
1341			anyfreed = 1;
1342
1343			BO_LOCK(bo);
1344			if (nbp != NULL &&
1345			    (((nbp->b_xflags & BX_VNCLEAN) == 0) ||
1346			    (nbp->b_vp != vp) ||
1347			    (nbp->b_flags & B_DELWRI))) {
1348				BO_UNLOCK(bo);
1349				goto restart;
1350			}
1351		}
1352
1353		TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
1354			if (bp->b_lblkno < trunclbn)
1355				continue;
1356			if (BUF_LOCK(bp,
1357			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
1358			    BO_MTX(bo)) == ENOLCK)
1359				goto restart;
1360			BO_LOCK(bo);
1361			bremfree(bp);
1362			BO_UNLOCK(bo);
1363			bp->b_flags |= (B_INVAL | B_RELBUF);
1364			bp->b_flags &= ~B_ASYNC;
1365			brelse(bp);
1366			anyfreed = 1;
1367
1368			BO_LOCK(bo);
1369			if (nbp != NULL &&
1370			    (((nbp->b_xflags & BX_VNDIRTY) == 0) ||
1371			    (nbp->b_vp != vp) ||
1372			    (nbp->b_flags & B_DELWRI) == 0)) {
1373				BO_UNLOCK(bo);
1374				goto restart;
1375			}
1376		}
1377	}
1378
1379	if (length > 0) {
1380restartsync:
1381		TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
1382			if (bp->b_lblkno > 0)
1383				continue;
1384			/*
1385			 * Since we hold the vnode lock this should only
1386			 * fail if we're racing with the buf daemon.
1387			 */
1388			if (BUF_LOCK(bp,
1389			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
1390			    BO_MTX(bo)) == ENOLCK) {
1391				goto restart;
1392			}
1393			VNASSERT((bp->b_flags & B_DELWRI), vp,
1394			    ("buf(%p) on dirty queue without DELWRI", bp));
1395
1396			BO_LOCK(bo);
1397			bremfree(bp);
1398			BO_UNLOCK(bo);
1399			bawrite(bp);
1400			BO_LOCK(bo);
1401			goto restartsync;
1402		}
1403	}
1404
1405	bufobj_wwait(bo, 0, 0);
1406	BO_UNLOCK(bo);
1407	vnode_pager_setsize(vp, length);
1408
1409	return (0);
1410}
1411
1412/*
1413 * buf_splay() - splay tree core for the clean/dirty list of buffers in
1414 *		 a vnode.
1415 *
1416 *	NOTE: We have to deal with the special case of a background bitmap
1417 *	buffer, a situation where two buffers will have the same logical
1418 *	block offset.  We want (1) only the foreground buffer to be accessed
1419 *	in a lookup and (2) must differentiate between the foreground and
1420 *	background buffer in the splay tree algorithm because the splay
1421 *	tree cannot normally handle multiple entities with the same 'index'.
1422 *	We accomplish this by adding differentiating flags to the splay tree's
1423 *	numerical domain.
1424 */
1425static
1426struct buf *
1427buf_splay(daddr_t lblkno, b_xflags_t xflags, struct buf *root)
1428{
1429	struct buf dummy;
1430	struct buf *lefttreemax, *righttreemin, *y;
1431
1432	if (root == NULL)
1433		return (NULL);
1434	lefttreemax = righttreemin = &dummy;
1435	for (;;) {
1436		if (lblkno < root->b_lblkno ||
1437		    (lblkno == root->b_lblkno &&
1438		    (xflags & BX_BKGRDMARKER) < (root->b_xflags & BX_BKGRDMARKER))) {
1439			if ((y = root->b_left) == NULL)
1440				break;
1441			if (lblkno < y->b_lblkno) {
1442				/* Rotate right. */
1443				root->b_left = y->b_right;
1444				y->b_right = root;
1445				root = y;
1446				if ((y = root->b_left) == NULL)
1447					break;
1448			}
1449			/* Link into the new root's right tree. */
1450			righttreemin->b_left = root;
1451			righttreemin = root;
1452		} else if (lblkno > root->b_lblkno ||
1453		    (lblkno == root->b_lblkno &&
1454		    (xflags & BX_BKGRDMARKER) > (root->b_xflags & BX_BKGRDMARKER))) {
1455			if ((y = root->b_right) == NULL)
1456				break;
1457			if (lblkno > y->b_lblkno) {
1458				/* Rotate left. */
1459				root->b_right = y->b_left;
1460				y->b_left = root;
1461				root = y;
1462				if ((y = root->b_right) == NULL)
1463					break;
1464			}
1465			/* Link into the new root's left tree. */
1466			lefttreemax->b_right = root;
1467			lefttreemax = root;
1468		} else {
1469			break;
1470		}
1471		root = y;
1472	}
1473	/* Assemble the new root. */
1474	lefttreemax->b_right = root->b_left;
1475	righttreemin->b_left = root->b_right;
1476	root->b_left = dummy.b_right;
1477	root->b_right = dummy.b_left;
1478	return (root);
1479}
1480
1481static void
1482buf_vlist_remove(struct buf *bp)
1483{
1484	struct buf *root;
1485	struct bufv *bv;
1486
1487	KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
1488	ASSERT_BO_LOCKED(bp->b_bufobj);
1489	KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) !=
1490	    (BX_VNDIRTY|BX_VNCLEAN),
1491	    ("buf_vlist_remove: Buf %p is on two lists", bp));
1492	if (bp->b_xflags & BX_VNDIRTY)
1493		bv = &bp->b_bufobj->bo_dirty;
1494	else
1495		bv = &bp->b_bufobj->bo_clean;
1496	if (bp != bv->bv_root) {
1497		root = buf_splay(bp->b_lblkno, bp->b_xflags, bv->bv_root);
1498		KASSERT(root == bp, ("splay lookup failed in remove"));
1499	}
1500	if (bp->b_left == NULL) {
1501		root = bp->b_right;
1502	} else {
1503		root = buf_splay(bp->b_lblkno, bp->b_xflags, bp->b_left);
1504		root->b_right = bp->b_right;
1505	}
1506	bv->bv_root = root;
1507	TAILQ_REMOVE(&bv->bv_hd, bp, b_bobufs);
1508	bv->bv_cnt--;
1509	bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN);
1510}
1511
1512/*
1513 * Add the buffer to the sorted clean or dirty block list using a
1514 * splay tree algorithm.
1515 *
1516 * NOTE: xflags is passed as a constant, optimizing this inline function!
1517 */
1518static void
1519buf_vlist_add(struct buf *bp, struct bufobj *bo, b_xflags_t xflags)
1520{
1521	struct buf *root;
1522	struct bufv *bv;
1523
1524	ASSERT_BO_LOCKED(bo);
1525	KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0,
1526	    ("buf_vlist_add: Buf %p has existing xflags %d", bp, bp->b_xflags));
1527	bp->b_xflags |= xflags;
1528	if (xflags & BX_VNDIRTY)
1529		bv = &bo->bo_dirty;
1530	else
1531		bv = &bo->bo_clean;
1532
1533	root = buf_splay(bp->b_lblkno, bp->b_xflags, bv->bv_root);
1534	if (root == NULL) {
1535		bp->b_left = NULL;
1536		bp->b_right = NULL;
1537		TAILQ_INSERT_TAIL(&bv->bv_hd, bp, b_bobufs);
1538	} else if (bp->b_lblkno < root->b_lblkno ||
1539	    (bp->b_lblkno == root->b_lblkno &&
1540	    (bp->b_xflags & BX_BKGRDMARKER) < (root->b_xflags & BX_BKGRDMARKER))) {
1541		bp->b_left = root->b_left;
1542		bp->b_right = root;
1543		root->b_left = NULL;
1544		TAILQ_INSERT_BEFORE(root, bp, b_bobufs);
1545	} else {
1546		bp->b_right = root->b_right;
1547		bp->b_left = root;
1548		root->b_right = NULL;
1549		TAILQ_INSERT_AFTER(&bv->bv_hd, root, bp, b_bobufs);
1550	}
1551	bv->bv_cnt++;
1552	bv->bv_root = bp;
1553}
1554
1555/*
1556 * Lookup a buffer using the splay tree.  Note that we specifically avoid
1557 * shadow buffers used in background bitmap writes.
1558 *
1559 * This code isn't quite efficient as it could be because we are maintaining
1560 * two sorted lists and do not know which list the block resides in.
1561 *
1562 * During a "make buildworld" the desired buffer is found at one of
1563 * the roots more than 60% of the time.  Thus, checking both roots
1564 * before performing either splay eliminates unnecessary splays on the
1565 * first tree splayed.
1566 */
1567struct buf *
1568gbincore(struct bufobj *bo, daddr_t lblkno)
1569{
1570	struct buf *bp;
1571
1572	ASSERT_BO_LOCKED(bo);
1573	if ((bp = bo->bo_clean.bv_root) != NULL &&
1574	    bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
1575		return (bp);
1576	if ((bp = bo->bo_dirty.bv_root) != NULL &&
1577	    bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
1578		return (bp);
1579	if ((bp = bo->bo_clean.bv_root) != NULL) {
1580		bo->bo_clean.bv_root = bp = buf_splay(lblkno, 0, bp);
1581		if (bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
1582			return (bp);
1583	}
1584	if ((bp = bo->bo_dirty.bv_root) != NULL) {
1585		bo->bo_dirty.bv_root = bp = buf_splay(lblkno, 0, bp);
1586		if (bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
1587			return (bp);
1588	}
1589	return (NULL);
1590}
1591
1592/*
1593 * Associate a buffer with a vnode.
1594 */
1595void
1596bgetvp(struct vnode *vp, struct buf *bp)
1597{
1598	struct bufobj *bo;
1599
1600	bo = &vp->v_bufobj;
1601	ASSERT_BO_LOCKED(bo);
1602	VNASSERT(bp->b_vp == NULL, bp->b_vp, ("bgetvp: not free"));
1603
1604	CTR3(KTR_BUF, "bgetvp(%p) vp %p flags %X", bp, vp, bp->b_flags);
1605	VNASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, vp,
1606	    ("bgetvp: bp already attached! %p", bp));
1607
1608	vhold(vp);
1609	if (VFS_NEEDSGIANT(vp->v_mount) || bo->bo_flag & BO_NEEDSGIANT)
1610		bp->b_flags |= B_NEEDSGIANT;
1611	bp->b_vp = vp;
1612	bp->b_bufobj = bo;
1613	/*
1614	 * Insert onto list for new vnode.
1615	 */
1616	buf_vlist_add(bp, bo, BX_VNCLEAN);
1617}
1618
1619/*
1620 * Disassociate a buffer from a vnode.
1621 */
1622void
1623brelvp(struct buf *bp)
1624{
1625	struct bufobj *bo;
1626	struct vnode *vp;
1627
1628	CTR3(KTR_BUF, "brelvp(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
1629	KASSERT(bp->b_vp != NULL, ("brelvp: NULL"));
1630
1631	/*
1632	 * Delete from old vnode list, if on one.
1633	 */
1634	vp = bp->b_vp;		/* XXX */
1635	bo = bp->b_bufobj;
1636	BO_LOCK(bo);
1637	if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN))
1638		buf_vlist_remove(bp);
1639	else
1640		panic("brelvp: Buffer %p not on queue.", bp);
1641	if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) {
1642		bo->bo_flag &= ~BO_ONWORKLST;
1643		mtx_lock(&sync_mtx);
1644		LIST_REMOVE(bo, bo_synclist);
1645		syncer_worklist_len--;
1646		mtx_unlock(&sync_mtx);
1647	}
1648	bp->b_flags &= ~B_NEEDSGIANT;
1649	bp->b_vp = NULL;
1650	bp->b_bufobj = NULL;
1651	BO_UNLOCK(bo);
1652	vdrop(vp);
1653}
1654
1655/*
1656 * Add an item to the syncer work queue.
1657 */
1658static void
1659vn_syncer_add_to_worklist(struct bufobj *bo, int delay)
1660{
1661	int queue, slot;
1662
1663	ASSERT_BO_LOCKED(bo);
1664
1665	mtx_lock(&sync_mtx);
1666	if (bo->bo_flag & BO_ONWORKLST)
1667		LIST_REMOVE(bo, bo_synclist);
1668	else {
1669		bo->bo_flag |= BO_ONWORKLST;
1670		syncer_worklist_len++;
1671	}
1672
1673	if (delay > syncer_maxdelay - 2)
1674		delay = syncer_maxdelay - 2;
1675	slot = (syncer_delayno + delay) & syncer_mask;
1676
1677	queue = VFS_NEEDSGIANT(bo->__bo_vnode->v_mount) ? WI_GIANTQ :
1678	    WI_MPSAFEQ;
1679	LIST_INSERT_HEAD(&syncer_workitem_pending[queue][slot], bo,
1680	    bo_synclist);
1681	mtx_unlock(&sync_mtx);
1682}
1683
1684static int
1685sysctl_vfs_worklist_len(SYSCTL_HANDLER_ARGS)
1686{
1687	int error, len;
1688
1689	mtx_lock(&sync_mtx);
1690	len = syncer_worklist_len - sync_vnode_count;
1691	mtx_unlock(&sync_mtx);
1692	error = SYSCTL_OUT(req, &len, sizeof(len));
1693	return (error);
1694}
1695
1696SYSCTL_PROC(_vfs, OID_AUTO, worklist_len, CTLTYPE_INT | CTLFLAG_RD, NULL, 0,
1697    sysctl_vfs_worklist_len, "I", "Syncer thread worklist length");
1698
1699static struct proc *updateproc;
1700static void sched_sync(void);
1701static struct kproc_desc up_kp = {
1702	"syncer",
1703	sched_sync,
1704	&updateproc
1705};
1706SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp);
1707
1708static int
1709sync_vnode(struct synclist *slp, struct bufobj **bo, struct thread *td)
1710{
1711	struct vnode *vp;
1712	struct mount *mp;
1713
1714	*bo = LIST_FIRST(slp);
1715	if (*bo == NULL)
1716		return (0);
1717	vp = (*bo)->__bo_vnode;	/* XXX */
1718	if (VOP_ISLOCKED(vp) != 0 || VI_TRYLOCK(vp) == 0)
1719		return (1);
1720	/*
1721	 * We use vhold in case the vnode does not
1722	 * successfully sync.  vhold prevents the vnode from
1723	 * going away when we unlock the sync_mtx so that
1724	 * we can acquire the vnode interlock.
1725	 */
1726	vholdl(vp);
1727	mtx_unlock(&sync_mtx);
1728	VI_UNLOCK(vp);
1729	if (vn_start_write(vp, &mp, V_NOWAIT) != 0) {
1730		vdrop(vp);
1731		mtx_lock(&sync_mtx);
1732		return (*bo == LIST_FIRST(slp));
1733	}
1734	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1735	(void) VOP_FSYNC(vp, MNT_LAZY, td);
1736	VOP_UNLOCK(vp, 0);
1737	vn_finished_write(mp);
1738	BO_LOCK(*bo);
1739	if (((*bo)->bo_flag & BO_ONWORKLST) != 0) {
1740		/*
1741		 * Put us back on the worklist.  The worklist
1742		 * routine will remove us from our current
1743		 * position and then add us back in at a later
1744		 * position.
1745		 */
1746		vn_syncer_add_to_worklist(*bo, syncdelay);
1747	}
1748	BO_UNLOCK(*bo);
1749	vdrop(vp);
1750	mtx_lock(&sync_mtx);
1751	return (0);
1752}
1753
1754/*
1755 * System filesystem synchronizer daemon.
1756 */
1757static void
1758sched_sync(void)
1759{
1760	struct synclist *gnext, *next;
1761	struct synclist *gslp, *slp;
1762	struct bufobj *bo;
1763	long starttime;
1764	struct thread *td = curthread;
1765	int last_work_seen;
1766	int net_worklist_len;
1767	int syncer_final_iter;
1768	int first_printf;
1769	int error;
1770
1771	last_work_seen = 0;
1772	syncer_final_iter = 0;
1773	first_printf = 1;
1774	syncer_state = SYNCER_RUNNING;
1775	starttime = time_uptime;
1776	td->td_pflags |= TDP_NORUNNINGBUF;
1777
1778	EVENTHANDLER_REGISTER(shutdown_pre_sync, syncer_shutdown, td->td_proc,
1779	    SHUTDOWN_PRI_LAST);
1780
1781	mtx_lock(&sync_mtx);
1782	for (;;) {
1783		if (syncer_state == SYNCER_FINAL_DELAY &&
1784		    syncer_final_iter == 0) {
1785			mtx_unlock(&sync_mtx);
1786			kproc_suspend_check(td->td_proc);
1787			mtx_lock(&sync_mtx);
1788		}
1789		net_worklist_len = syncer_worklist_len - sync_vnode_count;
1790		if (syncer_state != SYNCER_RUNNING &&
1791		    starttime != time_uptime) {
1792			if (first_printf) {
1793				printf("\nSyncing disks, vnodes remaining...");
1794				first_printf = 0;
1795			}
1796			printf("%d ", net_worklist_len);
1797		}
1798		starttime = time_uptime;
1799
1800		/*
1801		 * Push files whose dirty time has expired.  Be careful
1802		 * of interrupt race on slp queue.
1803		 *
1804		 * Skip over empty worklist slots when shutting down.
1805		 */
1806		do {
1807			slp = &syncer_workitem_pending[WI_MPSAFEQ][syncer_delayno];
1808			gslp = &syncer_workitem_pending[WI_GIANTQ][syncer_delayno];
1809			syncer_delayno += 1;
1810			if (syncer_delayno == syncer_maxdelay)
1811				syncer_delayno = 0;
1812			next = &syncer_workitem_pending[WI_MPSAFEQ][syncer_delayno];
1813			gnext = &syncer_workitem_pending[WI_GIANTQ][syncer_delayno];
1814			/*
1815			 * If the worklist has wrapped since the
1816			 * it was emptied of all but syncer vnodes,
1817			 * switch to the FINAL_DELAY state and run
1818			 * for one more second.
1819			 */
1820			if (syncer_state == SYNCER_SHUTTING_DOWN &&
1821			    net_worklist_len == 0 &&
1822			    last_work_seen == syncer_delayno) {
1823				syncer_state = SYNCER_FINAL_DELAY;
1824				syncer_final_iter = SYNCER_SHUTDOWN_SPEEDUP;
1825			}
1826		} while (syncer_state != SYNCER_RUNNING && LIST_EMPTY(slp) &&
1827		    LIST_EMPTY(gslp) && syncer_worklist_len > 0);
1828
1829		/*
1830		 * Keep track of the last time there was anything
1831		 * on the worklist other than syncer vnodes.
1832		 * Return to the SHUTTING_DOWN state if any
1833		 * new work appears.
1834		 */
1835		if (net_worklist_len > 0 || syncer_state == SYNCER_RUNNING)
1836			last_work_seen = syncer_delayno;
1837		if (net_worklist_len > 0 && syncer_state == SYNCER_FINAL_DELAY)
1838			syncer_state = SYNCER_SHUTTING_DOWN;
1839		while (!LIST_EMPTY(slp)) {
1840			error = sync_vnode(slp, &bo, td);
1841			if (error == 1) {
1842				LIST_REMOVE(bo, bo_synclist);
1843				LIST_INSERT_HEAD(next, bo, bo_synclist);
1844				continue;
1845			}
1846#ifdef SW_WATCHDOG
1847			if (first_printf == 0)
1848				wdog_kern_pat(WD_LASTVAL);
1849#endif
1850		}
1851		if (!LIST_EMPTY(gslp)) {
1852			mtx_unlock(&sync_mtx);
1853			mtx_lock(&Giant);
1854			mtx_lock(&sync_mtx);
1855			while (!LIST_EMPTY(gslp)) {
1856				error = sync_vnode(gslp, &bo, td);
1857				if (error == 1) {
1858					LIST_REMOVE(bo, bo_synclist);
1859					LIST_INSERT_HEAD(gnext, bo,
1860					    bo_synclist);
1861					continue;
1862				}
1863			}
1864			mtx_unlock(&Giant);
1865		}
1866		if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter > 0)
1867			syncer_final_iter--;
1868		/*
1869		 * The variable rushjob allows the kernel to speed up the
1870		 * processing of the filesystem syncer process. A rushjob
1871		 * value of N tells the filesystem syncer to process the next
1872		 * N seconds worth of work on its queue ASAP. Currently rushjob
1873		 * is used by the soft update code to speed up the filesystem
1874		 * syncer process when the incore state is getting so far
1875		 * ahead of the disk that the kernel memory pool is being
1876		 * threatened with exhaustion.
1877		 */
1878		if (rushjob > 0) {
1879			rushjob -= 1;
1880			continue;
1881		}
1882		/*
1883		 * Just sleep for a short period of time between
1884		 * iterations when shutting down to allow some I/O
1885		 * to happen.
1886		 *
1887		 * If it has taken us less than a second to process the
1888		 * current work, then wait. Otherwise start right over
1889		 * again. We can still lose time if any single round
1890		 * takes more than two seconds, but it does not really
1891		 * matter as we are just trying to generally pace the
1892		 * filesystem activity.
1893		 */
1894		if (syncer_state != SYNCER_RUNNING ||
1895		    time_uptime == starttime) {
1896			thread_lock(td);
1897			sched_prio(td, PPAUSE);
1898			thread_unlock(td);
1899		}
1900		if (syncer_state != SYNCER_RUNNING)
1901			cv_timedwait(&sync_wakeup, &sync_mtx,
1902			    hz / SYNCER_SHUTDOWN_SPEEDUP);
1903		else if (time_uptime == starttime)
1904			cv_timedwait(&sync_wakeup, &sync_mtx, hz);
1905	}
1906}
1907
1908/*
1909 * Request the syncer daemon to speed up its work.
1910 * We never push it to speed up more than half of its
1911 * normal turn time, otherwise it could take over the cpu.
1912 */
1913int
1914speedup_syncer(void)
1915{
1916	int ret = 0;
1917
1918	mtx_lock(&sync_mtx);
1919	if (rushjob < syncdelay / 2) {
1920		rushjob += 1;
1921		stat_rush_requests += 1;
1922		ret = 1;
1923	}
1924	mtx_unlock(&sync_mtx);
1925	cv_broadcast(&sync_wakeup);
1926	return (ret);
1927}
1928
1929/*
1930 * Tell the syncer to speed up its work and run though its work
1931 * list several times, then tell it to shut down.
1932 */
1933static void
1934syncer_shutdown(void *arg, int howto)
1935{
1936
1937	if (howto & RB_NOSYNC)
1938		return;
1939	mtx_lock(&sync_mtx);
1940	syncer_state = SYNCER_SHUTTING_DOWN;
1941	rushjob = 0;
1942	mtx_unlock(&sync_mtx);
1943	cv_broadcast(&sync_wakeup);
1944	kproc_shutdown(arg, howto);
1945}
1946
1947/*
1948 * Reassign a buffer from one vnode to another.
1949 * Used to assign file specific control information
1950 * (indirect blocks) to the vnode to which they belong.
1951 */
1952void
1953reassignbuf(struct buf *bp)
1954{
1955	struct vnode *vp;
1956	struct bufobj *bo;
1957	int delay;
1958#ifdef INVARIANTS
1959	struct bufv *bv;
1960#endif
1961
1962	vp = bp->b_vp;
1963	bo = bp->b_bufobj;
1964	++reassignbufcalls;
1965
1966	CTR3(KTR_BUF, "reassignbuf(%p) vp %p flags %X",
1967	    bp, bp->b_vp, bp->b_flags);
1968	/*
1969	 * B_PAGING flagged buffers cannot be reassigned because their vp
1970	 * is not fully linked in.
1971	 */
1972	if (bp->b_flags & B_PAGING)
1973		panic("cannot reassign paging buffer");
1974
1975	/*
1976	 * Delete from old vnode list, if on one.
1977	 */
1978	BO_LOCK(bo);
1979	if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN))
1980		buf_vlist_remove(bp);
1981	else
1982		panic("reassignbuf: Buffer %p not on queue.", bp);
1983	/*
1984	 * If dirty, put on list of dirty buffers; otherwise insert onto list
1985	 * of clean buffers.
1986	 */
1987	if (bp->b_flags & B_DELWRI) {
1988		if ((bo->bo_flag & BO_ONWORKLST) == 0) {
1989			switch (vp->v_type) {
1990			case VDIR:
1991				delay = dirdelay;
1992				break;
1993			case VCHR:
1994				delay = metadelay;
1995				break;
1996			default:
1997				delay = filedelay;
1998			}
1999			vn_syncer_add_to_worklist(bo, delay);
2000		}
2001		buf_vlist_add(bp, bo, BX_VNDIRTY);
2002	} else {
2003		buf_vlist_add(bp, bo, BX_VNCLEAN);
2004
2005		if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) {
2006			mtx_lock(&sync_mtx);
2007			LIST_REMOVE(bo, bo_synclist);
2008			syncer_worklist_len--;
2009			mtx_unlock(&sync_mtx);
2010			bo->bo_flag &= ~BO_ONWORKLST;
2011		}
2012	}
2013#ifdef INVARIANTS
2014	bv = &bo->bo_clean;
2015	bp = TAILQ_FIRST(&bv->bv_hd);
2016	KASSERT(bp == NULL || bp->b_bufobj == bo,
2017	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
2018	bp = TAILQ_LAST(&bv->bv_hd, buflists);
2019	KASSERT(bp == NULL || bp->b_bufobj == bo,
2020	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
2021	bv = &bo->bo_dirty;
2022	bp = TAILQ_FIRST(&bv->bv_hd);
2023	KASSERT(bp == NULL || bp->b_bufobj == bo,
2024	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
2025	bp = TAILQ_LAST(&bv->bv_hd, buflists);
2026	KASSERT(bp == NULL || bp->b_bufobj == bo,
2027	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
2028#endif
2029	BO_UNLOCK(bo);
2030}
2031
2032/*
2033 * Increment the use and hold counts on the vnode, taking care to reference
2034 * the driver's usecount if this is a chardev.  The vholdl() will remove
2035 * the vnode from the free list if it is presently free.  Requires the
2036 * vnode interlock and returns with it held.
2037 */
2038static void
2039v_incr_usecount(struct vnode *vp)
2040{
2041
2042	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2043	vp->v_usecount++;
2044	if (vp->v_type == VCHR && vp->v_rdev != NULL) {
2045		dev_lock();
2046		vp->v_rdev->si_usecount++;
2047		dev_unlock();
2048	}
2049	vholdl(vp);
2050}
2051
2052/*
2053 * Turn a holdcnt into a use+holdcnt such that only one call to
2054 * v_decr_usecount is needed.
2055 */
2056static void
2057v_upgrade_usecount(struct vnode *vp)
2058{
2059
2060	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2061	vp->v_usecount++;
2062	if (vp->v_type == VCHR && vp->v_rdev != NULL) {
2063		dev_lock();
2064		vp->v_rdev->si_usecount++;
2065		dev_unlock();
2066	}
2067}
2068
2069/*
2070 * Decrement the vnode use and hold count along with the driver's usecount
2071 * if this is a chardev.  The vdropl() below releases the vnode interlock
2072 * as it may free the vnode.
2073 */
2074static void
2075v_decr_usecount(struct vnode *vp)
2076{
2077
2078	ASSERT_VI_LOCKED(vp, __FUNCTION__);
2079	VNASSERT(vp->v_usecount > 0, vp,
2080	    ("v_decr_usecount: negative usecount"));
2081	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2082	vp->v_usecount--;
2083	if (vp->v_type == VCHR && vp->v_rdev != NULL) {
2084		dev_lock();
2085		vp->v_rdev->si_usecount--;
2086		dev_unlock();
2087	}
2088	vdropl(vp);
2089}
2090
2091/*
2092 * Decrement only the use count and driver use count.  This is intended to
2093 * be paired with a follow on vdropl() to release the remaining hold count.
2094 * In this way we may vgone() a vnode with a 0 usecount without risk of
2095 * having it end up on a free list because the hold count is kept above 0.
2096 */
2097static void
2098v_decr_useonly(struct vnode *vp)
2099{
2100
2101	ASSERT_VI_LOCKED(vp, __FUNCTION__);
2102	VNASSERT(vp->v_usecount > 0, vp,
2103	    ("v_decr_useonly: negative usecount"));
2104	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2105	vp->v_usecount--;
2106	if (vp->v_type == VCHR && vp->v_rdev != NULL) {
2107		dev_lock();
2108		vp->v_rdev->si_usecount--;
2109		dev_unlock();
2110	}
2111}
2112
2113/*
2114 * Grab a particular vnode from the free list, increment its
2115 * reference count and lock it.  VI_DOOMED is set if the vnode
2116 * is being destroyed.  Only callers who specify LK_RETRY will
2117 * see doomed vnodes.  If inactive processing was delayed in
2118 * vput try to do it here.
2119 */
2120int
2121vget(struct vnode *vp, int flags, struct thread *td)
2122{
2123	int error;
2124
2125	error = 0;
2126	VFS_ASSERT_GIANT(vp->v_mount);
2127	VNASSERT((flags & LK_TYPE_MASK) != 0, vp,
2128	    ("vget: invalid lock operation"));
2129	CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags);
2130
2131	if ((flags & LK_INTERLOCK) == 0)
2132		VI_LOCK(vp);
2133	vholdl(vp);
2134	if ((error = vn_lock(vp, flags | LK_INTERLOCK)) != 0) {
2135		vdrop(vp);
2136		CTR2(KTR_VFS, "%s: impossible to lock vnode %p", __func__,
2137		    vp);
2138		return (error);
2139	}
2140	if (vp->v_iflag & VI_DOOMED && (flags & LK_RETRY) == 0)
2141		panic("vget: vn_lock failed to return ENOENT\n");
2142	VI_LOCK(vp);
2143	/* Upgrade our holdcnt to a usecount. */
2144	v_upgrade_usecount(vp);
2145	/*
2146	 * We don't guarantee that any particular close will
2147	 * trigger inactive processing so just make a best effort
2148	 * here at preventing a reference to a removed file.  If
2149	 * we don't succeed no harm is done.
2150	 */
2151	if (vp->v_iflag & VI_OWEINACT) {
2152		if (VOP_ISLOCKED(vp) == LK_EXCLUSIVE &&
2153		    (flags & LK_NOWAIT) == 0)
2154			vinactive(vp, td);
2155		vp->v_iflag &= ~VI_OWEINACT;
2156	}
2157	VI_UNLOCK(vp);
2158	return (0);
2159}
2160
2161/*
2162 * Increase the reference count of a vnode.
2163 */
2164void
2165vref(struct vnode *vp)
2166{
2167
2168	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2169	VI_LOCK(vp);
2170	v_incr_usecount(vp);
2171	VI_UNLOCK(vp);
2172}
2173
2174/*
2175 * Return reference count of a vnode.
2176 *
2177 * The results of this call are only guaranteed when some mechanism other
2178 * than the VI lock is used to stop other processes from gaining references
2179 * to the vnode.  This may be the case if the caller holds the only reference.
2180 * This is also useful when stale data is acceptable as race conditions may
2181 * be accounted for by some other means.
2182 */
2183int
2184vrefcnt(struct vnode *vp)
2185{
2186	int usecnt;
2187
2188	VI_LOCK(vp);
2189	usecnt = vp->v_usecount;
2190	VI_UNLOCK(vp);
2191
2192	return (usecnt);
2193}
2194
2195#define	VPUTX_VRELE	1
2196#define	VPUTX_VPUT	2
2197#define	VPUTX_VUNREF	3
2198
2199static void
2200vputx(struct vnode *vp, int func)
2201{
2202	int error;
2203
2204	KASSERT(vp != NULL, ("vputx: null vp"));
2205	if (func == VPUTX_VUNREF)
2206		ASSERT_VOP_LOCKED(vp, "vunref");
2207	else if (func == VPUTX_VPUT)
2208		ASSERT_VOP_LOCKED(vp, "vput");
2209	else
2210		KASSERT(func == VPUTX_VRELE, ("vputx: wrong func"));
2211	VFS_ASSERT_GIANT(vp->v_mount);
2212	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2213	VI_LOCK(vp);
2214
2215	/* Skip this v_writecount check if we're going to panic below. */
2216	VNASSERT(vp->v_writecount < vp->v_usecount || vp->v_usecount < 1, vp,
2217	    ("vputx: missed vn_close"));
2218	error = 0;
2219
2220	if (vp->v_usecount > 1 || ((vp->v_iflag & VI_DOINGINACT) &&
2221	    vp->v_usecount == 1)) {
2222		if (func == VPUTX_VPUT)
2223			VOP_UNLOCK(vp, 0);
2224		v_decr_usecount(vp);
2225		return;
2226	}
2227
2228	if (vp->v_usecount != 1) {
2229		vprint("vputx: negative ref count", vp);
2230		panic("vputx: negative ref cnt");
2231	}
2232	CTR2(KTR_VFS, "%s: return vnode %p to the freelist", __func__, vp);
2233	/*
2234	 * We want to hold the vnode until the inactive finishes to
2235	 * prevent vgone() races.  We drop the use count here and the
2236	 * hold count below when we're done.
2237	 */
2238	v_decr_useonly(vp);
2239	/*
2240	 * We must call VOP_INACTIVE with the node locked. Mark
2241	 * as VI_DOINGINACT to avoid recursion.
2242	 */
2243	vp->v_iflag |= VI_OWEINACT;
2244	switch (func) {
2245	case VPUTX_VRELE:
2246		error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK);
2247		VI_LOCK(vp);
2248		break;
2249	case VPUTX_VPUT:
2250		if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) {
2251			error = VOP_LOCK(vp, LK_UPGRADE | LK_INTERLOCK |
2252			    LK_NOWAIT);
2253			VI_LOCK(vp);
2254		}
2255		break;
2256	case VPUTX_VUNREF:
2257		if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE)
2258			error = EBUSY;
2259		break;
2260	}
2261	if (vp->v_usecount > 0)
2262		vp->v_iflag &= ~VI_OWEINACT;
2263	if (error == 0) {
2264		if (vp->v_iflag & VI_OWEINACT)
2265			vinactive(vp, curthread);
2266		if (func != VPUTX_VUNREF)
2267			VOP_UNLOCK(vp, 0);
2268	}
2269	vdropl(vp);
2270}
2271
2272/*
2273 * Vnode put/release.
2274 * If count drops to zero, call inactive routine and return to freelist.
2275 */
2276void
2277vrele(struct vnode *vp)
2278{
2279
2280	vputx(vp, VPUTX_VRELE);
2281}
2282
2283/*
2284 * Release an already locked vnode.  This give the same effects as
2285 * unlock+vrele(), but takes less time and avoids releasing and
2286 * re-aquiring the lock (as vrele() acquires the lock internally.)
2287 */
2288void
2289vput(struct vnode *vp)
2290{
2291
2292	vputx(vp, VPUTX_VPUT);
2293}
2294
2295/*
2296 * Release an exclusively locked vnode. Do not unlock the vnode lock.
2297 */
2298void
2299vunref(struct vnode *vp)
2300{
2301
2302	vputx(vp, VPUTX_VUNREF);
2303}
2304
2305/*
2306 * Somebody doesn't want the vnode recycled.
2307 */
2308void
2309vhold(struct vnode *vp)
2310{
2311
2312	VI_LOCK(vp);
2313	vholdl(vp);
2314	VI_UNLOCK(vp);
2315}
2316
2317void
2318vholdl(struct vnode *vp)
2319{
2320
2321	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2322	vp->v_holdcnt++;
2323	if (VSHOULDBUSY(vp))
2324		vbusy(vp);
2325}
2326
2327/*
2328 * Note that there is one less who cares about this vnode.  vdrop() is the
2329 * opposite of vhold().
2330 */
2331void
2332vdrop(struct vnode *vp)
2333{
2334
2335	VI_LOCK(vp);
2336	vdropl(vp);
2337}
2338
2339/*
2340 * Drop the hold count of the vnode.  If this is the last reference to
2341 * the vnode we will free it if it has been vgone'd otherwise it is
2342 * placed on the free list.
2343 */
2344void
2345vdropl(struct vnode *vp)
2346{
2347
2348	ASSERT_VI_LOCKED(vp, "vdropl");
2349	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2350	if (vp->v_holdcnt <= 0)
2351		panic("vdrop: holdcnt %d", vp->v_holdcnt);
2352	vp->v_holdcnt--;
2353	if (vp->v_holdcnt == 0) {
2354		if (vp->v_iflag & VI_DOOMED) {
2355			CTR2(KTR_VFS, "%s: destroying the vnode %p", __func__,
2356			    vp);
2357			vdestroy(vp);
2358			return;
2359		} else
2360			vfree(vp);
2361	}
2362	VI_UNLOCK(vp);
2363}
2364
2365/*
2366 * Call VOP_INACTIVE on the vnode and manage the DOINGINACT and OWEINACT
2367 * flags.  DOINGINACT prevents us from recursing in calls to vinactive.
2368 * OWEINACT tracks whether a vnode missed a call to inactive due to a
2369 * failed lock upgrade.
2370 */
2371static void
2372vinactive(struct vnode *vp, struct thread *td)
2373{
2374
2375	ASSERT_VOP_ELOCKED(vp, "vinactive");
2376	ASSERT_VI_LOCKED(vp, "vinactive");
2377	VNASSERT((vp->v_iflag & VI_DOINGINACT) == 0, vp,
2378	    ("vinactive: recursed on VI_DOINGINACT"));
2379	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2380	vp->v_iflag |= VI_DOINGINACT;
2381	vp->v_iflag &= ~VI_OWEINACT;
2382	VI_UNLOCK(vp);
2383	VOP_INACTIVE(vp, td);
2384	VI_LOCK(vp);
2385	VNASSERT(vp->v_iflag & VI_DOINGINACT, vp,
2386	    ("vinactive: lost VI_DOINGINACT"));
2387	vp->v_iflag &= ~VI_DOINGINACT;
2388}
2389
2390/*
2391 * Remove any vnodes in the vnode table belonging to mount point mp.
2392 *
2393 * If FORCECLOSE is not specified, there should not be any active ones,
2394 * return error if any are found (nb: this is a user error, not a
2395 * system error). If FORCECLOSE is specified, detach any active vnodes
2396 * that are found.
2397 *
2398 * If WRITECLOSE is set, only flush out regular file vnodes open for
2399 * writing.
2400 *
2401 * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped.
2402 *
2403 * `rootrefs' specifies the base reference count for the root vnode
2404 * of this filesystem. The root vnode is considered busy if its
2405 * v_usecount exceeds this value. On a successful return, vflush(, td)
2406 * will call vrele() on the root vnode exactly rootrefs times.
2407 * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must
2408 * be zero.
2409 */
2410#ifdef DIAGNOSTIC
2411static int busyprt = 0;		/* print out busy vnodes */
2412SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "Print out busy vnodes");
2413#endif
2414
2415int
2416vflush(struct mount *mp, int rootrefs, int flags, struct thread *td)
2417{
2418	struct vnode *vp, *mvp, *rootvp = NULL;
2419	struct vattr vattr;
2420	int busy = 0, error;
2421
2422	CTR4(KTR_VFS, "%s: mp %p with rootrefs %d and flags %d", __func__, mp,
2423	    rootrefs, flags);
2424	if (rootrefs > 0) {
2425		KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0,
2426		    ("vflush: bad args"));
2427		/*
2428		 * Get the filesystem root vnode. We can vput() it
2429		 * immediately, since with rootrefs > 0, it won't go away.
2430		 */
2431		if ((error = VFS_ROOT(mp, LK_EXCLUSIVE, &rootvp)) != 0) {
2432			CTR2(KTR_VFS, "%s: vfs_root lookup failed with %d",
2433			    __func__, error);
2434			return (error);
2435		}
2436		vput(rootvp);
2437	}
2438	MNT_ILOCK(mp);
2439loop:
2440	MNT_VNODE_FOREACH(vp, mp, mvp) {
2441		VI_LOCK(vp);
2442		vholdl(vp);
2443		MNT_IUNLOCK(mp);
2444		error = vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE);
2445		if (error) {
2446			vdrop(vp);
2447			MNT_ILOCK(mp);
2448			MNT_VNODE_FOREACH_ABORT_ILOCKED(mp, mvp);
2449			goto loop;
2450		}
2451		/*
2452		 * Skip over a vnodes marked VV_SYSTEM.
2453		 */
2454		if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) {
2455			VOP_UNLOCK(vp, 0);
2456			vdrop(vp);
2457			MNT_ILOCK(mp);
2458			continue;
2459		}
2460		/*
2461		 * If WRITECLOSE is set, flush out unlinked but still open
2462		 * files (even if open only for reading) and regular file
2463		 * vnodes open for writing.
2464		 */
2465		if (flags & WRITECLOSE) {
2466			error = VOP_GETATTR(vp, &vattr, td->td_ucred);
2467			VI_LOCK(vp);
2468
2469			if ((vp->v_type == VNON ||
2470			    (error == 0 && vattr.va_nlink > 0)) &&
2471			    (vp->v_writecount == 0 || vp->v_type != VREG)) {
2472				VOP_UNLOCK(vp, 0);
2473				vdropl(vp);
2474				MNT_ILOCK(mp);
2475				continue;
2476			}
2477		} else
2478			VI_LOCK(vp);
2479		/*
2480		 * With v_usecount == 0, all we need to do is clear out the
2481		 * vnode data structures and we are done.
2482		 *
2483		 * If FORCECLOSE is set, forcibly close the vnode.
2484		 */
2485		if (vp->v_usecount == 0 || (flags & FORCECLOSE)) {
2486			VNASSERT(vp->v_usecount == 0 ||
2487			    (vp->v_type != VCHR && vp->v_type != VBLK), vp,
2488			    ("device VNODE %p is FORCECLOSED", vp));
2489			vgonel(vp);
2490		} else {
2491			busy++;
2492#ifdef DIAGNOSTIC
2493			if (busyprt)
2494				vprint("vflush: busy vnode", vp);
2495#endif
2496		}
2497		VOP_UNLOCK(vp, 0);
2498		vdropl(vp);
2499		MNT_ILOCK(mp);
2500	}
2501	MNT_IUNLOCK(mp);
2502	if (rootrefs > 0 && (flags & FORCECLOSE) == 0) {
2503		/*
2504		 * If just the root vnode is busy, and if its refcount
2505		 * is equal to `rootrefs', then go ahead and kill it.
2506		 */
2507		VI_LOCK(rootvp);
2508		KASSERT(busy > 0, ("vflush: not busy"));
2509		VNASSERT(rootvp->v_usecount >= rootrefs, rootvp,
2510		    ("vflush: usecount %d < rootrefs %d",
2511		     rootvp->v_usecount, rootrefs));
2512		if (busy == 1 && rootvp->v_usecount == rootrefs) {
2513			VOP_LOCK(rootvp, LK_EXCLUSIVE|LK_INTERLOCK);
2514			vgone(rootvp);
2515			VOP_UNLOCK(rootvp, 0);
2516			busy = 0;
2517		} else
2518			VI_UNLOCK(rootvp);
2519	}
2520	if (busy) {
2521		CTR2(KTR_VFS, "%s: failing as %d vnodes are busy", __func__,
2522		    busy);
2523		return (EBUSY);
2524	}
2525	for (; rootrefs > 0; rootrefs--)
2526		vrele(rootvp);
2527	return (0);
2528}
2529
2530/*
2531 * Recycle an unused vnode to the front of the free list.
2532 */
2533int
2534vrecycle(struct vnode *vp, struct thread *td)
2535{
2536	int recycled;
2537
2538	ASSERT_VOP_ELOCKED(vp, "vrecycle");
2539	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2540	recycled = 0;
2541	VI_LOCK(vp);
2542	if (vp->v_usecount == 0) {
2543		recycled = 1;
2544		vgonel(vp);
2545	}
2546	VI_UNLOCK(vp);
2547	return (recycled);
2548}
2549
2550/*
2551 * Eliminate all activity associated with a vnode
2552 * in preparation for reuse.
2553 */
2554void
2555vgone(struct vnode *vp)
2556{
2557	VI_LOCK(vp);
2558	vgonel(vp);
2559	VI_UNLOCK(vp);
2560}
2561
2562/*
2563 * vgone, with the vp interlock held.
2564 */
2565void
2566vgonel(struct vnode *vp)
2567{
2568	struct thread *td;
2569	int oweinact;
2570	int active;
2571	struct mount *mp;
2572
2573	ASSERT_VOP_ELOCKED(vp, "vgonel");
2574	ASSERT_VI_LOCKED(vp, "vgonel");
2575	VNASSERT(vp->v_holdcnt, vp,
2576	    ("vgonel: vp %p has no reference.", vp));
2577	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2578	td = curthread;
2579
2580	/*
2581	 * Don't vgonel if we're already doomed.
2582	 */
2583	if (vp->v_iflag & VI_DOOMED)
2584		return;
2585	vp->v_iflag |= VI_DOOMED;
2586	/*
2587	 * Check to see if the vnode is in use.  If so, we have to call
2588	 * VOP_CLOSE() and VOP_INACTIVE().
2589	 */
2590	active = vp->v_usecount;
2591	oweinact = (vp->v_iflag & VI_OWEINACT);
2592	VI_UNLOCK(vp);
2593	/*
2594	 * Clean out any buffers associated with the vnode.
2595	 * If the flush fails, just toss the buffers.
2596	 */
2597	mp = NULL;
2598	if (!TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd))
2599		(void) vn_start_secondary_write(vp, &mp, V_WAIT);
2600	if (vinvalbuf(vp, V_SAVE, 0, 0) != 0)
2601		vinvalbuf(vp, 0, 0, 0);
2602
2603	/*
2604	 * If purging an active vnode, it must be closed and
2605	 * deactivated before being reclaimed.
2606	 */
2607	if (active)
2608		VOP_CLOSE(vp, FNONBLOCK, NOCRED, td);
2609	if (oweinact || active) {
2610		VI_LOCK(vp);
2611		if ((vp->v_iflag & VI_DOINGINACT) == 0)
2612			vinactive(vp, td);
2613		VI_UNLOCK(vp);
2614	}
2615	/*
2616	 * Reclaim the vnode.
2617	 */
2618	if (VOP_RECLAIM(vp, td))
2619		panic("vgone: cannot reclaim");
2620	if (mp != NULL)
2621		vn_finished_secondary_write(mp);
2622	VNASSERT(vp->v_object == NULL, vp,
2623	    ("vop_reclaim left v_object vp=%p, tag=%s", vp, vp->v_tag));
2624	/*
2625	 * Clear the advisory locks and wake up waiting threads.
2626	 */
2627	(void)VOP_ADVLOCKPURGE(vp);
2628	/*
2629	 * Delete from old mount point vnode list.
2630	 */
2631	delmntque(vp);
2632	cache_purge(vp);
2633	/*
2634	 * Done with purge, reset to the standard lock and invalidate
2635	 * the vnode.
2636	 */
2637	VI_LOCK(vp);
2638	vp->v_vnlock = &vp->v_lock;
2639	vp->v_op = &dead_vnodeops;
2640	vp->v_tag = "none";
2641	vp->v_type = VBAD;
2642}
2643
2644/*
2645 * Calculate the total number of references to a special device.
2646 */
2647int
2648vcount(struct vnode *vp)
2649{
2650	int count;
2651
2652	dev_lock();
2653	count = vp->v_rdev->si_usecount;
2654	dev_unlock();
2655	return (count);
2656}
2657
2658/*
2659 * Same as above, but using the struct cdev *as argument
2660 */
2661int
2662count_dev(struct cdev *dev)
2663{
2664	int count;
2665
2666	dev_lock();
2667	count = dev->si_usecount;
2668	dev_unlock();
2669	return(count);
2670}
2671
2672/*
2673 * Print out a description of a vnode.
2674 */
2675static char *typename[] =
2676{"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD",
2677 "VMARKER"};
2678
2679void
2680vn_printf(struct vnode *vp, const char *fmt, ...)
2681{
2682	va_list ap;
2683	char buf[256], buf2[16];
2684	u_long flags;
2685
2686	va_start(ap, fmt);
2687	vprintf(fmt, ap);
2688	va_end(ap);
2689	printf("%p: ", (void *)vp);
2690	printf("tag %s, type %s\n", vp->v_tag, typename[vp->v_type]);
2691	printf("    usecount %d, writecount %d, refcount %d mountedhere %p\n",
2692	    vp->v_usecount, vp->v_writecount, vp->v_holdcnt, vp->v_mountedhere);
2693	buf[0] = '\0';
2694	buf[1] = '\0';
2695	if (vp->v_vflag & VV_ROOT)
2696		strlcat(buf, "|VV_ROOT", sizeof(buf));
2697	if (vp->v_vflag & VV_ISTTY)
2698		strlcat(buf, "|VV_ISTTY", sizeof(buf));
2699	if (vp->v_vflag & VV_NOSYNC)
2700		strlcat(buf, "|VV_NOSYNC", sizeof(buf));
2701	if (vp->v_vflag & VV_CACHEDLABEL)
2702		strlcat(buf, "|VV_CACHEDLABEL", sizeof(buf));
2703	if (vp->v_vflag & VV_TEXT)
2704		strlcat(buf, "|VV_TEXT", sizeof(buf));
2705	if (vp->v_vflag & VV_COPYONWRITE)
2706		strlcat(buf, "|VV_COPYONWRITE", sizeof(buf));
2707	if (vp->v_vflag & VV_SYSTEM)
2708		strlcat(buf, "|VV_SYSTEM", sizeof(buf));
2709	if (vp->v_vflag & VV_PROCDEP)
2710		strlcat(buf, "|VV_PROCDEP", sizeof(buf));
2711	if (vp->v_vflag & VV_NOKNOTE)
2712		strlcat(buf, "|VV_NOKNOTE", sizeof(buf));
2713	if (vp->v_vflag & VV_DELETED)
2714		strlcat(buf, "|VV_DELETED", sizeof(buf));
2715	if (vp->v_vflag & VV_MD)
2716		strlcat(buf, "|VV_MD", sizeof(buf));
2717	flags = vp->v_vflag & ~(VV_ROOT | VV_ISTTY | VV_NOSYNC |
2718	    VV_CACHEDLABEL | VV_TEXT | VV_COPYONWRITE | VV_SYSTEM | VV_PROCDEP |
2719	    VV_NOKNOTE | VV_DELETED | VV_MD);
2720	if (flags != 0) {
2721		snprintf(buf2, sizeof(buf2), "|VV(0x%lx)", flags);
2722		strlcat(buf, buf2, sizeof(buf));
2723	}
2724	if (vp->v_iflag & VI_MOUNT)
2725		strlcat(buf, "|VI_MOUNT", sizeof(buf));
2726	if (vp->v_iflag & VI_AGE)
2727		strlcat(buf, "|VI_AGE", sizeof(buf));
2728	if (vp->v_iflag & VI_DOOMED)
2729		strlcat(buf, "|VI_DOOMED", sizeof(buf));
2730	if (vp->v_iflag & VI_FREE)
2731		strlcat(buf, "|VI_FREE", sizeof(buf));
2732	if (vp->v_iflag & VI_DOINGINACT)
2733		strlcat(buf, "|VI_DOINGINACT", sizeof(buf));
2734	if (vp->v_iflag & VI_OWEINACT)
2735		strlcat(buf, "|VI_OWEINACT", sizeof(buf));
2736	flags = vp->v_iflag & ~(VI_MOUNT | VI_AGE | VI_DOOMED | VI_FREE |
2737	    VI_DOINGINACT | VI_OWEINACT);
2738	if (flags != 0) {
2739		snprintf(buf2, sizeof(buf2), "|VI(0x%lx)", flags);
2740		strlcat(buf, buf2, sizeof(buf));
2741	}
2742	printf("    flags (%s)\n", buf + 1);
2743	if (mtx_owned(VI_MTX(vp)))
2744		printf(" VI_LOCKed");
2745	if (vp->v_object != NULL)
2746		printf("    v_object %p ref %d pages %d\n",
2747		    vp->v_object, vp->v_object->ref_count,
2748		    vp->v_object->resident_page_count);
2749	printf("    ");
2750	lockmgr_printinfo(vp->v_vnlock);
2751	if (vp->v_data != NULL)
2752		VOP_PRINT(vp);
2753}
2754
2755#ifdef DDB
2756/*
2757 * List all of the locked vnodes in the system.
2758 * Called when debugging the kernel.
2759 */
2760DB_SHOW_COMMAND(lockedvnods, lockedvnodes)
2761{
2762	struct mount *mp, *nmp;
2763	struct vnode *vp;
2764
2765	/*
2766	 * Note: because this is DDB, we can't obey the locking semantics
2767	 * for these structures, which means we could catch an inconsistent
2768	 * state and dereference a nasty pointer.  Not much to be done
2769	 * about that.
2770	 */
2771	db_printf("Locked vnodes\n");
2772	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
2773		nmp = TAILQ_NEXT(mp, mnt_list);
2774		TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
2775			if (vp->v_type != VMARKER &&
2776			    VOP_ISLOCKED(vp))
2777				vprint("", vp);
2778		}
2779		nmp = TAILQ_NEXT(mp, mnt_list);
2780	}
2781}
2782
2783/*
2784 * Show details about the given vnode.
2785 */
2786DB_SHOW_COMMAND(vnode, db_show_vnode)
2787{
2788	struct vnode *vp;
2789
2790	if (!have_addr)
2791		return;
2792	vp = (struct vnode *)addr;
2793	vn_printf(vp, "vnode ");
2794}
2795
2796/*
2797 * Show details about the given mount point.
2798 */
2799DB_SHOW_COMMAND(mount, db_show_mount)
2800{
2801	struct mount *mp;
2802	struct vfsopt *opt;
2803	struct statfs *sp;
2804	struct vnode *vp;
2805	char buf[512];
2806	u_int flags;
2807
2808	if (!have_addr) {
2809		/* No address given, print short info about all mount points. */
2810		TAILQ_FOREACH(mp, &mountlist, mnt_list) {
2811			db_printf("%p %s on %s (%s)\n", mp,
2812			    mp->mnt_stat.f_mntfromname,
2813			    mp->mnt_stat.f_mntonname,
2814			    mp->mnt_stat.f_fstypename);
2815			if (db_pager_quit)
2816				break;
2817		}
2818		db_printf("\nMore info: show mount <addr>\n");
2819		return;
2820	}
2821
2822	mp = (struct mount *)addr;
2823	db_printf("%p %s on %s (%s)\n", mp, mp->mnt_stat.f_mntfromname,
2824	    mp->mnt_stat.f_mntonname, mp->mnt_stat.f_fstypename);
2825
2826	buf[0] = '\0';
2827	flags = mp->mnt_flag;
2828#define	MNT_FLAG(flag)	do {						\
2829	if (flags & (flag)) {						\
2830		if (buf[0] != '\0')					\
2831			strlcat(buf, ", ", sizeof(buf));		\
2832		strlcat(buf, (#flag) + 4, sizeof(buf));			\
2833		flags &= ~(flag);					\
2834	}								\
2835} while (0)
2836	MNT_FLAG(MNT_RDONLY);
2837	MNT_FLAG(MNT_SYNCHRONOUS);
2838	MNT_FLAG(MNT_NOEXEC);
2839	MNT_FLAG(MNT_NOSUID);
2840	MNT_FLAG(MNT_UNION);
2841	MNT_FLAG(MNT_ASYNC);
2842	MNT_FLAG(MNT_SUIDDIR);
2843	MNT_FLAG(MNT_SOFTDEP);
2844	MNT_FLAG(MNT_NOSYMFOLLOW);
2845	MNT_FLAG(MNT_GJOURNAL);
2846	MNT_FLAG(MNT_MULTILABEL);
2847	MNT_FLAG(MNT_ACLS);
2848	MNT_FLAG(MNT_NOATIME);
2849	MNT_FLAG(MNT_NOCLUSTERR);
2850	MNT_FLAG(MNT_NOCLUSTERW);
2851	MNT_FLAG(MNT_NFS4ACLS);
2852	MNT_FLAG(MNT_EXRDONLY);
2853	MNT_FLAG(MNT_EXPORTED);
2854	MNT_FLAG(MNT_DEFEXPORTED);
2855	MNT_FLAG(MNT_EXPORTANON);
2856	MNT_FLAG(MNT_EXKERB);
2857	MNT_FLAG(MNT_EXPUBLIC);
2858	MNT_FLAG(MNT_LOCAL);
2859	MNT_FLAG(MNT_QUOTA);
2860	MNT_FLAG(MNT_ROOTFS);
2861	MNT_FLAG(MNT_USER);
2862	MNT_FLAG(MNT_IGNORE);
2863	MNT_FLAG(MNT_UPDATE);
2864	MNT_FLAG(MNT_DELEXPORT);
2865	MNT_FLAG(MNT_RELOAD);
2866	MNT_FLAG(MNT_FORCE);
2867	MNT_FLAG(MNT_SNAPSHOT);
2868	MNT_FLAG(MNT_BYFSID);
2869	MNT_FLAG(MNT_SOFTDEP);
2870#undef MNT_FLAG
2871	if (flags != 0) {
2872		if (buf[0] != '\0')
2873			strlcat(buf, ", ", sizeof(buf));
2874		snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf),
2875		    "0x%08x", flags);
2876	}
2877	db_printf("    mnt_flag = %s\n", buf);
2878
2879	buf[0] = '\0';
2880	flags = mp->mnt_kern_flag;
2881#define	MNT_KERN_FLAG(flag)	do {					\
2882	if (flags & (flag)) {						\
2883		if (buf[0] != '\0')					\
2884			strlcat(buf, ", ", sizeof(buf));		\
2885		strlcat(buf, (#flag) + 5, sizeof(buf));			\
2886		flags &= ~(flag);					\
2887	}								\
2888} while (0)
2889	MNT_KERN_FLAG(MNTK_UNMOUNTF);
2890	MNT_KERN_FLAG(MNTK_ASYNC);
2891	MNT_KERN_FLAG(MNTK_SOFTDEP);
2892	MNT_KERN_FLAG(MNTK_NOINSMNTQ);
2893	MNT_KERN_FLAG(MNTK_DRAINING);
2894	MNT_KERN_FLAG(MNTK_REFEXPIRE);
2895	MNT_KERN_FLAG(MNTK_EXTENDED_SHARED);
2896	MNT_KERN_FLAG(MNTK_SHARED_WRITES);
2897	MNT_KERN_FLAG(MNTK_SUJ);
2898	MNT_KERN_FLAG(MNTK_UNMOUNT);
2899	MNT_KERN_FLAG(MNTK_MWAIT);
2900	MNT_KERN_FLAG(MNTK_SUSPEND);
2901	MNT_KERN_FLAG(MNTK_SUSPEND2);
2902	MNT_KERN_FLAG(MNTK_SUSPENDED);
2903	MNT_KERN_FLAG(MNTK_MPSAFE);
2904	MNT_KERN_FLAG(MNTK_LOOKUP_SHARED);
2905	MNT_KERN_FLAG(MNTK_NOKNOTE);
2906#undef MNT_KERN_FLAG
2907	if (flags != 0) {
2908		if (buf[0] != '\0')
2909			strlcat(buf, ", ", sizeof(buf));
2910		snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf),
2911		    "0x%08x", flags);
2912	}
2913	db_printf("    mnt_kern_flag = %s\n", buf);
2914
2915	db_printf("    mnt_opt = ");
2916	opt = TAILQ_FIRST(mp->mnt_opt);
2917	if (opt != NULL) {
2918		db_printf("%s", opt->name);
2919		opt = TAILQ_NEXT(opt, link);
2920		while (opt != NULL) {
2921			db_printf(", %s", opt->name);
2922			opt = TAILQ_NEXT(opt, link);
2923		}
2924	}
2925	db_printf("\n");
2926
2927	sp = &mp->mnt_stat;
2928	db_printf("    mnt_stat = { version=%u type=%u flags=0x%016jx "
2929	    "bsize=%ju iosize=%ju blocks=%ju bfree=%ju bavail=%jd files=%ju "
2930	    "ffree=%jd syncwrites=%ju asyncwrites=%ju syncreads=%ju "
2931	    "asyncreads=%ju namemax=%u owner=%u fsid=[%d, %d] }\n",
2932	    (u_int)sp->f_version, (u_int)sp->f_type, (uintmax_t)sp->f_flags,
2933	    (uintmax_t)sp->f_bsize, (uintmax_t)sp->f_iosize,
2934	    (uintmax_t)sp->f_blocks, (uintmax_t)sp->f_bfree,
2935	    (intmax_t)sp->f_bavail, (uintmax_t)sp->f_files,
2936	    (intmax_t)sp->f_ffree, (uintmax_t)sp->f_syncwrites,
2937	    (uintmax_t)sp->f_asyncwrites, (uintmax_t)sp->f_syncreads,
2938	    (uintmax_t)sp->f_asyncreads, (u_int)sp->f_namemax,
2939	    (u_int)sp->f_owner, (int)sp->f_fsid.val[0], (int)sp->f_fsid.val[1]);
2940
2941	db_printf("    mnt_cred = { uid=%u ruid=%u",
2942	    (u_int)mp->mnt_cred->cr_uid, (u_int)mp->mnt_cred->cr_ruid);
2943	if (jailed(mp->mnt_cred))
2944		db_printf(", jail=%d", mp->mnt_cred->cr_prison->pr_id);
2945	db_printf(" }\n");
2946	db_printf("    mnt_ref = %d\n", mp->mnt_ref);
2947	db_printf("    mnt_gen = %d\n", mp->mnt_gen);
2948	db_printf("    mnt_nvnodelistsize = %d\n", mp->mnt_nvnodelistsize);
2949	db_printf("    mnt_writeopcount = %d\n", mp->mnt_writeopcount);
2950	db_printf("    mnt_noasync = %u\n", mp->mnt_noasync);
2951	db_printf("    mnt_maxsymlinklen = %d\n", mp->mnt_maxsymlinklen);
2952	db_printf("    mnt_iosize_max = %d\n", mp->mnt_iosize_max);
2953	db_printf("    mnt_hashseed = %u\n", mp->mnt_hashseed);
2954	db_printf("    mnt_secondary_writes = %d\n", mp->mnt_secondary_writes);
2955	db_printf("    mnt_secondary_accwrites = %d\n",
2956	    mp->mnt_secondary_accwrites);
2957	db_printf("    mnt_gjprovider = %s\n",
2958	    mp->mnt_gjprovider != NULL ? mp->mnt_gjprovider : "NULL");
2959	db_printf("\n");
2960
2961	TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
2962		if (vp->v_type != VMARKER) {
2963			vn_printf(vp, "vnode ");
2964			if (db_pager_quit)
2965				break;
2966		}
2967	}
2968}
2969#endif	/* DDB */
2970
2971/*
2972 * Fill in a struct xvfsconf based on a struct vfsconf.
2973 */
2974static void
2975vfsconf2x(struct vfsconf *vfsp, struct xvfsconf *xvfsp)
2976{
2977
2978	strcpy(xvfsp->vfc_name, vfsp->vfc_name);
2979	xvfsp->vfc_typenum = vfsp->vfc_typenum;
2980	xvfsp->vfc_refcount = vfsp->vfc_refcount;
2981	xvfsp->vfc_flags = vfsp->vfc_flags;
2982	/*
2983	 * These are unused in userland, we keep them
2984	 * to not break binary compatibility.
2985	 */
2986	xvfsp->vfc_vfsops = NULL;
2987	xvfsp->vfc_next = NULL;
2988}
2989
2990/*
2991 * Top level filesystem related information gathering.
2992 */
2993static int
2994sysctl_vfs_conflist(SYSCTL_HANDLER_ARGS)
2995{
2996	struct vfsconf *vfsp;
2997	struct xvfsconf xvfsp;
2998	int error;
2999
3000	error = 0;
3001	TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
3002		bzero(&xvfsp, sizeof(xvfsp));
3003		vfsconf2x(vfsp, &xvfsp);
3004		error = SYSCTL_OUT(req, &xvfsp, sizeof xvfsp);
3005		if (error)
3006			break;
3007	}
3008	return (error);
3009}
3010
3011SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLTYPE_OPAQUE | CTLFLAG_RD,
3012    NULL, 0, sysctl_vfs_conflist,
3013    "S,xvfsconf", "List of all configured filesystems");
3014
3015#ifndef BURN_BRIDGES
3016static int	sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS);
3017
3018static int
3019vfs_sysctl(SYSCTL_HANDLER_ARGS)
3020{
3021	int *name = (int *)arg1 - 1;	/* XXX */
3022	u_int namelen = arg2 + 1;	/* XXX */
3023	struct vfsconf *vfsp;
3024	struct xvfsconf xvfsp;
3025
3026	printf("WARNING: userland calling deprecated sysctl, "
3027	    "please rebuild world\n");
3028
3029#if 1 || defined(COMPAT_PRELITE2)
3030	/* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */
3031	if (namelen == 1)
3032		return (sysctl_ovfs_conf(oidp, arg1, arg2, req));
3033#endif
3034
3035	switch (name[1]) {
3036	case VFS_MAXTYPENUM:
3037		if (namelen != 2)
3038			return (ENOTDIR);
3039		return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int)));
3040	case VFS_CONF:
3041		if (namelen != 3)
3042			return (ENOTDIR);	/* overloaded */
3043		TAILQ_FOREACH(vfsp, &vfsconf, vfc_list)
3044			if (vfsp->vfc_typenum == name[2])
3045				break;
3046		if (vfsp == NULL)
3047			return (EOPNOTSUPP);
3048		bzero(&xvfsp, sizeof(xvfsp));
3049		vfsconf2x(vfsp, &xvfsp);
3050		return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp)));
3051	}
3052	return (EOPNOTSUPP);
3053}
3054
3055static SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD | CTLFLAG_SKIP,
3056    vfs_sysctl, "Generic filesystem");
3057
3058#if 1 || defined(COMPAT_PRELITE2)
3059
3060static int
3061sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS)
3062{
3063	int error;
3064	struct vfsconf *vfsp;
3065	struct ovfsconf ovfs;
3066
3067	TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
3068		bzero(&ovfs, sizeof(ovfs));
3069		ovfs.vfc_vfsops = vfsp->vfc_vfsops;	/* XXX used as flag */
3070		strcpy(ovfs.vfc_name, vfsp->vfc_name);
3071		ovfs.vfc_index = vfsp->vfc_typenum;
3072		ovfs.vfc_refcount = vfsp->vfc_refcount;
3073		ovfs.vfc_flags = vfsp->vfc_flags;
3074		error = SYSCTL_OUT(req, &ovfs, sizeof ovfs);
3075		if (error)
3076			return error;
3077	}
3078	return 0;
3079}
3080
3081#endif /* 1 || COMPAT_PRELITE2 */
3082#endif /* !BURN_BRIDGES */
3083
3084#define KINFO_VNODESLOP		10
3085#ifdef notyet
3086/*
3087 * Dump vnode list (via sysctl).
3088 */
3089/* ARGSUSED */
3090static int
3091sysctl_vnode(SYSCTL_HANDLER_ARGS)
3092{
3093	struct xvnode *xvn;
3094	struct mount *mp;
3095	struct vnode *vp;
3096	int error, len, n;
3097
3098	/*
3099	 * Stale numvnodes access is not fatal here.
3100	 */
3101	req->lock = 0;
3102	len = (numvnodes + KINFO_VNODESLOP) * sizeof *xvn;
3103	if (!req->oldptr)
3104		/* Make an estimate */
3105		return (SYSCTL_OUT(req, 0, len));
3106
3107	error = sysctl_wire_old_buffer(req, 0);
3108	if (error != 0)
3109		return (error);
3110	xvn = malloc(len, M_TEMP, M_ZERO | M_WAITOK);
3111	n = 0;
3112	mtx_lock(&mountlist_mtx);
3113	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
3114		if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK))
3115			continue;
3116		MNT_ILOCK(mp);
3117		TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
3118			if (n == len)
3119				break;
3120			vref(vp);
3121			xvn[n].xv_size = sizeof *xvn;
3122			xvn[n].xv_vnode = vp;
3123			xvn[n].xv_id = 0;	/* XXX compat */
3124#define XV_COPY(field) xvn[n].xv_##field = vp->v_##field
3125			XV_COPY(usecount);
3126			XV_COPY(writecount);
3127			XV_COPY(holdcnt);
3128			XV_COPY(mount);
3129			XV_COPY(numoutput);
3130			XV_COPY(type);
3131#undef XV_COPY
3132			xvn[n].xv_flag = vp->v_vflag;
3133
3134			switch (vp->v_type) {
3135			case VREG:
3136			case VDIR:
3137			case VLNK:
3138				break;
3139			case VBLK:
3140			case VCHR:
3141				if (vp->v_rdev == NULL) {
3142					vrele(vp);
3143					continue;
3144				}
3145				xvn[n].xv_dev = dev2udev(vp->v_rdev);
3146				break;
3147			case VSOCK:
3148				xvn[n].xv_socket = vp->v_socket;
3149				break;
3150			case VFIFO:
3151				xvn[n].xv_fifo = vp->v_fifoinfo;
3152				break;
3153			case VNON:
3154			case VBAD:
3155			default:
3156				/* shouldn't happen? */
3157				vrele(vp);
3158				continue;
3159			}
3160			vrele(vp);
3161			++n;
3162		}
3163		MNT_IUNLOCK(mp);
3164		mtx_lock(&mountlist_mtx);
3165		vfs_unbusy(mp);
3166		if (n == len)
3167			break;
3168	}
3169	mtx_unlock(&mountlist_mtx);
3170
3171	error = SYSCTL_OUT(req, xvn, n * sizeof *xvn);
3172	free(xvn, M_TEMP);
3173	return (error);
3174}
3175
3176SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD,
3177    0, 0, sysctl_vnode, "S,xvnode", "");
3178#endif
3179
3180/*
3181 * Unmount all filesystems. The list is traversed in reverse order
3182 * of mounting to avoid dependencies.
3183 */
3184void
3185vfs_unmountall(void)
3186{
3187	struct mount *mp;
3188	struct thread *td;
3189	int error;
3190
3191	KASSERT(curthread != NULL, ("vfs_unmountall: NULL curthread"));
3192	CTR1(KTR_VFS, "%s: unmounting all filesystems", __func__);
3193	td = curthread;
3194
3195	/*
3196	 * Since this only runs when rebooting, it is not interlocked.
3197	 */
3198	while(!TAILQ_EMPTY(&mountlist)) {
3199		mp = TAILQ_LAST(&mountlist, mntlist);
3200		error = dounmount(mp, MNT_FORCE, td);
3201		if (error) {
3202			TAILQ_REMOVE(&mountlist, mp, mnt_list);
3203			/*
3204			 * XXX: Due to the way in which we mount the root
3205			 * file system off of devfs, devfs will generate a
3206			 * "busy" warning when we try to unmount it before
3207			 * the root.  Don't print a warning as a result in
3208			 * order to avoid false positive errors that may
3209			 * cause needless upset.
3210			 */
3211			if (strcmp(mp->mnt_vfc->vfc_name, "devfs") != 0) {
3212				printf("unmount of %s failed (",
3213				    mp->mnt_stat.f_mntonname);
3214				if (error == EBUSY)
3215					printf("BUSY)\n");
3216				else
3217					printf("%d)\n", error);
3218			}
3219		} else {
3220			/* The unmount has removed mp from the mountlist */
3221		}
3222	}
3223}
3224
3225/*
3226 * perform msync on all vnodes under a mount point
3227 * the mount point must be locked.
3228 */
3229void
3230vfs_msync(struct mount *mp, int flags)
3231{
3232	struct vnode *vp, *mvp;
3233	struct vm_object *obj;
3234
3235	CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
3236	MNT_ILOCK(mp);
3237	MNT_VNODE_FOREACH(vp, mp, mvp) {
3238		VI_LOCK(vp);
3239		obj = vp->v_object;
3240		if (obj != NULL && (obj->flags & OBJ_MIGHTBEDIRTY) != 0 &&
3241		    (flags == MNT_WAIT || VOP_ISLOCKED(vp) == 0)) {
3242			MNT_IUNLOCK(mp);
3243			if (!vget(vp,
3244			    LK_EXCLUSIVE | LK_RETRY | LK_INTERLOCK,
3245			    curthread)) {
3246				if (vp->v_vflag & VV_NOSYNC) {	/* unlinked */
3247					vput(vp);
3248					MNT_ILOCK(mp);
3249					continue;
3250				}
3251
3252				obj = vp->v_object;
3253				if (obj != NULL) {
3254					VM_OBJECT_LOCK(obj);
3255					vm_object_page_clean(obj, 0, 0,
3256					    flags == MNT_WAIT ?
3257					    OBJPC_SYNC : OBJPC_NOSYNC);
3258					VM_OBJECT_UNLOCK(obj);
3259				}
3260				vput(vp);
3261			}
3262			MNT_ILOCK(mp);
3263		} else
3264			VI_UNLOCK(vp);
3265	}
3266	MNT_IUNLOCK(mp);
3267}
3268
3269/*
3270 * Mark a vnode as free, putting it up for recycling.
3271 */
3272static void
3273vfree(struct vnode *vp)
3274{
3275
3276	ASSERT_VI_LOCKED(vp, "vfree");
3277	mtx_lock(&vnode_free_list_mtx);
3278	VNASSERT(vp->v_op != NULL, vp, ("vfree: vnode already reclaimed."));
3279	VNASSERT((vp->v_iflag & VI_FREE) == 0, vp, ("vnode already free"));
3280	VNASSERT(VSHOULDFREE(vp), vp, ("vfree: freeing when we shouldn't"));
3281	VNASSERT((vp->v_iflag & VI_DOOMED) == 0, vp,
3282	    ("vfree: Freeing doomed vnode"));
3283	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
3284	if (vp->v_iflag & VI_AGE) {
3285		TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
3286	} else {
3287		TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
3288	}
3289	freevnodes++;
3290	vp->v_iflag &= ~VI_AGE;
3291	vp->v_iflag |= VI_FREE;
3292	mtx_unlock(&vnode_free_list_mtx);
3293}
3294
3295/*
3296 * Opposite of vfree() - mark a vnode as in use.
3297 */
3298static void
3299vbusy(struct vnode *vp)
3300{
3301	ASSERT_VI_LOCKED(vp, "vbusy");
3302	VNASSERT((vp->v_iflag & VI_FREE) != 0, vp, ("vnode not free"));
3303	VNASSERT(vp->v_op != NULL, vp, ("vbusy: vnode already reclaimed."));
3304	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
3305
3306	mtx_lock(&vnode_free_list_mtx);
3307	TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
3308	freevnodes--;
3309	vp->v_iflag &= ~(VI_FREE|VI_AGE);
3310	mtx_unlock(&vnode_free_list_mtx);
3311}
3312
3313static void
3314destroy_vpollinfo(struct vpollinfo *vi)
3315{
3316	knlist_destroy(&vi->vpi_selinfo.si_note);
3317	mtx_destroy(&vi->vpi_lock);
3318	uma_zfree(vnodepoll_zone, vi);
3319}
3320
3321/*
3322 * Initalize per-vnode helper structure to hold poll-related state.
3323 */
3324void
3325v_addpollinfo(struct vnode *vp)
3326{
3327	struct vpollinfo *vi;
3328
3329	if (vp->v_pollinfo != NULL)
3330		return;
3331	vi = uma_zalloc(vnodepoll_zone, M_WAITOK);
3332	mtx_init(&vi->vpi_lock, "vnode pollinfo", NULL, MTX_DEF);
3333	knlist_init(&vi->vpi_selinfo.si_note, vp, vfs_knllock,
3334	    vfs_knlunlock, vfs_knl_assert_locked, vfs_knl_assert_unlocked);
3335	VI_LOCK(vp);
3336	if (vp->v_pollinfo != NULL) {
3337		VI_UNLOCK(vp);
3338		destroy_vpollinfo(vi);
3339		return;
3340	}
3341	vp->v_pollinfo = vi;
3342	VI_UNLOCK(vp);
3343}
3344
3345/*
3346 * Record a process's interest in events which might happen to
3347 * a vnode.  Because poll uses the historic select-style interface
3348 * internally, this routine serves as both the ``check for any
3349 * pending events'' and the ``record my interest in future events''
3350 * functions.  (These are done together, while the lock is held,
3351 * to avoid race conditions.)
3352 */
3353int
3354vn_pollrecord(struct vnode *vp, struct thread *td, int events)
3355{
3356
3357	v_addpollinfo(vp);
3358	mtx_lock(&vp->v_pollinfo->vpi_lock);
3359	if (vp->v_pollinfo->vpi_revents & events) {
3360		/*
3361		 * This leaves events we are not interested
3362		 * in available for the other process which
3363		 * which presumably had requested them
3364		 * (otherwise they would never have been
3365		 * recorded).
3366		 */
3367		events &= vp->v_pollinfo->vpi_revents;
3368		vp->v_pollinfo->vpi_revents &= ~events;
3369
3370		mtx_unlock(&vp->v_pollinfo->vpi_lock);
3371		return (events);
3372	}
3373	vp->v_pollinfo->vpi_events |= events;
3374	selrecord(td, &vp->v_pollinfo->vpi_selinfo);
3375	mtx_unlock(&vp->v_pollinfo->vpi_lock);
3376	return (0);
3377}
3378
3379/*
3380 * Routine to create and manage a filesystem syncer vnode.
3381 */
3382#define sync_close ((int (*)(struct  vop_close_args *))nullop)
3383static int	sync_fsync(struct  vop_fsync_args *);
3384static int	sync_inactive(struct  vop_inactive_args *);
3385static int	sync_reclaim(struct  vop_reclaim_args *);
3386
3387static struct vop_vector sync_vnodeops = {
3388	.vop_bypass =	VOP_EOPNOTSUPP,
3389	.vop_close =	sync_close,		/* close */
3390	.vop_fsync =	sync_fsync,		/* fsync */
3391	.vop_inactive =	sync_inactive,	/* inactive */
3392	.vop_reclaim =	sync_reclaim,	/* reclaim */
3393	.vop_lock1 =	vop_stdlock,	/* lock */
3394	.vop_unlock =	vop_stdunlock,	/* unlock */
3395	.vop_islocked =	vop_stdislocked,	/* islocked */
3396};
3397
3398/*
3399 * Create a new filesystem syncer vnode for the specified mount point.
3400 */
3401void
3402vfs_allocate_syncvnode(struct mount *mp)
3403{
3404	struct vnode *vp;
3405	struct bufobj *bo;
3406	static long start, incr, next;
3407	int error;
3408
3409	/* Allocate a new vnode */
3410	error = getnewvnode("syncer", mp, &sync_vnodeops, &vp);
3411	if (error != 0)
3412		panic("vfs_allocate_syncvnode: getnewvnode() failed");
3413	vp->v_type = VNON;
3414	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3415	vp->v_vflag |= VV_FORCEINSMQ;
3416	error = insmntque(vp, mp);
3417	if (error != 0)
3418		panic("vfs_allocate_syncvnode: insmntque() failed");
3419	vp->v_vflag &= ~VV_FORCEINSMQ;
3420	VOP_UNLOCK(vp, 0);
3421	/*
3422	 * Place the vnode onto the syncer worklist. We attempt to
3423	 * scatter them about on the list so that they will go off
3424	 * at evenly distributed times even if all the filesystems
3425	 * are mounted at once.
3426	 */
3427	next += incr;
3428	if (next == 0 || next > syncer_maxdelay) {
3429		start /= 2;
3430		incr /= 2;
3431		if (start == 0) {
3432			start = syncer_maxdelay / 2;
3433			incr = syncer_maxdelay;
3434		}
3435		next = start;
3436	}
3437	bo = &vp->v_bufobj;
3438	BO_LOCK(bo);
3439	vn_syncer_add_to_worklist(bo, syncdelay > 0 ? next % syncdelay : 0);
3440	/* XXX - vn_syncer_add_to_worklist() also grabs and drops sync_mtx. */
3441	mtx_lock(&sync_mtx);
3442	sync_vnode_count++;
3443	if (mp->mnt_syncer == NULL) {
3444		mp->mnt_syncer = vp;
3445		vp = NULL;
3446	}
3447	mtx_unlock(&sync_mtx);
3448	BO_UNLOCK(bo);
3449	if (vp != NULL) {
3450		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3451		vgone(vp);
3452		vput(vp);
3453	}
3454}
3455
3456void
3457vfs_deallocate_syncvnode(struct mount *mp)
3458{
3459	struct vnode *vp;
3460
3461	mtx_lock(&sync_mtx);
3462	vp = mp->mnt_syncer;
3463	if (vp != NULL)
3464		mp->mnt_syncer = NULL;
3465	mtx_unlock(&sync_mtx);
3466	if (vp != NULL)
3467		vrele(vp);
3468}
3469
3470/*
3471 * Do a lazy sync of the filesystem.
3472 */
3473static int
3474sync_fsync(struct vop_fsync_args *ap)
3475{
3476	struct vnode *syncvp = ap->a_vp;
3477	struct mount *mp = syncvp->v_mount;
3478	int error;
3479	struct bufobj *bo;
3480
3481	/*
3482	 * We only need to do something if this is a lazy evaluation.
3483	 */
3484	if (ap->a_waitfor != MNT_LAZY)
3485		return (0);
3486
3487	/*
3488	 * Move ourselves to the back of the sync list.
3489	 */
3490	bo = &syncvp->v_bufobj;
3491	BO_LOCK(bo);
3492	vn_syncer_add_to_worklist(bo, syncdelay);
3493	BO_UNLOCK(bo);
3494
3495	/*
3496	 * Walk the list of vnodes pushing all that are dirty and
3497	 * not already on the sync list.
3498	 */
3499	mtx_lock(&mountlist_mtx);
3500	if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK) != 0) {
3501		mtx_unlock(&mountlist_mtx);
3502		return (0);
3503	}
3504	if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) {
3505		vfs_unbusy(mp);
3506		return (0);
3507	}
3508	MNT_ILOCK(mp);
3509	mp->mnt_noasync++;
3510	mp->mnt_kern_flag &= ~MNTK_ASYNC;
3511	MNT_IUNLOCK(mp);
3512	vfs_msync(mp, MNT_NOWAIT);
3513	error = VFS_SYNC(mp, MNT_LAZY);
3514	MNT_ILOCK(mp);
3515	mp->mnt_noasync--;
3516	if ((mp->mnt_flag & MNT_ASYNC) != 0 && mp->mnt_noasync == 0)
3517		mp->mnt_kern_flag |= MNTK_ASYNC;
3518	MNT_IUNLOCK(mp);
3519	vn_finished_write(mp);
3520	vfs_unbusy(mp);
3521	return (error);
3522}
3523
3524/*
3525 * The syncer vnode is no referenced.
3526 */
3527static int
3528sync_inactive(struct vop_inactive_args *ap)
3529{
3530
3531	vgone(ap->a_vp);
3532	return (0);
3533}
3534
3535/*
3536 * The syncer vnode is no longer needed and is being decommissioned.
3537 *
3538 * Modifications to the worklist must be protected by sync_mtx.
3539 */
3540static int
3541sync_reclaim(struct vop_reclaim_args *ap)
3542{
3543	struct vnode *vp = ap->a_vp;
3544	struct bufobj *bo;
3545
3546	bo = &vp->v_bufobj;
3547	BO_LOCK(bo);
3548	mtx_lock(&sync_mtx);
3549	if (vp->v_mount->mnt_syncer == vp)
3550		vp->v_mount->mnt_syncer = NULL;
3551	if (bo->bo_flag & BO_ONWORKLST) {
3552		LIST_REMOVE(bo, bo_synclist);
3553		syncer_worklist_len--;
3554		sync_vnode_count--;
3555		bo->bo_flag &= ~BO_ONWORKLST;
3556	}
3557	mtx_unlock(&sync_mtx);
3558	BO_UNLOCK(bo);
3559
3560	return (0);
3561}
3562
3563/*
3564 * Check if vnode represents a disk device
3565 */
3566int
3567vn_isdisk(struct vnode *vp, int *errp)
3568{
3569	int error;
3570
3571	error = 0;
3572	dev_lock();
3573	if (vp->v_type != VCHR)
3574		error = ENOTBLK;
3575	else if (vp->v_rdev == NULL)
3576		error = ENXIO;
3577	else if (vp->v_rdev->si_devsw == NULL)
3578		error = ENXIO;
3579	else if (!(vp->v_rdev->si_devsw->d_flags & D_DISK))
3580		error = ENOTBLK;
3581	dev_unlock();
3582	if (errp != NULL)
3583		*errp = error;
3584	return (error == 0);
3585}
3586
3587/*
3588 * Common filesystem object access control check routine.  Accepts a
3589 * vnode's type, "mode", uid and gid, requested access mode, credentials,
3590 * and optional call-by-reference privused argument allowing vaccess()
3591 * to indicate to the caller whether privilege was used to satisfy the
3592 * request (obsoleted).  Returns 0 on success, or an errno on failure.
3593 */
3594int
3595vaccess(enum vtype type, mode_t file_mode, uid_t file_uid, gid_t file_gid,
3596    accmode_t accmode, struct ucred *cred, int *privused)
3597{
3598	accmode_t dac_granted;
3599	accmode_t priv_granted;
3600
3601	KASSERT((accmode & ~(VEXEC | VWRITE | VREAD | VADMIN | VAPPEND)) == 0,
3602	    ("invalid bit in accmode"));
3603	KASSERT((accmode & VAPPEND) == 0 || (accmode & VWRITE),
3604	    ("VAPPEND without VWRITE"));
3605
3606	/*
3607	 * Look for a normal, non-privileged way to access the file/directory
3608	 * as requested.  If it exists, go with that.
3609	 */
3610
3611	if (privused != NULL)
3612		*privused = 0;
3613
3614	dac_granted = 0;
3615
3616	/* Check the owner. */
3617	if (cred->cr_uid == file_uid) {
3618		dac_granted |= VADMIN;
3619		if (file_mode & S_IXUSR)
3620			dac_granted |= VEXEC;
3621		if (file_mode & S_IRUSR)
3622			dac_granted |= VREAD;
3623		if (file_mode & S_IWUSR)
3624			dac_granted |= (VWRITE | VAPPEND);
3625
3626		if ((accmode & dac_granted) == accmode)
3627			return (0);
3628
3629		goto privcheck;
3630	}
3631
3632	/* Otherwise, check the groups (first match) */
3633	if (groupmember(file_gid, cred)) {
3634		if (file_mode & S_IXGRP)
3635			dac_granted |= VEXEC;
3636		if (file_mode & S_IRGRP)
3637			dac_granted |= VREAD;
3638		if (file_mode & S_IWGRP)
3639			dac_granted |= (VWRITE | VAPPEND);
3640
3641		if ((accmode & dac_granted) == accmode)
3642			return (0);
3643
3644		goto privcheck;
3645	}
3646
3647	/* Otherwise, check everyone else. */
3648	if (file_mode & S_IXOTH)
3649		dac_granted |= VEXEC;
3650	if (file_mode & S_IROTH)
3651		dac_granted |= VREAD;
3652	if (file_mode & S_IWOTH)
3653		dac_granted |= (VWRITE | VAPPEND);
3654	if ((accmode & dac_granted) == accmode)
3655		return (0);
3656
3657privcheck:
3658	/*
3659	 * Build a privilege mask to determine if the set of privileges
3660	 * satisfies the requirements when combined with the granted mask
3661	 * from above.  For each privilege, if the privilege is required,
3662	 * bitwise or the request type onto the priv_granted mask.
3663	 */
3664	priv_granted = 0;
3665
3666	if (type == VDIR) {
3667		/*
3668		 * For directories, use PRIV_VFS_LOOKUP to satisfy VEXEC
3669		 * requests, instead of PRIV_VFS_EXEC.
3670		 */
3671		if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
3672		    !priv_check_cred(cred, PRIV_VFS_LOOKUP, 0))
3673			priv_granted |= VEXEC;
3674	} else {
3675		/*
3676		 * Ensure that at least one execute bit is on. Otherwise,
3677		 * a privileged user will always succeed, and we don't want
3678		 * this to happen unless the file really is executable.
3679		 */
3680		if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
3681		    (file_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) != 0 &&
3682		    !priv_check_cred(cred, PRIV_VFS_EXEC, 0))
3683			priv_granted |= VEXEC;
3684	}
3685
3686	if ((accmode & VREAD) && ((dac_granted & VREAD) == 0) &&
3687	    !priv_check_cred(cred, PRIV_VFS_READ, 0))
3688		priv_granted |= VREAD;
3689
3690	if ((accmode & VWRITE) && ((dac_granted & VWRITE) == 0) &&
3691	    !priv_check_cred(cred, PRIV_VFS_WRITE, 0))
3692		priv_granted |= (VWRITE | VAPPEND);
3693
3694	if ((accmode & VADMIN) && ((dac_granted & VADMIN) == 0) &&
3695	    !priv_check_cred(cred, PRIV_VFS_ADMIN, 0))
3696		priv_granted |= VADMIN;
3697
3698	if ((accmode & (priv_granted | dac_granted)) == accmode) {
3699		/* XXX audit: privilege used */
3700		if (privused != NULL)
3701			*privused = 1;
3702		return (0);
3703	}
3704
3705	return ((accmode & VADMIN) ? EPERM : EACCES);
3706}
3707
3708/*
3709 * Credential check based on process requesting service, and per-attribute
3710 * permissions.
3711 */
3712int
3713extattr_check_cred(struct vnode *vp, int attrnamespace, struct ucred *cred,
3714    struct thread *td, accmode_t accmode)
3715{
3716
3717	/*
3718	 * Kernel-invoked always succeeds.
3719	 */
3720	if (cred == NOCRED)
3721		return (0);
3722
3723	/*
3724	 * Do not allow privileged processes in jail to directly manipulate
3725	 * system attributes.
3726	 */
3727	switch (attrnamespace) {
3728	case EXTATTR_NAMESPACE_SYSTEM:
3729		/* Potentially should be: return (EPERM); */
3730		return (priv_check_cred(cred, PRIV_VFS_EXTATTR_SYSTEM, 0));
3731	case EXTATTR_NAMESPACE_USER:
3732		return (VOP_ACCESS(vp, accmode, cred, td));
3733	default:
3734		return (EPERM);
3735	}
3736}
3737
3738#ifdef DEBUG_VFS_LOCKS
3739/*
3740 * This only exists to supress warnings from unlocked specfs accesses.  It is
3741 * no longer ok to have an unlocked VFS.
3742 */
3743#define	IGNORE_LOCK(vp) (panicstr != NULL || (vp) == NULL ||		\
3744	(vp)->v_type == VCHR ||	(vp)->v_type == VBAD)
3745
3746int vfs_badlock_ddb = 1;	/* Drop into debugger on violation. */
3747SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_ddb, CTLFLAG_RW, &vfs_badlock_ddb, 0,
3748    "Drop into debugger on lock violation");
3749
3750int vfs_badlock_mutex = 1;	/* Check for interlock across VOPs. */
3751SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_mutex, CTLFLAG_RW, &vfs_badlock_mutex,
3752    0, "Check for interlock across VOPs");
3753
3754int vfs_badlock_print = 1;	/* Print lock violations. */
3755SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_print, CTLFLAG_RW, &vfs_badlock_print,
3756    0, "Print lock violations");
3757
3758#ifdef KDB
3759int vfs_badlock_backtrace = 1;	/* Print backtrace at lock violations. */
3760SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_backtrace, CTLFLAG_RW,
3761    &vfs_badlock_backtrace, 0, "Print backtrace at lock violations");
3762#endif
3763
3764static void
3765vfs_badlock(const char *msg, const char *str, struct vnode *vp)
3766{
3767
3768#ifdef KDB
3769	if (vfs_badlock_backtrace)
3770		kdb_backtrace();
3771#endif
3772	if (vfs_badlock_print)
3773		printf("%s: %p %s\n", str, (void *)vp, msg);
3774	if (vfs_badlock_ddb)
3775		kdb_enter(KDB_WHY_VFSLOCK, "lock violation");
3776}
3777
3778void
3779assert_vi_locked(struct vnode *vp, const char *str)
3780{
3781
3782	if (vfs_badlock_mutex && !mtx_owned(VI_MTX(vp)))
3783		vfs_badlock("interlock is not locked but should be", str, vp);
3784}
3785
3786void
3787assert_vi_unlocked(struct vnode *vp, const char *str)
3788{
3789
3790	if (vfs_badlock_mutex && mtx_owned(VI_MTX(vp)))
3791		vfs_badlock("interlock is locked but should not be", str, vp);
3792}
3793
3794void
3795assert_vop_locked(struct vnode *vp, const char *str)
3796{
3797
3798	if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) == 0)
3799		vfs_badlock("is not locked but should be", str, vp);
3800}
3801
3802void
3803assert_vop_unlocked(struct vnode *vp, const char *str)
3804{
3805
3806	if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) == LK_EXCLUSIVE)
3807		vfs_badlock("is locked but should not be", str, vp);
3808}
3809
3810void
3811assert_vop_elocked(struct vnode *vp, const char *str)
3812{
3813
3814	if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_EXCLUSIVE)
3815		vfs_badlock("is not exclusive locked but should be", str, vp);
3816}
3817
3818#if 0
3819void
3820assert_vop_elocked_other(struct vnode *vp, const char *str)
3821{
3822
3823	if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_EXCLOTHER)
3824		vfs_badlock("is not exclusive locked by another thread",
3825		    str, vp);
3826}
3827
3828void
3829assert_vop_slocked(struct vnode *vp, const char *str)
3830{
3831
3832	if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_SHARED)
3833		vfs_badlock("is not locked shared but should be", str, vp);
3834}
3835#endif /* 0 */
3836#endif /* DEBUG_VFS_LOCKS */
3837
3838void
3839vop_rename_fail(struct vop_rename_args *ap)
3840{
3841
3842	if (ap->a_tvp != NULL)
3843		vput(ap->a_tvp);
3844	if (ap->a_tdvp == ap->a_tvp)
3845		vrele(ap->a_tdvp);
3846	else
3847		vput(ap->a_tdvp);
3848	vrele(ap->a_fdvp);
3849	vrele(ap->a_fvp);
3850}
3851
3852void
3853vop_rename_pre(void *ap)
3854{
3855	struct vop_rename_args *a = ap;
3856
3857#ifdef DEBUG_VFS_LOCKS
3858	if (a->a_tvp)
3859		ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME");
3860	ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME");
3861	ASSERT_VI_UNLOCKED(a->a_fvp, "VOP_RENAME");
3862	ASSERT_VI_UNLOCKED(a->a_fdvp, "VOP_RENAME");
3863
3864	/* Check the source (from). */
3865	if (a->a_tdvp->v_vnlock != a->a_fdvp->v_vnlock &&
3866	    (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fdvp->v_vnlock))
3867		ASSERT_VOP_UNLOCKED(a->a_fdvp, "vop_rename: fdvp locked");
3868	if (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fvp->v_vnlock)
3869		ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: fvp locked");
3870
3871	/* Check the target. */
3872	if (a->a_tvp)
3873		ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked");
3874	ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked");
3875#endif
3876	if (a->a_tdvp != a->a_fdvp)
3877		vhold(a->a_fdvp);
3878	if (a->a_tvp != a->a_fvp)
3879		vhold(a->a_fvp);
3880	vhold(a->a_tdvp);
3881	if (a->a_tvp)
3882		vhold(a->a_tvp);
3883}
3884
3885void
3886vop_strategy_pre(void *ap)
3887{
3888#ifdef DEBUG_VFS_LOCKS
3889	struct vop_strategy_args *a;
3890	struct buf *bp;
3891
3892	a = ap;
3893	bp = a->a_bp;
3894
3895	/*
3896	 * Cluster ops lock their component buffers but not the IO container.
3897	 */
3898	if ((bp->b_flags & B_CLUSTER) != 0)
3899		return;
3900
3901	if (panicstr == NULL && !BUF_ISLOCKED(bp)) {
3902		if (vfs_badlock_print)
3903			printf(
3904			    "VOP_STRATEGY: bp is not locked but should be\n");
3905		if (vfs_badlock_ddb)
3906			kdb_enter(KDB_WHY_VFSLOCK, "lock violation");
3907	}
3908#endif
3909}
3910
3911void
3912vop_lookup_pre(void *ap)
3913{
3914#ifdef DEBUG_VFS_LOCKS
3915	struct vop_lookup_args *a;
3916	struct vnode *dvp;
3917
3918	a = ap;
3919	dvp = a->a_dvp;
3920	ASSERT_VI_UNLOCKED(dvp, "VOP_LOOKUP");
3921	ASSERT_VOP_LOCKED(dvp, "VOP_LOOKUP");
3922#endif
3923}
3924
3925void
3926vop_lookup_post(void *ap, int rc)
3927{
3928#ifdef DEBUG_VFS_LOCKS
3929	struct vop_lookup_args *a;
3930	struct vnode *dvp;
3931	struct vnode *vp;
3932
3933	a = ap;
3934	dvp = a->a_dvp;
3935	vp = *(a->a_vpp);
3936
3937	ASSERT_VI_UNLOCKED(dvp, "VOP_LOOKUP");
3938	ASSERT_VOP_LOCKED(dvp, "VOP_LOOKUP");
3939
3940	if (!rc)
3941		ASSERT_VOP_LOCKED(vp, "VOP_LOOKUP (child)");
3942#endif
3943}
3944
3945void
3946vop_lock_pre(void *ap)
3947{
3948#ifdef DEBUG_VFS_LOCKS
3949	struct vop_lock1_args *a = ap;
3950
3951	if ((a->a_flags & LK_INTERLOCK) == 0)
3952		ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
3953	else
3954		ASSERT_VI_LOCKED(a->a_vp, "VOP_LOCK");
3955#endif
3956}
3957
3958void
3959vop_lock_post(void *ap, int rc)
3960{
3961#ifdef DEBUG_VFS_LOCKS
3962	struct vop_lock1_args *a = ap;
3963
3964	ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
3965	if (rc == 0)
3966		ASSERT_VOP_LOCKED(a->a_vp, "VOP_LOCK");
3967#endif
3968}
3969
3970void
3971vop_unlock_pre(void *ap)
3972{
3973#ifdef DEBUG_VFS_LOCKS
3974	struct vop_unlock_args *a = ap;
3975
3976	if (a->a_flags & LK_INTERLOCK)
3977		ASSERT_VI_LOCKED(a->a_vp, "VOP_UNLOCK");
3978	ASSERT_VOP_LOCKED(a->a_vp, "VOP_UNLOCK");
3979#endif
3980}
3981
3982void
3983vop_unlock_post(void *ap, int rc)
3984{
3985#ifdef DEBUG_VFS_LOCKS
3986	struct vop_unlock_args *a = ap;
3987
3988	if (a->a_flags & LK_INTERLOCK)
3989		ASSERT_VI_UNLOCKED(a->a_vp, "VOP_UNLOCK");
3990#endif
3991}
3992
3993void
3994vop_create_post(void *ap, int rc)
3995{
3996	struct vop_create_args *a = ap;
3997
3998	if (!rc)
3999		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
4000}
4001
4002void
4003vop_link_post(void *ap, int rc)
4004{
4005	struct vop_link_args *a = ap;
4006
4007	if (!rc) {
4008		VFS_KNOTE_LOCKED(a->a_vp, NOTE_LINK);
4009		VFS_KNOTE_LOCKED(a->a_tdvp, NOTE_WRITE);
4010	}
4011}
4012
4013void
4014vop_mkdir_post(void *ap, int rc)
4015{
4016	struct vop_mkdir_args *a = ap;
4017
4018	if (!rc)
4019		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK);
4020}
4021
4022void
4023vop_mknod_post(void *ap, int rc)
4024{
4025	struct vop_mknod_args *a = ap;
4026
4027	if (!rc)
4028		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
4029}
4030
4031void
4032vop_remove_post(void *ap, int rc)
4033{
4034	struct vop_remove_args *a = ap;
4035
4036	if (!rc) {
4037		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
4038		VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE);
4039	}
4040}
4041
4042void
4043vop_rename_post(void *ap, int rc)
4044{
4045	struct vop_rename_args *a = ap;
4046
4047	if (!rc) {
4048		VFS_KNOTE_UNLOCKED(a->a_fdvp, NOTE_WRITE);
4049		VFS_KNOTE_UNLOCKED(a->a_tdvp, NOTE_WRITE);
4050		VFS_KNOTE_UNLOCKED(a->a_fvp, NOTE_RENAME);
4051		if (a->a_tvp)
4052			VFS_KNOTE_UNLOCKED(a->a_tvp, NOTE_DELETE);
4053	}
4054	if (a->a_tdvp != a->a_fdvp)
4055		vdrop(a->a_fdvp);
4056	if (a->a_tvp != a->a_fvp)
4057		vdrop(a->a_fvp);
4058	vdrop(a->a_tdvp);
4059	if (a->a_tvp)
4060		vdrop(a->a_tvp);
4061}
4062
4063void
4064vop_rmdir_post(void *ap, int rc)
4065{
4066	struct vop_rmdir_args *a = ap;
4067
4068	if (!rc) {
4069		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK);
4070		VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE);
4071	}
4072}
4073
4074void
4075vop_setattr_post(void *ap, int rc)
4076{
4077	struct vop_setattr_args *a = ap;
4078
4079	if (!rc)
4080		VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
4081}
4082
4083void
4084vop_symlink_post(void *ap, int rc)
4085{
4086	struct vop_symlink_args *a = ap;
4087
4088	if (!rc)
4089		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
4090}
4091
4092static struct knlist fs_knlist;
4093
4094static void
4095vfs_event_init(void *arg)
4096{
4097	knlist_init_mtx(&fs_knlist, NULL);
4098}
4099/* XXX - correct order? */
4100SYSINIT(vfs_knlist, SI_SUB_VFS, SI_ORDER_ANY, vfs_event_init, NULL);
4101
4102void
4103vfs_event_signal(fsid_t *fsid, uint32_t event, intptr_t data __unused)
4104{
4105
4106	KNOTE_UNLOCKED(&fs_knlist, event);
4107}
4108
4109static int	filt_fsattach(struct knote *kn);
4110static void	filt_fsdetach(struct knote *kn);
4111static int	filt_fsevent(struct knote *kn, long hint);
4112
4113struct filterops fs_filtops = {
4114	.f_isfd = 0,
4115	.f_attach = filt_fsattach,
4116	.f_detach = filt_fsdetach,
4117	.f_event = filt_fsevent
4118};
4119
4120static int
4121filt_fsattach(struct knote *kn)
4122{
4123
4124	kn->kn_flags |= EV_CLEAR;
4125	knlist_add(&fs_knlist, kn, 0);
4126	return (0);
4127}
4128
4129static void
4130filt_fsdetach(struct knote *kn)
4131{
4132
4133	knlist_remove(&fs_knlist, kn, 0);
4134}
4135
4136static int
4137filt_fsevent(struct knote *kn, long hint)
4138{
4139
4140	kn->kn_fflags |= hint;
4141	return (kn->kn_fflags != 0);
4142}
4143
4144static int
4145sysctl_vfs_ctl(SYSCTL_HANDLER_ARGS)
4146{
4147	struct vfsidctl vc;
4148	int error;
4149	struct mount *mp;
4150
4151	error = SYSCTL_IN(req, &vc, sizeof(vc));
4152	if (error)
4153		return (error);
4154	if (vc.vc_vers != VFS_CTL_VERS1)
4155		return (EINVAL);
4156	mp = vfs_getvfs(&vc.vc_fsid);
4157	if (mp == NULL)
4158		return (ENOENT);
4159	/* ensure that a specific sysctl goes to the right filesystem. */
4160	if (strcmp(vc.vc_fstypename, "*") != 0 &&
4161	    strcmp(vc.vc_fstypename, mp->mnt_vfc->vfc_name) != 0) {
4162		vfs_rel(mp);
4163		return (EINVAL);
4164	}
4165	VCTLTOREQ(&vc, req);
4166	error = VFS_SYSCTL(mp, vc.vc_op, req);
4167	vfs_rel(mp);
4168	return (error);
4169}
4170
4171SYSCTL_PROC(_vfs, OID_AUTO, ctl, CTLTYPE_OPAQUE | CTLFLAG_WR,
4172    NULL, 0, sysctl_vfs_ctl, "",
4173    "Sysctl by fsid");
4174
4175/*
4176 * Function to initialize a va_filerev field sensibly.
4177 * XXX: Wouldn't a random number make a lot more sense ??
4178 */
4179u_quad_t
4180init_va_filerev(void)
4181{
4182	struct bintime bt;
4183
4184	getbinuptime(&bt);
4185	return (((u_quad_t)bt.sec << 32LL) | (bt.frac >> 32LL));
4186}
4187
4188static int	filt_vfsread(struct knote *kn, long hint);
4189static int	filt_vfswrite(struct knote *kn, long hint);
4190static int	filt_vfsvnode(struct knote *kn, long hint);
4191static void	filt_vfsdetach(struct knote *kn);
4192static struct filterops vfsread_filtops = {
4193	.f_isfd = 1,
4194	.f_detach = filt_vfsdetach,
4195	.f_event = filt_vfsread
4196};
4197static struct filterops vfswrite_filtops = {
4198	.f_isfd = 1,
4199	.f_detach = filt_vfsdetach,
4200	.f_event = filt_vfswrite
4201};
4202static struct filterops vfsvnode_filtops = {
4203	.f_isfd = 1,
4204	.f_detach = filt_vfsdetach,
4205	.f_event = filt_vfsvnode
4206};
4207
4208static void
4209vfs_knllock(void *arg)
4210{
4211	struct vnode *vp = arg;
4212
4213	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4214}
4215
4216static void
4217vfs_knlunlock(void *arg)
4218{
4219	struct vnode *vp = arg;
4220
4221	VOP_UNLOCK(vp, 0);
4222}
4223
4224static void
4225vfs_knl_assert_locked(void *arg)
4226{
4227#ifdef DEBUG_VFS_LOCKS
4228	struct vnode *vp = arg;
4229
4230	ASSERT_VOP_LOCKED(vp, "vfs_knl_assert_locked");
4231#endif
4232}
4233
4234static void
4235vfs_knl_assert_unlocked(void *arg)
4236{
4237#ifdef DEBUG_VFS_LOCKS
4238	struct vnode *vp = arg;
4239
4240	ASSERT_VOP_UNLOCKED(vp, "vfs_knl_assert_unlocked");
4241#endif
4242}
4243
4244int
4245vfs_kqfilter(struct vop_kqfilter_args *ap)
4246{
4247	struct vnode *vp = ap->a_vp;
4248	struct knote *kn = ap->a_kn;
4249	struct knlist *knl;
4250
4251	switch (kn->kn_filter) {
4252	case EVFILT_READ:
4253		kn->kn_fop = &vfsread_filtops;
4254		break;
4255	case EVFILT_WRITE:
4256		kn->kn_fop = &vfswrite_filtops;
4257		break;
4258	case EVFILT_VNODE:
4259		kn->kn_fop = &vfsvnode_filtops;
4260		break;
4261	default:
4262		return (EINVAL);
4263	}
4264
4265	kn->kn_hook = (caddr_t)vp;
4266
4267	v_addpollinfo(vp);
4268	if (vp->v_pollinfo == NULL)
4269		return (ENOMEM);
4270	knl = &vp->v_pollinfo->vpi_selinfo.si_note;
4271	knlist_add(knl, kn, 0);
4272
4273	return (0);
4274}
4275
4276/*
4277 * Detach knote from vnode
4278 */
4279static void
4280filt_vfsdetach(struct knote *kn)
4281{
4282	struct vnode *vp = (struct vnode *)kn->kn_hook;
4283
4284	KASSERT(vp->v_pollinfo != NULL, ("Missing v_pollinfo"));
4285	knlist_remove(&vp->v_pollinfo->vpi_selinfo.si_note, kn, 0);
4286}
4287
4288/*ARGSUSED*/
4289static int
4290filt_vfsread(struct knote *kn, long hint)
4291{
4292	struct vnode *vp = (struct vnode *)kn->kn_hook;
4293	struct vattr va;
4294	int res;
4295
4296	/*
4297	 * filesystem is gone, so set the EOF flag and schedule
4298	 * the knote for deletion.
4299	 */
4300	if (hint == NOTE_REVOKE) {
4301		VI_LOCK(vp);
4302		kn->kn_flags |= (EV_EOF | EV_ONESHOT);
4303		VI_UNLOCK(vp);
4304		return (1);
4305	}
4306
4307	if (VOP_GETATTR(vp, &va, curthread->td_ucred))
4308		return (0);
4309
4310	VI_LOCK(vp);
4311	kn->kn_data = va.va_size - kn->kn_fp->f_offset;
4312	res = (kn->kn_data != 0);
4313	VI_UNLOCK(vp);
4314	return (res);
4315}
4316
4317/*ARGSUSED*/
4318static int
4319filt_vfswrite(struct knote *kn, long hint)
4320{
4321	struct vnode *vp = (struct vnode *)kn->kn_hook;
4322
4323	VI_LOCK(vp);
4324
4325	/*
4326	 * filesystem is gone, so set the EOF flag and schedule
4327	 * the knote for deletion.
4328	 */
4329	if (hint == NOTE_REVOKE)
4330		kn->kn_flags |= (EV_EOF | EV_ONESHOT);
4331
4332	kn->kn_data = 0;
4333	VI_UNLOCK(vp);
4334	return (1);
4335}
4336
4337static int
4338filt_vfsvnode(struct knote *kn, long hint)
4339{
4340	struct vnode *vp = (struct vnode *)kn->kn_hook;
4341	int res;
4342
4343	VI_LOCK(vp);
4344	if (kn->kn_sfflags & hint)
4345		kn->kn_fflags |= hint;
4346	if (hint == NOTE_REVOKE) {
4347		kn->kn_flags |= EV_EOF;
4348		VI_UNLOCK(vp);
4349		return (1);
4350	}
4351	res = (kn->kn_fflags != 0);
4352	VI_UNLOCK(vp);
4353	return (res);
4354}
4355
4356int
4357vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off)
4358{
4359	int error;
4360
4361	if (dp->d_reclen > ap->a_uio->uio_resid)
4362		return (ENAMETOOLONG);
4363	error = uiomove(dp, dp->d_reclen, ap->a_uio);
4364	if (error) {
4365		if (ap->a_ncookies != NULL) {
4366			if (ap->a_cookies != NULL)
4367				free(ap->a_cookies, M_TEMP);
4368			ap->a_cookies = NULL;
4369			*ap->a_ncookies = 0;
4370		}
4371		return (error);
4372	}
4373	if (ap->a_ncookies == NULL)
4374		return (0);
4375
4376	KASSERT(ap->a_cookies,
4377	    ("NULL ap->a_cookies value with non-NULL ap->a_ncookies!"));
4378
4379	*ap->a_cookies = realloc(*ap->a_cookies,
4380	    (*ap->a_ncookies + 1) * sizeof(u_long), M_TEMP, M_WAITOK | M_ZERO);
4381	(*ap->a_cookies)[*ap->a_ncookies] = off;
4382	return (0);
4383}
4384
4385/*
4386 * Mark for update the access time of the file if the filesystem
4387 * supports VOP_MARKATIME.  This functionality is used by execve and
4388 * mmap, so we want to avoid the I/O implied by directly setting
4389 * va_atime for the sake of efficiency.
4390 */
4391void
4392vfs_mark_atime(struct vnode *vp, struct ucred *cred)
4393{
4394	struct mount *mp;
4395
4396	mp = vp->v_mount;
4397	VFS_ASSERT_GIANT(mp);
4398	ASSERT_VOP_LOCKED(vp, "vfs_mark_atime");
4399	if (mp != NULL && (mp->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0)
4400		(void)VOP_MARKATIME(vp);
4401}
4402
4403/*
4404 * The purpose of this routine is to remove granularity from accmode_t,
4405 * reducing it into standard unix access bits - VEXEC, VREAD, VWRITE,
4406 * VADMIN and VAPPEND.
4407 *
4408 * If it returns 0, the caller is supposed to continue with the usual
4409 * access checks using 'accmode' as modified by this routine.  If it
4410 * returns nonzero value, the caller is supposed to return that value
4411 * as errno.
4412 *
4413 * Note that after this routine runs, accmode may be zero.
4414 */
4415int
4416vfs_unixify_accmode(accmode_t *accmode)
4417{
4418	/*
4419	 * There is no way to specify explicit "deny" rule using
4420	 * file mode or POSIX.1e ACLs.
4421	 */
4422	if (*accmode & VEXPLICIT_DENY) {
4423		*accmode = 0;
4424		return (0);
4425	}
4426
4427	/*
4428	 * None of these can be translated into usual access bits.
4429	 * Also, the common case for NFSv4 ACLs is to not contain
4430	 * either of these bits. Caller should check for VWRITE
4431	 * on the containing directory instead.
4432	 */
4433	if (*accmode & (VDELETE_CHILD | VDELETE))
4434		return (EPERM);
4435
4436	if (*accmode & VADMIN_PERMS) {
4437		*accmode &= ~VADMIN_PERMS;
4438		*accmode |= VADMIN;
4439	}
4440
4441	/*
4442	 * There is no way to deny VREAD_ATTRIBUTES, VREAD_ACL
4443	 * or VSYNCHRONIZE using file mode or POSIX.1e ACL.
4444	 */
4445	*accmode &= ~(VSTAT_PERMS | VSYNCHRONIZE);
4446
4447	return (0);
4448}
4449