vfs_subr.c revision 154152
1/*-
2 * Copyright (c) 1989, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 4. Neither the name of the University nor the names of its contributors
19 *    may be used to endorse or promote products derived from this software
20 *    without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 *	@(#)vfs_subr.c	8.31 (Berkeley) 5/26/95
35 */
36
37/*
38 * External virtual filesystem routines
39 */
40
41#include <sys/cdefs.h>
42__FBSDID("$FreeBSD: head/sys/kern/vfs_subr.c 154152 2006-01-09 20:42:19Z tegge $");
43
44#include "opt_ddb.h"
45#include "opt_mac.h"
46
47#include <sys/param.h>
48#include <sys/systm.h>
49#include <sys/bio.h>
50#include <sys/buf.h>
51#include <sys/conf.h>
52#include <sys/dirent.h>
53#include <sys/event.h>
54#include <sys/eventhandler.h>
55#include <sys/extattr.h>
56#include <sys/file.h>
57#include <sys/fcntl.h>
58#include <sys/kdb.h>
59#include <sys/kernel.h>
60#include <sys/kthread.h>
61#include <sys/mac.h>
62#include <sys/malloc.h>
63#include <sys/mount.h>
64#include <sys/namei.h>
65#include <sys/reboot.h>
66#include <sys/sleepqueue.h>
67#include <sys/stat.h>
68#include <sys/sysctl.h>
69#include <sys/syslog.h>
70#include <sys/vmmeter.h>
71#include <sys/vnode.h>
72
73#include <machine/stdarg.h>
74
75#include <vm/vm.h>
76#include <vm/vm_object.h>
77#include <vm/vm_extern.h>
78#include <vm/pmap.h>
79#include <vm/vm_map.h>
80#include <vm/vm_page.h>
81#include <vm/vm_kern.h>
82#include <vm/uma.h>
83
84static MALLOC_DEFINE(M_NETADDR, "subr_export_host", "Export host address structure");
85
86static void	delmntque(struct vnode *vp);
87static void	insmntque(struct vnode *vp, struct mount *mp);
88static int	flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo,
89		    int slpflag, int slptimeo);
90static void	syncer_shutdown(void *arg, int howto);
91static int	vtryrecycle(struct vnode *vp);
92static void	vbusy(struct vnode *vp);
93static void	vdropl(struct vnode *vp);
94static void	vinactive(struct vnode *, struct thread *);
95static void	v_incr_usecount(struct vnode *);
96static void	v_decr_usecount(struct vnode *);
97static void	v_decr_useonly(struct vnode *);
98static void	vfree(struct vnode *);
99static void	vnlru_free(int);
100static void	vdestroy(struct vnode *);
101static void	vgonel(struct vnode *);
102static void	vfs_knllock(void *arg);
103static void	vfs_knlunlock(void *arg);
104static int	vfs_knllocked(void *arg);
105
106
107/*
108 * Enable Giant pushdown based on whether or not the vm is mpsafe in this
109 * build.  Without mpsafevm the buffer cache can not run Giant free.
110 */
111#if defined(__alpha__) || defined(__amd64__) || defined(__i386__) || \
112	defined(__ia64__) || defined(__sparc64__)
113int mpsafe_vfs = 1;
114#else
115int mpsafe_vfs;
116#endif
117TUNABLE_INT("debug.mpsafevfs", &mpsafe_vfs);
118SYSCTL_INT(_debug, OID_AUTO, mpsafevfs, CTLFLAG_RD, &mpsafe_vfs, 0,
119    "MPSAFE VFS");
120
121/*
122 * Number of vnodes in existence.  Increased whenever getnewvnode()
123 * allocates a new vnode, never decreased.
124 */
125static unsigned long	numvnodes;
126
127SYSCTL_LONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, "");
128
129/*
130 * Conversion tables for conversion from vnode types to inode formats
131 * and back.
132 */
133enum vtype iftovt_tab[16] = {
134	VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
135	VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
136};
137int vttoif_tab[10] = {
138	0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
139	S_IFSOCK, S_IFIFO, S_IFMT, S_IFMT
140};
141
142/*
143 * List of vnodes that are ready for recycling.
144 */
145static TAILQ_HEAD(freelst, vnode) vnode_free_list;
146
147/*
148 * Free vnode target.  Free vnodes may simply be files which have been stat'd
149 * but not read.  This is somewhat common, and a small cache of such files
150 * should be kept to avoid recreation costs.
151 */
152static u_long wantfreevnodes;
153SYSCTL_LONG(_vfs, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, "");
154/* Number of vnodes in the free list. */
155static u_long freevnodes;
156SYSCTL_LONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, "");
157
158/*
159 * Various variables used for debugging the new implementation of
160 * reassignbuf().
161 * XXX these are probably of (very) limited utility now.
162 */
163static int reassignbufcalls;
164SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0, "");
165
166/*
167 * Cache for the mount type id assigned to NFS.  This is used for
168 * special checks in nfs/nfs_nqlease.c and vm/vnode_pager.c.
169 */
170int	nfs_mount_type = -1;
171
172/* To keep more than one thread at a time from running vfs_getnewfsid */
173static struct mtx mntid_mtx;
174
175/*
176 * Lock for any access to the following:
177 *	vnode_free_list
178 *	numvnodes
179 *	freevnodes
180 */
181static struct mtx vnode_free_list_mtx;
182
183/* Publicly exported FS */
184struct nfs_public nfs_pub;
185
186/* Zone for allocation of new vnodes - used exclusively by getnewvnode() */
187static uma_zone_t vnode_zone;
188static uma_zone_t vnodepoll_zone;
189
190/* Set to 1 to print out reclaim of active vnodes */
191int	prtactive;
192
193/*
194 * The workitem queue.
195 *
196 * It is useful to delay writes of file data and filesystem metadata
197 * for tens of seconds so that quickly created and deleted files need
198 * not waste disk bandwidth being created and removed. To realize this,
199 * we append vnodes to a "workitem" queue. When running with a soft
200 * updates implementation, most pending metadata dependencies should
201 * not wait for more than a few seconds. Thus, mounted on block devices
202 * are delayed only about a half the time that file data is delayed.
203 * Similarly, directory updates are more critical, so are only delayed
204 * about a third the time that file data is delayed. Thus, there are
205 * SYNCER_MAXDELAY queues that are processed round-robin at a rate of
206 * one each second (driven off the filesystem syncer process). The
207 * syncer_delayno variable indicates the next queue that is to be processed.
208 * Items that need to be processed soon are placed in this queue:
209 *
210 *	syncer_workitem_pending[syncer_delayno]
211 *
212 * A delay of fifteen seconds is done by placing the request fifteen
213 * entries later in the queue:
214 *
215 *	syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
216 *
217 */
218static int syncer_delayno;
219static long syncer_mask;
220LIST_HEAD(synclist, bufobj);
221static struct synclist *syncer_workitem_pending;
222/*
223 * The sync_mtx protects:
224 *	bo->bo_synclist
225 *	sync_vnode_count
226 *	syncer_delayno
227 *	syncer_state
228 *	syncer_workitem_pending
229 *	syncer_worklist_len
230 *	rushjob
231 */
232static struct mtx sync_mtx;
233
234#define SYNCER_MAXDELAY		32
235static int syncer_maxdelay = SYNCER_MAXDELAY;	/* maximum delay time */
236static int syncdelay = 30;		/* max time to delay syncing data */
237static int filedelay = 30;		/* time to delay syncing files */
238SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, "");
239static int dirdelay = 29;		/* time to delay syncing directories */
240SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, "");
241static int metadelay = 28;		/* time to delay syncing metadata */
242SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, "");
243static int rushjob;		/* number of slots to run ASAP */
244static int stat_rush_requests;	/* number of times I/O speeded up */
245SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, "");
246
247/*
248 * When shutting down the syncer, run it at four times normal speed.
249 */
250#define SYNCER_SHUTDOWN_SPEEDUP		4
251static int sync_vnode_count;
252static int syncer_worklist_len;
253static enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY }
254    syncer_state;
255
256/*
257 * Number of vnodes we want to exist at any one time.  This is mostly used
258 * to size hash tables in vnode-related code.  It is normally not used in
259 * getnewvnode(), as wantfreevnodes is normally nonzero.)
260 *
261 * XXX desiredvnodes is historical cruft and should not exist.
262 */
263int desiredvnodes;
264SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW,
265    &desiredvnodes, 0, "Maximum number of vnodes");
266SYSCTL_INT(_kern, OID_AUTO, minvnodes, CTLFLAG_RW,
267    &wantfreevnodes, 0, "Minimum number of vnodes (legacy)");
268static int vnlru_nowhere;
269SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW,
270    &vnlru_nowhere, 0, "Number of times the vnlru process ran without success");
271
272/* Hook for calling soft updates. */
273int (*softdep_process_worklist_hook)(struct mount *);
274
275/*
276 * Macros to control when a vnode is freed and recycled.  All require
277 * the vnode interlock.
278 */
279#define VCANRECYCLE(vp) (((vp)->v_iflag & VI_FREE) && !(vp)->v_holdcnt)
280#define VSHOULDFREE(vp) (!((vp)->v_iflag & VI_FREE) && !(vp)->v_holdcnt)
281#define VSHOULDBUSY(vp) (((vp)->v_iflag & VI_FREE) && (vp)->v_holdcnt)
282
283
284/*
285 * Initialize the vnode management data structures.
286 */
287#ifndef	MAXVNODES_MAX
288#define	MAXVNODES_MAX	100000
289#endif
290static void
291vntblinit(void *dummy __unused)
292{
293
294	/*
295	 * Desiredvnodes is a function of the physical memory size and
296	 * the kernel's heap size.  Specifically, desiredvnodes scales
297	 * in proportion to the physical memory size until two fifths
298	 * of the kernel's heap size is consumed by vnodes and vm
299	 * objects.
300	 */
301	desiredvnodes = min(maxproc + cnt.v_page_count / 4, 2 * vm_kmem_size /
302	    (5 * (sizeof(struct vm_object) + sizeof(struct vnode))));
303	if (desiredvnodes > MAXVNODES_MAX) {
304		if (bootverbose)
305			printf("Reducing kern.maxvnodes %d -> %d\n",
306			    desiredvnodes, MAXVNODES_MAX);
307		desiredvnodes = MAXVNODES_MAX;
308	}
309	wantfreevnodes = desiredvnodes / 4;
310	mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF);
311	TAILQ_INIT(&vnode_free_list);
312	mtx_init(&vnode_free_list_mtx, "vnode_free_list", NULL, MTX_DEF);
313	vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL,
314	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
315	vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo),
316	      NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
317	/*
318	 * Initialize the filesystem syncer.
319	 */
320	syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE,
321		&syncer_mask);
322	syncer_maxdelay = syncer_mask + 1;
323	mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF);
324}
325SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL)
326
327
328/*
329 * Mark a mount point as busy. Used to synchronize access and to delay
330 * unmounting. Interlock is not released on failure.
331 */
332int
333vfs_busy(mp, flags, interlkp, td)
334	struct mount *mp;
335	int flags;
336	struct mtx *interlkp;
337	struct thread *td;
338{
339	int lkflags;
340
341	MNT_ILOCK(mp);
342	if (mp->mnt_kern_flag & MNTK_UNMOUNT) {
343		if (flags & LK_NOWAIT) {
344			MNT_IUNLOCK(mp);
345			return (ENOENT);
346		}
347		if (interlkp)
348			mtx_unlock(interlkp);
349		mp->mnt_kern_flag |= MNTK_MWAIT;
350		/*
351		 * Since all busy locks are shared except the exclusive
352		 * lock granted when unmounting, the only place that a
353		 * wakeup needs to be done is at the release of the
354		 * exclusive lock at the end of dounmount.
355		 */
356		msleep(mp, MNT_MTX(mp), PVFS|PDROP, "vfs_busy", 0);
357		if (interlkp)
358			mtx_lock(interlkp);
359		return (ENOENT);
360	}
361	if (interlkp)
362		mtx_unlock(interlkp);
363	lkflags = LK_SHARED | LK_INTERLOCK;
364	if (lockmgr(&mp->mnt_lock, lkflags, MNT_MTX(mp), td))
365		panic("vfs_busy: unexpected lock failure");
366	return (0);
367}
368
369/*
370 * Free a busy filesystem.
371 */
372void
373vfs_unbusy(mp, td)
374	struct mount *mp;
375	struct thread *td;
376{
377
378	lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, td);
379}
380
381/*
382 * Lookup a mount point by filesystem identifier.
383 */
384struct mount *
385vfs_getvfs(fsid)
386	fsid_t *fsid;
387{
388	struct mount *mp;
389
390	mtx_lock(&mountlist_mtx);
391	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
392		if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
393		    mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
394			mtx_unlock(&mountlist_mtx);
395			return (mp);
396		}
397	}
398	mtx_unlock(&mountlist_mtx);
399	return ((struct mount *) 0);
400}
401
402/*
403 * Check if a user can access priveledged mount options.
404 */
405int
406vfs_suser(struct mount *mp, struct thread *td)
407{
408	int error;
409
410	if ((mp->mnt_flag & MNT_USER) == 0 ||
411	    mp->mnt_cred->cr_uid != td->td_ucred->cr_uid) {
412		if ((error = suser(td)) != 0)
413			return (error);
414	}
415	return (0);
416}
417
418/*
419 * Get a new unique fsid.  Try to make its val[0] unique, since this value
420 * will be used to create fake device numbers for stat().  Also try (but
421 * not so hard) make its val[0] unique mod 2^16, since some emulators only
422 * support 16-bit device numbers.  We end up with unique val[0]'s for the
423 * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls.
424 *
425 * Keep in mind that several mounts may be running in parallel.  Starting
426 * the search one past where the previous search terminated is both a
427 * micro-optimization and a defense against returning the same fsid to
428 * different mounts.
429 */
430void
431vfs_getnewfsid(mp)
432	struct mount *mp;
433{
434	static u_int16_t mntid_base;
435	fsid_t tfsid;
436	int mtype;
437
438	mtx_lock(&mntid_mtx);
439	mtype = mp->mnt_vfc->vfc_typenum;
440	tfsid.val[1] = mtype;
441	mtype = (mtype & 0xFF) << 24;
442	for (;;) {
443		tfsid.val[0] = makedev(255,
444		    mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF));
445		mntid_base++;
446		if (vfs_getvfs(&tfsid) == NULL)
447			break;
448	}
449	mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
450	mp->mnt_stat.f_fsid.val[1] = tfsid.val[1];
451	mtx_unlock(&mntid_mtx);
452}
453
454/*
455 * Knob to control the precision of file timestamps:
456 *
457 *   0 = seconds only; nanoseconds zeroed.
458 *   1 = seconds and nanoseconds, accurate within 1/HZ.
459 *   2 = seconds and nanoseconds, truncated to microseconds.
460 * >=3 = seconds and nanoseconds, maximum precision.
461 */
462enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC };
463
464static int timestamp_precision = TSP_SEC;
465SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW,
466    &timestamp_precision, 0, "");
467
468/*
469 * Get a current timestamp.
470 */
471void
472vfs_timestamp(tsp)
473	struct timespec *tsp;
474{
475	struct timeval tv;
476
477	switch (timestamp_precision) {
478	case TSP_SEC:
479		tsp->tv_sec = time_second;
480		tsp->tv_nsec = 0;
481		break;
482	case TSP_HZ:
483		getnanotime(tsp);
484		break;
485	case TSP_USEC:
486		microtime(&tv);
487		TIMEVAL_TO_TIMESPEC(&tv, tsp);
488		break;
489	case TSP_NSEC:
490	default:
491		nanotime(tsp);
492		break;
493	}
494}
495
496/*
497 * Set vnode attributes to VNOVAL
498 */
499void
500vattr_null(vap)
501	struct vattr *vap;
502{
503
504	vap->va_type = VNON;
505	vap->va_size = VNOVAL;
506	vap->va_bytes = VNOVAL;
507	vap->va_mode = VNOVAL;
508	vap->va_nlink = VNOVAL;
509	vap->va_uid = VNOVAL;
510	vap->va_gid = VNOVAL;
511	vap->va_fsid = VNOVAL;
512	vap->va_fileid = VNOVAL;
513	vap->va_blocksize = VNOVAL;
514	vap->va_rdev = VNOVAL;
515	vap->va_atime.tv_sec = VNOVAL;
516	vap->va_atime.tv_nsec = VNOVAL;
517	vap->va_mtime.tv_sec = VNOVAL;
518	vap->va_mtime.tv_nsec = VNOVAL;
519	vap->va_ctime.tv_sec = VNOVAL;
520	vap->va_ctime.tv_nsec = VNOVAL;
521	vap->va_birthtime.tv_sec = VNOVAL;
522	vap->va_birthtime.tv_nsec = VNOVAL;
523	vap->va_flags = VNOVAL;
524	vap->va_gen = VNOVAL;
525	vap->va_vaflags = 0;
526}
527
528/*
529 * This routine is called when we have too many vnodes.  It attempts
530 * to free <count> vnodes and will potentially free vnodes that still
531 * have VM backing store (VM backing store is typically the cause
532 * of a vnode blowout so we want to do this).  Therefore, this operation
533 * is not considered cheap.
534 *
535 * A number of conditions may prevent a vnode from being reclaimed.
536 * the buffer cache may have references on the vnode, a directory
537 * vnode may still have references due to the namei cache representing
538 * underlying files, or the vnode may be in active use.   It is not
539 * desireable to reuse such vnodes.  These conditions may cause the
540 * number of vnodes to reach some minimum value regardless of what
541 * you set kern.maxvnodes to.  Do not set kern.maxvnodes too low.
542 */
543static int
544vlrureclaim(struct mount *mp)
545{
546	struct thread *td;
547	struct vnode *vp;
548	int done;
549	int trigger;
550	int usevnodes;
551	int count;
552
553	/*
554	 * Calculate the trigger point, don't allow user
555	 * screwups to blow us up.   This prevents us from
556	 * recycling vnodes with lots of resident pages.  We
557	 * aren't trying to free memory, we are trying to
558	 * free vnodes.
559	 */
560	usevnodes = desiredvnodes;
561	if (usevnodes <= 0)
562		usevnodes = 1;
563	trigger = cnt.v_page_count * 2 / usevnodes;
564	done = 0;
565	td = curthread;
566	vn_start_write(NULL, &mp, V_WAIT);
567	MNT_ILOCK(mp);
568	count = mp->mnt_nvnodelistsize / 10 + 1;
569	while (count != 0) {
570		vp = TAILQ_FIRST(&mp->mnt_nvnodelist);
571		while (vp != NULL && vp->v_type == VMARKER)
572			vp = TAILQ_NEXT(vp, v_nmntvnodes);
573		if (vp == NULL)
574			break;
575		TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
576		TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
577		--count;
578		if (!VI_TRYLOCK(vp))
579			goto next_iter;
580		/*
581		 * If it's been deconstructed already, it's still
582		 * referenced, or it exceeds the trigger, skip it.
583		 */
584		if (vp->v_usecount || !LIST_EMPTY(&(vp)->v_cache_src) ||
585		    (vp->v_iflag & VI_DOOMED) != 0 || (vp->v_object != NULL &&
586		    vp->v_object->resident_page_count > trigger)) {
587			VI_UNLOCK(vp);
588			goto next_iter;
589		}
590		MNT_IUNLOCK(mp);
591		vholdl(vp);
592		if (VOP_LOCK(vp, LK_INTERLOCK|LK_EXCLUSIVE|LK_NOWAIT, td)) {
593			vdrop(vp);
594			goto next_iter_mntunlocked;
595		}
596		VI_LOCK(vp);
597		/*
598		 * v_usecount may have been bumped after VOP_LOCK() dropped
599		 * the vnode interlock and before it was locked again.
600		 *
601		 * It is not necessary to recheck VI_DOOMED because it can
602		 * only be set by another thread that holds both the vnode
603		 * lock and vnode interlock.  If another thread has the
604		 * vnode lock before we get to VOP_LOCK() and obtains the
605		 * vnode interlock after VOP_LOCK() drops the vnode
606		 * interlock, the other thread will be unable to drop the
607		 * vnode lock before our VOP_LOCK() call fails.
608		 */
609		if (vp->v_usecount || !LIST_EMPTY(&(vp)->v_cache_src) ||
610		    (vp->v_object != NULL &&
611		    vp->v_object->resident_page_count > trigger)) {
612			VOP_UNLOCK(vp, LK_INTERLOCK, td);
613			goto next_iter_mntunlocked;
614		}
615		KASSERT((vp->v_iflag & VI_DOOMED) == 0,
616		    ("VI_DOOMED unexpectedly detected in vlrureclaim()"));
617		vgonel(vp);
618		VOP_UNLOCK(vp, 0, td);
619		vdropl(vp);
620		done++;
621next_iter_mntunlocked:
622		if ((count % 256) != 0)
623			goto relock_mnt;
624		goto yield;
625next_iter:
626		if ((count % 256) != 0)
627			continue;
628		MNT_IUNLOCK(mp);
629yield:
630		uio_yield();
631relock_mnt:
632		MNT_ILOCK(mp);
633	}
634	MNT_IUNLOCK(mp);
635	vn_finished_write(mp);
636	return done;
637}
638
639/*
640 * Attempt to keep the free list at wantfreevnodes length.
641 */
642static void
643vnlru_free(int count)
644{
645	struct vnode *vp;
646
647	mtx_assert(&vnode_free_list_mtx, MA_OWNED);
648	for (; count > 0; count--) {
649		vp = TAILQ_FIRST(&vnode_free_list);
650		/*
651		 * The list can be modified while the free_list_mtx
652		 * has been dropped and vp could be NULL here.
653		 */
654		if (!vp)
655			break;
656		VNASSERT(vp->v_op != NULL, vp,
657		    ("vnlru_free: vnode already reclaimed."));
658		TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
659		/*
660		 * Don't recycle if we can't get the interlock.
661		 */
662		if (!VI_TRYLOCK(vp)) {
663			TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
664			continue;
665		}
666		VNASSERT(VCANRECYCLE(vp), vp,
667		    ("vp inconsistent on freelist"));
668		freevnodes--;
669		vp->v_iflag &= ~VI_FREE;
670		vholdl(vp);
671		mtx_unlock(&vnode_free_list_mtx);
672		VI_UNLOCK(vp);
673		vtryrecycle(vp);
674		/*
675		 * If the recycled succeeded this vdrop will actually free
676		 * the vnode.  If not it will simply place it back on
677		 * the free list.
678		 */
679		vdrop(vp);
680		mtx_lock(&vnode_free_list_mtx);
681	}
682}
683/*
684 * Attempt to recycle vnodes in a context that is always safe to block.
685 * Calling vlrurecycle() from the bowels of filesystem code has some
686 * interesting deadlock problems.
687 */
688static struct proc *vnlruproc;
689static int vnlruproc_sig;
690
691static void
692vnlru_proc(void)
693{
694	struct mount *mp, *nmp;
695	int done;
696	struct proc *p = vnlruproc;
697	struct thread *td = FIRST_THREAD_IN_PROC(p);
698
699	mtx_lock(&Giant);
700
701	EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, p,
702	    SHUTDOWN_PRI_FIRST);
703
704	for (;;) {
705		kthread_suspend_check(p);
706		mtx_lock(&vnode_free_list_mtx);
707		if (freevnodes > wantfreevnodes)
708			vnlru_free(freevnodes - wantfreevnodes);
709		if (numvnodes <= desiredvnodes * 9 / 10) {
710			vnlruproc_sig = 0;
711			wakeup(&vnlruproc_sig);
712			msleep(vnlruproc, &vnode_free_list_mtx,
713			    PVFS|PDROP, "vlruwt", hz);
714			continue;
715		}
716		mtx_unlock(&vnode_free_list_mtx);
717		done = 0;
718		mtx_lock(&mountlist_mtx);
719		for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
720			int vfsunlocked;
721			if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td)) {
722				nmp = TAILQ_NEXT(mp, mnt_list);
723				continue;
724			}
725			if (!VFS_NEEDSGIANT(mp)) {
726				mtx_unlock(&Giant);
727				vfsunlocked = 1;
728			} else
729				vfsunlocked = 0;
730			done += vlrureclaim(mp);
731			if (vfsunlocked)
732				mtx_lock(&Giant);
733			mtx_lock(&mountlist_mtx);
734			nmp = TAILQ_NEXT(mp, mnt_list);
735			vfs_unbusy(mp, td);
736		}
737		mtx_unlock(&mountlist_mtx);
738		if (done == 0) {
739#if 0
740			/* These messages are temporary debugging aids */
741			if (vnlru_nowhere < 5)
742				printf("vnlru process getting nowhere..\n");
743			else if (vnlru_nowhere == 5)
744				printf("vnlru process messages stopped.\n");
745#endif
746			vnlru_nowhere++;
747			tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3);
748		} else
749			uio_yield();
750	}
751}
752
753static struct kproc_desc vnlru_kp = {
754	"vnlru",
755	vnlru_proc,
756	&vnlruproc
757};
758SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &vnlru_kp)
759
760/*
761 * Routines having to do with the management of the vnode table.
762 */
763
764static void
765vdestroy(struct vnode *vp)
766{
767	struct bufobj *bo;
768
769	CTR1(KTR_VFS, "vdestroy vp %p", vp);
770	mtx_lock(&vnode_free_list_mtx);
771	numvnodes--;
772	mtx_unlock(&vnode_free_list_mtx);
773	bo = &vp->v_bufobj;
774	VNASSERT((vp->v_iflag & VI_FREE) == 0, vp,
775	    ("cleaned vnode still on the free list."));
776	VNASSERT(vp->v_data == NULL, vp, ("cleaned vnode isn't"));
777	VNASSERT(vp->v_holdcnt == 0, vp, ("Non-zero hold count"));
778	VNASSERT(vp->v_usecount == 0, vp, ("Non-zero use count"));
779	VNASSERT(vp->v_writecount == 0, vp, ("Non-zero write count"));
780	VNASSERT(bo->bo_numoutput == 0, vp, ("Clean vnode has pending I/O's"));
781	VNASSERT(bo->bo_clean.bv_cnt == 0, vp, ("cleanbufcnt not 0"));
782	VNASSERT(bo->bo_clean.bv_root == NULL, vp, ("cleanblkroot not NULL"));
783	VNASSERT(bo->bo_dirty.bv_cnt == 0, vp, ("dirtybufcnt not 0"));
784	VNASSERT(bo->bo_dirty.bv_root == NULL, vp, ("dirtyblkroot not NULL"));
785	VNASSERT(TAILQ_EMPTY(&vp->v_cache_dst), vp, ("vp has namecache dst"));
786	VNASSERT(LIST_EMPTY(&vp->v_cache_src), vp, ("vp has namecache src"));
787	VI_UNLOCK(vp);
788#ifdef MAC
789	mac_destroy_vnode(vp);
790#endif
791	if (vp->v_pollinfo != NULL) {
792		knlist_destroy(&vp->v_pollinfo->vpi_selinfo.si_note);
793		mtx_destroy(&vp->v_pollinfo->vpi_lock);
794		uma_zfree(vnodepoll_zone, vp->v_pollinfo);
795	}
796#ifdef INVARIANTS
797	/* XXX Elsewhere we can detect an already freed vnode via NULL v_op. */
798	vp->v_op = NULL;
799#endif
800	lockdestroy(vp->v_vnlock);
801	mtx_destroy(&vp->v_interlock);
802	uma_zfree(vnode_zone, vp);
803}
804
805/*
806 * Try to recycle a freed vnode.  We abort if anyone picks up a reference
807 * before we actually vgone().  This function must be called with the vnode
808 * held to prevent the vnode from being returned to the free list midway
809 * through vgone().
810 */
811static int
812vtryrecycle(struct vnode *vp)
813{
814	struct thread *td = curthread;
815	struct mount *vnmp;
816
817	CTR1(KTR_VFS, "vtryrecycle: trying vp %p", vp);
818	VNASSERT(vp->v_holdcnt, vp,
819	    ("vtryrecycle: Recycling vp %p without a reference.", vp));
820	/*
821	 * This vnode may found and locked via some other list, if so we
822	 * can't recycle it yet.
823	 */
824	if (VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT, td) != 0)
825		return (EWOULDBLOCK);
826	/*
827	 * Don't recycle if its filesystem is being suspended.
828	 */
829	if (vn_start_write(vp, &vnmp, V_NOWAIT) != 0) {
830		VOP_UNLOCK(vp, 0, td);
831		return (EBUSY);
832	}
833	/*
834	 * If we got this far, we need to acquire the interlock and see if
835	 * anyone picked up this vnode from another list.  If not, we will
836	 * mark it with DOOMED via vgonel() so that anyone who does find it
837	 * will skip over it.
838	 */
839	VI_LOCK(vp);
840	if (vp->v_usecount) {
841		VOP_UNLOCK(vp, LK_INTERLOCK, td);
842		vn_finished_write(vnmp);
843		return (EBUSY);
844	}
845	if ((vp->v_iflag & VI_DOOMED) == 0)
846		vgonel(vp);
847	VOP_UNLOCK(vp, LK_INTERLOCK, td);
848	vn_finished_write(vnmp);
849	CTR1(KTR_VFS, "vtryrecycle: recycled vp %p", vp);
850	return (0);
851}
852
853/*
854 * Return the next vnode from the free list.
855 */
856int
857getnewvnode(tag, mp, vops, vpp)
858	const char *tag;
859	struct mount *mp;
860	struct vop_vector *vops;
861	struct vnode **vpp;
862{
863	struct vnode *vp = NULL;
864	struct bufobj *bo;
865
866	mtx_lock(&vnode_free_list_mtx);
867	/*
868	 * Lend our context to reclaim vnodes if they've exceeded the max.
869	 */
870	if (freevnodes > wantfreevnodes)
871		vnlru_free(1);
872	/*
873	 * Wait for available vnodes.
874	 */
875	if (numvnodes > desiredvnodes) {
876		if (vnlruproc_sig == 0) {
877			vnlruproc_sig = 1;      /* avoid unnecessary wakeups */
878			wakeup(vnlruproc);
879		}
880		msleep(&vnlruproc_sig, &vnode_free_list_mtx, PVFS,
881		    "vlruwk", hz);
882#if 0	/* XXX Not all VFS_VGET/ffs_vget callers check returns. */
883		if (numvnodes > desiredvnodes) {
884			mtx_unlock(&vnode_free_list_mtx);
885			return (ENFILE);
886		}
887#endif
888	}
889	numvnodes++;
890	mtx_unlock(&vnode_free_list_mtx);
891	vp = (struct vnode *) uma_zalloc(vnode_zone, M_WAITOK|M_ZERO);
892	/*
893	 * Setup locks.
894	 */
895	vp->v_vnlock = &vp->v_lock;
896	mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF);
897	/*
898	 * By default, don't allow shared locks unless filesystems
899	 * opt-in.
900	 */
901	lockinit(vp->v_vnlock, PVFS, tag, VLKTIMEOUT, LK_NOSHARE);
902	/*
903	 * Initialize bufobj.
904	 */
905	bo = &vp->v_bufobj;
906	bo->__bo_vnode = vp;
907	bo->bo_mtx = &vp->v_interlock;
908	bo->bo_ops = &buf_ops_bio;
909	bo->bo_private = vp;
910	TAILQ_INIT(&bo->bo_clean.bv_hd);
911	TAILQ_INIT(&bo->bo_dirty.bv_hd);
912	/*
913	 * Initialize namecache.
914	 */
915	LIST_INIT(&vp->v_cache_src);
916	TAILQ_INIT(&vp->v_cache_dst);
917	/*
918	 * Finalize various vnode identity bits.
919	 */
920	vp->v_type = VNON;
921	vp->v_tag = tag;
922	vp->v_op = vops;
923	v_incr_usecount(vp);
924	vp->v_data = 0;
925#ifdef MAC
926	mac_init_vnode(vp);
927	if (mp != NULL && (mp->mnt_flag & MNT_MULTILABEL) == 0)
928		mac_associate_vnode_singlelabel(mp, vp);
929	else if (mp == NULL)
930		printf("NULL mp in getnewvnode()\n");
931#endif
932	delmntque(vp);
933	if (mp != NULL) {
934		insmntque(vp, mp);
935		bo->bo_bsize = mp->mnt_stat.f_iosize;
936		if ((mp->mnt_kern_flag & MNTK_NOKNOTE) != 0)
937			vp->v_vflag |= VV_NOKNOTE;
938	}
939
940	CTR2(KTR_VFS, "getnewvnode: mp %p vp %p", mp, vp);
941	*vpp = vp;
942	return (0);
943}
944
945/*
946 * Delete from old mount point vnode list, if on one.
947 */
948static void
949delmntque(struct vnode *vp)
950{
951	struct mount *mp;
952
953	if (vp->v_mount == NULL)
954		return;
955	mp = vp->v_mount;
956	MNT_ILOCK(mp);
957	vp->v_mount = NULL;
958	VNASSERT(mp->mnt_nvnodelistsize > 0, vp,
959		("bad mount point vnode list size"));
960	TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
961	mp->mnt_nvnodelistsize--;
962	MNT_IUNLOCK(mp);
963}
964
965/*
966 * Insert into list of vnodes for the new mount point, if available.
967 */
968static void
969insmntque(struct vnode *vp, struct mount *mp)
970{
971
972	vp->v_mount = mp;
973	VNASSERT(mp != NULL, vp, ("Don't call insmntque(foo, NULL)"));
974	MNT_ILOCK(vp->v_mount);
975	TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
976	VNASSERT(mp->mnt_nvnodelistsize >= 0, vp,
977		("neg mount point vnode list size"));
978	mp->mnt_nvnodelistsize++;
979	MNT_IUNLOCK(vp->v_mount);
980}
981
982/*
983 * Flush out and invalidate all buffers associated with a bufobj
984 * Called with the underlying object locked.
985 */
986int
987bufobj_invalbuf(struct bufobj *bo, int flags, struct thread *td, int slpflag, int slptimeo)
988{
989	int error;
990
991	BO_LOCK(bo);
992	if (flags & V_SAVE) {
993		error = bufobj_wwait(bo, slpflag, slptimeo);
994		if (error) {
995			BO_UNLOCK(bo);
996			return (error);
997		}
998		if (bo->bo_dirty.bv_cnt > 0) {
999			BO_UNLOCK(bo);
1000			if ((error = BO_SYNC(bo, MNT_WAIT, td)) != 0)
1001				return (error);
1002			/*
1003			 * XXX We could save a lock/unlock if this was only
1004			 * enabled under INVARIANTS
1005			 */
1006			BO_LOCK(bo);
1007			if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0)
1008				panic("vinvalbuf: dirty bufs");
1009		}
1010	}
1011	/*
1012	 * If you alter this loop please notice that interlock is dropped and
1013	 * reacquired in flushbuflist.  Special care is needed to ensure that
1014	 * no race conditions occur from this.
1015	 */
1016	do {
1017		error = flushbuflist(&bo->bo_clean,
1018		    flags, bo, slpflag, slptimeo);
1019		if (error == 0)
1020			error = flushbuflist(&bo->bo_dirty,
1021			    flags, bo, slpflag, slptimeo);
1022		if (error != 0 && error != EAGAIN) {
1023			BO_UNLOCK(bo);
1024			return (error);
1025		}
1026	} while (error != 0);
1027
1028	/*
1029	 * Wait for I/O to complete.  XXX needs cleaning up.  The vnode can
1030	 * have write I/O in-progress but if there is a VM object then the
1031	 * VM object can also have read-I/O in-progress.
1032	 */
1033	do {
1034		bufobj_wwait(bo, 0, 0);
1035		BO_UNLOCK(bo);
1036		if (bo->bo_object != NULL) {
1037			VM_OBJECT_LOCK(bo->bo_object);
1038			vm_object_pip_wait(bo->bo_object, "bovlbx");
1039			VM_OBJECT_UNLOCK(bo->bo_object);
1040		}
1041		BO_LOCK(bo);
1042	} while (bo->bo_numoutput > 0);
1043	BO_UNLOCK(bo);
1044
1045	/*
1046	 * Destroy the copy in the VM cache, too.
1047	 */
1048	if (bo->bo_object != NULL) {
1049		VM_OBJECT_LOCK(bo->bo_object);
1050		vm_object_page_remove(bo->bo_object, 0, 0,
1051			(flags & V_SAVE) ? TRUE : FALSE);
1052		VM_OBJECT_UNLOCK(bo->bo_object);
1053	}
1054
1055#ifdef INVARIANTS
1056	BO_LOCK(bo);
1057	if ((flags & (V_ALT | V_NORMAL)) == 0 &&
1058	    (bo->bo_dirty.bv_cnt > 0 || bo->bo_clean.bv_cnt > 0))
1059		panic("vinvalbuf: flush failed");
1060	BO_UNLOCK(bo);
1061#endif
1062	return (0);
1063}
1064
1065/*
1066 * Flush out and invalidate all buffers associated with a vnode.
1067 * Called with the underlying object locked.
1068 */
1069int
1070vinvalbuf(struct vnode *vp, int flags, struct thread *td, int slpflag, int slptimeo)
1071{
1072
1073	CTR2(KTR_VFS, "vinvalbuf vp %p flags %d", vp, flags);
1074	ASSERT_VOP_LOCKED(vp, "vinvalbuf");
1075	return (bufobj_invalbuf(&vp->v_bufobj, flags, td, slpflag, slptimeo));
1076}
1077
1078/*
1079 * Flush out buffers on the specified list.
1080 *
1081 */
1082static int
1083flushbuflist(bufv, flags, bo, slpflag, slptimeo)
1084	struct bufv *bufv;
1085	int flags;
1086	struct bufobj *bo;
1087	int slpflag, slptimeo;
1088{
1089	struct buf *bp, *nbp;
1090	int retval, error;
1091	daddr_t lblkno;
1092	b_xflags_t xflags;
1093
1094	ASSERT_BO_LOCKED(bo);
1095
1096	retval = 0;
1097	TAILQ_FOREACH_SAFE(bp, &bufv->bv_hd, b_bobufs, nbp) {
1098		if (((flags & V_NORMAL) && (bp->b_xflags & BX_ALTDATA)) ||
1099		    ((flags & V_ALT) && (bp->b_xflags & BX_ALTDATA) == 0)) {
1100			continue;
1101		}
1102		lblkno = 0;
1103		xflags = 0;
1104		if (nbp != NULL) {
1105			lblkno = nbp->b_lblkno;
1106			xflags = nbp->b_xflags &
1107				(BX_BKGRDMARKER | BX_VNDIRTY | BX_VNCLEAN);
1108		}
1109		retval = EAGAIN;
1110		error = BUF_TIMELOCK(bp,
1111		    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_MTX(bo),
1112		    "flushbuf", slpflag, slptimeo);
1113		if (error) {
1114			BO_LOCK(bo);
1115			return (error != ENOLCK ? error : EAGAIN);
1116		}
1117		KASSERT(bp->b_bufobj == bo,
1118	            ("bp %p wrong b_bufobj %p should be %p",
1119		    bp, bp->b_bufobj, bo));
1120		if (bp->b_bufobj != bo) {	/* XXX: necessary ? */
1121			BUF_UNLOCK(bp);
1122			BO_LOCK(bo);
1123			return (EAGAIN);
1124		}
1125		/*
1126		 * XXX Since there are no node locks for NFS, I
1127		 * believe there is a slight chance that a delayed
1128		 * write will occur while sleeping just above, so
1129		 * check for it.
1130		 */
1131		if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) &&
1132		    (flags & V_SAVE)) {
1133			bremfree(bp);
1134			bp->b_flags |= B_ASYNC;
1135			bwrite(bp);
1136			BO_LOCK(bo);
1137			return (EAGAIN);	/* XXX: why not loop ? */
1138		}
1139		bremfree(bp);
1140		bp->b_flags |= (B_INVAL | B_NOCACHE | B_RELBUF);
1141		bp->b_flags &= ~B_ASYNC;
1142		brelse(bp);
1143		BO_LOCK(bo);
1144		if (nbp != NULL &&
1145		    (nbp->b_bufobj != bo ||
1146		     nbp->b_lblkno != lblkno ||
1147		     (nbp->b_xflags &
1148		      (BX_BKGRDMARKER | BX_VNDIRTY | BX_VNCLEAN)) != xflags))
1149			break;			/* nbp invalid */
1150	}
1151	return (retval);
1152}
1153
1154/*
1155 * Truncate a file's buffer and pages to a specified length.  This
1156 * is in lieu of the old vinvalbuf mechanism, which performed unneeded
1157 * sync activity.
1158 */
1159int
1160vtruncbuf(struct vnode *vp, struct ucred *cred, struct thread *td, off_t length, int blksize)
1161{
1162	struct buf *bp, *nbp;
1163	int anyfreed;
1164	int trunclbn;
1165	struct bufobj *bo;
1166
1167	CTR2(KTR_VFS, "vtruncbuf vp %p length %jd", vp, length);
1168	/*
1169	 * Round up to the *next* lbn.
1170	 */
1171	trunclbn = (length + blksize - 1) / blksize;
1172
1173	ASSERT_VOP_LOCKED(vp, "vtruncbuf");
1174restart:
1175	VI_LOCK(vp);
1176	bo = &vp->v_bufobj;
1177	anyfreed = 1;
1178	for (;anyfreed;) {
1179		anyfreed = 0;
1180		TAILQ_FOREACH_SAFE(bp, &bo->bo_clean.bv_hd, b_bobufs, nbp) {
1181			if (bp->b_lblkno < trunclbn)
1182				continue;
1183			if (BUF_LOCK(bp,
1184			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
1185			    VI_MTX(vp)) == ENOLCK)
1186				goto restart;
1187
1188			bremfree(bp);
1189			bp->b_flags |= (B_INVAL | B_RELBUF);
1190			bp->b_flags &= ~B_ASYNC;
1191			brelse(bp);
1192			anyfreed = 1;
1193
1194			if (nbp != NULL &&
1195			    (((nbp->b_xflags & BX_VNCLEAN) == 0) ||
1196			    (nbp->b_vp != vp) ||
1197			    (nbp->b_flags & B_DELWRI))) {
1198				goto restart;
1199			}
1200			VI_LOCK(vp);
1201		}
1202
1203		TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
1204			if (bp->b_lblkno < trunclbn)
1205				continue;
1206			if (BUF_LOCK(bp,
1207			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
1208			    VI_MTX(vp)) == ENOLCK)
1209				goto restart;
1210			bremfree(bp);
1211			bp->b_flags |= (B_INVAL | B_RELBUF);
1212			bp->b_flags &= ~B_ASYNC;
1213			brelse(bp);
1214			anyfreed = 1;
1215			if (nbp != NULL &&
1216			    (((nbp->b_xflags & BX_VNDIRTY) == 0) ||
1217			    (nbp->b_vp != vp) ||
1218			    (nbp->b_flags & B_DELWRI) == 0)) {
1219				goto restart;
1220			}
1221			VI_LOCK(vp);
1222		}
1223	}
1224
1225	if (length > 0) {
1226restartsync:
1227		TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
1228			if (bp->b_lblkno > 0)
1229				continue;
1230			/*
1231			 * Since we hold the vnode lock this should only
1232			 * fail if we're racing with the buf daemon.
1233			 */
1234			if (BUF_LOCK(bp,
1235			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
1236			    VI_MTX(vp)) == ENOLCK) {
1237				goto restart;
1238			}
1239			VNASSERT((bp->b_flags & B_DELWRI), vp,
1240			    ("buf(%p) on dirty queue without DELWRI", bp));
1241
1242			bremfree(bp);
1243			bawrite(bp);
1244			VI_LOCK(vp);
1245			goto restartsync;
1246		}
1247	}
1248
1249	bufobj_wwait(bo, 0, 0);
1250	VI_UNLOCK(vp);
1251	vnode_pager_setsize(vp, length);
1252
1253	return (0);
1254}
1255
1256/*
1257 * buf_splay() - splay tree core for the clean/dirty list of buffers in
1258 * 		 a vnode.
1259 *
1260 *	NOTE: We have to deal with the special case of a background bitmap
1261 *	buffer, a situation where two buffers will have the same logical
1262 *	block offset.  We want (1) only the foreground buffer to be accessed
1263 *	in a lookup and (2) must differentiate between the foreground and
1264 *	background buffer in the splay tree algorithm because the splay
1265 *	tree cannot normally handle multiple entities with the same 'index'.
1266 *	We accomplish this by adding differentiating flags to the splay tree's
1267 *	numerical domain.
1268 */
1269static
1270struct buf *
1271buf_splay(daddr_t lblkno, b_xflags_t xflags, struct buf *root)
1272{
1273	struct buf dummy;
1274	struct buf *lefttreemax, *righttreemin, *y;
1275
1276	if (root == NULL)
1277		return (NULL);
1278	lefttreemax = righttreemin = &dummy;
1279	for (;;) {
1280		if (lblkno < root->b_lblkno ||
1281		    (lblkno == root->b_lblkno &&
1282		    (xflags & BX_BKGRDMARKER) < (root->b_xflags & BX_BKGRDMARKER))) {
1283			if ((y = root->b_left) == NULL)
1284				break;
1285			if (lblkno < y->b_lblkno) {
1286				/* Rotate right. */
1287				root->b_left = y->b_right;
1288				y->b_right = root;
1289				root = y;
1290				if ((y = root->b_left) == NULL)
1291					break;
1292			}
1293			/* Link into the new root's right tree. */
1294			righttreemin->b_left = root;
1295			righttreemin = root;
1296		} else if (lblkno > root->b_lblkno ||
1297		    (lblkno == root->b_lblkno &&
1298		    (xflags & BX_BKGRDMARKER) > (root->b_xflags & BX_BKGRDMARKER))) {
1299			if ((y = root->b_right) == NULL)
1300				break;
1301			if (lblkno > y->b_lblkno) {
1302				/* Rotate left. */
1303				root->b_right = y->b_left;
1304				y->b_left = root;
1305				root = y;
1306				if ((y = root->b_right) == NULL)
1307					break;
1308			}
1309			/* Link into the new root's left tree. */
1310			lefttreemax->b_right = root;
1311			lefttreemax = root;
1312		} else {
1313			break;
1314		}
1315		root = y;
1316	}
1317	/* Assemble the new root. */
1318	lefttreemax->b_right = root->b_left;
1319	righttreemin->b_left = root->b_right;
1320	root->b_left = dummy.b_right;
1321	root->b_right = dummy.b_left;
1322	return (root);
1323}
1324
1325static void
1326buf_vlist_remove(struct buf *bp)
1327{
1328	struct buf *root;
1329	struct bufv *bv;
1330
1331	KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
1332	ASSERT_BO_LOCKED(bp->b_bufobj);
1333	KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) !=
1334	    (BX_VNDIRTY|BX_VNCLEAN),
1335	    ("buf_vlist_remove: Buf %p is on two lists", bp));
1336	if (bp->b_xflags & BX_VNDIRTY)
1337		bv = &bp->b_bufobj->bo_dirty;
1338	else
1339		bv = &bp->b_bufobj->bo_clean;
1340	if (bp != bv->bv_root) {
1341		root = buf_splay(bp->b_lblkno, bp->b_xflags, bv->bv_root);
1342		KASSERT(root == bp, ("splay lookup failed in remove"));
1343	}
1344	if (bp->b_left == NULL) {
1345		root = bp->b_right;
1346	} else {
1347		root = buf_splay(bp->b_lblkno, bp->b_xflags, bp->b_left);
1348		root->b_right = bp->b_right;
1349	}
1350	bv->bv_root = root;
1351	TAILQ_REMOVE(&bv->bv_hd, bp, b_bobufs);
1352	bv->bv_cnt--;
1353	bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN);
1354}
1355
1356/*
1357 * Add the buffer to the sorted clean or dirty block list using a
1358 * splay tree algorithm.
1359 *
1360 * NOTE: xflags is passed as a constant, optimizing this inline function!
1361 */
1362static void
1363buf_vlist_add(struct buf *bp, struct bufobj *bo, b_xflags_t xflags)
1364{
1365	struct buf *root;
1366	struct bufv *bv;
1367
1368	ASSERT_BO_LOCKED(bo);
1369	KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0,
1370	    ("buf_vlist_add: Buf %p has existing xflags %d", bp, bp->b_xflags));
1371	bp->b_xflags |= xflags;
1372	if (xflags & BX_VNDIRTY)
1373		bv = &bo->bo_dirty;
1374	else
1375		bv = &bo->bo_clean;
1376
1377	root = buf_splay(bp->b_lblkno, bp->b_xflags, bv->bv_root);
1378	if (root == NULL) {
1379		bp->b_left = NULL;
1380		bp->b_right = NULL;
1381		TAILQ_INSERT_TAIL(&bv->bv_hd, bp, b_bobufs);
1382	} else if (bp->b_lblkno < root->b_lblkno ||
1383	    (bp->b_lblkno == root->b_lblkno &&
1384	    (bp->b_xflags & BX_BKGRDMARKER) < (root->b_xflags & BX_BKGRDMARKER))) {
1385		bp->b_left = root->b_left;
1386		bp->b_right = root;
1387		root->b_left = NULL;
1388		TAILQ_INSERT_BEFORE(root, bp, b_bobufs);
1389	} else {
1390		bp->b_right = root->b_right;
1391		bp->b_left = root;
1392		root->b_right = NULL;
1393		TAILQ_INSERT_AFTER(&bv->bv_hd, root, bp, b_bobufs);
1394	}
1395	bv->bv_cnt++;
1396	bv->bv_root = bp;
1397}
1398
1399/*
1400 * Lookup a buffer using the splay tree.  Note that we specifically avoid
1401 * shadow buffers used in background bitmap writes.
1402 *
1403 * This code isn't quite efficient as it could be because we are maintaining
1404 * two sorted lists and do not know which list the block resides in.
1405 *
1406 * During a "make buildworld" the desired buffer is found at one of
1407 * the roots more than 60% of the time.  Thus, checking both roots
1408 * before performing either splay eliminates unnecessary splays on the
1409 * first tree splayed.
1410 */
1411struct buf *
1412gbincore(struct bufobj *bo, daddr_t lblkno)
1413{
1414	struct buf *bp;
1415
1416	ASSERT_BO_LOCKED(bo);
1417	if ((bp = bo->bo_clean.bv_root) != NULL &&
1418	    bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
1419		return (bp);
1420	if ((bp = bo->bo_dirty.bv_root) != NULL &&
1421	    bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
1422		return (bp);
1423	if ((bp = bo->bo_clean.bv_root) != NULL) {
1424		bo->bo_clean.bv_root = bp = buf_splay(lblkno, 0, bp);
1425		if (bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
1426			return (bp);
1427	}
1428	if ((bp = bo->bo_dirty.bv_root) != NULL) {
1429		bo->bo_dirty.bv_root = bp = buf_splay(lblkno, 0, bp);
1430		if (bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
1431			return (bp);
1432	}
1433	return (NULL);
1434}
1435
1436/*
1437 * Associate a buffer with a vnode.
1438 */
1439void
1440bgetvp(struct vnode *vp, struct buf *bp)
1441{
1442
1443	VNASSERT(bp->b_vp == NULL, bp->b_vp, ("bgetvp: not free"));
1444
1445	CTR3(KTR_BUF, "bgetvp(%p) vp %p flags %X", bp, vp, bp->b_flags);
1446	VNASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, vp,
1447	    ("bgetvp: bp already attached! %p", bp));
1448
1449	ASSERT_VI_LOCKED(vp, "bgetvp");
1450	vholdl(vp);
1451	bp->b_vp = vp;
1452	bp->b_bufobj = &vp->v_bufobj;
1453	/*
1454	 * Insert onto list for new vnode.
1455	 */
1456	buf_vlist_add(bp, &vp->v_bufobj, BX_VNCLEAN);
1457}
1458
1459/*
1460 * Disassociate a buffer from a vnode.
1461 */
1462void
1463brelvp(struct buf *bp)
1464{
1465	struct bufobj *bo;
1466	struct vnode *vp;
1467
1468	CTR3(KTR_BUF, "brelvp(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
1469	KASSERT(bp->b_vp != NULL, ("brelvp: NULL"));
1470
1471	/*
1472	 * Delete from old vnode list, if on one.
1473	 */
1474	vp = bp->b_vp;		/* XXX */
1475	bo = bp->b_bufobj;
1476	BO_LOCK(bo);
1477	if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN))
1478		buf_vlist_remove(bp);
1479	else
1480		panic("brelvp: Buffer %p not on queue.", bp);
1481	if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) {
1482		bo->bo_flag &= ~BO_ONWORKLST;
1483		mtx_lock(&sync_mtx);
1484		LIST_REMOVE(bo, bo_synclist);
1485 		syncer_worklist_len--;
1486		mtx_unlock(&sync_mtx);
1487	}
1488	bp->b_vp = NULL;
1489	bp->b_bufobj = NULL;
1490	vdropl(vp);
1491}
1492
1493/*
1494 * Add an item to the syncer work queue.
1495 */
1496static void
1497vn_syncer_add_to_worklist(struct bufobj *bo, int delay)
1498{
1499	int slot;
1500
1501	ASSERT_BO_LOCKED(bo);
1502
1503	mtx_lock(&sync_mtx);
1504	if (bo->bo_flag & BO_ONWORKLST)
1505		LIST_REMOVE(bo, bo_synclist);
1506	else {
1507		bo->bo_flag |= BO_ONWORKLST;
1508 		syncer_worklist_len++;
1509	}
1510
1511	if (delay > syncer_maxdelay - 2)
1512		delay = syncer_maxdelay - 2;
1513	slot = (syncer_delayno + delay) & syncer_mask;
1514
1515	LIST_INSERT_HEAD(&syncer_workitem_pending[slot], bo, bo_synclist);
1516	mtx_unlock(&sync_mtx);
1517}
1518
1519static int
1520sysctl_vfs_worklist_len(SYSCTL_HANDLER_ARGS)
1521{
1522	int error, len;
1523
1524	mtx_lock(&sync_mtx);
1525	len = syncer_worklist_len - sync_vnode_count;
1526	mtx_unlock(&sync_mtx);
1527	error = SYSCTL_OUT(req, &len, sizeof(len));
1528	return (error);
1529}
1530
1531SYSCTL_PROC(_vfs, OID_AUTO, worklist_len, CTLTYPE_INT | CTLFLAG_RD, NULL, 0,
1532    sysctl_vfs_worklist_len, "I", "Syncer thread worklist length");
1533
1534static struct proc *updateproc;
1535static void sched_sync(void);
1536static struct kproc_desc up_kp = {
1537	"syncer",
1538	sched_sync,
1539	&updateproc
1540};
1541SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp)
1542
1543static int
1544sync_vnode(struct bufobj *bo, struct thread *td)
1545{
1546	struct vnode *vp;
1547	struct mount *mp;
1548
1549	vp = bo->__bo_vnode; 	/* XXX */
1550	if (VOP_ISLOCKED(vp, NULL) != 0)
1551		return (1);
1552	if (VI_TRYLOCK(vp) == 0)
1553		return (1);
1554	/*
1555	 * We use vhold in case the vnode does not
1556	 * successfully sync.  vhold prevents the vnode from
1557	 * going away when we unlock the sync_mtx so that
1558	 * we can acquire the vnode interlock.
1559	 */
1560	vholdl(vp);
1561	mtx_unlock(&sync_mtx);
1562	VI_UNLOCK(vp);
1563	if (vn_start_write(vp, &mp, V_NOWAIT) != 0) {
1564		vdrop(vp);
1565		mtx_lock(&sync_mtx);
1566		return (1);
1567	}
1568	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
1569	(void) VOP_FSYNC(vp, MNT_LAZY, td);
1570	VOP_UNLOCK(vp, 0, td);
1571	vn_finished_write(mp);
1572	VI_LOCK(vp);
1573	if ((bo->bo_flag & BO_ONWORKLST) != 0) {
1574		/*
1575		 * Put us back on the worklist.  The worklist
1576		 * routine will remove us from our current
1577		 * position and then add us back in at a later
1578		 * position.
1579		 */
1580		vn_syncer_add_to_worklist(bo, syncdelay);
1581	}
1582	vdropl(vp);
1583	mtx_lock(&sync_mtx);
1584	return (0);
1585}
1586
1587/*
1588 * System filesystem synchronizer daemon.
1589 */
1590static void
1591sched_sync(void)
1592{
1593	struct synclist *next;
1594	struct synclist *slp;
1595	struct bufobj *bo;
1596	long starttime;
1597	struct thread *td = FIRST_THREAD_IN_PROC(updateproc);
1598	static int dummychan;
1599	int last_work_seen;
1600	int net_worklist_len;
1601	int syncer_final_iter;
1602	int first_printf;
1603	int error;
1604
1605	mtx_lock(&Giant);
1606	last_work_seen = 0;
1607	syncer_final_iter = 0;
1608	first_printf = 1;
1609	syncer_state = SYNCER_RUNNING;
1610	starttime = time_uptime;
1611	td->td_pflags |= TDP_NORUNNINGBUF;
1612
1613	EVENTHANDLER_REGISTER(shutdown_pre_sync, syncer_shutdown, td->td_proc,
1614	    SHUTDOWN_PRI_LAST);
1615
1616	for (;;) {
1617		mtx_lock(&sync_mtx);
1618		if (syncer_state == SYNCER_FINAL_DELAY &&
1619		    syncer_final_iter == 0) {
1620			mtx_unlock(&sync_mtx);
1621			kthread_suspend_check(td->td_proc);
1622			mtx_lock(&sync_mtx);
1623		}
1624		net_worklist_len = syncer_worklist_len - sync_vnode_count;
1625		if (syncer_state != SYNCER_RUNNING &&
1626		    starttime != time_uptime) {
1627			if (first_printf) {
1628				printf("\nSyncing disks, vnodes remaining...");
1629				first_printf = 0;
1630			}
1631			printf("%d ", net_worklist_len);
1632		}
1633		starttime = time_uptime;
1634
1635		/*
1636		 * Push files whose dirty time has expired.  Be careful
1637		 * of interrupt race on slp queue.
1638		 *
1639		 * Skip over empty worklist slots when shutting down.
1640		 */
1641		do {
1642			slp = &syncer_workitem_pending[syncer_delayno];
1643			syncer_delayno += 1;
1644			if (syncer_delayno == syncer_maxdelay)
1645				syncer_delayno = 0;
1646			next = &syncer_workitem_pending[syncer_delayno];
1647			/*
1648			 * If the worklist has wrapped since the
1649			 * it was emptied of all but syncer vnodes,
1650			 * switch to the FINAL_DELAY state and run
1651			 * for one more second.
1652			 */
1653			if (syncer_state == SYNCER_SHUTTING_DOWN &&
1654			    net_worklist_len == 0 &&
1655			    last_work_seen == syncer_delayno) {
1656				syncer_state = SYNCER_FINAL_DELAY;
1657				syncer_final_iter = SYNCER_SHUTDOWN_SPEEDUP;
1658			}
1659		} while (syncer_state != SYNCER_RUNNING && LIST_EMPTY(slp) &&
1660		    syncer_worklist_len > 0);
1661
1662		/*
1663		 * Keep track of the last time there was anything
1664		 * on the worklist other than syncer vnodes.
1665		 * Return to the SHUTTING_DOWN state if any
1666		 * new work appears.
1667		 */
1668		if (net_worklist_len > 0 || syncer_state == SYNCER_RUNNING)
1669			last_work_seen = syncer_delayno;
1670		if (net_worklist_len > 0 && syncer_state == SYNCER_FINAL_DELAY)
1671			syncer_state = SYNCER_SHUTTING_DOWN;
1672		while ((bo = LIST_FIRST(slp)) != NULL) {
1673			error = sync_vnode(bo, td);
1674			if (error == 1) {
1675				LIST_REMOVE(bo, bo_synclist);
1676				LIST_INSERT_HEAD(next, bo, bo_synclist);
1677				continue;
1678			}
1679		}
1680		if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter > 0)
1681			syncer_final_iter--;
1682		mtx_unlock(&sync_mtx);
1683
1684		/*
1685		 * Do soft update processing.
1686		 */
1687		if (softdep_process_worklist_hook != NULL)
1688			(*softdep_process_worklist_hook)(NULL);
1689
1690		/*
1691		 * The variable rushjob allows the kernel to speed up the
1692		 * processing of the filesystem syncer process. A rushjob
1693		 * value of N tells the filesystem syncer to process the next
1694		 * N seconds worth of work on its queue ASAP. Currently rushjob
1695		 * is used by the soft update code to speed up the filesystem
1696		 * syncer process when the incore state is getting so far
1697		 * ahead of the disk that the kernel memory pool is being
1698		 * threatened with exhaustion.
1699		 */
1700		mtx_lock(&sync_mtx);
1701		if (rushjob > 0) {
1702			rushjob -= 1;
1703			mtx_unlock(&sync_mtx);
1704			continue;
1705		}
1706		mtx_unlock(&sync_mtx);
1707		/*
1708		 * Just sleep for a short period if time between
1709		 * iterations when shutting down to allow some I/O
1710		 * to happen.
1711		 *
1712		 * If it has taken us less than a second to process the
1713		 * current work, then wait. Otherwise start right over
1714		 * again. We can still lose time if any single round
1715		 * takes more than two seconds, but it does not really
1716		 * matter as we are just trying to generally pace the
1717		 * filesystem activity.
1718		 */
1719		if (syncer_state != SYNCER_RUNNING)
1720			tsleep(&dummychan, PPAUSE, "syncfnl",
1721			    hz / SYNCER_SHUTDOWN_SPEEDUP);
1722		else if (time_uptime == starttime)
1723			tsleep(&lbolt, PPAUSE, "syncer", 0);
1724	}
1725}
1726
1727/*
1728 * Request the syncer daemon to speed up its work.
1729 * We never push it to speed up more than half of its
1730 * normal turn time, otherwise it could take over the cpu.
1731 */
1732int
1733speedup_syncer()
1734{
1735	struct thread *td;
1736	int ret = 0;
1737
1738	td = FIRST_THREAD_IN_PROC(updateproc);
1739	sleepq_remove(td, &lbolt);
1740	mtx_lock(&sync_mtx);
1741	if (rushjob < syncdelay / 2) {
1742		rushjob += 1;
1743		stat_rush_requests += 1;
1744		ret = 1;
1745	}
1746	mtx_unlock(&sync_mtx);
1747	return (ret);
1748}
1749
1750/*
1751 * Tell the syncer to speed up its work and run though its work
1752 * list several times, then tell it to shut down.
1753 */
1754static void
1755syncer_shutdown(void *arg, int howto)
1756{
1757	struct thread *td;
1758
1759	if (howto & RB_NOSYNC)
1760		return;
1761	td = FIRST_THREAD_IN_PROC(updateproc);
1762	sleepq_remove(td, &lbolt);
1763	mtx_lock(&sync_mtx);
1764	syncer_state = SYNCER_SHUTTING_DOWN;
1765	rushjob = 0;
1766	mtx_unlock(&sync_mtx);
1767	kproc_shutdown(arg, howto);
1768}
1769
1770/*
1771 * Reassign a buffer from one vnode to another.
1772 * Used to assign file specific control information
1773 * (indirect blocks) to the vnode to which they belong.
1774 */
1775void
1776reassignbuf(struct buf *bp)
1777{
1778	struct vnode *vp;
1779	struct bufobj *bo;
1780	int delay;
1781#ifdef INVARIANTS
1782	struct bufv *bv;
1783#endif
1784
1785	vp = bp->b_vp;
1786	bo = bp->b_bufobj;
1787	++reassignbufcalls;
1788
1789	CTR3(KTR_BUF, "reassignbuf(%p) vp %p flags %X",
1790	    bp, bp->b_vp, bp->b_flags);
1791	/*
1792	 * B_PAGING flagged buffers cannot be reassigned because their vp
1793	 * is not fully linked in.
1794	 */
1795	if (bp->b_flags & B_PAGING)
1796		panic("cannot reassign paging buffer");
1797
1798	/*
1799	 * Delete from old vnode list, if on one.
1800	 */
1801	VI_LOCK(vp);
1802	if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN))
1803		buf_vlist_remove(bp);
1804	else
1805		panic("reassignbuf: Buffer %p not on queue.", bp);
1806	/*
1807	 * If dirty, put on list of dirty buffers; otherwise insert onto list
1808	 * of clean buffers.
1809	 */
1810	if (bp->b_flags & B_DELWRI) {
1811		if ((bo->bo_flag & BO_ONWORKLST) == 0) {
1812			switch (vp->v_type) {
1813			case VDIR:
1814				delay = dirdelay;
1815				break;
1816			case VCHR:
1817				delay = metadelay;
1818				break;
1819			default:
1820				delay = filedelay;
1821			}
1822			vn_syncer_add_to_worklist(bo, delay);
1823		}
1824		buf_vlist_add(bp, bo, BX_VNDIRTY);
1825	} else {
1826		buf_vlist_add(bp, bo, BX_VNCLEAN);
1827
1828		if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) {
1829			mtx_lock(&sync_mtx);
1830			LIST_REMOVE(bo, bo_synclist);
1831 			syncer_worklist_len--;
1832			mtx_unlock(&sync_mtx);
1833			bo->bo_flag &= ~BO_ONWORKLST;
1834		}
1835	}
1836#ifdef INVARIANTS
1837	bv = &bo->bo_clean;
1838	bp = TAILQ_FIRST(&bv->bv_hd);
1839	KASSERT(bp == NULL || bp->b_bufobj == bo,
1840	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
1841	bp = TAILQ_LAST(&bv->bv_hd, buflists);
1842	KASSERT(bp == NULL || bp->b_bufobj == bo,
1843	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
1844	bv = &bo->bo_dirty;
1845	bp = TAILQ_FIRST(&bv->bv_hd);
1846	KASSERT(bp == NULL || bp->b_bufobj == bo,
1847	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
1848	bp = TAILQ_LAST(&bv->bv_hd, buflists);
1849	KASSERT(bp == NULL || bp->b_bufobj == bo,
1850	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
1851#endif
1852	VI_UNLOCK(vp);
1853}
1854
1855/*
1856 * Increment the use and hold counts on the vnode, taking care to reference
1857 * the driver's usecount if this is a chardev.  The vholdl() will remove
1858 * the vnode from the free list if it is presently free.  Requires the
1859 * vnode interlock and returns with it held.
1860 */
1861static void
1862v_incr_usecount(struct vnode *vp)
1863{
1864
1865	CTR3(KTR_VFS, "v_incr_usecount: vp %p holdcnt %d usecount %d\n",
1866	    vp, vp->v_holdcnt, vp->v_usecount);
1867	vp->v_usecount++;
1868	if (vp->v_type == VCHR && vp->v_rdev != NULL) {
1869		dev_lock();
1870		vp->v_rdev->si_usecount++;
1871		dev_unlock();
1872	}
1873	vholdl(vp);
1874}
1875
1876/*
1877 * Decrement the vnode use and hold count along with the driver's usecount
1878 * if this is a chardev.  The vdropl() below releases the vnode interlock
1879 * as it may free the vnode.
1880 */
1881static void
1882v_decr_usecount(struct vnode *vp)
1883{
1884
1885	CTR3(KTR_VFS, "v_decr_usecount: vp %p holdcnt %d usecount %d\n",
1886	    vp, vp->v_holdcnt, vp->v_usecount);
1887	ASSERT_VI_LOCKED(vp, __FUNCTION__);
1888	VNASSERT(vp->v_usecount > 0, vp,
1889	    ("v_decr_usecount: negative usecount"));
1890	vp->v_usecount--;
1891	if (vp->v_type == VCHR && vp->v_rdev != NULL) {
1892		dev_lock();
1893		vp->v_rdev->si_usecount--;
1894		dev_unlock();
1895	}
1896	vdropl(vp);
1897}
1898
1899/*
1900 * Decrement only the use count and driver use count.  This is intended to
1901 * be paired with a follow on vdropl() to release the remaining hold count.
1902 * In this way we may vgone() a vnode with a 0 usecount without risk of
1903 * having it end up on a free list because the hold count is kept above 0.
1904 */
1905static void
1906v_decr_useonly(struct vnode *vp)
1907{
1908
1909	CTR3(KTR_VFS, "v_decr_useonly: vp %p holdcnt %d usecount %d\n",
1910	    vp, vp->v_holdcnt, vp->v_usecount);
1911	ASSERT_VI_LOCKED(vp, __FUNCTION__);
1912	VNASSERT(vp->v_usecount > 0, vp,
1913	    ("v_decr_useonly: negative usecount"));
1914	vp->v_usecount--;
1915	if (vp->v_type == VCHR && vp->v_rdev != NULL) {
1916		dev_lock();
1917		vp->v_rdev->si_usecount--;
1918		dev_unlock();
1919	}
1920}
1921
1922/*
1923 * Grab a particular vnode from the free list, increment its
1924 * reference count and lock it. The vnode lock bit is set if the
1925 * vnode is being eliminated in vgone. The process is awakened
1926 * when the transition is completed, and an error returned to
1927 * indicate that the vnode is no longer usable (possibly having
1928 * been changed to a new filesystem type).
1929 */
1930int
1931vget(vp, flags, td)
1932	struct vnode *vp;
1933	int flags;
1934	struct thread *td;
1935{
1936	int oweinact;
1937	int oldflags;
1938	int usecount;
1939	int error;
1940
1941	error = 0;
1942	oldflags = flags;
1943	oweinact = 0;
1944	if ((flags & LK_INTERLOCK) == 0)
1945		VI_LOCK(vp);
1946	/*
1947	 * If the inactive call was deferred because vput() was called
1948	 * with a shared lock, we have to do it here before another thread
1949	 * gets a reference to data that should be dead.
1950	 */
1951	if (vp->v_iflag & VI_OWEINACT) {
1952		if (flags & LK_NOWAIT) {
1953			VI_UNLOCK(vp);
1954			return (EBUSY);
1955		}
1956		flags &= ~LK_TYPE_MASK;
1957		flags |= LK_EXCLUSIVE;
1958		oweinact = 1;
1959	}
1960	usecount = vp->v_usecount;
1961	v_incr_usecount(vp);
1962	if ((error = vn_lock(vp, flags | LK_INTERLOCK, td)) != 0) {
1963		VI_LOCK(vp);
1964		/*
1965		 * must expand vrele here because we do not want
1966		 * to call VOP_INACTIVE if the reference count
1967		 * drops back to zero since it was never really
1968		 * active.
1969		 */
1970		v_decr_usecount(vp);
1971		/*
1972		 * Print warning when race below occur:
1973		 *
1974		 * thread1	thread2
1975		 * -------	-------
1976		 *					v_usecount=0
1977		 * vref(vp)				v_usecount=1
1978		 *		vget(vp)
1979		 *		v_incr_usecount(vp)	v_usecount=2
1980		 *		vn_lock(vp)
1981		 * vrele(vp)				v_usecount=1
1982		 *		v_decr_usecount(vp)	v_usecount=0
1983		 *
1984		 * In such situation VOP_INACTIVE() will not be called for
1985		 * the vnode vp.
1986		 */
1987		if (usecount > 0 && vp->v_usecount == 0)
1988			printf("vinactive() won't be called for vp=%p\n", vp);
1989		return (error);
1990	}
1991	if (vp->v_iflag & VI_DOOMED && (flags & LK_RETRY) == 0)
1992		panic("vget: vn_lock failed to return ENOENT\n");
1993	if (oweinact) {
1994		VI_LOCK(vp);
1995		if (vp->v_iflag & VI_OWEINACT)
1996			vinactive(vp, td);
1997		VI_UNLOCK(vp);
1998		if ((oldflags & LK_TYPE_MASK) == 0)
1999			VOP_UNLOCK(vp, 0, td);
2000	}
2001	return (0);
2002}
2003
2004/*
2005 * Increase the reference count of a vnode.
2006 */
2007void
2008vref(struct vnode *vp)
2009{
2010
2011	VI_LOCK(vp);
2012	v_incr_usecount(vp);
2013	VI_UNLOCK(vp);
2014}
2015
2016/*
2017 * Return reference count of a vnode.
2018 *
2019 * The results of this call are only guaranteed when some mechanism other
2020 * than the VI lock is used to stop other processes from gaining references
2021 * to the vnode.  This may be the case if the caller holds the only reference.
2022 * This is also useful when stale data is acceptable as race conditions may
2023 * be accounted for by some other means.
2024 */
2025int
2026vrefcnt(struct vnode *vp)
2027{
2028	int usecnt;
2029
2030	VI_LOCK(vp);
2031	usecnt = vp->v_usecount;
2032	VI_UNLOCK(vp);
2033
2034	return (usecnt);
2035}
2036
2037
2038/*
2039 * Vnode put/release.
2040 * If count drops to zero, call inactive routine and return to freelist.
2041 */
2042void
2043vrele(vp)
2044	struct vnode *vp;
2045{
2046	struct thread *td = curthread;	/* XXX */
2047
2048	KASSERT(vp != NULL, ("vrele: null vp"));
2049
2050	VI_LOCK(vp);
2051
2052	/* Skip this v_writecount check if we're going to panic below. */
2053	VNASSERT(vp->v_writecount < vp->v_usecount || vp->v_usecount < 1, vp,
2054	    ("vrele: missed vn_close"));
2055
2056	if (vp->v_usecount > 1 || ((vp->v_iflag & VI_DOINGINACT) &&
2057	    vp->v_usecount == 1)) {
2058		v_decr_usecount(vp);
2059		return;
2060	}
2061	if (vp->v_usecount != 1) {
2062#ifdef DIAGNOSTIC
2063		vprint("vrele: negative ref count", vp);
2064#endif
2065		VI_UNLOCK(vp);
2066		panic("vrele: negative ref cnt");
2067	}
2068	/*
2069	 * We want to hold the vnode until the inactive finishes to
2070	 * prevent vgone() races.  We drop the use count here and the
2071	 * hold count below when we're done.
2072	 */
2073	v_decr_useonly(vp);
2074	/*
2075	 * We must call VOP_INACTIVE with the node locked. Mark
2076	 * as VI_DOINGINACT to avoid recursion.
2077	 */
2078	if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK, td) == 0) {
2079		VI_LOCK(vp);
2080		vinactive(vp, td);
2081		VOP_UNLOCK(vp, 0, td);
2082	} else
2083		VI_LOCK(vp);
2084	vdropl(vp);
2085}
2086
2087/*
2088 * Release an already locked vnode.  This give the same effects as
2089 * unlock+vrele(), but takes less time and avoids releasing and
2090 * re-aquiring the lock (as vrele() aquires the lock internally.)
2091 */
2092void
2093vput(vp)
2094	struct vnode *vp;
2095{
2096	struct thread *td = curthread;	/* XXX */
2097	int error;
2098
2099	KASSERT(vp != NULL, ("vput: null vp"));
2100	ASSERT_VOP_LOCKED(vp, "vput");
2101	VI_LOCK(vp);
2102	/* Skip this v_writecount check if we're going to panic below. */
2103	VNASSERT(vp->v_writecount < vp->v_usecount || vp->v_usecount < 1, vp,
2104	    ("vput: missed vn_close"));
2105	error = 0;
2106
2107	if (vp->v_usecount > 1 || ((vp->v_iflag & VI_DOINGINACT) &&
2108	    vp->v_usecount == 1)) {
2109		VOP_UNLOCK(vp, 0, td);
2110		v_decr_usecount(vp);
2111		return;
2112	}
2113
2114	if (vp->v_usecount != 1) {
2115#ifdef DIAGNOSTIC
2116		vprint("vput: negative ref count", vp);
2117#endif
2118		panic("vput: negative ref cnt");
2119	}
2120	/*
2121	 * We want to hold the vnode until the inactive finishes to
2122	 * prevent vgone() races.  We drop the use count here and the
2123	 * hold count below when we're done.
2124	 */
2125	v_decr_useonly(vp);
2126	vp->v_iflag |= VI_OWEINACT;
2127	if (VOP_ISLOCKED(vp, NULL) != LK_EXCLUSIVE) {
2128		error = VOP_LOCK(vp, LK_EXCLUPGRADE|LK_INTERLOCK|LK_NOWAIT, td);
2129		VI_LOCK(vp);
2130		if (error)
2131			goto done;
2132	}
2133	if (vp->v_iflag & VI_OWEINACT)
2134		vinactive(vp, td);
2135	VOP_UNLOCK(vp, 0, td);
2136done:
2137	vdropl(vp);
2138}
2139
2140/*
2141 * Somebody doesn't want the vnode recycled.
2142 */
2143void
2144vhold(struct vnode *vp)
2145{
2146
2147	VI_LOCK(vp);
2148	vholdl(vp);
2149	VI_UNLOCK(vp);
2150}
2151
2152void
2153vholdl(struct vnode *vp)
2154{
2155
2156	vp->v_holdcnt++;
2157	if (VSHOULDBUSY(vp))
2158		vbusy(vp);
2159}
2160
2161/*
2162 * Note that there is one less who cares about this vnode.  vdrop() is the
2163 * opposite of vhold().
2164 */
2165void
2166vdrop(struct vnode *vp)
2167{
2168
2169	VI_LOCK(vp);
2170	vdropl(vp);
2171}
2172
2173/*
2174 * Drop the hold count of the vnode.  If this is the last reference to
2175 * the vnode we will free it if it has been vgone'd otherwise it is
2176 * placed on the free list.
2177 */
2178static void
2179vdropl(struct vnode *vp)
2180{
2181
2182	if (vp->v_holdcnt <= 0)
2183		panic("vdrop: holdcnt %d", vp->v_holdcnt);
2184	vp->v_holdcnt--;
2185	if (vp->v_holdcnt == 0) {
2186		if (vp->v_iflag & VI_DOOMED) {
2187			vdestroy(vp);
2188			return;
2189		} else
2190			vfree(vp);
2191	}
2192	VI_UNLOCK(vp);
2193}
2194
2195/*
2196 * Call VOP_INACTIVE on the vnode and manage the DOINGINACT and OWEINACT
2197 * flags.  DOINGINACT prevents us from recursing in calls to vinactive.
2198 * OWEINACT tracks whether a vnode missed a call to inactive due to a
2199 * failed lock upgrade.
2200 */
2201static void
2202vinactive(struct vnode *vp, struct thread *td)
2203{
2204
2205	ASSERT_VOP_LOCKED(vp, "vinactive");
2206	ASSERT_VI_LOCKED(vp, "vinactive");
2207	VNASSERT((vp->v_iflag & VI_DOINGINACT) == 0, vp,
2208	    ("vinactive: recursed on VI_DOINGINACT"));
2209	vp->v_iflag |= VI_DOINGINACT;
2210	vp->v_iflag &= ~VI_OWEINACT;
2211	VI_UNLOCK(vp);
2212	VOP_INACTIVE(vp, td);
2213	VI_LOCK(vp);
2214	VNASSERT(vp->v_iflag & VI_DOINGINACT, vp,
2215	    ("vinactive: lost VI_DOINGINACT"));
2216	vp->v_iflag &= ~VI_DOINGINACT;
2217}
2218
2219/*
2220 * Remove any vnodes in the vnode table belonging to mount point mp.
2221 *
2222 * If FORCECLOSE is not specified, there should not be any active ones,
2223 * return error if any are found (nb: this is a user error, not a
2224 * system error). If FORCECLOSE is specified, detach any active vnodes
2225 * that are found.
2226 *
2227 * If WRITECLOSE is set, only flush out regular file vnodes open for
2228 * writing.
2229 *
2230 * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped.
2231 *
2232 * `rootrefs' specifies the base reference count for the root vnode
2233 * of this filesystem. The root vnode is considered busy if its
2234 * v_usecount exceeds this value. On a successful return, vflush(, td)
2235 * will call vrele() on the root vnode exactly rootrefs times.
2236 * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must
2237 * be zero.
2238 */
2239#ifdef DIAGNOSTIC
2240static int busyprt = 0;		/* print out busy vnodes */
2241SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "");
2242#endif
2243
2244int
2245vflush(mp, rootrefs, flags, td)
2246	struct mount *mp;
2247	int rootrefs;
2248	int flags;
2249	struct thread *td;
2250{
2251	struct vnode *vp, *mvp, *rootvp = NULL;
2252	struct vattr vattr;
2253	int busy = 0, error;
2254
2255	CTR1(KTR_VFS, "vflush: mp %p", mp);
2256	if (rootrefs > 0) {
2257		KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0,
2258		    ("vflush: bad args"));
2259		/*
2260		 * Get the filesystem root vnode. We can vput() it
2261		 * immediately, since with rootrefs > 0, it won't go away.
2262		 */
2263		if ((error = VFS_ROOT(mp, LK_EXCLUSIVE, &rootvp, td)) != 0)
2264			return (error);
2265		vput(rootvp);
2266
2267	}
2268	MNT_ILOCK(mp);
2269loop:
2270	MNT_VNODE_FOREACH(vp, mp, mvp) {
2271
2272		VI_LOCK(vp);
2273		vholdl(vp);
2274		MNT_IUNLOCK(mp);
2275		error = vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE, td);
2276		if (error) {
2277			vdrop(vp);
2278			MNT_ILOCK(mp);
2279			MNT_VNODE_FOREACH_ABORT_ILOCKED(mp, mvp);
2280			goto loop;
2281		}
2282		/*
2283		 * Skip over a vnodes marked VV_SYSTEM.
2284		 */
2285		if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) {
2286			VOP_UNLOCK(vp, 0, td);
2287			vdrop(vp);
2288			MNT_ILOCK(mp);
2289			continue;
2290		}
2291		/*
2292		 * If WRITECLOSE is set, flush out unlinked but still open
2293		 * files (even if open only for reading) and regular file
2294		 * vnodes open for writing.
2295		 */
2296		if (flags & WRITECLOSE) {
2297			error = VOP_GETATTR(vp, &vattr, td->td_ucred, td);
2298			VI_LOCK(vp);
2299
2300			if ((vp->v_type == VNON ||
2301			    (error == 0 && vattr.va_nlink > 0)) &&
2302			    (vp->v_writecount == 0 || vp->v_type != VREG)) {
2303				VOP_UNLOCK(vp, 0, td);
2304				vdropl(vp);
2305				MNT_ILOCK(mp);
2306				continue;
2307			}
2308		} else
2309			VI_LOCK(vp);
2310		/*
2311		 * With v_usecount == 0, all we need to do is clear out the
2312		 * vnode data structures and we are done.
2313		 *
2314		 * If FORCECLOSE is set, forcibly close the vnode.
2315		 */
2316		if (vp->v_usecount == 0 || (flags & FORCECLOSE)) {
2317			VNASSERT(vp->v_usecount == 0 ||
2318			    (vp->v_type != VCHR && vp->v_type != VBLK), vp,
2319			    ("device VNODE %p is FORCECLOSED", vp));
2320			vgonel(vp);
2321		} else {
2322			busy++;
2323#ifdef DIAGNOSTIC
2324			if (busyprt)
2325				vprint("vflush: busy vnode", vp);
2326#endif
2327		}
2328		VOP_UNLOCK(vp, 0, td);
2329		vdropl(vp);
2330		MNT_ILOCK(mp);
2331	}
2332	MNT_IUNLOCK(mp);
2333	if (rootrefs > 0 && (flags & FORCECLOSE) == 0) {
2334		/*
2335		 * If just the root vnode is busy, and if its refcount
2336		 * is equal to `rootrefs', then go ahead and kill it.
2337		 */
2338		VI_LOCK(rootvp);
2339		KASSERT(busy > 0, ("vflush: not busy"));
2340		VNASSERT(rootvp->v_usecount >= rootrefs, rootvp,
2341		    ("vflush: usecount %d < rootrefs %d",
2342		     rootvp->v_usecount, rootrefs));
2343		if (busy == 1 && rootvp->v_usecount == rootrefs) {
2344			VOP_LOCK(rootvp, LK_EXCLUSIVE|LK_INTERLOCK, td);
2345			vgone(rootvp);
2346			VOP_UNLOCK(rootvp, 0, td);
2347			busy = 0;
2348		} else
2349			VI_UNLOCK(rootvp);
2350	}
2351	if (busy)
2352		return (EBUSY);
2353	for (; rootrefs > 0; rootrefs--)
2354		vrele(rootvp);
2355	return (0);
2356}
2357
2358/*
2359 * Recycle an unused vnode to the front of the free list.
2360 */
2361int
2362vrecycle(struct vnode *vp, struct thread *td)
2363{
2364	int recycled;
2365
2366	ASSERT_VOP_LOCKED(vp, "vrecycle");
2367	recycled = 0;
2368	VI_LOCK(vp);
2369	if (vp->v_usecount == 0) {
2370		recycled = 1;
2371		vgonel(vp);
2372	}
2373	VI_UNLOCK(vp);
2374	return (recycled);
2375}
2376
2377/*
2378 * Eliminate all activity associated with a vnode
2379 * in preparation for reuse.
2380 */
2381void
2382vgone(struct vnode *vp)
2383{
2384	VI_LOCK(vp);
2385	vgonel(vp);
2386	VI_UNLOCK(vp);
2387}
2388
2389/*
2390 * vgone, with the vp interlock held.
2391 */
2392void
2393vgonel(struct vnode *vp)
2394{
2395	struct thread *td;
2396	int oweinact;
2397	int active;
2398
2399	CTR1(KTR_VFS, "vgonel: vp %p", vp);
2400	ASSERT_VOP_LOCKED(vp, "vgonel");
2401	ASSERT_VI_LOCKED(vp, "vgonel");
2402#if 0
2403	/* XXX Need to fix ttyvp before I enable this. */
2404	VNASSERT(vp->v_holdcnt, vp,
2405	    ("vgonel: vp %p has no reference.", vp));
2406#endif
2407	td = curthread;
2408
2409	/*
2410	 * Don't vgonel if we're already doomed.
2411	 */
2412	if (vp->v_iflag & VI_DOOMED)
2413		return;
2414	vp->v_iflag |= VI_DOOMED;
2415	/*
2416	 * Check to see if the vnode is in use.  If so, we have to call
2417	 * VOP_CLOSE() and VOP_INACTIVE().
2418	 */
2419	active = vp->v_usecount;
2420	oweinact = (vp->v_iflag & VI_OWEINACT);
2421	VI_UNLOCK(vp);
2422	/*
2423	 * Clean out any buffers associated with the vnode.
2424	 * If the flush fails, just toss the buffers.
2425	 */
2426	if (!TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd))
2427		(void) vn_write_suspend_wait(vp, NULL, V_WAIT);
2428	if (vinvalbuf(vp, V_SAVE, td, 0, 0) != 0)
2429		vinvalbuf(vp, 0, td, 0, 0);
2430
2431	/*
2432	 * If purging an active vnode, it must be closed and
2433	 * deactivated before being reclaimed.
2434	 */
2435	if (active)
2436		VOP_CLOSE(vp, FNONBLOCK, NOCRED, td);
2437	if (oweinact || active) {
2438		VI_LOCK(vp);
2439		if ((vp->v_iflag & VI_DOINGINACT) == 0)
2440			vinactive(vp, td);
2441		VI_UNLOCK(vp);
2442	}
2443	/*
2444	 * Reclaim the vnode.
2445	 */
2446	if (VOP_RECLAIM(vp, td))
2447		panic("vgone: cannot reclaim");
2448	VNASSERT(vp->v_object == NULL, vp,
2449	    ("vop_reclaim left v_object vp=%p, tag=%s", vp, vp->v_tag));
2450	/*
2451	 * Delete from old mount point vnode list.
2452	 */
2453	delmntque(vp);
2454	cache_purge(vp);
2455	/*
2456	 * Done with purge, reset to the standard lock and invalidate
2457	 * the vnode.
2458	 */
2459	VI_LOCK(vp);
2460	vp->v_vnlock = &vp->v_lock;
2461	vp->v_op = &dead_vnodeops;
2462	vp->v_tag = "none";
2463	vp->v_type = VBAD;
2464}
2465
2466/*
2467 * Calculate the total number of references to a special device.
2468 */
2469int
2470vcount(vp)
2471	struct vnode *vp;
2472{
2473	int count;
2474
2475	dev_lock();
2476	count = vp->v_rdev->si_usecount;
2477	dev_unlock();
2478	return (count);
2479}
2480
2481/*
2482 * Same as above, but using the struct cdev *as argument
2483 */
2484int
2485count_dev(dev)
2486	struct cdev *dev;
2487{
2488	int count;
2489
2490	dev_lock();
2491	count = dev->si_usecount;
2492	dev_unlock();
2493	return(count);
2494}
2495
2496/*
2497 * Print out a description of a vnode.
2498 */
2499static char *typename[] =
2500{"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD",
2501 "VMARKER"};
2502
2503void
2504vn_printf(struct vnode *vp, const char *fmt, ...)
2505{
2506	va_list ap;
2507	char buf[96];
2508
2509	va_start(ap, fmt);
2510	vprintf(fmt, ap);
2511	va_end(ap);
2512	printf("%p: ", (void *)vp);
2513	printf("tag %s, type %s\n", vp->v_tag, typename[vp->v_type]);
2514	printf("    usecount %d, writecount %d, refcount %d mountedhere %p\n",
2515	    vp->v_usecount, vp->v_writecount, vp->v_holdcnt, vp->v_mountedhere);
2516	buf[0] = '\0';
2517	buf[1] = '\0';
2518	if (vp->v_vflag & VV_ROOT)
2519		strcat(buf, "|VV_ROOT");
2520	if (vp->v_vflag & VV_TEXT)
2521		strcat(buf, "|VV_TEXT");
2522	if (vp->v_vflag & VV_SYSTEM)
2523		strcat(buf, "|VV_SYSTEM");
2524	if (vp->v_iflag & VI_DOOMED)
2525		strcat(buf, "|VI_DOOMED");
2526	if (vp->v_iflag & VI_FREE)
2527		strcat(buf, "|VI_FREE");
2528	printf("    flags (%s)\n", buf + 1);
2529	if (mtx_owned(VI_MTX(vp)))
2530		printf(" VI_LOCKed");
2531	if (vp->v_object != NULL)
2532		printf("    v_object %p ref %d pages %d\n",
2533		    vp->v_object, vp->v_object->ref_count,
2534		    vp->v_object->resident_page_count);
2535	printf("    ");
2536	lockmgr_printinfo(vp->v_vnlock);
2537	printf("\n");
2538	if (vp->v_data != NULL)
2539		VOP_PRINT(vp);
2540}
2541
2542#ifdef DDB
2543#include <ddb/ddb.h>
2544/*
2545 * List all of the locked vnodes in the system.
2546 * Called when debugging the kernel.
2547 */
2548DB_SHOW_COMMAND(lockedvnods, lockedvnodes)
2549{
2550	struct mount *mp, *nmp;
2551	struct vnode *vp;
2552
2553	/*
2554	 * Note: because this is DDB, we can't obey the locking semantics
2555	 * for these structures, which means we could catch an inconsistent
2556	 * state and dereference a nasty pointer.  Not much to be done
2557	 * about that.
2558	 */
2559	printf("Locked vnodes\n");
2560	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
2561		nmp = TAILQ_NEXT(mp, mnt_list);
2562		TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
2563			if (VOP_ISLOCKED(vp, NULL))
2564				vprint("", vp);
2565		}
2566		nmp = TAILQ_NEXT(mp, mnt_list);
2567	}
2568}
2569#endif
2570
2571/*
2572 * Fill in a struct xvfsconf based on a struct vfsconf.
2573 */
2574static void
2575vfsconf2x(struct vfsconf *vfsp, struct xvfsconf *xvfsp)
2576{
2577
2578	strcpy(xvfsp->vfc_name, vfsp->vfc_name);
2579	xvfsp->vfc_typenum = vfsp->vfc_typenum;
2580	xvfsp->vfc_refcount = vfsp->vfc_refcount;
2581	xvfsp->vfc_flags = vfsp->vfc_flags;
2582	/*
2583	 * These are unused in userland, we keep them
2584	 * to not break binary compatibility.
2585	 */
2586	xvfsp->vfc_vfsops = NULL;
2587	xvfsp->vfc_next = NULL;
2588}
2589
2590/*
2591 * Top level filesystem related information gathering.
2592 */
2593static int
2594sysctl_vfs_conflist(SYSCTL_HANDLER_ARGS)
2595{
2596	struct vfsconf *vfsp;
2597	struct xvfsconf xvfsp;
2598	int error;
2599
2600	error = 0;
2601	TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
2602		bzero(&xvfsp, sizeof(xvfsp));
2603		vfsconf2x(vfsp, &xvfsp);
2604		error = SYSCTL_OUT(req, &xvfsp, sizeof xvfsp);
2605		if (error)
2606			break;
2607	}
2608	return (error);
2609}
2610
2611SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLFLAG_RD, NULL, 0, sysctl_vfs_conflist,
2612    "S,xvfsconf", "List of all configured filesystems");
2613
2614#ifndef BURN_BRIDGES
2615static int	sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS);
2616
2617static int
2618vfs_sysctl(SYSCTL_HANDLER_ARGS)
2619{
2620	int *name = (int *)arg1 - 1;	/* XXX */
2621	u_int namelen = arg2 + 1;	/* XXX */
2622	struct vfsconf *vfsp;
2623	struct xvfsconf xvfsp;
2624
2625	printf("WARNING: userland calling deprecated sysctl, "
2626	    "please rebuild world\n");
2627
2628#if 1 || defined(COMPAT_PRELITE2)
2629	/* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */
2630	if (namelen == 1)
2631		return (sysctl_ovfs_conf(oidp, arg1, arg2, req));
2632#endif
2633
2634	switch (name[1]) {
2635	case VFS_MAXTYPENUM:
2636		if (namelen != 2)
2637			return (ENOTDIR);
2638		return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int)));
2639	case VFS_CONF:
2640		if (namelen != 3)
2641			return (ENOTDIR);	/* overloaded */
2642		TAILQ_FOREACH(vfsp, &vfsconf, vfc_list)
2643			if (vfsp->vfc_typenum == name[2])
2644				break;
2645		if (vfsp == NULL)
2646			return (EOPNOTSUPP);
2647		bzero(&xvfsp, sizeof(xvfsp));
2648		vfsconf2x(vfsp, &xvfsp);
2649		return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp)));
2650	}
2651	return (EOPNOTSUPP);
2652}
2653
2654static SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD | CTLFLAG_SKIP,
2655	vfs_sysctl, "Generic filesystem");
2656
2657#if 1 || defined(COMPAT_PRELITE2)
2658
2659static int
2660sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS)
2661{
2662	int error;
2663	struct vfsconf *vfsp;
2664	struct ovfsconf ovfs;
2665
2666	TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
2667		bzero(&ovfs, sizeof(ovfs));
2668		ovfs.vfc_vfsops = vfsp->vfc_vfsops;	/* XXX used as flag */
2669		strcpy(ovfs.vfc_name, vfsp->vfc_name);
2670		ovfs.vfc_index = vfsp->vfc_typenum;
2671		ovfs.vfc_refcount = vfsp->vfc_refcount;
2672		ovfs.vfc_flags = vfsp->vfc_flags;
2673		error = SYSCTL_OUT(req, &ovfs, sizeof ovfs);
2674		if (error)
2675			return error;
2676	}
2677	return 0;
2678}
2679
2680#endif /* 1 || COMPAT_PRELITE2 */
2681#endif /* !BURN_BRIDGES */
2682
2683#define KINFO_VNODESLOP		10
2684#ifdef notyet
2685/*
2686 * Dump vnode list (via sysctl).
2687 */
2688/* ARGSUSED */
2689static int
2690sysctl_vnode(SYSCTL_HANDLER_ARGS)
2691{
2692	struct xvnode *xvn;
2693	struct thread *td = req->td;
2694	struct mount *mp;
2695	struct vnode *vp;
2696	int error, len, n;
2697
2698	/*
2699	 * Stale numvnodes access is not fatal here.
2700	 */
2701	req->lock = 0;
2702	len = (numvnodes + KINFO_VNODESLOP) * sizeof *xvn;
2703	if (!req->oldptr)
2704		/* Make an estimate */
2705		return (SYSCTL_OUT(req, 0, len));
2706
2707	error = sysctl_wire_old_buffer(req, 0);
2708	if (error != 0)
2709		return (error);
2710	xvn = malloc(len, M_TEMP, M_ZERO | M_WAITOK);
2711	n = 0;
2712	mtx_lock(&mountlist_mtx);
2713	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
2714		if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td))
2715			continue;
2716		MNT_ILOCK(mp);
2717		TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
2718			if (n == len)
2719				break;
2720			vref(vp);
2721			xvn[n].xv_size = sizeof *xvn;
2722			xvn[n].xv_vnode = vp;
2723			xvn[n].xv_id = 0;	/* XXX compat */
2724#define XV_COPY(field) xvn[n].xv_##field = vp->v_##field
2725			XV_COPY(usecount);
2726			XV_COPY(writecount);
2727			XV_COPY(holdcnt);
2728			XV_COPY(mount);
2729			XV_COPY(numoutput);
2730			XV_COPY(type);
2731#undef XV_COPY
2732			xvn[n].xv_flag = vp->v_vflag;
2733
2734			switch (vp->v_type) {
2735			case VREG:
2736			case VDIR:
2737			case VLNK:
2738				break;
2739			case VBLK:
2740			case VCHR:
2741				if (vp->v_rdev == NULL) {
2742					vrele(vp);
2743					continue;
2744				}
2745				xvn[n].xv_dev = dev2udev(vp->v_rdev);
2746				break;
2747			case VSOCK:
2748				xvn[n].xv_socket = vp->v_socket;
2749				break;
2750			case VFIFO:
2751				xvn[n].xv_fifo = vp->v_fifoinfo;
2752				break;
2753			case VNON:
2754			case VBAD:
2755			default:
2756				/* shouldn't happen? */
2757				vrele(vp);
2758				continue;
2759			}
2760			vrele(vp);
2761			++n;
2762		}
2763		MNT_IUNLOCK(mp);
2764		mtx_lock(&mountlist_mtx);
2765		vfs_unbusy(mp, td);
2766		if (n == len)
2767			break;
2768	}
2769	mtx_unlock(&mountlist_mtx);
2770
2771	error = SYSCTL_OUT(req, xvn, n * sizeof *xvn);
2772	free(xvn, M_TEMP);
2773	return (error);
2774}
2775
2776SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD,
2777	0, 0, sysctl_vnode, "S,xvnode", "");
2778#endif
2779
2780/*
2781 * Unmount all filesystems. The list is traversed in reverse order
2782 * of mounting to avoid dependencies.
2783 */
2784void
2785vfs_unmountall()
2786{
2787	struct mount *mp;
2788	struct thread *td;
2789	int error;
2790
2791	KASSERT(curthread != NULL, ("vfs_unmountall: NULL curthread"));
2792	td = curthread;
2793	/*
2794	 * Since this only runs when rebooting, it is not interlocked.
2795	 */
2796	while(!TAILQ_EMPTY(&mountlist)) {
2797		mp = TAILQ_LAST(&mountlist, mntlist);
2798		error = dounmount(mp, MNT_FORCE, td);
2799		if (error) {
2800			TAILQ_REMOVE(&mountlist, mp, mnt_list);
2801			/*
2802			 * XXX: Due to the way in which we mount the root
2803			 * file system off of devfs, devfs will generate a
2804			 * "busy" warning when we try to unmount it before
2805			 * the root.  Don't print a warning as a result in
2806			 * order to avoid false positive errors that may
2807			 * cause needless upset.
2808			 */
2809			if (strcmp(mp->mnt_vfc->vfc_name, "devfs") != 0) {
2810				printf("unmount of %s failed (",
2811				    mp->mnt_stat.f_mntonname);
2812				if (error == EBUSY)
2813					printf("BUSY)\n");
2814				else
2815					printf("%d)\n", error);
2816			}
2817		} else {
2818			/* The unmount has removed mp from the mountlist */
2819		}
2820	}
2821}
2822
2823/*
2824 * perform msync on all vnodes under a mount point
2825 * the mount point must be locked.
2826 */
2827void
2828vfs_msync(struct mount *mp, int flags)
2829{
2830	struct vnode *vp, *mvp;
2831	struct vm_object *obj;
2832
2833	MNT_ILOCK(mp);
2834	MNT_VNODE_FOREACH(vp, mp, mvp) {
2835		VI_LOCK(vp);
2836		if ((vp->v_iflag & VI_OBJDIRTY) &&
2837		    (flags == MNT_WAIT || VOP_ISLOCKED(vp, NULL) == 0)) {
2838			MNT_IUNLOCK(mp);
2839			if (!vget(vp,
2840			    LK_EXCLUSIVE | LK_RETRY | LK_INTERLOCK,
2841			    curthread)) {
2842				if (vp->v_vflag & VV_NOSYNC) {	/* unlinked */
2843					vput(vp);
2844					MNT_ILOCK(mp);
2845					continue;
2846				}
2847
2848				obj = vp->v_object;
2849				if (obj != NULL) {
2850					VM_OBJECT_LOCK(obj);
2851					vm_object_page_clean(obj, 0, 0,
2852					    flags == MNT_WAIT ?
2853					    OBJPC_SYNC : OBJPC_NOSYNC);
2854					VM_OBJECT_UNLOCK(obj);
2855				}
2856				vput(vp);
2857			}
2858			MNT_ILOCK(mp);
2859		} else
2860			VI_UNLOCK(vp);
2861	}
2862	MNT_IUNLOCK(mp);
2863}
2864
2865/*
2866 * Mark a vnode as free, putting it up for recycling.
2867 */
2868static void
2869vfree(struct vnode *vp)
2870{
2871
2872	CTR1(KTR_VFS, "vfree vp %p", vp);
2873	ASSERT_VI_LOCKED(vp, "vfree");
2874	mtx_lock(&vnode_free_list_mtx);
2875	VNASSERT(vp->v_op != NULL, vp, ("vfree: vnode already reclaimed."));
2876	VNASSERT((vp->v_iflag & VI_FREE) == 0, vp, ("vnode already free"));
2877	VNASSERT(VSHOULDFREE(vp), vp, ("vfree: freeing when we shouldn't"));
2878	VNASSERT((vp->v_iflag & VI_DOOMED) == 0, vp,
2879	    ("vfree: Freeing doomed vnode"));
2880	if (vp->v_iflag & VI_AGE) {
2881		TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
2882	} else {
2883		TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
2884	}
2885	freevnodes++;
2886	vp->v_iflag &= ~VI_AGE;
2887	vp->v_iflag |= VI_FREE;
2888	mtx_unlock(&vnode_free_list_mtx);
2889}
2890
2891/*
2892 * Opposite of vfree() - mark a vnode as in use.
2893 */
2894static void
2895vbusy(struct vnode *vp)
2896{
2897	CTR1(KTR_VFS, "vbusy vp %p", vp);
2898	ASSERT_VI_LOCKED(vp, "vbusy");
2899	VNASSERT((vp->v_iflag & VI_FREE) != 0, vp, ("vnode not free"));
2900	VNASSERT(vp->v_op != NULL, vp, ("vbusy: vnode already reclaimed."));
2901
2902	mtx_lock(&vnode_free_list_mtx);
2903	TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
2904	freevnodes--;
2905	vp->v_iflag &= ~(VI_FREE|VI_AGE);
2906	mtx_unlock(&vnode_free_list_mtx);
2907}
2908
2909/*
2910 * Initalize per-vnode helper structure to hold poll-related state.
2911 */
2912void
2913v_addpollinfo(struct vnode *vp)
2914{
2915	struct vpollinfo *vi;
2916
2917	vi = uma_zalloc(vnodepoll_zone, M_WAITOK);
2918	if (vp->v_pollinfo != NULL) {
2919		uma_zfree(vnodepoll_zone, vi);
2920		return;
2921	}
2922	vp->v_pollinfo = vi;
2923	mtx_init(&vp->v_pollinfo->vpi_lock, "vnode pollinfo", NULL, MTX_DEF);
2924	knlist_init(&vp->v_pollinfo->vpi_selinfo.si_note, vp, vfs_knllock,
2925	    vfs_knlunlock, vfs_knllocked);
2926}
2927
2928/*
2929 * Record a process's interest in events which might happen to
2930 * a vnode.  Because poll uses the historic select-style interface
2931 * internally, this routine serves as both the ``check for any
2932 * pending events'' and the ``record my interest in future events''
2933 * functions.  (These are done together, while the lock is held,
2934 * to avoid race conditions.)
2935 */
2936int
2937vn_pollrecord(vp, td, events)
2938	struct vnode *vp;
2939	struct thread *td;
2940	short events;
2941{
2942
2943	if (vp->v_pollinfo == NULL)
2944		v_addpollinfo(vp);
2945	mtx_lock(&vp->v_pollinfo->vpi_lock);
2946	if (vp->v_pollinfo->vpi_revents & events) {
2947		/*
2948		 * This leaves events we are not interested
2949		 * in available for the other process which
2950		 * which presumably had requested them
2951		 * (otherwise they would never have been
2952		 * recorded).
2953		 */
2954		events &= vp->v_pollinfo->vpi_revents;
2955		vp->v_pollinfo->vpi_revents &= ~events;
2956
2957		mtx_unlock(&vp->v_pollinfo->vpi_lock);
2958		return events;
2959	}
2960	vp->v_pollinfo->vpi_events |= events;
2961	selrecord(td, &vp->v_pollinfo->vpi_selinfo);
2962	mtx_unlock(&vp->v_pollinfo->vpi_lock);
2963	return 0;
2964}
2965
2966/*
2967 * Routine to create and manage a filesystem syncer vnode.
2968 */
2969#define sync_close ((int (*)(struct  vop_close_args *))nullop)
2970static int	sync_fsync(struct  vop_fsync_args *);
2971static int	sync_inactive(struct  vop_inactive_args *);
2972static int	sync_reclaim(struct  vop_reclaim_args *);
2973
2974static struct vop_vector sync_vnodeops = {
2975	.vop_bypass =	VOP_EOPNOTSUPP,
2976	.vop_close =	sync_close,		/* close */
2977	.vop_fsync =	sync_fsync,		/* fsync */
2978	.vop_inactive =	sync_inactive,	/* inactive */
2979	.vop_reclaim =	sync_reclaim,	/* reclaim */
2980	.vop_lock =	vop_stdlock,	/* lock */
2981	.vop_unlock =	vop_stdunlock,	/* unlock */
2982	.vop_islocked =	vop_stdislocked,	/* islocked */
2983};
2984
2985/*
2986 * Create a new filesystem syncer vnode for the specified mount point.
2987 */
2988int
2989vfs_allocate_syncvnode(mp)
2990	struct mount *mp;
2991{
2992	struct vnode *vp;
2993	static long start, incr, next;
2994	int error;
2995
2996	/* Allocate a new vnode */
2997	if ((error = getnewvnode("syncer", mp, &sync_vnodeops, &vp)) != 0) {
2998		mp->mnt_syncer = NULL;
2999		return (error);
3000	}
3001	vp->v_type = VNON;
3002	/*
3003	 * Place the vnode onto the syncer worklist. We attempt to
3004	 * scatter them about on the list so that they will go off
3005	 * at evenly distributed times even if all the filesystems
3006	 * are mounted at once.
3007	 */
3008	next += incr;
3009	if (next == 0 || next > syncer_maxdelay) {
3010		start /= 2;
3011		incr /= 2;
3012		if (start == 0) {
3013			start = syncer_maxdelay / 2;
3014			incr = syncer_maxdelay;
3015		}
3016		next = start;
3017	}
3018	VI_LOCK(vp);
3019	vn_syncer_add_to_worklist(&vp->v_bufobj,
3020	    syncdelay > 0 ? next % syncdelay : 0);
3021	/* XXX - vn_syncer_add_to_worklist() also grabs and drops sync_mtx. */
3022	mtx_lock(&sync_mtx);
3023	sync_vnode_count++;
3024	mtx_unlock(&sync_mtx);
3025	VI_UNLOCK(vp);
3026	mp->mnt_syncer = vp;
3027	return (0);
3028}
3029
3030/*
3031 * Do a lazy sync of the filesystem.
3032 */
3033static int
3034sync_fsync(ap)
3035	struct vop_fsync_args /* {
3036		struct vnode *a_vp;
3037		struct ucred *a_cred;
3038		int a_waitfor;
3039		struct thread *a_td;
3040	} */ *ap;
3041{
3042	struct vnode *syncvp = ap->a_vp;
3043	struct mount *mp = syncvp->v_mount;
3044	struct thread *td = ap->a_td;
3045	int error, asyncflag;
3046	struct bufobj *bo;
3047
3048	/*
3049	 * We only need to do something if this is a lazy evaluation.
3050	 */
3051	if (ap->a_waitfor != MNT_LAZY)
3052		return (0);
3053
3054	/*
3055	 * Move ourselves to the back of the sync list.
3056	 */
3057	bo = &syncvp->v_bufobj;
3058	BO_LOCK(bo);
3059	vn_syncer_add_to_worklist(bo, syncdelay);
3060	BO_UNLOCK(bo);
3061
3062	/*
3063	 * Walk the list of vnodes pushing all that are dirty and
3064	 * not already on the sync list.
3065	 */
3066	mtx_lock(&mountlist_mtx);
3067	if (vfs_busy(mp, LK_EXCLUSIVE | LK_NOWAIT, &mountlist_mtx, td) != 0) {
3068		mtx_unlock(&mountlist_mtx);
3069		return (0);
3070	}
3071	if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) {
3072		vfs_unbusy(mp, td);
3073		return (0);
3074	}
3075	asyncflag = mp->mnt_flag & MNT_ASYNC;
3076	mp->mnt_flag &= ~MNT_ASYNC;
3077	vfs_msync(mp, MNT_NOWAIT);
3078	error = VFS_SYNC(mp, MNT_LAZY, td);
3079	if (asyncflag)
3080		mp->mnt_flag |= MNT_ASYNC;
3081	vn_finished_write(mp);
3082	vfs_unbusy(mp, td);
3083	return (error);
3084}
3085
3086/*
3087 * The syncer vnode is no referenced.
3088 */
3089static int
3090sync_inactive(ap)
3091	struct vop_inactive_args /* {
3092		struct vnode *a_vp;
3093		struct thread *a_td;
3094	} */ *ap;
3095{
3096
3097	vgone(ap->a_vp);
3098	return (0);
3099}
3100
3101/*
3102 * The syncer vnode is no longer needed and is being decommissioned.
3103 *
3104 * Modifications to the worklist must be protected by sync_mtx.
3105 */
3106static int
3107sync_reclaim(ap)
3108	struct vop_reclaim_args /* {
3109		struct vnode *a_vp;
3110	} */ *ap;
3111{
3112	struct vnode *vp = ap->a_vp;
3113	struct bufobj *bo;
3114
3115	VI_LOCK(vp);
3116	bo = &vp->v_bufobj;
3117	vp->v_mount->mnt_syncer = NULL;
3118	if (bo->bo_flag & BO_ONWORKLST) {
3119		mtx_lock(&sync_mtx);
3120		LIST_REMOVE(bo, bo_synclist);
3121 		syncer_worklist_len--;
3122		sync_vnode_count--;
3123		mtx_unlock(&sync_mtx);
3124		bo->bo_flag &= ~BO_ONWORKLST;
3125	}
3126	VI_UNLOCK(vp);
3127
3128	return (0);
3129}
3130
3131/*
3132 * Check if vnode represents a disk device
3133 */
3134int
3135vn_isdisk(vp, errp)
3136	struct vnode *vp;
3137	int *errp;
3138{
3139	int error;
3140
3141	error = 0;
3142	dev_lock();
3143	if (vp->v_type != VCHR)
3144		error = ENOTBLK;
3145	else if (vp->v_rdev == NULL)
3146		error = ENXIO;
3147	else if (vp->v_rdev->si_devsw == NULL)
3148		error = ENXIO;
3149	else if (!(vp->v_rdev->si_devsw->d_flags & D_DISK))
3150		error = ENOTBLK;
3151	dev_unlock();
3152	if (errp != NULL)
3153		*errp = error;
3154	return (error == 0);
3155}
3156
3157/*
3158 * Common filesystem object access control check routine.  Accepts a
3159 * vnode's type, "mode", uid and gid, requested access mode, credentials,
3160 * and optional call-by-reference privused argument allowing vaccess()
3161 * to indicate to the caller whether privilege was used to satisfy the
3162 * request (obsoleted).  Returns 0 on success, or an errno on failure.
3163 */
3164int
3165vaccess(type, file_mode, file_uid, file_gid, acc_mode, cred, privused)
3166	enum vtype type;
3167	mode_t file_mode;
3168	uid_t file_uid;
3169	gid_t file_gid;
3170	mode_t acc_mode;
3171	struct ucred *cred;
3172	int *privused;
3173{
3174	mode_t dac_granted;
3175#ifdef CAPABILITIES
3176	mode_t cap_granted;
3177#endif
3178
3179	/*
3180	 * Look for a normal, non-privileged way to access the file/directory
3181	 * as requested.  If it exists, go with that.
3182	 */
3183
3184	if (privused != NULL)
3185		*privused = 0;
3186
3187	dac_granted = 0;
3188
3189	/* Check the owner. */
3190	if (cred->cr_uid == file_uid) {
3191		dac_granted |= VADMIN;
3192		if (file_mode & S_IXUSR)
3193			dac_granted |= VEXEC;
3194		if (file_mode & S_IRUSR)
3195			dac_granted |= VREAD;
3196		if (file_mode & S_IWUSR)
3197			dac_granted |= (VWRITE | VAPPEND);
3198
3199		if ((acc_mode & dac_granted) == acc_mode)
3200			return (0);
3201
3202		goto privcheck;
3203	}
3204
3205	/* Otherwise, check the groups (first match) */
3206	if (groupmember(file_gid, cred)) {
3207		if (file_mode & S_IXGRP)
3208			dac_granted |= VEXEC;
3209		if (file_mode & S_IRGRP)
3210			dac_granted |= VREAD;
3211		if (file_mode & S_IWGRP)
3212			dac_granted |= (VWRITE | VAPPEND);
3213
3214		if ((acc_mode & dac_granted) == acc_mode)
3215			return (0);
3216
3217		goto privcheck;
3218	}
3219
3220	/* Otherwise, check everyone else. */
3221	if (file_mode & S_IXOTH)
3222		dac_granted |= VEXEC;
3223	if (file_mode & S_IROTH)
3224		dac_granted |= VREAD;
3225	if (file_mode & S_IWOTH)
3226		dac_granted |= (VWRITE | VAPPEND);
3227	if ((acc_mode & dac_granted) == acc_mode)
3228		return (0);
3229
3230privcheck:
3231	if (!suser_cred(cred, SUSER_ALLOWJAIL)) {
3232		/* XXX audit: privilege used */
3233		if (privused != NULL)
3234			*privused = 1;
3235		return (0);
3236	}
3237
3238#ifdef CAPABILITIES
3239	/*
3240	 * Build a capability mask to determine if the set of capabilities
3241	 * satisfies the requirements when combined with the granted mask
3242	 * from above.
3243	 * For each capability, if the capability is required, bitwise
3244	 * or the request type onto the cap_granted mask.
3245	 */
3246	cap_granted = 0;
3247
3248	if (type == VDIR) {
3249		/*
3250		 * For directories, use CAP_DAC_READ_SEARCH to satisfy
3251		 * VEXEC requests, instead of CAP_DAC_EXECUTE.
3252		 */
3253		if ((acc_mode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
3254		    !cap_check(cred, NULL, CAP_DAC_READ_SEARCH, SUSER_ALLOWJAIL))
3255			cap_granted |= VEXEC;
3256	} else {
3257		if ((acc_mode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
3258		    !cap_check(cred, NULL, CAP_DAC_EXECUTE, SUSER_ALLOWJAIL))
3259			cap_granted |= VEXEC;
3260	}
3261
3262	if ((acc_mode & VREAD) && ((dac_granted & VREAD) == 0) &&
3263	    !cap_check(cred, NULL, CAP_DAC_READ_SEARCH, SUSER_ALLOWJAIL))
3264		cap_granted |= VREAD;
3265
3266	if ((acc_mode & VWRITE) && ((dac_granted & VWRITE) == 0) &&
3267	    !cap_check(cred, NULL, CAP_DAC_WRITE, SUSER_ALLOWJAIL))
3268		cap_granted |= (VWRITE | VAPPEND);
3269
3270	if ((acc_mode & VADMIN) && ((dac_granted & VADMIN) == 0) &&
3271	    !cap_check(cred, NULL, CAP_FOWNER, SUSER_ALLOWJAIL))
3272		cap_granted |= VADMIN;
3273
3274	if ((acc_mode & (cap_granted | dac_granted)) == acc_mode) {
3275		/* XXX audit: privilege used */
3276		if (privused != NULL)
3277			*privused = 1;
3278		return (0);
3279	}
3280#endif
3281
3282	return ((acc_mode & VADMIN) ? EPERM : EACCES);
3283}
3284
3285/*
3286 * Credential check based on process requesting service, and per-attribute
3287 * permissions.
3288 */
3289int
3290extattr_check_cred(struct vnode *vp, int attrnamespace,
3291    struct ucred *cred, struct thread *td, int access)
3292{
3293
3294	/*
3295	 * Kernel-invoked always succeeds.
3296	 */
3297	if (cred == NOCRED)
3298		return (0);
3299
3300	/*
3301	 * Do not allow privileged processes in jail to directly
3302	 * manipulate system attributes.
3303	 *
3304	 * XXX What capability should apply here?
3305	 * Probably CAP_SYS_SETFFLAG.
3306	 */
3307	switch (attrnamespace) {
3308	case EXTATTR_NAMESPACE_SYSTEM:
3309		/* Potentially should be: return (EPERM); */
3310		return (suser_cred(cred, 0));
3311	case EXTATTR_NAMESPACE_USER:
3312		return (VOP_ACCESS(vp, access, cred, td));
3313	default:
3314		return (EPERM);
3315	}
3316}
3317
3318#ifdef DEBUG_VFS_LOCKS
3319/*
3320 * This only exists to supress warnings from unlocked specfs accesses.  It is
3321 * no longer ok to have an unlocked VFS.
3322 */
3323#define	IGNORE_LOCK(vp) ((vp)->v_type == VCHR || (vp)->v_type == VBAD)
3324
3325int vfs_badlock_ddb = 1;	/* Drop into debugger on violation. */
3326SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_ddb, CTLFLAG_RW, &vfs_badlock_ddb, 0, "");
3327
3328int vfs_badlock_mutex = 1;	/* Check for interlock across VOPs. */
3329SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_mutex, CTLFLAG_RW, &vfs_badlock_mutex, 0, "");
3330
3331int vfs_badlock_print = 1;	/* Print lock violations. */
3332SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_print, CTLFLAG_RW, &vfs_badlock_print, 0, "");
3333
3334#ifdef KDB
3335int vfs_badlock_backtrace = 1;	/* Print backtrace at lock violations. */
3336SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_backtrace, CTLFLAG_RW, &vfs_badlock_backtrace, 0, "");
3337#endif
3338
3339static void
3340vfs_badlock(const char *msg, const char *str, struct vnode *vp)
3341{
3342
3343#ifdef KDB
3344	if (vfs_badlock_backtrace)
3345		kdb_backtrace();
3346#endif
3347	if (vfs_badlock_print)
3348		printf("%s: %p %s\n", str, (void *)vp, msg);
3349	if (vfs_badlock_ddb)
3350		kdb_enter("lock violation");
3351}
3352
3353void
3354assert_vi_locked(struct vnode *vp, const char *str)
3355{
3356
3357	if (vfs_badlock_mutex && !mtx_owned(VI_MTX(vp)))
3358		vfs_badlock("interlock is not locked but should be", str, vp);
3359}
3360
3361void
3362assert_vi_unlocked(struct vnode *vp, const char *str)
3363{
3364
3365	if (vfs_badlock_mutex && mtx_owned(VI_MTX(vp)))
3366		vfs_badlock("interlock is locked but should not be", str, vp);
3367}
3368
3369void
3370assert_vop_locked(struct vnode *vp, const char *str)
3371{
3372
3373	if (vp && !IGNORE_LOCK(vp) && VOP_ISLOCKED(vp, NULL) == 0)
3374		vfs_badlock("is not locked but should be", str, vp);
3375}
3376
3377void
3378assert_vop_unlocked(struct vnode *vp, const char *str)
3379{
3380
3381	if (vp && !IGNORE_LOCK(vp) &&
3382	    VOP_ISLOCKED(vp, curthread) == LK_EXCLUSIVE)
3383		vfs_badlock("is locked but should not be", str, vp);
3384}
3385
3386void
3387assert_vop_elocked(struct vnode *vp, const char *str)
3388{
3389
3390	if (vp && !IGNORE_LOCK(vp) &&
3391	    VOP_ISLOCKED(vp, curthread) != LK_EXCLUSIVE)
3392		vfs_badlock("is not exclusive locked but should be", str, vp);
3393}
3394
3395#if 0
3396void
3397assert_vop_elocked_other(struct vnode *vp, const char *str)
3398{
3399
3400	if (vp && !IGNORE_LOCK(vp) &&
3401	    VOP_ISLOCKED(vp, curthread) != LK_EXCLOTHER)
3402		vfs_badlock("is not exclusive locked by another thread",
3403		    str, vp);
3404}
3405
3406void
3407assert_vop_slocked(struct vnode *vp, const char *str)
3408{
3409
3410	if (vp && !IGNORE_LOCK(vp) &&
3411	    VOP_ISLOCKED(vp, curthread) != LK_SHARED)
3412		vfs_badlock("is not locked shared but should be", str, vp);
3413}
3414#endif /* 0 */
3415#endif /* DEBUG_VFS_LOCKS */
3416
3417void
3418vop_rename_pre(void *ap)
3419{
3420	struct vop_rename_args *a = ap;
3421
3422#ifdef DEBUG_VFS_LOCKS
3423	if (a->a_tvp)
3424		ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME");
3425	ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME");
3426	ASSERT_VI_UNLOCKED(a->a_fvp, "VOP_RENAME");
3427	ASSERT_VI_UNLOCKED(a->a_fdvp, "VOP_RENAME");
3428
3429	/* Check the source (from). */
3430	if (a->a_tdvp != a->a_fdvp)
3431		ASSERT_VOP_UNLOCKED(a->a_fdvp, "vop_rename: fdvp locked");
3432	if (a->a_tvp != a->a_fvp)
3433		ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: tvp locked");
3434
3435	/* Check the target. */
3436	if (a->a_tvp)
3437		ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked");
3438	ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked");
3439#endif
3440	if (a->a_tdvp != a->a_fdvp)
3441		vhold(a->a_fdvp);
3442	if (a->a_tvp != a->a_fvp)
3443		vhold(a->a_fvp);
3444	vhold(a->a_tdvp);
3445	if (a->a_tvp)
3446		vhold(a->a_tvp);
3447}
3448
3449void
3450vop_strategy_pre(void *ap)
3451{
3452#ifdef DEBUG_VFS_LOCKS
3453	struct vop_strategy_args *a;
3454	struct buf *bp;
3455
3456	a = ap;
3457	bp = a->a_bp;
3458
3459	/*
3460	 * Cluster ops lock their component buffers but not the IO container.
3461	 */
3462	if ((bp->b_flags & B_CLUSTER) != 0)
3463		return;
3464
3465	if (BUF_REFCNT(bp) < 1) {
3466		if (vfs_badlock_print)
3467			printf(
3468			    "VOP_STRATEGY: bp is not locked but should be\n");
3469		if (vfs_badlock_ddb)
3470			kdb_enter("lock violation");
3471	}
3472#endif
3473}
3474
3475void
3476vop_lookup_pre(void *ap)
3477{
3478#ifdef DEBUG_VFS_LOCKS
3479	struct vop_lookup_args *a;
3480	struct vnode *dvp;
3481
3482	a = ap;
3483	dvp = a->a_dvp;
3484	ASSERT_VI_UNLOCKED(dvp, "VOP_LOOKUP");
3485	ASSERT_VOP_LOCKED(dvp, "VOP_LOOKUP");
3486#endif
3487}
3488
3489void
3490vop_lookup_post(void *ap, int rc)
3491{
3492#ifdef DEBUG_VFS_LOCKS
3493	struct vop_lookup_args *a;
3494	struct vnode *dvp;
3495	struct vnode *vp;
3496
3497	a = ap;
3498	dvp = a->a_dvp;
3499	vp = *(a->a_vpp);
3500
3501	ASSERT_VI_UNLOCKED(dvp, "VOP_LOOKUP");
3502	ASSERT_VOP_LOCKED(dvp, "VOP_LOOKUP");
3503
3504	if (!rc)
3505		ASSERT_VOP_LOCKED(vp, "VOP_LOOKUP (child)");
3506#endif
3507}
3508
3509void
3510vop_lock_pre(void *ap)
3511{
3512#ifdef DEBUG_VFS_LOCKS
3513	struct vop_lock_args *a = ap;
3514
3515	if ((a->a_flags & LK_INTERLOCK) == 0)
3516		ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
3517	else
3518		ASSERT_VI_LOCKED(a->a_vp, "VOP_LOCK");
3519#endif
3520}
3521
3522void
3523vop_lock_post(void *ap, int rc)
3524{
3525#ifdef DEBUG_VFS_LOCKS
3526	struct vop_lock_args *a = ap;
3527
3528	ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
3529	if (rc == 0)
3530		ASSERT_VOP_LOCKED(a->a_vp, "VOP_LOCK");
3531#endif
3532}
3533
3534void
3535vop_unlock_pre(void *ap)
3536{
3537#ifdef DEBUG_VFS_LOCKS
3538	struct vop_unlock_args *a = ap;
3539
3540	if (a->a_flags & LK_INTERLOCK)
3541		ASSERT_VI_LOCKED(a->a_vp, "VOP_UNLOCK");
3542	ASSERT_VOP_LOCKED(a->a_vp, "VOP_UNLOCK");
3543#endif
3544}
3545
3546void
3547vop_unlock_post(void *ap, int rc)
3548{
3549#ifdef DEBUG_VFS_LOCKS
3550	struct vop_unlock_args *a = ap;
3551
3552	if (a->a_flags & LK_INTERLOCK)
3553		ASSERT_VI_UNLOCKED(a->a_vp, "VOP_UNLOCK");
3554#endif
3555}
3556
3557void
3558vop_create_post(void *ap, int rc)
3559{
3560	struct vop_create_args *a = ap;
3561
3562	if (!rc)
3563		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
3564}
3565
3566void
3567vop_link_post(void *ap, int rc)
3568{
3569	struct vop_link_args *a = ap;
3570
3571	if (!rc) {
3572		VFS_KNOTE_LOCKED(a->a_vp, NOTE_LINK);
3573		VFS_KNOTE_LOCKED(a->a_tdvp, NOTE_WRITE);
3574	}
3575}
3576
3577void
3578vop_mkdir_post(void *ap, int rc)
3579{
3580	struct vop_mkdir_args *a = ap;
3581
3582	if (!rc)
3583		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK);
3584}
3585
3586void
3587vop_mknod_post(void *ap, int rc)
3588{
3589	struct vop_mknod_args *a = ap;
3590
3591	if (!rc)
3592		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
3593}
3594
3595void
3596vop_remove_post(void *ap, int rc)
3597{
3598	struct vop_remove_args *a = ap;
3599
3600	if (!rc) {
3601		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
3602		VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE);
3603	}
3604}
3605
3606void
3607vop_rename_post(void *ap, int rc)
3608{
3609	struct vop_rename_args *a = ap;
3610
3611	if (!rc) {
3612		VFS_KNOTE_UNLOCKED(a->a_fdvp, NOTE_WRITE);
3613		VFS_KNOTE_UNLOCKED(a->a_tdvp, NOTE_WRITE);
3614		VFS_KNOTE_UNLOCKED(a->a_fvp, NOTE_RENAME);
3615		if (a->a_tvp)
3616			VFS_KNOTE_UNLOCKED(a->a_tvp, NOTE_DELETE);
3617	}
3618	if (a->a_tdvp != a->a_fdvp)
3619		vdrop(a->a_fdvp);
3620	if (a->a_tvp != a->a_fvp)
3621		vdrop(a->a_fvp);
3622	vdrop(a->a_tdvp);
3623	if (a->a_tvp)
3624		vdrop(a->a_tvp);
3625}
3626
3627void
3628vop_rmdir_post(void *ap, int rc)
3629{
3630	struct vop_rmdir_args *a = ap;
3631
3632	if (!rc) {
3633		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK);
3634		VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE);
3635	}
3636}
3637
3638void
3639vop_setattr_post(void *ap, int rc)
3640{
3641	struct vop_setattr_args *a = ap;
3642
3643	if (!rc)
3644		VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
3645}
3646
3647void
3648vop_symlink_post(void *ap, int rc)
3649{
3650	struct vop_symlink_args *a = ap;
3651
3652	if (!rc)
3653		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
3654}
3655
3656static struct knlist fs_knlist;
3657
3658static void
3659vfs_event_init(void *arg)
3660{
3661	knlist_init(&fs_knlist, NULL, NULL, NULL, NULL);
3662}
3663/* XXX - correct order? */
3664SYSINIT(vfs_knlist, SI_SUB_VFS, SI_ORDER_ANY, vfs_event_init, NULL);
3665
3666void
3667vfs_event_signal(fsid_t *fsid, u_int32_t event, intptr_t data __unused)
3668{
3669
3670	KNOTE_UNLOCKED(&fs_knlist, event);
3671}
3672
3673static int	filt_fsattach(struct knote *kn);
3674static void	filt_fsdetach(struct knote *kn);
3675static int	filt_fsevent(struct knote *kn, long hint);
3676
3677struct filterops fs_filtops =
3678	{ 0, filt_fsattach, filt_fsdetach, filt_fsevent };
3679
3680static int
3681filt_fsattach(struct knote *kn)
3682{
3683
3684	kn->kn_flags |= EV_CLEAR;
3685	knlist_add(&fs_knlist, kn, 0);
3686	return (0);
3687}
3688
3689static void
3690filt_fsdetach(struct knote *kn)
3691{
3692
3693	knlist_remove(&fs_knlist, kn, 0);
3694}
3695
3696static int
3697filt_fsevent(struct knote *kn, long hint)
3698{
3699
3700	kn->kn_fflags |= hint;
3701	return (kn->kn_fflags != 0);
3702}
3703
3704static int
3705sysctl_vfs_ctl(SYSCTL_HANDLER_ARGS)
3706{
3707	struct vfsidctl vc;
3708	int error;
3709	struct mount *mp;
3710
3711	error = SYSCTL_IN(req, &vc, sizeof(vc));
3712	if (error)
3713		return (error);
3714	if (vc.vc_vers != VFS_CTL_VERS1)
3715		return (EINVAL);
3716	mp = vfs_getvfs(&vc.vc_fsid);
3717	if (mp == NULL)
3718		return (ENOENT);
3719	/* ensure that a specific sysctl goes to the right filesystem. */
3720	if (strcmp(vc.vc_fstypename, "*") != 0 &&
3721	    strcmp(vc.vc_fstypename, mp->mnt_vfc->vfc_name) != 0) {
3722		return (EINVAL);
3723	}
3724	VCTLTOREQ(&vc, req);
3725	return (VFS_SYSCTL(mp, vc.vc_op, req));
3726}
3727
3728SYSCTL_PROC(_vfs, OID_AUTO, ctl, CTLFLAG_WR,
3729        NULL, 0, sysctl_vfs_ctl, "", "Sysctl by fsid");
3730
3731/*
3732 * Function to initialize a va_filerev field sensibly.
3733 * XXX: Wouldn't a random number make a lot more sense ??
3734 */
3735u_quad_t
3736init_va_filerev(void)
3737{
3738	struct bintime bt;
3739
3740	getbinuptime(&bt);
3741	return (((u_quad_t)bt.sec << 32LL) | (bt.frac >> 32LL));
3742}
3743
3744static int	filt_vfsread(struct knote *kn, long hint);
3745static int	filt_vfswrite(struct knote *kn, long hint);
3746static int	filt_vfsvnode(struct knote *kn, long hint);
3747static void	filt_vfsdetach(struct knote *kn);
3748static struct filterops vfsread_filtops =
3749	{ 1, NULL, filt_vfsdetach, filt_vfsread };
3750static struct filterops vfswrite_filtops =
3751	{ 1, NULL, filt_vfsdetach, filt_vfswrite };
3752static struct filterops vfsvnode_filtops =
3753	{ 1, NULL, filt_vfsdetach, filt_vfsvnode };
3754
3755static void
3756vfs_knllock(void *arg)
3757{
3758	struct vnode *vp = arg;
3759
3760	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, curthread);
3761}
3762
3763static void
3764vfs_knlunlock(void *arg)
3765{
3766	struct vnode *vp = arg;
3767
3768	VOP_UNLOCK(vp, 0, curthread);
3769}
3770
3771static int
3772vfs_knllocked(void *arg)
3773{
3774	struct vnode *vp = arg;
3775
3776	return (VOP_ISLOCKED(vp, curthread) == LK_EXCLUSIVE);
3777}
3778
3779int
3780vfs_kqfilter(struct vop_kqfilter_args *ap)
3781{
3782	struct vnode *vp = ap->a_vp;
3783	struct knote *kn = ap->a_kn;
3784	struct knlist *knl;
3785
3786	switch (kn->kn_filter) {
3787	case EVFILT_READ:
3788		kn->kn_fop = &vfsread_filtops;
3789		break;
3790	case EVFILT_WRITE:
3791		kn->kn_fop = &vfswrite_filtops;
3792		break;
3793	case EVFILT_VNODE:
3794		kn->kn_fop = &vfsvnode_filtops;
3795		break;
3796	default:
3797		return (EINVAL);
3798	}
3799
3800	kn->kn_hook = (caddr_t)vp;
3801
3802	if (vp->v_pollinfo == NULL)
3803		v_addpollinfo(vp);
3804	if (vp->v_pollinfo == NULL)
3805		return (ENOMEM);
3806	knl = &vp->v_pollinfo->vpi_selinfo.si_note;
3807	knlist_add(knl, kn, 0);
3808
3809	return (0);
3810}
3811
3812/*
3813 * Detach knote from vnode
3814 */
3815static void
3816filt_vfsdetach(struct knote *kn)
3817{
3818	struct vnode *vp = (struct vnode *)kn->kn_hook;
3819
3820	KASSERT(vp->v_pollinfo != NULL, ("Missing v_pollinfo"));
3821	knlist_remove(&vp->v_pollinfo->vpi_selinfo.si_note, kn, 0);
3822}
3823
3824/*ARGSUSED*/
3825static int
3826filt_vfsread(struct knote *kn, long hint)
3827{
3828	struct vnode *vp = (struct vnode *)kn->kn_hook;
3829	struct vattr va;
3830
3831	/*
3832	 * filesystem is gone, so set the EOF flag and schedule
3833	 * the knote for deletion.
3834	 */
3835	if (hint == NOTE_REVOKE) {
3836		kn->kn_flags |= (EV_EOF | EV_ONESHOT);
3837		return (1);
3838	}
3839
3840	if (VOP_GETATTR(vp, &va, curthread->td_ucred, curthread))
3841		return (0);
3842
3843	kn->kn_data = va.va_size - kn->kn_fp->f_offset;
3844	return (kn->kn_data != 0);
3845}
3846
3847/*ARGSUSED*/
3848static int
3849filt_vfswrite(struct knote *kn, long hint)
3850{
3851	/*
3852	 * filesystem is gone, so set the EOF flag and schedule
3853	 * the knote for deletion.
3854	 */
3855	if (hint == NOTE_REVOKE)
3856		kn->kn_flags |= (EV_EOF | EV_ONESHOT);
3857
3858	kn->kn_data = 0;
3859	return (1);
3860}
3861
3862static int
3863filt_vfsvnode(struct knote *kn, long hint)
3864{
3865	if (kn->kn_sfflags & hint)
3866		kn->kn_fflags |= hint;
3867	if (hint == NOTE_REVOKE) {
3868		kn->kn_flags |= EV_EOF;
3869		return (1);
3870	}
3871	return (kn->kn_fflags != 0);
3872}
3873
3874int
3875vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off)
3876{
3877	int error;
3878
3879	if (dp->d_reclen > ap->a_uio->uio_resid)
3880		return (ENAMETOOLONG);
3881	error = uiomove(dp, dp->d_reclen, ap->a_uio);
3882	if (error) {
3883		if (ap->a_ncookies != NULL) {
3884			if (ap->a_cookies != NULL)
3885				free(ap->a_cookies, M_TEMP);
3886			ap->a_cookies = NULL;
3887			*ap->a_ncookies = 0;
3888		}
3889		return (error);
3890	}
3891	if (ap->a_ncookies == NULL)
3892		return (0);
3893
3894	KASSERT(ap->a_cookies,
3895	    ("NULL ap->a_cookies value with non-NULL ap->a_ncookies!"));
3896
3897	*ap->a_cookies = realloc(*ap->a_cookies,
3898	    (*ap->a_ncookies + 1) * sizeof(u_long), M_TEMP, M_WAITOK | M_ZERO);
3899	(*ap->a_cookies)[*ap->a_ncookies] = off;
3900	return (0);
3901}
3902
3903/*
3904 * Mark for update the access time of the file if the filesystem
3905 * supports VA_MARK_ATIME.  This functionality is used by execve
3906 * and mmap, so we want to avoid the synchronous I/O implied by
3907 * directly setting va_atime for the sake of efficiency.
3908 */
3909void
3910vfs_mark_atime(struct vnode *vp, struct thread *td)
3911{
3912	struct vattr atimeattr;
3913
3914	if ((vp->v_mount->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0) {
3915		VATTR_NULL(&atimeattr);
3916		atimeattr.va_vaflags |= VA_MARK_ATIME;
3917		(void)VOP_SETATTR(vp, &atimeattr, td->td_ucred, td);
3918	}
3919}
3920