vfs_subr.c revision 148167
110154Sache/*- 27767Sache * Copyright (c) 1989, 1993 37767Sache * The Regents of the University of California. All rights reserved. 4941Snate * (c) UNIX System Laboratories, Inc. 57767Sache * All or some portions of this file are derived from material licensed 67767Sache * to the University of California by American Telephone and Telegraph 7941Snate * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8941Snate * the permission of UNIX System Laboratories, Inc. 9941Snate * 10941Snate * Redistribution and use in source and binary forms, with or without 11941Snate * modification, are permitted provided that the following conditions 12941Snate * are met: 13941Snate * 1. Redistributions of source code must retain the above copyright 14941Snate * notice, this list of conditions and the following disclaimer. 15941Snate * 2. Redistributions in binary form must reproduce the above copyright 16941Snate * notice, this list of conditions and the following disclaimer in the 17941Snate * documentation and/or other materials provided with the distribution. 18941Snate * 4. Neither the name of the University nor the names of its contributors 19941Snate * may be used to endorse or promote products derived from this software 2010154Sache * without specific prior written permission. 21941Snate * 22941Snate * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23941Snate * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24941Snate * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25941Snate * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26941Snate * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27941Snate * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28941Snate * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 2987230Smarkm * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 3054158Scharnier * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 3187230Smarkm * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 3287230Smarkm * SUCH DAMAGE. 33941Snate * 34941Snate * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95 35941Snate */ 367767Sache 37941Snate/* 38941Snate * External virtual filesystem routines 39941Snate */ 4022873Sdavidn 41941Snate#include <sys/cdefs.h> 42941Snate__FBSDID("$FreeBSD: head/sys/kern/vfs_subr.c 148167 2005-07-20 01:43:27Z jeff $"); 4354158Scharnier 44941Snate#include "opt_ddb.h" 45941Snate#include "opt_mac.h" 46941Snate 47941Snate#include <sys/param.h> 48941Snate#include <sys/systm.h> 49941Snate#include <sys/bio.h> 50941Snate#include <sys/buf.h> 51941Snate#include <sys/conf.h> 52941Snate#include <sys/event.h> 53941Snate#include <sys/eventhandler.h> 5422873Sdavidn#include <sys/extattr.h> 557767Sache#include <sys/file.h> 567767Sache#include <sys/fcntl.h> 5711760Sache#include <sys/kdb.h> 5811760Sache#include <sys/kernel.h> 597767Sache#include <sys/kthread.h> 60941Snate#include <sys/mac.h> 6123318Sache#include <sys/malloc.h> 6222873Sdavidn#include <sys/mount.h> 6322873Sdavidn#include <sys/namei.h> 6423318Sache#include <sys/reboot.h> 6522873Sdavidn#include <sys/sleepqueue.h> 6622873Sdavidn#include <sys/stat.h> 67941Snate#include <sys/sysctl.h> 687767Sache#include <sys/syslog.h> 69941Snate#include <sys/vmmeter.h> 70941Snate#include <sys/vnode.h> 71941Snate 727767Sache#include <machine/stdarg.h> 737767Sache 74941Snate#include <vm/vm.h> 75941Snate#include <vm/vm_object.h> 76941Snate#include <vm/vm_extern.h> 77941Snate#include <vm/pmap.h> 78941Snate#include <vm/vm_map.h> 7910154Sache#include <vm/vm_page.h> 807767Sache#include <vm/vm_kern.h> 817767Sache#include <vm/uma.h> 827767Sache 837767Sachestatic MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure"); 847767Sache 857767Sachestatic void delmntque(struct vnode *vp); 867767Sachestatic void insmntque(struct vnode *vp, struct mount *mp); 877767Sachestatic int flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, 887767Sache int slpflag, int slptimeo); 897767Sachestatic void syncer_shutdown(void *arg, int howto); 907767Sachestatic int vtryrecycle(struct vnode *vp); 917767Sachestatic void vbusy(struct vnode *vp); 927767Sachestatic void vdropl(struct vnode *vp); 93941Snatestatic void vinactive(struct vnode *, struct thread *); 94941Snatestatic void v_incr_usecount(struct vnode *); 95941Snatestatic void v_decr_usecount(struct vnode *); 9610154Sachestatic void v_decr_useonly(struct vnode *); 9710154Sachestatic void vfree(struct vnode *); 98941Snatestatic void vnlru_free(int); 997767Sachestatic void vdestroy(struct vnode *); 10087208Smarkmstatic void vgonel(struct vnode *); 101941Snatestatic void vfs_knllock(void *arg); 1027767Sachestatic void vfs_knlunlock(void *arg); 1037767Sachestatic int vfs_knllocked(void *arg); 10446081Simp 105941Snate 106941Snate/* 1077767Sache * Enable Giant pushdown based on whether or not the vm is mpsafe in this 108941Snate * build. Without mpsafevm the buffer cache can not run Giant free. 109941Snate */ 1107767Sache#if defined(__alpha__) || defined(__amd64__) || defined(__i386__) 111941Snateint mpsafe_vfs = 1; 1127767Sache#else 113941Snateint mpsafe_vfs; 114941Snate#endif 11582722SkrisTUNABLE_INT("debug.mpsafevfs", &mpsafe_vfs); 116941SnateSYSCTL_INT(_debug, OID_AUTO, mpsafevfs, CTLFLAG_RD, &mpsafe_vfs, 0, 117941Snate "MPSAFE VFS"); 118941Snate 1197767Sache/* 1207767Sache * Number of vnodes in existence. Increased whenever getnewvnode() 1217767Sache * allocates a new vnode, never decreased. 1227767Sache */ 1237767Sachestatic unsigned long numvnodes; 12487208Smarkm 1257767SacheSYSCTL_LONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, ""); 126941Snate 127941Snate/* 12887208Smarkm * Conversion tables for conversion from vnode types to inode formats 129941Snate * and back. 13010154Sache */ 131941Snateenum vtype iftovt_tab[16] = { 1327767Sache VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, 1337767Sache VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD, 1347767Sache}; 1357767Sacheint vttoif_tab[9] = { 1367767Sache 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, 1377767Sache S_IFSOCK, S_IFIFO, S_IFMT, 138941Snate}; 13982722Skris 140941Snate/* 141941Snate * List of vnodes that are ready for recycling. 14287208Smarkm */ 143941Snatestatic TAILQ_HEAD(freelst, vnode) vnode_free_list; 14482722Skris 14582722Skris/* 14682722Skris * Free vnode target. Free vnodes may simply be files which have been stat'd 14782722Skris * but not read. This is somewhat common, and a small cache of such files 14882722Skris * should be kept to avoid recreation costs. 14982722Skris */ 15082722Skrisstatic u_long wantfreevnodes; 151941SnateSYSCTL_LONG(_vfs, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, ""); 152941Snate/* Number of vnodes in the free list. */ 153941Snatestatic u_long freevnodes; 154941SnateSYSCTL_LONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, ""); 1557767Sache 156941Snate/* 157941Snate * Various variables used for debugging the new implementation of 158941Snate * reassignbuf(). 159941Snate * XXX these are probably of (very) limited utility now. 1607767Sache */ 1617767Sachestatic int reassignbufcalls; 162941SnateSYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0, ""); 1637767Sache 16480294Sobrien/* 16580294Sobrien * Cache for the mount type id assigned to NFS. This is used for 1667767Sache * special checks in nfs/nfs_nqlease.c and vm/vnode_pager.c. 1677767Sache */ 1687767Sacheint nfs_mount_type = -1; 169941Snate 17054158Scharnier/* To keep more than one thread at a time from running vfs_getnewfsid */ 171941Snatestatic struct mtx mntid_mtx; 1727767Sache 1737767Sache/* 17410154Sache * Lock for any access to the following: 1757767Sache * vnode_free_list 17654158Scharnier * numvnodes 17710154Sache * freevnodes 1787767Sache */ 1797767Sachestatic struct mtx vnode_free_list_mtx; 18080294Sobrien 18180294Sobrien/* Publicly exported FS */ 1827767Sachestruct nfs_public nfs_pub; 183941Snate 184941Snate/* Zone for allocation of new vnodes - used exclusively by getnewvnode() */ 18510154Sachestatic uma_zone_t vnode_zone; 18610154Sachestatic uma_zone_t vnodepoll_zone; 18710154Sache 18810154Sache/* Set to 1 to print out reclaim of active vnodes */ 18910154Sacheint prtactive; 19010154Sache 19110154Sache/* 19210154Sache * The workitem queue. 19310154Sache * 19410154Sache * It is useful to delay writes of file data and filesystem metadata 19510154Sache * for tens of seconds so that quickly created and deleted files need 19610154Sache * not waste disk bandwidth being created and removed. To realize this, 19710154Sache * we append vnodes to a "workitem" queue. When running with a soft 19810154Sache * updates implementation, most pending metadata dependencies should 19910154Sache * not wait for more than a few seconds. Thus, mounted on block devices 20010154Sache * are delayed only about a half the time that file data is delayed. 20110154Sache * Similarly, directory updates are more critical, so are only delayed 20210154Sache * about a third the time that file data is delayed. Thus, there are 20310154Sache * SYNCER_MAXDELAY queues that are processed round-robin at a rate of 20410154Sache * one each second (driven off the filesystem syncer process). The 20510154Sache * syncer_delayno variable indicates the next queue that is to be processed. 20610154Sache * Items that need to be processed soon are placed in this queue: 20710154Sache * 20810154Sache * syncer_workitem_pending[syncer_delayno] 20910154Sache * 210941Snate * A delay of fifteen seconds is done by placing the request fifteen 2117767Sache * entries later in the queue: 212941Snate * 2137767Sache * syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask] 2147767Sache * 21510154Sache */ 2167767Sachestatic int syncer_delayno; 2177767Sachestatic long syncer_mask; 2187767SacheLIST_HEAD(synclist, bufobj); 2197767Sachestatic struct synclist *syncer_workitem_pending; 2207767Sache/* 2217767Sache * The sync_mtx protects: 2227767Sache * bo->bo_synclist 2237767Sache * sync_vnode_count 2247767Sache * syncer_delayno 2257767Sache * syncer_state 22610154Sache * syncer_workitem_pending 22711760Sache * syncer_worklist_len 22811760Sache * rushjob 22911760Sache */ 23011760Sachestatic struct mtx sync_mtx; 2317767Sache 2327767Sache#define SYNCER_MAXDELAY 32 2337767Sachestatic int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */ 2347767Sachestatic int syncdelay = 30; /* max time to delay syncing data */ 2357767Sachestatic int filedelay = 30; /* time to delay syncing files */ 2367767SacheSYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, ""); 237941Snatestatic int dirdelay = 29; /* time to delay syncing directories */ 2387767SacheSYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, ""); 239941Snatestatic int metadelay = 28; /* time to delay syncing metadata */ 2407767SacheSYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, ""); 241941Snatestatic int rushjob; /* number of slots to run ASAP */ 2427767Sachestatic int stat_rush_requests; /* number of times I/O speeded up */ 2437767SacheSYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, ""); 2447767Sache 2457767Sache/* 2467767Sache * When shutting down the syncer, run it at four times normal speed. 247941Snate */ 2487767Sache#define SYNCER_SHUTDOWN_SPEEDUP 4 249941Snatestatic int sync_vnode_count; 2507767Sachestatic int syncer_worklist_len; 25154158Scharnierstatic enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY } 252941Snate syncer_state; 2537767Sache 2547767Sache/* 255941Snate * Number of vnodes we want to exist at any one time. This is mostly used 2567767Sache * to size hash tables in vnode-related code. It is normally not used in 2577767Sache * getnewvnode(), as wantfreevnodes is normally nonzero.) 2587767Sache * 259941Snate * XXX desiredvnodes is historical cruft and should not exist. 2607767Sache */ 2617767Sacheint desiredvnodes; 2627767SacheSYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW, 2637767Sache &desiredvnodes, 0, "Maximum number of vnodes"); 2647767SacheSYSCTL_INT(_kern, OID_AUTO, minvnodes, CTLFLAG_RW, 2657767Sache &wantfreevnodes, 0, "Minimum number of vnodes (legacy)"); 2667767Sachestatic int vnlru_nowhere; 267941SnateSYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW, 26810154Sache &vnlru_nowhere, 0, "Number of times the vnlru process ran without success"); 26954158Scharnier 270941Snate/* Hook for calling soft updates. */ 27110154Sacheint (*softdep_process_worklist_hook)(struct mount *); 27210154Sache 273941Snate/* 27410154Sache * Macros to control when a vnode is freed and recycled. All require 27510154Sache * the vnode interlock. 27610154Sache */ 277941Snate#define VCANRECYCLE(vp) (((vp)->v_iflag & VI_FREE) && !(vp)->v_holdcnt) 27810154Sache#define VSHOULDFREE(vp) (!((vp)->v_iflag & VI_FREE) && !(vp)->v_holdcnt) 27910154Sache#define VSHOULDBUSY(vp) (((vp)->v_iflag & VI_FREE) && (vp)->v_holdcnt) 28054158Scharnier 28110154Sache 2827767Sache/* 2837767Sache * Initialize the vnode management data structures. 2847767Sache */ 2857767Sache#ifndef MAXVNODES_MAX 2867767Sache#define MAXVNODES_MAX 100000 2877767Sache#endif 2887767Sachestatic void 28954158Scharniervntblinit(void *dummy __unused) 290941Snate{ 2917767Sache 29254158Scharnier /* 293941Snate * Desiredvnodes is a function of the physical memory size and 2947767Sache * the kernel's heap size. Specifically, desiredvnodes scales 29554158Scharnier * in proportion to the physical memory size until two fifths 296941Snate * of the kernel's heap size is consumed by vnodes and vm 2977767Sache * objects. 298941Snate */ 2998112Sache desiredvnodes = min(maxproc + cnt.v_page_count / 4, 2 * vm_kmem_size / 3008112Sache (5 * (sizeof(struct vm_object) + sizeof(struct vnode)))); 3018112Sache if (desiredvnodes > MAXVNODES_MAX) { 3028112Sache if (bootverbose) 3038112Sache printf("Reducing kern.maxvnodes %d -> %d\n", 3048112Sache desiredvnodes, MAXVNODES_MAX); 30510154Sache desiredvnodes = MAXVNODES_MAX; 3067767Sache } 3077767Sache wantfreevnodes = desiredvnodes / 4; 3087767Sache mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF); 309941Snate TAILQ_INIT(&vnode_free_list); 3107767Sache mtx_init(&vnode_free_list_mtx, "vnode_free_list", NULL, MTX_DEF); 3117767Sache vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL, 3127767Sache NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); 3137767Sache vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo), 3147767Sache NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); 3157767Sache /* 316941Snate * Initialize the filesystem syncer. 3177767Sache */ 31854158Scharnier syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE, 319941Snate &syncer_mask); 3207767Sache syncer_maxdelay = syncer_mask + 1; 3217767Sache mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF); 3227767Sache} 3237767SacheSYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL) 3247767Sache 3257767Sache 326941Snate/* 32710154Sache * Mark a mount point as busy. Used to synchronize access and to delay 32822873Sdavidn * unmounting. Interlock is not released on failure. 3297767Sache */ 33010154Sacheint 3317767Sachevfs_busy(mp, flags, interlkp, td) 3327767Sache struct mount *mp; 3337767Sache int flags; 334941Snate struct mtx *interlkp; 3357767Sache struct thread *td; 3367767Sache{ 3377767Sache int lkflags; 3387767Sache 33954158Scharnier MNT_ILOCK(mp); 3407767Sache if (mp->mnt_kern_flag & MNTK_UNMOUNT) { 34122873Sdavidn if (flags & LK_NOWAIT) { 34222873Sdavidn MNT_IUNLOCK(mp); 343941Snate return (ENOENT); 3447767Sache } 3457767Sache if (interlkp) 3467767Sache mtx_unlock(interlkp); 347941Snate mp->mnt_kern_flag |= MNTK_MWAIT; 3487767Sache /* 3497767Sache * Since all busy locks are shared except the exclusive 35054158Scharnier * lock granted when unmounting, the only place that a 3517767Sache * wakeup needs to be done is at the release of the 3527767Sache * exclusive lock at the end of dounmount. 3537767Sache */ 3547767Sache msleep(mp, MNT_MTX(mp), PVFS|PDROP, "vfs_busy", 0); 3557767Sache if (interlkp) 3567767Sache mtx_lock(interlkp); 357941Snate return (ENOENT); 3587767Sache } 3597767Sache if (interlkp) 3607767Sache mtx_unlock(interlkp); 3617767Sache lkflags = LK_SHARED | LK_INTERLOCK; 3627767Sache if (lockmgr(&mp->mnt_lock, lkflags, MNT_MTX(mp), td)) 36387208Smarkm panic("vfs_busy: unexpected lock failure"); 3647767Sache return (0); 3657767Sache} 3667767Sache 36710154Sache/* 3687767Sache * Free a busy filesystem. 3697767Sache */ 3707767Sachevoid 3717767Sachevfs_unbusy(mp, td) 372941Snate struct mount *mp; 3737767Sache struct thread *td; 3747767Sache{ 3757767Sache 3767767Sache lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, td); 3777767Sache} 3787767Sache 3797767Sache/* 3807767Sache * Lookup a mount point by filesystem identifier. 3817767Sache */ 38210154Sachestruct mount * 38310154Sachevfs_getvfs(fsid) 38410154Sache fsid_t *fsid; 38510154Sache{ 38610154Sache struct mount *mp; 38710154Sache 38810154Sache mtx_lock(&mountlist_mtx); 38910154Sache TAILQ_FOREACH(mp, &mountlist, mnt_list) { 39010154Sache if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] && 39110154Sache mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) { 39210154Sache mtx_unlock(&mountlist_mtx); 39310154Sache return (mp); 3947767Sache } 395941Snate } 3967767Sache mtx_unlock(&mountlist_mtx); 3977767Sache return ((struct mount *) 0); 3987767Sache} 3997767Sache 40010154Sache/* 401941Snate * Check if a user can access priveledged mount options. 40210154Sache */ 4037767Sacheint 4047767Sachevfs_suser(struct mount *mp, struct thread *td) 4057767Sache{ 4067767Sache int error; 4077767Sache 4087767Sache if ((mp->mnt_flag & MNT_USER) == 0 || 4097767Sache mp->mnt_cred->cr_uid != td->td_ucred->cr_uid) { 4107767Sache if ((error = suser(td)) != 0) 4117767Sache return (error); 4127767Sache } 4137767Sache return (0); 4147767Sache} 41510154Sache 4167767Sache/* 4177767Sache * Get a new unique fsid. Try to make its val[0] unique, since this value 4187767Sache * will be used to create fake device numbers for stat(). Also try (but 4197767Sache * not so hard) make its val[0] unique mod 2^16, since some emulators only 4207767Sache * support 16-bit device numbers. We end up with unique val[0]'s for the 4217767Sache * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls. 4227767Sache * 4237767Sache * Keep in mind that several mounts may be running in parallel. Starting 424941Snate * the search one past where the previous search terminated is both a 4257767Sache * micro-optimization and a defense against returning the same fsid to 4267767Sache * different mounts. 427941Snate */ 4287767Sachevoid 4297767Sachevfs_getnewfsid(mp) 43054158Scharnier struct mount *mp; 43110154Sache{ 4327767Sache static u_int16_t mntid_base; 43354158Scharnier fsid_t tfsid; 434941Snate int mtype; 4357767Sache 436941Snate mtx_lock(&mntid_mtx); 4377767Sache mtype = mp->mnt_vfc->vfc_typenum; 4387767Sache tfsid.val[1] = mtype; 439941Snate mtype = (mtype & 0xFF) << 24; 4407767Sache for (;;) { 44154158Scharnier tfsid.val[0] = makedev(255, 442941Snate mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF)); 4437767Sache mntid_base++; 44410154Sache if (vfs_getvfs(&tfsid) == NULL) 445941Snate break; 446941Snate } 447941Snate mp->mnt_stat.f_fsid.val[0] = tfsid.val[0]; 448941Snate mp->mnt_stat.f_fsid.val[1] = tfsid.val[1]; 449941Snate mtx_unlock(&mntid_mtx); 45010154Sache} 4517767Sache 4527767Sache/* 4537767Sache * Knob to control the precision of file timestamps: 4547767Sache * 4557767Sache * 0 = seconds only; nanoseconds zeroed. 4567767Sache * 1 = seconds and nanoseconds, accurate within 1/HZ. 4577767Sache * 2 = seconds and nanoseconds, truncated to microseconds. 4587767Sache * >=3 = seconds and nanoseconds, maximum precision. 4597767Sache */ 46010154Sacheenum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC }; 4617767Sache 4627767Sachestatic int timestamp_precision = TSP_SEC; 4637767SacheSYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW, 46440389Smckay ×tamp_precision, 0, ""); 46540389Smckay 46640389Smckay/* 46740389Smckay * Get a current timestamp. 468941Snate */ 4697767Sachevoid 470941Snatevfs_timestamp(tsp) 4717767Sache struct timespec *tsp; 47254158Scharnier{ 473941Snate struct timeval tv; 4747767Sache 47554158Scharnier switch (timestamp_precision) { 476941Snate case TSP_SEC: 47710154Sache tsp->tv_sec = time_second; 4787767Sache tsp->tv_nsec = 0; 4797767Sache break; 4807767Sache case TSP_HZ: 48154158Scharnier getnanotime(tsp); 48210154Sache break; 4837767Sache case TSP_USEC: 4847767Sache microtime(&tv); 4857767Sache TIMEVAL_TO_TIMESPEC(&tv, tsp); 4867767Sache break; 4877767Sache case TSP_NSEC: 4887767Sache default: 4897767Sache nanotime(tsp); 490941Snate break; 49110154Sache } 4927767Sache} 493941Snate 4947767Sache/* 4957767Sache * Set vnode attributes to VNOVAL 496941Snate */ 4977767Sachevoid 4987767Sachevattr_null(vap) 49987208Smarkm struct vattr *vap; 5007767Sache{ 5017767Sache 5027767Sache vap->va_type = VNON; 5037767Sache vap->va_size = VNOVAL; 5047767Sache vap->va_bytes = VNOVAL; 505941Snate vap->va_mode = VNOVAL; 50610154Sache vap->va_nlink = VNOVAL; 50710154Sache vap->va_uid = VNOVAL; 50810154Sache vap->va_gid = VNOVAL; 50910154Sache vap->va_fsid = VNOVAL; 51010154Sache vap->va_fileid = VNOVAL; 51110154Sache vap->va_blocksize = VNOVAL; 5127767Sache vap->va_rdev = VNOVAL; 5137767Sache vap->va_atime.tv_sec = VNOVAL; 514941Snate vap->va_atime.tv_nsec = VNOVAL; 515941Snate vap->va_mtime.tv_sec = VNOVAL; 516941Snate vap->va_mtime.tv_nsec = VNOVAL; 51710154Sache vap->va_ctime.tv_sec = VNOVAL; 518941Snate vap->va_ctime.tv_nsec = VNOVAL; 5197767Sache vap->va_birthtime.tv_sec = VNOVAL; 5207767Sache vap->va_birthtime.tv_nsec = VNOVAL; 5217767Sache vap->va_flags = VNOVAL; 5227767Sache vap->va_gen = VNOVAL; 52310154Sache vap->va_vaflags = 0; 52410154Sache} 52510154Sache 52610154Sache/* 52710154Sache * This routine is called when we have too many vnodes. It attempts 528941Snate * to free <count> vnodes and will potentially free vnodes that still 5297767Sache * have VM backing store (VM backing store is typically the cause 530941Snate * of a vnode blowout so we want to do this). Therefore, this operation 5317767Sache * is not considered cheap. 53254158Scharnier * 5338874Srgrimes * A number of conditions may prevent a vnode from being reclaimed. 53410154Sache * the buffer cache may have references on the vnode, a directory 53554158Scharnier * vnode may still have references due to the namei cache representing 53610154Sache * underlying files, or the vnode may be in active use. It is not 53710154Sache * desireable to reuse such vnodes. These conditions may cause the 53810154Sache * number of vnodes to reach some minimum value regardless of what 53910154Sache * you set kern.maxvnodes to. Do not set kern.maxvnodes too low. 54010154Sache */ 54110154Sachestatic int 54210154Sachevlrureclaim(struct mount *mp) 54310154Sache{ 54410154Sache struct thread *td; 54554158Scharnier struct vnode *vp; 54610154Sache int done; 54710154Sache int trigger; 54810154Sache int usevnodes; 5497860Sache int count; 55010154Sache 55110154Sache /* 55210154Sache * Calculate the trigger point, don't allow user 55354158Scharnier * screwups to blow us up. This prevents us from 55454158Scharnier * recycling vnodes with lots of resident pages. We 55510154Sache * aren't trying to free memory, we are trying to 55610154Sache * free vnodes. 55710154Sache */ 55810154Sache usevnodes = desiredvnodes; 55910154Sache if (usevnodes <= 0) 56010154Sache usevnodes = 1; 56110154Sache trigger = cnt.v_page_count * 2 / usevnodes; 56210154Sache done = 0; 56310154Sache td = curthread; 56410154Sache vn_start_write(NULL, &mp, V_WAIT); 56510154Sache MNT_ILOCK(mp); 56610154Sache count = mp->mnt_nvnodelistsize / 10 + 1; 56710154Sache while (count && (vp = TAILQ_FIRST(&mp->mnt_nvnodelist)) != NULL) { 56810154Sache TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 56910154Sache TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 57010154Sache --count; 57110154Sache if (!VI_TRYLOCK(vp)) 57210154Sache continue; 57310154Sache /* 57410154Sache * If it's been deconstructed already, it's still 57510154Sache * referenced, or it exceeds the trigger, skip it. 57610154Sache */ 57710154Sache if ((vp->v_iflag & VI_DOOMED) != 0 || vp->v_usecount || 57810154Sache !LIST_EMPTY(&(vp)->v_cache_src) || (vp->v_object != NULL && 57954158Scharnier vp->v_object->resident_page_count > trigger)) { 58010154Sache VI_UNLOCK(vp); 58110154Sache continue; 58210154Sache } 58310154Sache MNT_IUNLOCK(mp); 58410154Sache vholdl(vp); 58510154Sache if (VOP_LOCK(vp, LK_INTERLOCK|LK_EXCLUSIVE, td)) { 58610154Sache vdrop(vp); 58710154Sache MNT_ILOCK(mp); 58854158Scharnier continue; 58954158Scharnier } 59010154Sache VI_LOCK(vp); 59110154Sache vgonel(vp); 5927860Sache VOP_UNLOCK(vp, 0, td); 5937767Sache vdropl(vp); 5947767Sache done++; 595941Snate MNT_ILOCK(mp); 596941Snate } 5977767Sache MNT_IUNLOCK(mp); 598941Snate vn_finished_write(mp); 5997767Sache return done; 6007767Sache} 6017767Sache 6027767Sache/* 603941Snate * Attempt to keep the free list at wantfreevnodes length. 6047767Sache */ 60587208Smarkmstatic void 6067767Sachevnlru_free(int count) 6077767Sache{ 608941Snate struct vnode *vp; 6097767Sache 610941Snate mtx_assert(&vnode_free_list_mtx, MA_OWNED); 6117767Sache for (; count > 0; count--) { 6127767Sache vp = TAILQ_FIRST(&vnode_free_list); 6137767Sache /* 6147767Sache * The list can be modified while the free_list_mtx 6157767Sache * has been dropped and vp could be NULL here. 6167767Sache */ 617941Snate if (!vp) 61882722Skris break; 61982722Skris VNASSERT(vp->v_op != NULL, vp, 6207767Sache ("vnlru_free: vnode already reclaimed.")); 6217767Sache TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); 6227767Sache /* 6237767Sache * Don't recycle if we can't get the interlock. 6247767Sache */ 6257767Sache if (!VI_TRYLOCK(vp)) { 6267767Sache TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 6277767Sache continue; 6287767Sache } 6297767Sache VNASSERT(VCANRECYCLE(vp), vp, 6307767Sache ("vp inconsistent on freelist")); 6317767Sache freevnodes--; 6327767Sache vp->v_iflag &= ~VI_FREE; 6337767Sache vholdl(vp); 634941Snate mtx_unlock(&vnode_free_list_mtx); 6357767Sache VI_UNLOCK(vp); 6367767Sache vtryrecycle(vp); 6377767Sache /* 63824360Simp * If the recycled succeeded this vdrop will actually free 6397767Sache * the vnode. If not it will simply place it back on 6407767Sache * the free list. 6417767Sache */ 6427767Sache vdrop(vp); 643941Snate mtx_lock(&vnode_free_list_mtx); 6447767Sache } 6457767Sache} 6467767Sache/* 647941Snate * Attempt to recycle vnodes in a context that is always safe to block. 6487767Sache * Calling vlrurecycle() from the bowels of filesystem code has some 6497767Sache * interesting deadlock problems. 6507767Sache */ 65110154Sachestatic struct proc *vnlruproc; 6527767Sachestatic int vnlruproc_sig; 6537767Sache 6547767Sachestatic void 655941Snatevnlru_proc(void) 6567767Sache{ 6577767Sache struct mount *mp, *nmp; 6587767Sache int done; 659941Snate struct proc *p = vnlruproc; 6607767Sache struct thread *td = FIRST_THREAD_IN_PROC(p); 6617767Sache 662941Snate mtx_lock(&Giant); 6637767Sache 6647767Sache EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, p, 6657767Sache SHUTDOWN_PRI_FIRST); 666941Snate 6677767Sache for (;;) { 6687767Sache kthread_suspend_check(p); 6697767Sache mtx_lock(&vnode_free_list_mtx); 670941Snate if (freevnodes > wantfreevnodes) 6717767Sache vnlru_free(freevnodes - wantfreevnodes); 6727767Sache if (numvnodes <= desiredvnodes * 9 / 10) { 6737767Sache vnlruproc_sig = 0; 674941Snate wakeup(&vnlruproc_sig); 6757767Sache msleep(vnlruproc, &vnode_free_list_mtx, 6767767Sache PVFS|PDROP, "vlruwt", hz); 6777767Sache continue; 678941Snate } 6797767Sache mtx_unlock(&vnode_free_list_mtx); 6807767Sache done = 0; 6817767Sache mtx_lock(&mountlist_mtx); 682941Snate for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { 6837767Sache int vfsunlocked; 6847767Sache if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td)) { 6857767Sache nmp = TAILQ_NEXT(mp, mnt_list); 686941Snate continue; 6877767Sache } 6887767Sache if (!VFS_NEEDSGIANT(mp)) { 6897767Sache mtx_unlock(&Giant); 690941Snate vfsunlocked = 1; 69110154Sache } else 69210154Sache vfsunlocked = 0; 69310154Sache done += vlrureclaim(mp); 69410154Sache if (vfsunlocked) 69510154Sache mtx_lock(&Giant); 6967767Sache mtx_lock(&mountlist_mtx); 6977767Sache nmp = TAILQ_NEXT(mp, mnt_list); 6987767Sache vfs_unbusy(mp, td); 6997767Sache } 7007767Sache mtx_unlock(&mountlist_mtx); 7017767Sache if (done == 0) { 702941Snate#if 0 7037767Sache /* These messages are temporary debugging aids */ 70482722Skris if (vnlru_nowhere < 5) 70582722Skris printf("vnlru process getting nowhere..\n"); 70682722Skris else if (vnlru_nowhere == 5) 707941Snate printf("vnlru process messages stopped.\n"); 7087767Sache#endif 7097767Sache vnlru_nowhere++; 7107767Sache tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3); 71154158Scharnier } else 7127767Sache uio_yield(); 7137767Sache } 714941Snate} 7157767Sache 7167767Sachestatic struct kproc_desc vnlru_kp = { 7177767Sache "vnlru", 7187767Sache vnlru_proc, 7197767Sache &vnlruproc 7207767Sache}; 7217767SacheSYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &vnlru_kp) 7227767Sache 7237767Sache/* 72410154Sache * Routines having to do with the management of the vnode table. 7257767Sache */ 7267767Sache 72710154Sachestatic void 72810154Sachevdestroy(struct vnode *vp) 72910154Sache{ 73010154Sache struct bufobj *bo; 73110154Sache 7327767Sache CTR1(KTR_VFS, "vdestroy vp %p", vp); 7337767Sache mtx_lock(&vnode_free_list_mtx); 7347767Sache numvnodes--; 7357767Sache mtx_unlock(&vnode_free_list_mtx); 7367767Sache bo = &vp->v_bufobj; 7377767Sache VNASSERT((vp->v_iflag & VI_FREE) == 0, vp, 738941Snate ("cleaned vnode still on the free list.")); 7397767Sache VNASSERT(vp->v_data == NULL, vp, ("cleaned vnode isn't")); 7407767Sache VNASSERT(vp->v_holdcnt == 0, vp, ("Non-zero hold count")); 7417767Sache VNASSERT(vp->v_usecount == 0, vp, ("Non-zero use count")); 7427767Sache VNASSERT(vp->v_writecount == 0, vp, ("Non-zero write count")); 7437767Sache VNASSERT(bo->bo_numoutput == 0, vp, ("Clean vnode has pending I/O's")); 7447767Sache VNASSERT(bo->bo_clean.bv_cnt == 0, vp, ("cleanbufcnt not 0")); 7457767Sache VNASSERT(bo->bo_clean.bv_root == NULL, vp, ("cleanblkroot not NULL")); 7467767Sache VNASSERT(bo->bo_dirty.bv_cnt == 0, vp, ("dirtybufcnt not 0")); 7477767Sache VNASSERT(bo->bo_dirty.bv_root == NULL, vp, ("dirtyblkroot not NULL")); 7487767Sache VNASSERT(TAILQ_EMPTY(&vp->v_cache_dst), vp, ("vp has namecache dst")); 7497767Sache VNASSERT(LIST_EMPTY(&vp->v_cache_src), vp, ("vp has namecache src")); 7507767Sache#ifdef MAC 7517767Sache mac_destroy_vnode(vp); 75210154Sache#endif 7537767Sache if (vp->v_pollinfo != NULL) { 7547767Sache knlist_destroy(&vp->v_pollinfo->vpi_selinfo.si_note); 7557767Sache mtx_destroy(&vp->v_pollinfo->vpi_lock); 7567767Sache uma_zfree(vnodepoll_zone, vp->v_pollinfo); 7577767Sache } 7587767Sache#ifdef INVARIANTS 7597767Sache /* XXX Elsewhere we can detect an already freed vnode via NULL v_op. */ 7607767Sache vp->v_op = NULL; 7617767Sache#endif 7627767Sache lockdestroy(vp->v_vnlock); 76354158Scharnier mtx_destroy(&vp->v_interlock); 7647767Sache uma_zfree(vnode_zone, vp); 7657767Sache} 7667767Sache 767941Snate/* 768 * Try to recycle a freed vnode. We abort if anyone picks up a reference 769 * before we actually vgone(). This function must be called with the vnode 770 * held to prevent the vnode from being returned to the free list midway 771 * through vgone(). 772 */ 773static int 774vtryrecycle(struct vnode *vp) 775{ 776 struct thread *td = curthread; 777 struct mount *vnmp; 778 779 CTR1(KTR_VFS, "vtryrecycle: trying vp %p", vp); 780 VNASSERT(vp->v_holdcnt, vp, 781 ("vtryrecycle: Recycling vp %p without a reference.", vp)); 782 /* 783 * This vnode may found and locked via some other list, if so we 784 * can't recycle it yet. 785 */ 786 if (VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT, td) != 0) 787 return (EWOULDBLOCK); 788 /* 789 * Don't recycle if its filesystem is being suspended. 790 */ 791 if (vn_start_write(vp, &vnmp, V_NOWAIT) != 0) { 792 VOP_UNLOCK(vp, 0, td); 793 return (EBUSY); 794 } 795 /* 796 * If we got this far, we need to acquire the interlock and see if 797 * anyone picked up this vnode from another list. If not, we will 798 * mark it with DOOMED via vgonel() so that anyone who does find it 799 * will skip over it. 800 */ 801 VI_LOCK(vp); 802 if (vp->v_usecount) { 803 VOP_UNLOCK(vp, LK_INTERLOCK, td); 804 vn_finished_write(vnmp); 805 return (EBUSY); 806 } 807 if ((vp->v_iflag & VI_DOOMED) == 0) 808 vgonel(vp); 809 VOP_UNLOCK(vp, LK_INTERLOCK, td); 810 vn_finished_write(vnmp); 811 CTR1(KTR_VFS, "vtryrecycle: recycled vp %p", vp); 812 return (0); 813} 814 815/* 816 * Return the next vnode from the free list. 817 */ 818int 819getnewvnode(tag, mp, vops, vpp) 820 const char *tag; 821 struct mount *mp; 822 struct vop_vector *vops; 823 struct vnode **vpp; 824{ 825 struct vnode *vp = NULL; 826 struct bufobj *bo; 827 828 mtx_lock(&vnode_free_list_mtx); 829 /* 830 * Lend our context to reclaim vnodes if they've exceeded the max. 831 */ 832 if (freevnodes > wantfreevnodes) 833 vnlru_free(1); 834 /* 835 * Wait for available vnodes. 836 */ 837 if (numvnodes > desiredvnodes) { 838 if (vnlruproc_sig == 0) { 839 vnlruproc_sig = 1; /* avoid unnecessary wakeups */ 840 wakeup(vnlruproc); 841 } 842 msleep(&vnlruproc_sig, &vnode_free_list_mtx, PVFS, 843 "vlruwk", hz); 844#if 0 /* XXX Not all VFS_VGET/ffs_vget callers check returns. */ 845 if (numvnodes > desiredvnodes) { 846 mtx_unlock(&vnode_free_list_mtx); 847 return (ENFILE); 848 } 849#endif 850 } 851 numvnodes++; 852 mtx_unlock(&vnode_free_list_mtx); 853 vp = (struct vnode *) uma_zalloc(vnode_zone, M_WAITOK|M_ZERO); 854 /* 855 * Setup locks. 856 */ 857 vp->v_vnlock = &vp->v_lock; 858 mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF); 859 /* 860 * By default, don't allow shared locks unless filesystems 861 * opt-in. 862 */ 863 lockinit(vp->v_vnlock, PVFS, tag, VLKTIMEOUT, LK_NOSHARE); 864 /* 865 * Initialize bufobj. 866 */ 867 bo = &vp->v_bufobj; 868 bo->__bo_vnode = vp; 869 bo->bo_mtx = &vp->v_interlock; 870 bo->bo_ops = &buf_ops_bio; 871 bo->bo_private = vp; 872 TAILQ_INIT(&bo->bo_clean.bv_hd); 873 TAILQ_INIT(&bo->bo_dirty.bv_hd); 874 /* 875 * Initialize namecache. 876 */ 877 LIST_INIT(&vp->v_cache_src); 878 TAILQ_INIT(&vp->v_cache_dst); 879 /* 880 * Finalize various vnode identity bits. 881 */ 882 vp->v_type = VNON; 883 vp->v_tag = tag; 884 vp->v_op = vops; 885 v_incr_usecount(vp); 886 vp->v_data = 0; 887#ifdef MAC 888 mac_init_vnode(vp); 889 if (mp != NULL && (mp->mnt_flag & MNT_MULTILABEL) == 0) 890 mac_associate_vnode_singlelabel(mp, vp); 891 else if (mp == NULL) 892 printf("NULL mp in getnewvnode()\n"); 893#endif 894 delmntque(vp); 895 if (mp != NULL) { 896 insmntque(vp, mp); 897 bo->bo_bsize = mp->mnt_stat.f_iosize; 898 } 899 900 CTR2(KTR_VFS, "getnewvnode: mp %p vp %p", mp, vp); 901 *vpp = vp; 902 return (0); 903} 904 905/* 906 * Delete from old mount point vnode list, if on one. 907 */ 908static void 909delmntque(struct vnode *vp) 910{ 911 struct mount *mp; 912 913 if (vp->v_mount == NULL) 914 return; 915 mp = vp->v_mount; 916 MNT_ILOCK(mp); 917 vp->v_mount = NULL; 918 VNASSERT(mp->mnt_nvnodelistsize > 0, vp, 919 ("bad mount point vnode list size")); 920 TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 921 mp->mnt_nvnodelistsize--; 922 MNT_IUNLOCK(mp); 923} 924 925/* 926 * Insert into list of vnodes for the new mount point, if available. 927 */ 928static void 929insmntque(struct vnode *vp, struct mount *mp) 930{ 931 932 vp->v_mount = mp; 933 VNASSERT(mp != NULL, vp, ("Don't call insmntque(foo, NULL)")); 934 MNT_ILOCK(vp->v_mount); 935 TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 936 mp->mnt_nvnodelistsize++; 937 MNT_IUNLOCK(vp->v_mount); 938} 939 940/* 941 * Flush out and invalidate all buffers associated with a bufobj 942 * Called with the underlying object locked. 943 */ 944int 945bufobj_invalbuf(struct bufobj *bo, int flags, struct thread *td, int slpflag, int slptimeo) 946{ 947 int error; 948 949 BO_LOCK(bo); 950 if (flags & V_SAVE) { 951 error = bufobj_wwait(bo, slpflag, slptimeo); 952 if (error) { 953 BO_UNLOCK(bo); 954 return (error); 955 } 956 if (bo->bo_dirty.bv_cnt > 0) { 957 BO_UNLOCK(bo); 958 if ((error = BO_SYNC(bo, MNT_WAIT, td)) != 0) 959 return (error); 960 /* 961 * XXX We could save a lock/unlock if this was only 962 * enabled under INVARIANTS 963 */ 964 BO_LOCK(bo); 965 if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0) 966 panic("vinvalbuf: dirty bufs"); 967 } 968 } 969 /* 970 * If you alter this loop please notice that interlock is dropped and 971 * reacquired in flushbuflist. Special care is needed to ensure that 972 * no race conditions occur from this. 973 */ 974 do { 975 error = flushbuflist(&bo->bo_clean, 976 flags, bo, slpflag, slptimeo); 977 if (error == 0) 978 error = flushbuflist(&bo->bo_dirty, 979 flags, bo, slpflag, slptimeo); 980 if (error != 0 && error != EAGAIN) { 981 BO_UNLOCK(bo); 982 return (error); 983 } 984 } while (error != 0); 985 986 /* 987 * Wait for I/O to complete. XXX needs cleaning up. The vnode can 988 * have write I/O in-progress but if there is a VM object then the 989 * VM object can also have read-I/O in-progress. 990 */ 991 do { 992 bufobj_wwait(bo, 0, 0); 993 BO_UNLOCK(bo); 994 if (bo->bo_object != NULL) { 995 VM_OBJECT_LOCK(bo->bo_object); 996 vm_object_pip_wait(bo->bo_object, "bovlbx"); 997 VM_OBJECT_UNLOCK(bo->bo_object); 998 } 999 BO_LOCK(bo); 1000 } while (bo->bo_numoutput > 0); 1001 BO_UNLOCK(bo); 1002 1003 /* 1004 * Destroy the copy in the VM cache, too. 1005 */ 1006 if (bo->bo_object != NULL) { 1007 VM_OBJECT_LOCK(bo->bo_object); 1008 vm_object_page_remove(bo->bo_object, 0, 0, 1009 (flags & V_SAVE) ? TRUE : FALSE); 1010 VM_OBJECT_UNLOCK(bo->bo_object); 1011 } 1012 1013#ifdef INVARIANTS 1014 BO_LOCK(bo); 1015 if ((flags & (V_ALT | V_NORMAL)) == 0 && 1016 (bo->bo_dirty.bv_cnt > 0 || bo->bo_clean.bv_cnt > 0)) 1017 panic("vinvalbuf: flush failed"); 1018 BO_UNLOCK(bo); 1019#endif 1020 return (0); 1021} 1022 1023/* 1024 * Flush out and invalidate all buffers associated with a vnode. 1025 * Called with the underlying object locked. 1026 */ 1027int 1028vinvalbuf(struct vnode *vp, int flags, struct thread *td, int slpflag, int slptimeo) 1029{ 1030 1031 CTR2(KTR_VFS, "vinvalbuf vp %p flags %d", vp, flags); 1032 ASSERT_VOP_LOCKED(vp, "vinvalbuf"); 1033 return (bufobj_invalbuf(&vp->v_bufobj, flags, td, slpflag, slptimeo)); 1034} 1035 1036/* 1037 * Flush out buffers on the specified list. 1038 * 1039 */ 1040static int 1041flushbuflist(bufv, flags, bo, slpflag, slptimeo) 1042 struct bufv *bufv; 1043 int flags; 1044 struct bufobj *bo; 1045 int slpflag, slptimeo; 1046{ 1047 struct buf *bp, *nbp; 1048 int retval, error; 1049 1050 ASSERT_BO_LOCKED(bo); 1051 1052 retval = 0; 1053 TAILQ_FOREACH_SAFE(bp, &bufv->bv_hd, b_bobufs, nbp) { 1054 if (((flags & V_NORMAL) && (bp->b_xflags & BX_ALTDATA)) || 1055 ((flags & V_ALT) && (bp->b_xflags & BX_ALTDATA) == 0)) { 1056 continue; 1057 } 1058 retval = EAGAIN; 1059 error = BUF_TIMELOCK(bp, 1060 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_MTX(bo), 1061 "flushbuf", slpflag, slptimeo); 1062 if (error) { 1063 BO_LOCK(bo); 1064 return (error != ENOLCK ? error : EAGAIN); 1065 } 1066 KASSERT(bp->b_bufobj == bo, 1067 ("bp %p wrong b_bufobj %p should be %p", 1068 bp, bp->b_bufobj, bo)); 1069 if (bp->b_bufobj != bo) { /* XXX: necessary ? */ 1070 BUF_UNLOCK(bp); 1071 BO_LOCK(bo); 1072 return (EAGAIN); 1073 } 1074 /* 1075 * XXX Since there are no node locks for NFS, I 1076 * believe there is a slight chance that a delayed 1077 * write will occur while sleeping just above, so 1078 * check for it. 1079 */ 1080 if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) && 1081 (flags & V_SAVE)) { 1082 bremfree(bp); 1083 bp->b_flags |= B_ASYNC; 1084 bwrite(bp); 1085 BO_LOCK(bo); 1086 return (EAGAIN); /* XXX: why not loop ? */ 1087 } 1088 bremfree(bp); 1089 bp->b_flags |= (B_INVAL | B_NOCACHE | B_RELBUF); 1090 bp->b_flags &= ~B_ASYNC; 1091 brelse(bp); 1092 BO_LOCK(bo); 1093 } 1094 return (retval); 1095} 1096 1097/* 1098 * Truncate a file's buffer and pages to a specified length. This 1099 * is in lieu of the old vinvalbuf mechanism, which performed unneeded 1100 * sync activity. 1101 */ 1102int 1103vtruncbuf(struct vnode *vp, struct ucred *cred, struct thread *td, off_t length, int blksize) 1104{ 1105 struct buf *bp, *nbp; 1106 int anyfreed; 1107 int trunclbn; 1108 struct bufobj *bo; 1109 1110 CTR2(KTR_VFS, "vtruncbuf vp %p length %jd", vp, length); 1111 /* 1112 * Round up to the *next* lbn. 1113 */ 1114 trunclbn = (length + blksize - 1) / blksize; 1115 1116 ASSERT_VOP_LOCKED(vp, "vtruncbuf"); 1117restart: 1118 VI_LOCK(vp); 1119 bo = &vp->v_bufobj; 1120 anyfreed = 1; 1121 for (;anyfreed;) { 1122 anyfreed = 0; 1123 TAILQ_FOREACH_SAFE(bp, &bo->bo_clean.bv_hd, b_bobufs, nbp) { 1124 if (bp->b_lblkno < trunclbn) 1125 continue; 1126 if (BUF_LOCK(bp, 1127 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, 1128 VI_MTX(vp)) == ENOLCK) 1129 goto restart; 1130 1131 bremfree(bp); 1132 bp->b_flags |= (B_INVAL | B_RELBUF); 1133 bp->b_flags &= ~B_ASYNC; 1134 brelse(bp); 1135 anyfreed = 1; 1136 1137 if (nbp != NULL && 1138 (((nbp->b_xflags & BX_VNCLEAN) == 0) || 1139 (nbp->b_vp != vp) || 1140 (nbp->b_flags & B_DELWRI))) { 1141 goto restart; 1142 } 1143 VI_LOCK(vp); 1144 } 1145 1146 TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { 1147 if (bp->b_lblkno < trunclbn) 1148 continue; 1149 if (BUF_LOCK(bp, 1150 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, 1151 VI_MTX(vp)) == ENOLCK) 1152 goto restart; 1153 bremfree(bp); 1154 bp->b_flags |= (B_INVAL | B_RELBUF); 1155 bp->b_flags &= ~B_ASYNC; 1156 brelse(bp); 1157 anyfreed = 1; 1158 if (nbp != NULL && 1159 (((nbp->b_xflags & BX_VNDIRTY) == 0) || 1160 (nbp->b_vp != vp) || 1161 (nbp->b_flags & B_DELWRI) == 0)) { 1162 goto restart; 1163 } 1164 VI_LOCK(vp); 1165 } 1166 } 1167 1168 if (length > 0) { 1169restartsync: 1170 TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { 1171 if (bp->b_lblkno > 0) 1172 continue; 1173 /* 1174 * Since we hold the vnode lock this should only 1175 * fail if we're racing with the buf daemon. 1176 */ 1177 if (BUF_LOCK(bp, 1178 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, 1179 VI_MTX(vp)) == ENOLCK) { 1180 goto restart; 1181 } 1182 VNASSERT((bp->b_flags & B_DELWRI), vp, 1183 ("buf(%p) on dirty queue without DELWRI", bp)); 1184 1185 bremfree(bp); 1186 bawrite(bp); 1187 VI_LOCK(vp); 1188 goto restartsync; 1189 } 1190 } 1191 1192 bufobj_wwait(bo, 0, 0); 1193 VI_UNLOCK(vp); 1194 vnode_pager_setsize(vp, length); 1195 1196 return (0); 1197} 1198 1199/* 1200 * buf_splay() - splay tree core for the clean/dirty list of buffers in 1201 * a vnode. 1202 * 1203 * NOTE: We have to deal with the special case of a background bitmap 1204 * buffer, a situation where two buffers will have the same logical 1205 * block offset. We want (1) only the foreground buffer to be accessed 1206 * in a lookup and (2) must differentiate between the foreground and 1207 * background buffer in the splay tree algorithm because the splay 1208 * tree cannot normally handle multiple entities with the same 'index'. 1209 * We accomplish this by adding differentiating flags to the splay tree's 1210 * numerical domain. 1211 */ 1212static 1213struct buf * 1214buf_splay(daddr_t lblkno, b_xflags_t xflags, struct buf *root) 1215{ 1216 struct buf dummy; 1217 struct buf *lefttreemax, *righttreemin, *y; 1218 1219 if (root == NULL) 1220 return (NULL); 1221 lefttreemax = righttreemin = &dummy; 1222 for (;;) { 1223 if (lblkno < root->b_lblkno || 1224 (lblkno == root->b_lblkno && 1225 (xflags & BX_BKGRDMARKER) < (root->b_xflags & BX_BKGRDMARKER))) { 1226 if ((y = root->b_left) == NULL) 1227 break; 1228 if (lblkno < y->b_lblkno) { 1229 /* Rotate right. */ 1230 root->b_left = y->b_right; 1231 y->b_right = root; 1232 root = y; 1233 if ((y = root->b_left) == NULL) 1234 break; 1235 } 1236 /* Link into the new root's right tree. */ 1237 righttreemin->b_left = root; 1238 righttreemin = root; 1239 } else if (lblkno > root->b_lblkno || 1240 (lblkno == root->b_lblkno && 1241 (xflags & BX_BKGRDMARKER) > (root->b_xflags & BX_BKGRDMARKER))) { 1242 if ((y = root->b_right) == NULL) 1243 break; 1244 if (lblkno > y->b_lblkno) { 1245 /* Rotate left. */ 1246 root->b_right = y->b_left; 1247 y->b_left = root; 1248 root = y; 1249 if ((y = root->b_right) == NULL) 1250 break; 1251 } 1252 /* Link into the new root's left tree. */ 1253 lefttreemax->b_right = root; 1254 lefttreemax = root; 1255 } else { 1256 break; 1257 } 1258 root = y; 1259 } 1260 /* Assemble the new root. */ 1261 lefttreemax->b_right = root->b_left; 1262 righttreemin->b_left = root->b_right; 1263 root->b_left = dummy.b_right; 1264 root->b_right = dummy.b_left; 1265 return (root); 1266} 1267 1268static void 1269buf_vlist_remove(struct buf *bp) 1270{ 1271 struct buf *root; 1272 struct bufv *bv; 1273 1274 KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); 1275 ASSERT_BO_LOCKED(bp->b_bufobj); 1276 KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) != 1277 (BX_VNDIRTY|BX_VNCLEAN), 1278 ("buf_vlist_remove: Buf %p is on two lists", bp)); 1279 if (bp->b_xflags & BX_VNDIRTY) 1280 bv = &bp->b_bufobj->bo_dirty; 1281 else 1282 bv = &bp->b_bufobj->bo_clean; 1283 if (bp != bv->bv_root) { 1284 root = buf_splay(bp->b_lblkno, bp->b_xflags, bv->bv_root); 1285 KASSERT(root == bp, ("splay lookup failed in remove")); 1286 } 1287 if (bp->b_left == NULL) { 1288 root = bp->b_right; 1289 } else { 1290 root = buf_splay(bp->b_lblkno, bp->b_xflags, bp->b_left); 1291 root->b_right = bp->b_right; 1292 } 1293 bv->bv_root = root; 1294 TAILQ_REMOVE(&bv->bv_hd, bp, b_bobufs); 1295 bv->bv_cnt--; 1296 bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN); 1297} 1298 1299/* 1300 * Add the buffer to the sorted clean or dirty block list using a 1301 * splay tree algorithm. 1302 * 1303 * NOTE: xflags is passed as a constant, optimizing this inline function! 1304 */ 1305static void 1306buf_vlist_add(struct buf *bp, struct bufobj *bo, b_xflags_t xflags) 1307{ 1308 struct buf *root; 1309 struct bufv *bv; 1310 1311 ASSERT_BO_LOCKED(bo); 1312 KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, 1313 ("buf_vlist_add: Buf %p has existing xflags %d", bp, bp->b_xflags)); 1314 bp->b_xflags |= xflags; 1315 if (xflags & BX_VNDIRTY) 1316 bv = &bo->bo_dirty; 1317 else 1318 bv = &bo->bo_clean; 1319 1320 root = buf_splay(bp->b_lblkno, bp->b_xflags, bv->bv_root); 1321 if (root == NULL) { 1322 bp->b_left = NULL; 1323 bp->b_right = NULL; 1324 TAILQ_INSERT_TAIL(&bv->bv_hd, bp, b_bobufs); 1325 } else if (bp->b_lblkno < root->b_lblkno || 1326 (bp->b_lblkno == root->b_lblkno && 1327 (bp->b_xflags & BX_BKGRDMARKER) < (root->b_xflags & BX_BKGRDMARKER))) { 1328 bp->b_left = root->b_left; 1329 bp->b_right = root; 1330 root->b_left = NULL; 1331 TAILQ_INSERT_BEFORE(root, bp, b_bobufs); 1332 } else { 1333 bp->b_right = root->b_right; 1334 bp->b_left = root; 1335 root->b_right = NULL; 1336 TAILQ_INSERT_AFTER(&bv->bv_hd, root, bp, b_bobufs); 1337 } 1338 bv->bv_cnt++; 1339 bv->bv_root = bp; 1340} 1341 1342/* 1343 * Lookup a buffer using the splay tree. Note that we specifically avoid 1344 * shadow buffers used in background bitmap writes. 1345 * 1346 * This code isn't quite efficient as it could be because we are maintaining 1347 * two sorted lists and do not know which list the block resides in. 1348 * 1349 * During a "make buildworld" the desired buffer is found at one of 1350 * the roots more than 60% of the time. Thus, checking both roots 1351 * before performing either splay eliminates unnecessary splays on the 1352 * first tree splayed. 1353 */ 1354struct buf * 1355gbincore(struct bufobj *bo, daddr_t lblkno) 1356{ 1357 struct buf *bp; 1358 1359 ASSERT_BO_LOCKED(bo); 1360 if ((bp = bo->bo_clean.bv_root) != NULL && 1361 bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER)) 1362 return (bp); 1363 if ((bp = bo->bo_dirty.bv_root) != NULL && 1364 bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER)) 1365 return (bp); 1366 if ((bp = bo->bo_clean.bv_root) != NULL) { 1367 bo->bo_clean.bv_root = bp = buf_splay(lblkno, 0, bp); 1368 if (bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER)) 1369 return (bp); 1370 } 1371 if ((bp = bo->bo_dirty.bv_root) != NULL) { 1372 bo->bo_dirty.bv_root = bp = buf_splay(lblkno, 0, bp); 1373 if (bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER)) 1374 return (bp); 1375 } 1376 return (NULL); 1377} 1378 1379/* 1380 * Associate a buffer with a vnode. 1381 */ 1382void 1383bgetvp(struct vnode *vp, struct buf *bp) 1384{ 1385 1386 VNASSERT(bp->b_vp == NULL, bp->b_vp, ("bgetvp: not free")); 1387 1388 CTR3(KTR_BUF, "bgetvp(%p) vp %p flags %X", bp, vp, bp->b_flags); 1389 VNASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, vp, 1390 ("bgetvp: bp already attached! %p", bp)); 1391 1392 ASSERT_VI_LOCKED(vp, "bgetvp"); 1393 vholdl(vp); 1394 bp->b_vp = vp; 1395 bp->b_bufobj = &vp->v_bufobj; 1396 /* 1397 * Insert onto list for new vnode. 1398 */ 1399 buf_vlist_add(bp, &vp->v_bufobj, BX_VNCLEAN); 1400} 1401 1402/* 1403 * Disassociate a buffer from a vnode. 1404 */ 1405void 1406brelvp(struct buf *bp) 1407{ 1408 struct bufobj *bo; 1409 struct vnode *vp; 1410 1411 CTR3(KTR_BUF, "brelvp(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); 1412 KASSERT(bp->b_vp != NULL, ("brelvp: NULL")); 1413 1414 /* 1415 * Delete from old vnode list, if on one. 1416 */ 1417 vp = bp->b_vp; /* XXX */ 1418 bo = bp->b_bufobj; 1419 BO_LOCK(bo); 1420 if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) 1421 buf_vlist_remove(bp); 1422 else 1423 panic("brelvp: Buffer %p not on queue.", bp); 1424 if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) { 1425 bo->bo_flag &= ~BO_ONWORKLST; 1426 mtx_lock(&sync_mtx); 1427 LIST_REMOVE(bo, bo_synclist); 1428 syncer_worklist_len--; 1429 mtx_unlock(&sync_mtx); 1430 } 1431 bp->b_vp = NULL; 1432 bp->b_bufobj = NULL; 1433 vdropl(vp); 1434} 1435 1436/* 1437 * Add an item to the syncer work queue. 1438 */ 1439static void 1440vn_syncer_add_to_worklist(struct bufobj *bo, int delay) 1441{ 1442 int slot; 1443 1444 ASSERT_BO_LOCKED(bo); 1445 1446 mtx_lock(&sync_mtx); 1447 if (bo->bo_flag & BO_ONWORKLST) 1448 LIST_REMOVE(bo, bo_synclist); 1449 else { 1450 bo->bo_flag |= BO_ONWORKLST; 1451 syncer_worklist_len++; 1452 } 1453 1454 if (delay > syncer_maxdelay - 2) 1455 delay = syncer_maxdelay - 2; 1456 slot = (syncer_delayno + delay) & syncer_mask; 1457 1458 LIST_INSERT_HEAD(&syncer_workitem_pending[slot], bo, bo_synclist); 1459 mtx_unlock(&sync_mtx); 1460} 1461 1462static int 1463sysctl_vfs_worklist_len(SYSCTL_HANDLER_ARGS) 1464{ 1465 int error, len; 1466 1467 mtx_lock(&sync_mtx); 1468 len = syncer_worklist_len - sync_vnode_count; 1469 mtx_unlock(&sync_mtx); 1470 error = SYSCTL_OUT(req, &len, sizeof(len)); 1471 return (error); 1472} 1473 1474SYSCTL_PROC(_vfs, OID_AUTO, worklist_len, CTLTYPE_INT | CTLFLAG_RD, NULL, 0, 1475 sysctl_vfs_worklist_len, "I", "Syncer thread worklist length"); 1476 1477struct proc *updateproc; 1478static void sched_sync(void); 1479static struct kproc_desc up_kp = { 1480 "syncer", 1481 sched_sync, 1482 &updateproc 1483}; 1484SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp) 1485 1486static int 1487sync_vnode(struct bufobj *bo, struct thread *td) 1488{ 1489 struct vnode *vp; 1490 struct mount *mp; 1491 1492 vp = bo->__bo_vnode; /* XXX */ 1493 if (VOP_ISLOCKED(vp, NULL) != 0) 1494 return (1); 1495 if (VI_TRYLOCK(vp) == 0) 1496 return (1); 1497 /* 1498 * We use vhold in case the vnode does not 1499 * successfully sync. vhold prevents the vnode from 1500 * going away when we unlock the sync_mtx so that 1501 * we can acquire the vnode interlock. 1502 */ 1503 vholdl(vp); 1504 mtx_unlock(&sync_mtx); 1505 VI_UNLOCK(vp); 1506 if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { 1507 vdrop(vp); 1508 mtx_lock(&sync_mtx); 1509 return (1); 1510 } 1511 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); 1512 (void) VOP_FSYNC(vp, MNT_LAZY, td); 1513 VOP_UNLOCK(vp, 0, td); 1514 vn_finished_write(mp); 1515 VI_LOCK(vp); 1516 if ((bo->bo_flag & BO_ONWORKLST) != 0) { 1517 /* 1518 * Put us back on the worklist. The worklist 1519 * routine will remove us from our current 1520 * position and then add us back in at a later 1521 * position. 1522 */ 1523 vn_syncer_add_to_worklist(bo, syncdelay); 1524 } 1525 vdropl(vp); 1526 mtx_lock(&sync_mtx); 1527 return (0); 1528} 1529 1530/* 1531 * System filesystem synchronizer daemon. 1532 */ 1533static void 1534sched_sync(void) 1535{ 1536 struct synclist *next; 1537 struct synclist *slp; 1538 struct bufobj *bo; 1539 long starttime; 1540 struct thread *td = FIRST_THREAD_IN_PROC(updateproc); 1541 static int dummychan; 1542 int last_work_seen; 1543 int net_worklist_len; 1544 int syncer_final_iter; 1545 int first_printf; 1546 int error; 1547 1548 mtx_lock(&Giant); 1549 last_work_seen = 0; 1550 syncer_final_iter = 0; 1551 first_printf = 1; 1552 syncer_state = SYNCER_RUNNING; 1553 starttime = time_second; 1554 1555 EVENTHANDLER_REGISTER(shutdown_pre_sync, syncer_shutdown, td->td_proc, 1556 SHUTDOWN_PRI_LAST); 1557 1558 for (;;) { 1559 mtx_lock(&sync_mtx); 1560 if (syncer_state == SYNCER_FINAL_DELAY && 1561 syncer_final_iter == 0) { 1562 mtx_unlock(&sync_mtx); 1563 kthread_suspend_check(td->td_proc); 1564 mtx_lock(&sync_mtx); 1565 } 1566 net_worklist_len = syncer_worklist_len - sync_vnode_count; 1567 if (syncer_state != SYNCER_RUNNING && 1568 starttime != time_second) { 1569 if (first_printf) { 1570 printf("\nSyncing disks, vnodes remaining..."); 1571 first_printf = 0; 1572 } 1573 printf("%d ", net_worklist_len); 1574 } 1575 starttime = time_second; 1576 1577 /* 1578 * Push files whose dirty time has expired. Be careful 1579 * of interrupt race on slp queue. 1580 * 1581 * Skip over empty worklist slots when shutting down. 1582 */ 1583 do { 1584 slp = &syncer_workitem_pending[syncer_delayno]; 1585 syncer_delayno += 1; 1586 if (syncer_delayno == syncer_maxdelay) 1587 syncer_delayno = 0; 1588 next = &syncer_workitem_pending[syncer_delayno]; 1589 /* 1590 * If the worklist has wrapped since the 1591 * it was emptied of all but syncer vnodes, 1592 * switch to the FINAL_DELAY state and run 1593 * for one more second. 1594 */ 1595 if (syncer_state == SYNCER_SHUTTING_DOWN && 1596 net_worklist_len == 0 && 1597 last_work_seen == syncer_delayno) { 1598 syncer_state = SYNCER_FINAL_DELAY; 1599 syncer_final_iter = SYNCER_SHUTDOWN_SPEEDUP; 1600 } 1601 } while (syncer_state != SYNCER_RUNNING && LIST_EMPTY(slp) && 1602 syncer_worklist_len > 0); 1603 1604 /* 1605 * Keep track of the last time there was anything 1606 * on the worklist other than syncer vnodes. 1607 * Return to the SHUTTING_DOWN state if any 1608 * new work appears. 1609 */ 1610 if (net_worklist_len > 0 || syncer_state == SYNCER_RUNNING) 1611 last_work_seen = syncer_delayno; 1612 if (net_worklist_len > 0 && syncer_state == SYNCER_FINAL_DELAY) 1613 syncer_state = SYNCER_SHUTTING_DOWN; 1614 while ((bo = LIST_FIRST(slp)) != NULL) { 1615 error = sync_vnode(bo, td); 1616 if (error == 1) { 1617 LIST_REMOVE(bo, bo_synclist); 1618 LIST_INSERT_HEAD(next, bo, bo_synclist); 1619 continue; 1620 } 1621 } 1622 if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter > 0) 1623 syncer_final_iter--; 1624 mtx_unlock(&sync_mtx); 1625 1626 /* 1627 * Do soft update processing. 1628 */ 1629 if (softdep_process_worklist_hook != NULL) 1630 (*softdep_process_worklist_hook)(NULL); 1631 1632 /* 1633 * The variable rushjob allows the kernel to speed up the 1634 * processing of the filesystem syncer process. A rushjob 1635 * value of N tells the filesystem syncer to process the next 1636 * N seconds worth of work on its queue ASAP. Currently rushjob 1637 * is used by the soft update code to speed up the filesystem 1638 * syncer process when the incore state is getting so far 1639 * ahead of the disk that the kernel memory pool is being 1640 * threatened with exhaustion. 1641 */ 1642 mtx_lock(&sync_mtx); 1643 if (rushjob > 0) { 1644 rushjob -= 1; 1645 mtx_unlock(&sync_mtx); 1646 continue; 1647 } 1648 mtx_unlock(&sync_mtx); 1649 /* 1650 * Just sleep for a short period if time between 1651 * iterations when shutting down to allow some I/O 1652 * to happen. 1653 * 1654 * If it has taken us less than a second to process the 1655 * current work, then wait. Otherwise start right over 1656 * again. We can still lose time if any single round 1657 * takes more than two seconds, but it does not really 1658 * matter as we are just trying to generally pace the 1659 * filesystem activity. 1660 */ 1661 if (syncer_state != SYNCER_RUNNING) 1662 tsleep(&dummychan, PPAUSE, "syncfnl", 1663 hz / SYNCER_SHUTDOWN_SPEEDUP); 1664 else if (time_second == starttime) 1665 tsleep(&lbolt, PPAUSE, "syncer", 0); 1666 } 1667} 1668 1669/* 1670 * Request the syncer daemon to speed up its work. 1671 * We never push it to speed up more than half of its 1672 * normal turn time, otherwise it could take over the cpu. 1673 */ 1674int 1675speedup_syncer() 1676{ 1677 struct thread *td; 1678 int ret = 0; 1679 1680 td = FIRST_THREAD_IN_PROC(updateproc); 1681 sleepq_remove(td, &lbolt); 1682 mtx_lock(&sync_mtx); 1683 if (rushjob < syncdelay / 2) { 1684 rushjob += 1; 1685 stat_rush_requests += 1; 1686 ret = 1; 1687 } 1688 mtx_unlock(&sync_mtx); 1689 return (ret); 1690} 1691 1692/* 1693 * Tell the syncer to speed up its work and run though its work 1694 * list several times, then tell it to shut down. 1695 */ 1696static void 1697syncer_shutdown(void *arg, int howto) 1698{ 1699 struct thread *td; 1700 1701 if (howto & RB_NOSYNC) 1702 return; 1703 td = FIRST_THREAD_IN_PROC(updateproc); 1704 sleepq_remove(td, &lbolt); 1705 mtx_lock(&sync_mtx); 1706 syncer_state = SYNCER_SHUTTING_DOWN; 1707 rushjob = 0; 1708 mtx_unlock(&sync_mtx); 1709 kproc_shutdown(arg, howto); 1710} 1711 1712/* 1713 * Reassign a buffer from one vnode to another. 1714 * Used to assign file specific control information 1715 * (indirect blocks) to the vnode to which they belong. 1716 */ 1717void 1718reassignbuf(struct buf *bp) 1719{ 1720 struct vnode *vp; 1721 struct bufobj *bo; 1722 int delay; 1723#ifdef INVARIANTS 1724 struct bufv *bv; 1725#endif 1726 1727 vp = bp->b_vp; 1728 bo = bp->b_bufobj; 1729 ++reassignbufcalls; 1730 1731 CTR3(KTR_BUF, "reassignbuf(%p) vp %p flags %X", 1732 bp, bp->b_vp, bp->b_flags); 1733 /* 1734 * B_PAGING flagged buffers cannot be reassigned because their vp 1735 * is not fully linked in. 1736 */ 1737 if (bp->b_flags & B_PAGING) 1738 panic("cannot reassign paging buffer"); 1739 1740 /* 1741 * Delete from old vnode list, if on one. 1742 */ 1743 VI_LOCK(vp); 1744 if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) 1745 buf_vlist_remove(bp); 1746 else 1747 panic("reassignbuf: Buffer %p not on queue.", bp); 1748 /* 1749 * If dirty, put on list of dirty buffers; otherwise insert onto list 1750 * of clean buffers. 1751 */ 1752 if (bp->b_flags & B_DELWRI) { 1753 if ((bo->bo_flag & BO_ONWORKLST) == 0) { 1754 switch (vp->v_type) { 1755 case VDIR: 1756 delay = dirdelay; 1757 break; 1758 case VCHR: 1759 delay = metadelay; 1760 break; 1761 default: 1762 delay = filedelay; 1763 } 1764 vn_syncer_add_to_worklist(bo, delay); 1765 } 1766 buf_vlist_add(bp, bo, BX_VNDIRTY); 1767 } else { 1768 buf_vlist_add(bp, bo, BX_VNCLEAN); 1769 1770 if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) { 1771 mtx_lock(&sync_mtx); 1772 LIST_REMOVE(bo, bo_synclist); 1773 syncer_worklist_len--; 1774 mtx_unlock(&sync_mtx); 1775 bo->bo_flag &= ~BO_ONWORKLST; 1776 } 1777 } 1778#ifdef INVARIANTS 1779 bv = &bo->bo_clean; 1780 bp = TAILQ_FIRST(&bv->bv_hd); 1781 KASSERT(bp == NULL || bp->b_bufobj == bo, 1782 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); 1783 bp = TAILQ_LAST(&bv->bv_hd, buflists); 1784 KASSERT(bp == NULL || bp->b_bufobj == bo, 1785 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); 1786 bv = &bo->bo_dirty; 1787 bp = TAILQ_FIRST(&bv->bv_hd); 1788 KASSERT(bp == NULL || bp->b_bufobj == bo, 1789 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); 1790 bp = TAILQ_LAST(&bv->bv_hd, buflists); 1791 KASSERT(bp == NULL || bp->b_bufobj == bo, 1792 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); 1793#endif 1794 VI_UNLOCK(vp); 1795} 1796 1797/* 1798 * Increment the use and hold counts on the vnode, taking care to reference 1799 * the driver's usecount if this is a chardev. The vholdl() will remove 1800 * the vnode from the free list if it is presently free. Requires the 1801 * vnode interlock and returns with it held. 1802 */ 1803static void 1804v_incr_usecount(struct vnode *vp) 1805{ 1806 1807 CTR3(KTR_VFS, "v_incr_usecount: vp %p holdcnt %d usecount %d\n", 1808 vp, vp->v_holdcnt, vp->v_usecount); 1809 vp->v_usecount++; 1810 if (vp->v_type == VCHR && vp->v_rdev != NULL) { 1811 dev_lock(); 1812 vp->v_rdev->si_usecount++; 1813 dev_unlock(); 1814 } 1815 vholdl(vp); 1816} 1817 1818/* 1819 * Decrement the vnode use and hold count along with the driver's usecount 1820 * if this is a chardev. The vdropl() below releases the vnode interlock 1821 * as it may free the vnode. 1822 */ 1823static void 1824v_decr_usecount(struct vnode *vp) 1825{ 1826 1827 CTR3(KTR_VFS, "v_decr_usecount: vp %p holdcnt %d usecount %d\n", 1828 vp, vp->v_holdcnt, vp->v_usecount); 1829 ASSERT_VI_LOCKED(vp, __FUNCTION__); 1830 VNASSERT(vp->v_usecount > 0, vp, 1831 ("v_decr_usecount: negative usecount")); 1832 vp->v_usecount--; 1833 if (vp->v_type == VCHR && vp->v_rdev != NULL) { 1834 dev_lock(); 1835 vp->v_rdev->si_usecount--; 1836 dev_unlock(); 1837 } 1838 vdropl(vp); 1839} 1840 1841/* 1842 * Decrement only the use count and driver use count. This is intended to 1843 * be paired with a follow on vdropl() to release the remaining hold count. 1844 * In this way we may vgone() a vnode with a 0 usecount without risk of 1845 * having it end up on a free list because the hold count is kept above 0. 1846 */ 1847static void 1848v_decr_useonly(struct vnode *vp) 1849{ 1850 1851 CTR3(KTR_VFS, "v_decr_useonly: vp %p holdcnt %d usecount %d\n", 1852 vp, vp->v_holdcnt, vp->v_usecount); 1853 ASSERT_VI_LOCKED(vp, __FUNCTION__); 1854 VNASSERT(vp->v_usecount > 0, vp, 1855 ("v_decr_useonly: negative usecount")); 1856 vp->v_usecount--; 1857 if (vp->v_type == VCHR && vp->v_rdev != NULL) { 1858 dev_lock(); 1859 vp->v_rdev->si_usecount--; 1860 dev_unlock(); 1861 } 1862} 1863 1864/* 1865 * Grab a particular vnode from the free list, increment its 1866 * reference count and lock it. The vnode lock bit is set if the 1867 * vnode is being eliminated in vgone. The process is awakened 1868 * when the transition is completed, and an error returned to 1869 * indicate that the vnode is no longer usable (possibly having 1870 * been changed to a new filesystem type). 1871 */ 1872int 1873vget(vp, flags, td) 1874 struct vnode *vp; 1875 int flags; 1876 struct thread *td; 1877{ 1878 int oweinact; 1879 int oldflags; 1880 int error; 1881 1882 error = 0; 1883 oldflags = flags; 1884 oweinact = 0; 1885 if ((flags & LK_INTERLOCK) == 0) 1886 VI_LOCK(vp); 1887 /* 1888 * If the inactive call was deferred because vput() was called 1889 * with a shared lock, we have to do it here before another thread 1890 * gets a reference to data that should be dead. 1891 */ 1892 if (vp->v_iflag & VI_OWEINACT) { 1893 if (flags & LK_NOWAIT) { 1894 VI_UNLOCK(vp); 1895 return (EBUSY); 1896 } 1897 flags &= ~LK_TYPE_MASK; 1898 flags |= LK_EXCLUSIVE; 1899 oweinact = 1; 1900 } 1901 v_incr_usecount(vp); 1902 if ((error = vn_lock(vp, flags | LK_INTERLOCK, td)) != 0) { 1903 VI_LOCK(vp); 1904 /* 1905 * must expand vrele here because we do not want 1906 * to call VOP_INACTIVE if the reference count 1907 * drops back to zero since it was never really 1908 * active. 1909 */ 1910 v_decr_usecount(vp); 1911 return (error); 1912 } 1913 if (vp->v_iflag & VI_DOOMED && (flags & LK_RETRY) == 0) 1914 panic("vget: vn_lock failed to return ENOENT\n"); 1915 if (oweinact) { 1916 VI_LOCK(vp); 1917 if (vp->v_iflag & VI_OWEINACT) 1918 vinactive(vp, td); 1919 VI_UNLOCK(vp); 1920 if ((oldflags & LK_TYPE_MASK) == 0) 1921 VOP_UNLOCK(vp, 0, td); 1922 } 1923 return (0); 1924} 1925 1926/* 1927 * Increase the reference count of a vnode. 1928 */ 1929void 1930vref(struct vnode *vp) 1931{ 1932 1933 VI_LOCK(vp); 1934 v_incr_usecount(vp); 1935 VI_UNLOCK(vp); 1936} 1937 1938/* 1939 * Return reference count of a vnode. 1940 * 1941 * The results of this call are only guaranteed when some mechanism other 1942 * than the VI lock is used to stop other processes from gaining references 1943 * to the vnode. This may be the case if the caller holds the only reference. 1944 * This is also useful when stale data is acceptable as race conditions may 1945 * be accounted for by some other means. 1946 */ 1947int 1948vrefcnt(struct vnode *vp) 1949{ 1950 int usecnt; 1951 1952 VI_LOCK(vp); 1953 usecnt = vp->v_usecount; 1954 VI_UNLOCK(vp); 1955 1956 return (usecnt); 1957} 1958 1959 1960/* 1961 * Vnode put/release. 1962 * If count drops to zero, call inactive routine and return to freelist. 1963 */ 1964void 1965vrele(vp) 1966 struct vnode *vp; 1967{ 1968 struct thread *td = curthread; /* XXX */ 1969 1970 KASSERT(vp != NULL, ("vrele: null vp")); 1971 1972 VI_LOCK(vp); 1973 1974 /* Skip this v_writecount check if we're going to panic below. */ 1975 VNASSERT(vp->v_writecount < vp->v_usecount || vp->v_usecount < 1, vp, 1976 ("vrele: missed vn_close")); 1977 1978 if (vp->v_usecount > 1 || ((vp->v_iflag & VI_DOINGINACT) && 1979 vp->v_usecount == 1)) { 1980 v_decr_usecount(vp); 1981 return; 1982 } 1983 if (vp->v_usecount != 1) { 1984#ifdef DIAGNOSTIC 1985 vprint("vrele: negative ref count", vp); 1986#endif 1987 VI_UNLOCK(vp); 1988 panic("vrele: negative ref cnt"); 1989 } 1990 /* 1991 * We want to hold the vnode until the inactive finishes to 1992 * prevent vgone() races. We drop the use count here and the 1993 * hold count below when we're done. 1994 */ 1995 v_decr_useonly(vp); 1996 /* 1997 * We must call VOP_INACTIVE with the node locked. Mark 1998 * as VI_DOINGINACT to avoid recursion. 1999 */ 2000 if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK, td) == 0) { 2001 VI_LOCK(vp); 2002 vinactive(vp, td); 2003 VOP_UNLOCK(vp, 0, td); 2004 } else 2005 VI_LOCK(vp); 2006 vdropl(vp); 2007} 2008 2009/* 2010 * Release an already locked vnode. This give the same effects as 2011 * unlock+vrele(), but takes less time and avoids releasing and 2012 * re-aquiring the lock (as vrele() aquires the lock internally.) 2013 */ 2014void 2015vput(vp) 2016 struct vnode *vp; 2017{ 2018 struct thread *td = curthread; /* XXX */ 2019 int error; 2020 2021 KASSERT(vp != NULL, ("vput: null vp")); 2022 ASSERT_VOP_LOCKED(vp, "vput"); 2023 VI_LOCK(vp); 2024 /* Skip this v_writecount check if we're going to panic below. */ 2025 VNASSERT(vp->v_writecount < vp->v_usecount || vp->v_usecount < 1, vp, 2026 ("vput: missed vn_close")); 2027 error = 0; 2028 2029 if (vp->v_usecount > 1 || ((vp->v_iflag & VI_DOINGINACT) && 2030 vp->v_usecount == 1)) { 2031 VOP_UNLOCK(vp, 0, td); 2032 v_decr_usecount(vp); 2033 return; 2034 } 2035 2036 if (vp->v_usecount != 1) { 2037#ifdef DIAGNOSTIC 2038 vprint("vput: negative ref count", vp); 2039#endif 2040 panic("vput: negative ref cnt"); 2041 } 2042 /* 2043 * We want to hold the vnode until the inactive finishes to 2044 * prevent vgone() races. We drop the use count here and the 2045 * hold count below when we're done. 2046 */ 2047 v_decr_useonly(vp); 2048 vp->v_iflag |= VI_OWEINACT; 2049 if (VOP_ISLOCKED(vp, NULL) != LK_EXCLUSIVE) { 2050 error = VOP_LOCK(vp, LK_EXCLUPGRADE|LK_INTERLOCK|LK_NOWAIT, td); 2051 VI_LOCK(vp); 2052 if (error) 2053 goto done; 2054 } 2055 if (vp->v_iflag & VI_OWEINACT) 2056 vinactive(vp, td); 2057 VOP_UNLOCK(vp, 0, td); 2058done: 2059 vdropl(vp); 2060} 2061 2062/* 2063 * Somebody doesn't want the vnode recycled. 2064 */ 2065void 2066vhold(struct vnode *vp) 2067{ 2068 2069 VI_LOCK(vp); 2070 vholdl(vp); 2071 VI_UNLOCK(vp); 2072} 2073 2074void 2075vholdl(struct vnode *vp) 2076{ 2077 2078 vp->v_holdcnt++; 2079 if (VSHOULDBUSY(vp)) 2080 vbusy(vp); 2081} 2082 2083/* 2084 * Note that there is one less who cares about this vnode. vdrop() is the 2085 * opposite of vhold(). 2086 */ 2087void 2088vdrop(struct vnode *vp) 2089{ 2090 2091 VI_LOCK(vp); 2092 vdropl(vp); 2093} 2094 2095/* 2096 * Drop the hold count of the vnode. If this is the last reference to 2097 * the vnode we will free it if it has been vgone'd otherwise it is 2098 * placed on the free list. 2099 */ 2100static void 2101vdropl(struct vnode *vp) 2102{ 2103 2104 if (vp->v_holdcnt <= 0) 2105 panic("vdrop: holdcnt %d", vp->v_holdcnt); 2106 vp->v_holdcnt--; 2107 if (vp->v_holdcnt == 0) { 2108 if (vp->v_iflag & VI_DOOMED) { 2109 vdestroy(vp); 2110 return; 2111 } else 2112 vfree(vp); 2113 } 2114 VI_UNLOCK(vp); 2115} 2116 2117/* 2118 * Call VOP_INACTIVE on the vnode and manage the DOINGINACT and OWEINACT 2119 * flags. DOINGINACT prevents us from recursing in calls to vinactive. 2120 * OWEINACT tracks whether a vnode missed a call to inactive due to a 2121 * failed lock upgrade. 2122 */ 2123static void 2124vinactive(struct vnode *vp, struct thread *td) 2125{ 2126 2127 ASSERT_VOP_LOCKED(vp, "vinactive"); 2128 ASSERT_VI_LOCKED(vp, "vinactive"); 2129 VNASSERT((vp->v_iflag & VI_DOINGINACT) == 0, vp, 2130 ("vinactive: recursed on VI_DOINGINACT")); 2131 vp->v_iflag |= VI_DOINGINACT; 2132 vp->v_iflag &= ~VI_OWEINACT; 2133 VI_UNLOCK(vp); 2134 VOP_INACTIVE(vp, td); 2135 VI_LOCK(vp); 2136 VNASSERT(vp->v_iflag & VI_DOINGINACT, vp, 2137 ("vinactive: lost VI_DOINGINACT")); 2138 vp->v_iflag &= ~VI_DOINGINACT; 2139} 2140 2141/* 2142 * Remove any vnodes in the vnode table belonging to mount point mp. 2143 * 2144 * If FORCECLOSE is not specified, there should not be any active ones, 2145 * return error if any are found (nb: this is a user error, not a 2146 * system error). If FORCECLOSE is specified, detach any active vnodes 2147 * that are found. 2148 * 2149 * If WRITECLOSE is set, only flush out regular file vnodes open for 2150 * writing. 2151 * 2152 * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped. 2153 * 2154 * `rootrefs' specifies the base reference count for the root vnode 2155 * of this filesystem. The root vnode is considered busy if its 2156 * v_usecount exceeds this value. On a successful return, vflush(, td) 2157 * will call vrele() on the root vnode exactly rootrefs times. 2158 * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must 2159 * be zero. 2160 */ 2161#ifdef DIAGNOSTIC 2162static int busyprt = 0; /* print out busy vnodes */ 2163SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, ""); 2164#endif 2165 2166int 2167vflush(mp, rootrefs, flags, td) 2168 struct mount *mp; 2169 int rootrefs; 2170 int flags; 2171 struct thread *td; 2172{ 2173 struct vnode *vp, *nvp, *rootvp = NULL; 2174 struct vattr vattr; 2175 int busy = 0, error; 2176 2177 CTR1(KTR_VFS, "vflush: mp %p", mp); 2178 if (rootrefs > 0) { 2179 KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0, 2180 ("vflush: bad args")); 2181 /* 2182 * Get the filesystem root vnode. We can vput() it 2183 * immediately, since with rootrefs > 0, it won't go away. 2184 */ 2185 if ((error = VFS_ROOT(mp, LK_EXCLUSIVE, &rootvp, td)) != 0) 2186 return (error); 2187 vput(rootvp); 2188 2189 } 2190 MNT_ILOCK(mp); 2191loop: 2192 MNT_VNODE_FOREACH(vp, mp, nvp) { 2193 2194 VI_LOCK(vp); 2195 vholdl(vp); 2196 MNT_IUNLOCK(mp); 2197 error = vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE, td); 2198 if (error) { 2199 vdrop(vp); 2200 MNT_ILOCK(mp); 2201 goto loop; 2202 } 2203 /* 2204 * Skip over a vnodes marked VV_SYSTEM. 2205 */ 2206 if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) { 2207 VOP_UNLOCK(vp, 0, td); 2208 vdrop(vp); 2209 MNT_ILOCK(mp); 2210 continue; 2211 } 2212 /* 2213 * If WRITECLOSE is set, flush out unlinked but still open 2214 * files (even if open only for reading) and regular file 2215 * vnodes open for writing. 2216 */ 2217 if (flags & WRITECLOSE) { 2218 error = VOP_GETATTR(vp, &vattr, td->td_ucred, td); 2219 VI_LOCK(vp); 2220 2221 if ((vp->v_type == VNON || 2222 (error == 0 && vattr.va_nlink > 0)) && 2223 (vp->v_writecount == 0 || vp->v_type != VREG)) { 2224 VOP_UNLOCK(vp, 0, td); 2225 vdropl(vp); 2226 MNT_ILOCK(mp); 2227 continue; 2228 } 2229 } else 2230 VI_LOCK(vp); 2231 /* 2232 * With v_usecount == 0, all we need to do is clear out the 2233 * vnode data structures and we are done. 2234 * 2235 * If FORCECLOSE is set, forcibly close the vnode. 2236 */ 2237 if (vp->v_usecount == 0 || (flags & FORCECLOSE)) { 2238 VNASSERT(vp->v_usecount == 0 || 2239 (vp->v_type != VCHR && vp->v_type != VBLK), vp, 2240 ("device VNODE %p is FORCECLOSED", vp)); 2241 vgonel(vp); 2242 } else { 2243 busy++; 2244#ifdef DIAGNOSTIC 2245 if (busyprt) 2246 vprint("vflush: busy vnode", vp); 2247#endif 2248 } 2249 VOP_UNLOCK(vp, 0, td); 2250 vdropl(vp); 2251 MNT_ILOCK(mp); 2252 } 2253 MNT_IUNLOCK(mp); 2254 if (rootrefs > 0 && (flags & FORCECLOSE) == 0) { 2255 /* 2256 * If just the root vnode is busy, and if its refcount 2257 * is equal to `rootrefs', then go ahead and kill it. 2258 */ 2259 VI_LOCK(rootvp); 2260 KASSERT(busy > 0, ("vflush: not busy")); 2261 VNASSERT(rootvp->v_usecount >= rootrefs, rootvp, 2262 ("vflush: usecount %d < rootrefs %d", 2263 rootvp->v_usecount, rootrefs)); 2264 if (busy == 1 && rootvp->v_usecount == rootrefs) { 2265 VOP_LOCK(rootvp, LK_EXCLUSIVE|LK_INTERLOCK, td); 2266 vgone(rootvp); 2267 VOP_UNLOCK(rootvp, 0, td); 2268 busy = 0; 2269 } else 2270 VI_UNLOCK(rootvp); 2271 } 2272 if (busy) 2273 return (EBUSY); 2274 for (; rootrefs > 0; rootrefs--) 2275 vrele(rootvp); 2276 return (0); 2277} 2278 2279/* 2280 * Recycle an unused vnode to the front of the free list. 2281 */ 2282int 2283vrecycle(struct vnode *vp, struct thread *td) 2284{ 2285 int recycled; 2286 2287 ASSERT_VOP_LOCKED(vp, "vrecycle"); 2288 recycled = 0; 2289 VI_LOCK(vp); 2290 if (vp->v_usecount == 0) { 2291 recycled = 1; 2292 vgonel(vp); 2293 } 2294 VI_UNLOCK(vp); 2295 return (recycled); 2296} 2297 2298/* 2299 * Eliminate all activity associated with a vnode 2300 * in preparation for reuse. 2301 */ 2302void 2303vgone(struct vnode *vp) 2304{ 2305 VI_LOCK(vp); 2306 vgonel(vp); 2307 VI_UNLOCK(vp); 2308} 2309 2310/* 2311 * vgone, with the vp interlock held. 2312 */ 2313void 2314vgonel(struct vnode *vp) 2315{ 2316 struct thread *td; 2317 int oweinact; 2318 int active; 2319 2320 CTR1(KTR_VFS, "vgonel: vp %p", vp); 2321 ASSERT_VOP_LOCKED(vp, "vgonel"); 2322 ASSERT_VI_LOCKED(vp, "vgonel"); 2323#if 0 2324 /* XXX Need to fix ttyvp before I enable this. */ 2325 VNASSERT(vp->v_holdcnt, vp, 2326 ("vgonel: vp %p has no reference.", vp)); 2327#endif 2328 td = curthread; 2329 2330 /* 2331 * Don't vgonel if we're already doomed. 2332 */ 2333 if (vp->v_iflag & VI_DOOMED) { 2334 VI_UNLOCK(vp); 2335 return; 2336 } 2337 vp->v_iflag |= VI_DOOMED; 2338 /* 2339 * Check to see if the vnode is in use. If so, we have to call 2340 * VOP_CLOSE() and VOP_INACTIVE(). 2341 */ 2342 active = vp->v_usecount; 2343 oweinact = (vp->v_iflag & VI_OWEINACT); 2344 VI_UNLOCK(vp); 2345 /* 2346 * Clean out any buffers associated with the vnode. 2347 * If the flush fails, just toss the buffers. 2348 */ 2349 if (!TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd)) 2350 (void) vn_write_suspend_wait(vp, NULL, V_WAIT); 2351 if (vinvalbuf(vp, V_SAVE, td, 0, 0) != 0) 2352 vinvalbuf(vp, 0, td, 0, 0); 2353 2354 /* 2355 * If purging an active vnode, it must be closed and 2356 * deactivated before being reclaimed. 2357 */ 2358 if (active) 2359 VOP_CLOSE(vp, FNONBLOCK, NOCRED, td); 2360 if (oweinact || active) { 2361 VI_LOCK(vp); 2362 if ((vp->v_iflag & VI_DOINGINACT) == 0) 2363 vinactive(vp, td); 2364 VI_UNLOCK(vp); 2365 } 2366 /* 2367 * Reclaim the vnode. 2368 */ 2369 if (VOP_RECLAIM(vp, td)) 2370 panic("vgone: cannot reclaim"); 2371 VNASSERT(vp->v_object == NULL, vp, 2372 ("vop_reclaim left v_object vp=%p, tag=%s", vp, vp->v_tag)); 2373 /* 2374 * Delete from old mount point vnode list. 2375 */ 2376 delmntque(vp); 2377 cache_purge(vp); 2378 /* 2379 * Done with purge, reset to the standard lock and invalidate 2380 * the vnode. 2381 */ 2382 VI_LOCK(vp); 2383 vp->v_vnlock = &vp->v_lock; 2384 vp->v_op = &dead_vnodeops; 2385 vp->v_tag = "none"; 2386 vp->v_type = VBAD; 2387} 2388 2389/* 2390 * Calculate the total number of references to a special device. 2391 */ 2392int 2393vcount(vp) 2394 struct vnode *vp; 2395{ 2396 int count; 2397 2398 dev_lock(); 2399 count = vp->v_rdev->si_usecount; 2400 dev_unlock(); 2401 return (count); 2402} 2403 2404/* 2405 * Same as above, but using the struct cdev *as argument 2406 */ 2407int 2408count_dev(dev) 2409 struct cdev *dev; 2410{ 2411 int count; 2412 2413 dev_lock(); 2414 count = dev->si_usecount; 2415 dev_unlock(); 2416 return(count); 2417} 2418 2419/* 2420 * Print out a description of a vnode. 2421 */ 2422static char *typename[] = 2423{"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"}; 2424 2425void 2426vn_printf(struct vnode *vp, const char *fmt, ...) 2427{ 2428 va_list ap; 2429 char buf[96]; 2430 2431 va_start(ap, fmt); 2432 vprintf(fmt, ap); 2433 va_end(ap); 2434 printf("%p: ", (void *)vp); 2435 printf("tag %s, type %s\n", vp->v_tag, typename[vp->v_type]); 2436 printf(" usecount %d, writecount %d, refcount %d mountedhere %p\n", 2437 vp->v_usecount, vp->v_writecount, vp->v_holdcnt, vp->v_mountedhere); 2438 buf[0] = '\0'; 2439 buf[1] = '\0'; 2440 if (vp->v_vflag & VV_ROOT) 2441 strcat(buf, "|VV_ROOT"); 2442 if (vp->v_vflag & VV_TEXT) 2443 strcat(buf, "|VV_TEXT"); 2444 if (vp->v_vflag & VV_SYSTEM) 2445 strcat(buf, "|VV_SYSTEM"); 2446 if (vp->v_iflag & VI_DOOMED) 2447 strcat(buf, "|VI_DOOMED"); 2448 if (vp->v_iflag & VI_FREE) 2449 strcat(buf, "|VI_FREE"); 2450 printf(" flags (%s)\n", buf + 1); 2451 if (mtx_owned(VI_MTX(vp))) 2452 printf(" VI_LOCKed"); 2453 if (vp->v_object != NULL) 2454 printf(" v_object %p ref %d pages %d\n", 2455 vp->v_object, vp->v_object->ref_count, 2456 vp->v_object->resident_page_count); 2457 printf(" "); 2458 lockmgr_printinfo(vp->v_vnlock); 2459 printf("\n"); 2460 if (vp->v_data != NULL) 2461 VOP_PRINT(vp); 2462} 2463 2464#ifdef DDB 2465#include <ddb/ddb.h> 2466/* 2467 * List all of the locked vnodes in the system. 2468 * Called when debugging the kernel. 2469 */ 2470DB_SHOW_COMMAND(lockedvnods, lockedvnodes) 2471{ 2472 struct mount *mp, *nmp; 2473 struct vnode *vp; 2474 2475 /* 2476 * Note: because this is DDB, we can't obey the locking semantics 2477 * for these structures, which means we could catch an inconsistent 2478 * state and dereference a nasty pointer. Not much to be done 2479 * about that. 2480 */ 2481 printf("Locked vnodes\n"); 2482 for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { 2483 nmp = TAILQ_NEXT(mp, mnt_list); 2484 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 2485 if (VOP_ISLOCKED(vp, NULL)) 2486 vprint("", vp); 2487 } 2488 nmp = TAILQ_NEXT(mp, mnt_list); 2489 } 2490} 2491#endif 2492 2493/* 2494 * Fill in a struct xvfsconf based on a struct vfsconf. 2495 */ 2496static void 2497vfsconf2x(struct vfsconf *vfsp, struct xvfsconf *xvfsp) 2498{ 2499 2500 strcpy(xvfsp->vfc_name, vfsp->vfc_name); 2501 xvfsp->vfc_typenum = vfsp->vfc_typenum; 2502 xvfsp->vfc_refcount = vfsp->vfc_refcount; 2503 xvfsp->vfc_flags = vfsp->vfc_flags; 2504 /* 2505 * These are unused in userland, we keep them 2506 * to not break binary compatibility. 2507 */ 2508 xvfsp->vfc_vfsops = NULL; 2509 xvfsp->vfc_next = NULL; 2510} 2511 2512/* 2513 * Top level filesystem related information gathering. 2514 */ 2515static int 2516sysctl_vfs_conflist(SYSCTL_HANDLER_ARGS) 2517{ 2518 struct vfsconf *vfsp; 2519 struct xvfsconf xvfsp; 2520 int error; 2521 2522 error = 0; 2523 TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { 2524 bzero(&xvfsp, sizeof(xvfsp)); 2525 vfsconf2x(vfsp, &xvfsp); 2526 error = SYSCTL_OUT(req, &xvfsp, sizeof xvfsp); 2527 if (error) 2528 break; 2529 } 2530 return (error); 2531} 2532 2533SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLFLAG_RD, NULL, 0, sysctl_vfs_conflist, 2534 "S,xvfsconf", "List of all configured filesystems"); 2535 2536#ifndef BURN_BRIDGES 2537static int sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS); 2538 2539static int 2540vfs_sysctl(SYSCTL_HANDLER_ARGS) 2541{ 2542 int *name = (int *)arg1 - 1; /* XXX */ 2543 u_int namelen = arg2 + 1; /* XXX */ 2544 struct vfsconf *vfsp; 2545 struct xvfsconf xvfsp; 2546 2547 printf("WARNING: userland calling deprecated sysctl, " 2548 "please rebuild world\n"); 2549 2550#if 1 || defined(COMPAT_PRELITE2) 2551 /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */ 2552 if (namelen == 1) 2553 return (sysctl_ovfs_conf(oidp, arg1, arg2, req)); 2554#endif 2555 2556 switch (name[1]) { 2557 case VFS_MAXTYPENUM: 2558 if (namelen != 2) 2559 return (ENOTDIR); 2560 return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int))); 2561 case VFS_CONF: 2562 if (namelen != 3) 2563 return (ENOTDIR); /* overloaded */ 2564 TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) 2565 if (vfsp->vfc_typenum == name[2]) 2566 break; 2567 if (vfsp == NULL) 2568 return (EOPNOTSUPP); 2569 bzero(&xvfsp, sizeof(xvfsp)); 2570 vfsconf2x(vfsp, &xvfsp); 2571 return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp))); 2572 } 2573 return (EOPNOTSUPP); 2574} 2575 2576static SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD | CTLFLAG_SKIP, 2577 vfs_sysctl, "Generic filesystem"); 2578 2579#if 1 || defined(COMPAT_PRELITE2) 2580 2581static int 2582sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS) 2583{ 2584 int error; 2585 struct vfsconf *vfsp; 2586 struct ovfsconf ovfs; 2587 2588 TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { 2589 bzero(&ovfs, sizeof(ovfs)); 2590 ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */ 2591 strcpy(ovfs.vfc_name, vfsp->vfc_name); 2592 ovfs.vfc_index = vfsp->vfc_typenum; 2593 ovfs.vfc_refcount = vfsp->vfc_refcount; 2594 ovfs.vfc_flags = vfsp->vfc_flags; 2595 error = SYSCTL_OUT(req, &ovfs, sizeof ovfs); 2596 if (error) 2597 return error; 2598 } 2599 return 0; 2600} 2601 2602#endif /* 1 || COMPAT_PRELITE2 */ 2603#endif /* !BURN_BRIDGES */ 2604 2605#define KINFO_VNODESLOP 10 2606#ifdef notyet 2607/* 2608 * Dump vnode list (via sysctl). 2609 */ 2610/* ARGSUSED */ 2611static int 2612sysctl_vnode(SYSCTL_HANDLER_ARGS) 2613{ 2614 struct xvnode *xvn; 2615 struct thread *td = req->td; 2616 struct mount *mp; 2617 struct vnode *vp; 2618 int error, len, n; 2619 2620 /* 2621 * Stale numvnodes access is not fatal here. 2622 */ 2623 req->lock = 0; 2624 len = (numvnodes + KINFO_VNODESLOP) * sizeof *xvn; 2625 if (!req->oldptr) 2626 /* Make an estimate */ 2627 return (SYSCTL_OUT(req, 0, len)); 2628 2629 error = sysctl_wire_old_buffer(req, 0); 2630 if (error != 0) 2631 return (error); 2632 xvn = malloc(len, M_TEMP, M_ZERO | M_WAITOK); 2633 n = 0; 2634 mtx_lock(&mountlist_mtx); 2635 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 2636 if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td)) 2637 continue; 2638 MNT_ILOCK(mp); 2639 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 2640 if (n == len) 2641 break; 2642 vref(vp); 2643 xvn[n].xv_size = sizeof *xvn; 2644 xvn[n].xv_vnode = vp; 2645 xvn[n].xv_id = 0; /* XXX compat */ 2646#define XV_COPY(field) xvn[n].xv_##field = vp->v_##field 2647 XV_COPY(usecount); 2648 XV_COPY(writecount); 2649 XV_COPY(holdcnt); 2650 XV_COPY(mount); 2651 XV_COPY(numoutput); 2652 XV_COPY(type); 2653#undef XV_COPY 2654 xvn[n].xv_flag = vp->v_vflag; 2655 2656 switch (vp->v_type) { 2657 case VREG: 2658 case VDIR: 2659 case VLNK: 2660 break; 2661 case VBLK: 2662 case VCHR: 2663 if (vp->v_rdev == NULL) { 2664 vrele(vp); 2665 continue; 2666 } 2667 xvn[n].xv_dev = dev2udev(vp->v_rdev); 2668 break; 2669 case VSOCK: 2670 xvn[n].xv_socket = vp->v_socket; 2671 break; 2672 case VFIFO: 2673 xvn[n].xv_fifo = vp->v_fifoinfo; 2674 break; 2675 case VNON: 2676 case VBAD: 2677 default: 2678 /* shouldn't happen? */ 2679 vrele(vp); 2680 continue; 2681 } 2682 vrele(vp); 2683 ++n; 2684 } 2685 MNT_IUNLOCK(mp); 2686 mtx_lock(&mountlist_mtx); 2687 vfs_unbusy(mp, td); 2688 if (n == len) 2689 break; 2690 } 2691 mtx_unlock(&mountlist_mtx); 2692 2693 error = SYSCTL_OUT(req, xvn, n * sizeof *xvn); 2694 free(xvn, M_TEMP); 2695 return (error); 2696} 2697 2698SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD, 2699 0, 0, sysctl_vnode, "S,xvnode", ""); 2700#endif 2701 2702/* 2703 * Unmount all filesystems. The list is traversed in reverse order 2704 * of mounting to avoid dependencies. 2705 */ 2706void 2707vfs_unmountall() 2708{ 2709 struct mount *mp; 2710 struct thread *td; 2711 int error; 2712 2713 KASSERT(curthread != NULL, ("vfs_unmountall: NULL curthread")); 2714 td = curthread; 2715 /* 2716 * Since this only runs when rebooting, it is not interlocked. 2717 */ 2718 while(!TAILQ_EMPTY(&mountlist)) { 2719 mp = TAILQ_LAST(&mountlist, mntlist); 2720 error = dounmount(mp, MNT_FORCE, td); 2721 if (error) { 2722 TAILQ_REMOVE(&mountlist, mp, mnt_list); 2723 printf("unmount of %s failed (", 2724 mp->mnt_stat.f_mntonname); 2725 if (error == EBUSY) 2726 printf("BUSY)\n"); 2727 else 2728 printf("%d)\n", error); 2729 } else { 2730 /* The unmount has removed mp from the mountlist */ 2731 } 2732 } 2733} 2734 2735/* 2736 * perform msync on all vnodes under a mount point 2737 * the mount point must be locked. 2738 */ 2739void 2740vfs_msync(struct mount *mp, int flags) 2741{ 2742 struct vnode *vp, *nvp; 2743 struct vm_object *obj; 2744 int tries; 2745 2746 tries = 5; 2747 MNT_ILOCK(mp); 2748loop: 2749 TAILQ_FOREACH_SAFE(vp, &mp->mnt_nvnodelist, v_nmntvnodes, nvp) { 2750 if (vp->v_mount != mp) { 2751 if (--tries > 0) 2752 goto loop; 2753 break; 2754 } 2755 2756 VI_LOCK(vp); 2757 if ((vp->v_iflag & VI_OBJDIRTY) && 2758 (flags == MNT_WAIT || VOP_ISLOCKED(vp, NULL) == 0)) { 2759 MNT_IUNLOCK(mp); 2760 if (!vget(vp, 2761 LK_EXCLUSIVE | LK_RETRY | LK_INTERLOCK, 2762 curthread)) { 2763 if (vp->v_vflag & VV_NOSYNC) { /* unlinked */ 2764 vput(vp); 2765 MNT_ILOCK(mp); 2766 continue; 2767 } 2768 2769 obj = vp->v_object; 2770 if (obj != NULL) { 2771 VM_OBJECT_LOCK(obj); 2772 vm_object_page_clean(obj, 0, 0, 2773 flags == MNT_WAIT ? 2774 OBJPC_SYNC : OBJPC_NOSYNC); 2775 VM_OBJECT_UNLOCK(obj); 2776 } 2777 vput(vp); 2778 } 2779 MNT_ILOCK(mp); 2780 if (TAILQ_NEXT(vp, v_nmntvnodes) != nvp) { 2781 if (--tries > 0) 2782 goto loop; 2783 break; 2784 } 2785 } else 2786 VI_UNLOCK(vp); 2787 } 2788 MNT_IUNLOCK(mp); 2789} 2790 2791/* 2792 * Mark a vnode as free, putting it up for recycling. 2793 */ 2794static void 2795vfree(struct vnode *vp) 2796{ 2797 2798 CTR1(KTR_VFS, "vfree vp %p", vp); 2799 ASSERT_VI_LOCKED(vp, "vfree"); 2800 mtx_lock(&vnode_free_list_mtx); 2801 VNASSERT(vp->v_op != NULL, vp, ("vfree: vnode already reclaimed.")); 2802 VNASSERT((vp->v_iflag & VI_FREE) == 0, vp, ("vnode already free")); 2803 VNASSERT(VSHOULDFREE(vp), vp, ("vfree: freeing when we shouldn't")); 2804 VNASSERT((vp->v_iflag & VI_DOOMED) == 0, vp, 2805 ("vfree: Freeing doomed vnode")); 2806 if (vp->v_iflag & VI_AGE) { 2807 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); 2808 } else { 2809 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 2810 } 2811 freevnodes++; 2812 vp->v_iflag &= ~VI_AGE; 2813 vp->v_iflag |= VI_FREE; 2814 mtx_unlock(&vnode_free_list_mtx); 2815} 2816 2817/* 2818 * Opposite of vfree() - mark a vnode as in use. 2819 */ 2820static void 2821vbusy(struct vnode *vp) 2822{ 2823 CTR1(KTR_VFS, "vbusy vp %p", vp); 2824 ASSERT_VI_LOCKED(vp, "vbusy"); 2825 VNASSERT((vp->v_iflag & VI_FREE) != 0, vp, ("vnode not free")); 2826 VNASSERT(vp->v_op != NULL, vp, ("vbusy: vnode already reclaimed.")); 2827 2828 mtx_lock(&vnode_free_list_mtx); 2829 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); 2830 freevnodes--; 2831 vp->v_iflag &= ~(VI_FREE|VI_AGE); 2832 mtx_unlock(&vnode_free_list_mtx); 2833} 2834 2835/* 2836 * Initalize per-vnode helper structure to hold poll-related state. 2837 */ 2838void 2839v_addpollinfo(struct vnode *vp) 2840{ 2841 struct vpollinfo *vi; 2842 2843 vi = uma_zalloc(vnodepoll_zone, M_WAITOK); 2844 if (vp->v_pollinfo != NULL) { 2845 uma_zfree(vnodepoll_zone, vi); 2846 return; 2847 } 2848 vp->v_pollinfo = vi; 2849 mtx_init(&vp->v_pollinfo->vpi_lock, "vnode pollinfo", NULL, MTX_DEF); 2850 knlist_init(&vp->v_pollinfo->vpi_selinfo.si_note, vp, vfs_knllock, 2851 vfs_knlunlock, vfs_knllocked); 2852} 2853 2854/* 2855 * Record a process's interest in events which might happen to 2856 * a vnode. Because poll uses the historic select-style interface 2857 * internally, this routine serves as both the ``check for any 2858 * pending events'' and the ``record my interest in future events'' 2859 * functions. (These are done together, while the lock is held, 2860 * to avoid race conditions.) 2861 */ 2862int 2863vn_pollrecord(vp, td, events) 2864 struct vnode *vp; 2865 struct thread *td; 2866 short events; 2867{ 2868 2869 if (vp->v_pollinfo == NULL) 2870 v_addpollinfo(vp); 2871 mtx_lock(&vp->v_pollinfo->vpi_lock); 2872 if (vp->v_pollinfo->vpi_revents & events) { 2873 /* 2874 * This leaves events we are not interested 2875 * in available for the other process which 2876 * which presumably had requested them 2877 * (otherwise they would never have been 2878 * recorded). 2879 */ 2880 events &= vp->v_pollinfo->vpi_revents; 2881 vp->v_pollinfo->vpi_revents &= ~events; 2882 2883 mtx_unlock(&vp->v_pollinfo->vpi_lock); 2884 return events; 2885 } 2886 vp->v_pollinfo->vpi_events |= events; 2887 selrecord(td, &vp->v_pollinfo->vpi_selinfo); 2888 mtx_unlock(&vp->v_pollinfo->vpi_lock); 2889 return 0; 2890} 2891 2892/* 2893 * Routine to create and manage a filesystem syncer vnode. 2894 */ 2895#define sync_close ((int (*)(struct vop_close_args *))nullop) 2896static int sync_fsync(struct vop_fsync_args *); 2897static int sync_inactive(struct vop_inactive_args *); 2898static int sync_reclaim(struct vop_reclaim_args *); 2899 2900static struct vop_vector sync_vnodeops = { 2901 .vop_bypass = VOP_EOPNOTSUPP, 2902 .vop_close = sync_close, /* close */ 2903 .vop_fsync = sync_fsync, /* fsync */ 2904 .vop_inactive = sync_inactive, /* inactive */ 2905 .vop_reclaim = sync_reclaim, /* reclaim */ 2906 .vop_lock = vop_stdlock, /* lock */ 2907 .vop_unlock = vop_stdunlock, /* unlock */ 2908 .vop_islocked = vop_stdislocked, /* islocked */ 2909}; 2910 2911/* 2912 * Create a new filesystem syncer vnode for the specified mount point. 2913 */ 2914int 2915vfs_allocate_syncvnode(mp) 2916 struct mount *mp; 2917{ 2918 struct vnode *vp; 2919 static long start, incr, next; 2920 int error; 2921 2922 /* Allocate a new vnode */ 2923 if ((error = getnewvnode("syncer", mp, &sync_vnodeops, &vp)) != 0) { 2924 mp->mnt_syncer = NULL; 2925 return (error); 2926 } 2927 vp->v_type = VNON; 2928 /* 2929 * Place the vnode onto the syncer worklist. We attempt to 2930 * scatter them about on the list so that they will go off 2931 * at evenly distributed times even if all the filesystems 2932 * are mounted at once. 2933 */ 2934 next += incr; 2935 if (next == 0 || next > syncer_maxdelay) { 2936 start /= 2; 2937 incr /= 2; 2938 if (start == 0) { 2939 start = syncer_maxdelay / 2; 2940 incr = syncer_maxdelay; 2941 } 2942 next = start; 2943 } 2944 VI_LOCK(vp); 2945 vn_syncer_add_to_worklist(&vp->v_bufobj, 2946 syncdelay > 0 ? next % syncdelay : 0); 2947 /* XXX - vn_syncer_add_to_worklist() also grabs and drops sync_mtx. */ 2948 mtx_lock(&sync_mtx); 2949 sync_vnode_count++; 2950 mtx_unlock(&sync_mtx); 2951 VI_UNLOCK(vp); 2952 mp->mnt_syncer = vp; 2953 return (0); 2954} 2955 2956/* 2957 * Do a lazy sync of the filesystem. 2958 */ 2959static int 2960sync_fsync(ap) 2961 struct vop_fsync_args /* { 2962 struct vnode *a_vp; 2963 struct ucred *a_cred; 2964 int a_waitfor; 2965 struct thread *a_td; 2966 } */ *ap; 2967{ 2968 struct vnode *syncvp = ap->a_vp; 2969 struct mount *mp = syncvp->v_mount; 2970 struct thread *td = ap->a_td; 2971 int error, asyncflag; 2972 struct bufobj *bo; 2973 2974 /* 2975 * We only need to do something if this is a lazy evaluation. 2976 */ 2977 if (ap->a_waitfor != MNT_LAZY) 2978 return (0); 2979 2980 /* 2981 * Move ourselves to the back of the sync list. 2982 */ 2983 bo = &syncvp->v_bufobj; 2984 BO_LOCK(bo); 2985 vn_syncer_add_to_worklist(bo, syncdelay); 2986 BO_UNLOCK(bo); 2987 2988 /* 2989 * Walk the list of vnodes pushing all that are dirty and 2990 * not already on the sync list. 2991 */ 2992 mtx_lock(&mountlist_mtx); 2993 if (vfs_busy(mp, LK_EXCLUSIVE | LK_NOWAIT, &mountlist_mtx, td) != 0) { 2994 mtx_unlock(&mountlist_mtx); 2995 return (0); 2996 } 2997 if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) { 2998 vfs_unbusy(mp, td); 2999 return (0); 3000 } 3001 asyncflag = mp->mnt_flag & MNT_ASYNC; 3002 mp->mnt_flag &= ~MNT_ASYNC; 3003 vfs_msync(mp, MNT_NOWAIT); 3004 error = VFS_SYNC(mp, MNT_LAZY, td); 3005 if (asyncflag) 3006 mp->mnt_flag |= MNT_ASYNC; 3007 vn_finished_write(mp); 3008 vfs_unbusy(mp, td); 3009 return (error); 3010} 3011 3012/* 3013 * The syncer vnode is no referenced. 3014 */ 3015static int 3016sync_inactive(ap) 3017 struct vop_inactive_args /* { 3018 struct vnode *a_vp; 3019 struct thread *a_td; 3020 } */ *ap; 3021{ 3022 3023 vgone(ap->a_vp); 3024 return (0); 3025} 3026 3027/* 3028 * The syncer vnode is no longer needed and is being decommissioned. 3029 * 3030 * Modifications to the worklist must be protected by sync_mtx. 3031 */ 3032static int 3033sync_reclaim(ap) 3034 struct vop_reclaim_args /* { 3035 struct vnode *a_vp; 3036 } */ *ap; 3037{ 3038 struct vnode *vp = ap->a_vp; 3039 struct bufobj *bo; 3040 3041 VI_LOCK(vp); 3042 bo = &vp->v_bufobj; 3043 vp->v_mount->mnt_syncer = NULL; 3044 if (bo->bo_flag & BO_ONWORKLST) { 3045 mtx_lock(&sync_mtx); 3046 LIST_REMOVE(bo, bo_synclist); 3047 syncer_worklist_len--; 3048 sync_vnode_count--; 3049 mtx_unlock(&sync_mtx); 3050 bo->bo_flag &= ~BO_ONWORKLST; 3051 } 3052 VI_UNLOCK(vp); 3053 3054 return (0); 3055} 3056 3057/* 3058 * Check if vnode represents a disk device 3059 */ 3060int 3061vn_isdisk(vp, errp) 3062 struct vnode *vp; 3063 int *errp; 3064{ 3065 int error; 3066 3067 error = 0; 3068 dev_lock(); 3069 if (vp->v_type != VCHR) 3070 error = ENOTBLK; 3071 else if (vp->v_rdev == NULL) 3072 error = ENXIO; 3073 else if (vp->v_rdev->si_devsw == NULL) 3074 error = ENXIO; 3075 else if (!(vp->v_rdev->si_devsw->d_flags & D_DISK)) 3076 error = ENOTBLK; 3077 dev_unlock(); 3078 if (errp != NULL) 3079 *errp = error; 3080 return (error == 0); 3081} 3082 3083/* 3084 * Common filesystem object access control check routine. Accepts a 3085 * vnode's type, "mode", uid and gid, requested access mode, credentials, 3086 * and optional call-by-reference privused argument allowing vaccess() 3087 * to indicate to the caller whether privilege was used to satisfy the 3088 * request (obsoleted). Returns 0 on success, or an errno on failure. 3089 */ 3090int 3091vaccess(type, file_mode, file_uid, file_gid, acc_mode, cred, privused) 3092 enum vtype type; 3093 mode_t file_mode; 3094 uid_t file_uid; 3095 gid_t file_gid; 3096 mode_t acc_mode; 3097 struct ucred *cred; 3098 int *privused; 3099{ 3100 mode_t dac_granted; 3101#ifdef CAPABILITIES 3102 mode_t cap_granted; 3103#endif 3104 3105 /* 3106 * Look for a normal, non-privileged way to access the file/directory 3107 * as requested. If it exists, go with that. 3108 */ 3109 3110 if (privused != NULL) 3111 *privused = 0; 3112 3113 dac_granted = 0; 3114 3115 /* Check the owner. */ 3116 if (cred->cr_uid == file_uid) { 3117 dac_granted |= VADMIN; 3118 if (file_mode & S_IXUSR) 3119 dac_granted |= VEXEC; 3120 if (file_mode & S_IRUSR) 3121 dac_granted |= VREAD; 3122 if (file_mode & S_IWUSR) 3123 dac_granted |= (VWRITE | VAPPEND); 3124 3125 if ((acc_mode & dac_granted) == acc_mode) 3126 return (0); 3127 3128 goto privcheck; 3129 } 3130 3131 /* Otherwise, check the groups (first match) */ 3132 if (groupmember(file_gid, cred)) { 3133 if (file_mode & S_IXGRP) 3134 dac_granted |= VEXEC; 3135 if (file_mode & S_IRGRP) 3136 dac_granted |= VREAD; 3137 if (file_mode & S_IWGRP) 3138 dac_granted |= (VWRITE | VAPPEND); 3139 3140 if ((acc_mode & dac_granted) == acc_mode) 3141 return (0); 3142 3143 goto privcheck; 3144 } 3145 3146 /* Otherwise, check everyone else. */ 3147 if (file_mode & S_IXOTH) 3148 dac_granted |= VEXEC; 3149 if (file_mode & S_IROTH) 3150 dac_granted |= VREAD; 3151 if (file_mode & S_IWOTH) 3152 dac_granted |= (VWRITE | VAPPEND); 3153 if ((acc_mode & dac_granted) == acc_mode) 3154 return (0); 3155 3156privcheck: 3157 if (!suser_cred(cred, SUSER_ALLOWJAIL)) { 3158 /* XXX audit: privilege used */ 3159 if (privused != NULL) 3160 *privused = 1; 3161 return (0); 3162 } 3163 3164#ifdef CAPABILITIES 3165 /* 3166 * Build a capability mask to determine if the set of capabilities 3167 * satisfies the requirements when combined with the granted mask 3168 * from above. 3169 * For each capability, if the capability is required, bitwise 3170 * or the request type onto the cap_granted mask. 3171 */ 3172 cap_granted = 0; 3173 3174 if (type == VDIR) { 3175 /* 3176 * For directories, use CAP_DAC_READ_SEARCH to satisfy 3177 * VEXEC requests, instead of CAP_DAC_EXECUTE. 3178 */ 3179 if ((acc_mode & VEXEC) && ((dac_granted & VEXEC) == 0) && 3180 !cap_check(cred, NULL, CAP_DAC_READ_SEARCH, SUSER_ALLOWJAIL)) 3181 cap_granted |= VEXEC; 3182 } else { 3183 if ((acc_mode & VEXEC) && ((dac_granted & VEXEC) == 0) && 3184 !cap_check(cred, NULL, CAP_DAC_EXECUTE, SUSER_ALLOWJAIL)) 3185 cap_granted |= VEXEC; 3186 } 3187 3188 if ((acc_mode & VREAD) && ((dac_granted & VREAD) == 0) && 3189 !cap_check(cred, NULL, CAP_DAC_READ_SEARCH, SUSER_ALLOWJAIL)) 3190 cap_granted |= VREAD; 3191 3192 if ((acc_mode & VWRITE) && ((dac_granted & VWRITE) == 0) && 3193 !cap_check(cred, NULL, CAP_DAC_WRITE, SUSER_ALLOWJAIL)) 3194 cap_granted |= (VWRITE | VAPPEND); 3195 3196 if ((acc_mode & VADMIN) && ((dac_granted & VADMIN) == 0) && 3197 !cap_check(cred, NULL, CAP_FOWNER, SUSER_ALLOWJAIL)) 3198 cap_granted |= VADMIN; 3199 3200 if ((acc_mode & (cap_granted | dac_granted)) == acc_mode) { 3201 /* XXX audit: privilege used */ 3202 if (privused != NULL) 3203 *privused = 1; 3204 return (0); 3205 } 3206#endif 3207 3208 return ((acc_mode & VADMIN) ? EPERM : EACCES); 3209} 3210 3211/* 3212 * Credential check based on process requesting service, and per-attribute 3213 * permissions. 3214 */ 3215int 3216extattr_check_cred(struct vnode *vp, int attrnamespace, 3217 struct ucred *cred, struct thread *td, int access) 3218{ 3219 3220 /* 3221 * Kernel-invoked always succeeds. 3222 */ 3223 if (cred == NOCRED) 3224 return (0); 3225 3226 /* 3227 * Do not allow privileged processes in jail to directly 3228 * manipulate system attributes. 3229 * 3230 * XXX What capability should apply here? 3231 * Probably CAP_SYS_SETFFLAG. 3232 */ 3233 switch (attrnamespace) { 3234 case EXTATTR_NAMESPACE_SYSTEM: 3235 /* Potentially should be: return (EPERM); */ 3236 return (suser_cred(cred, 0)); 3237 case EXTATTR_NAMESPACE_USER: 3238 return (VOP_ACCESS(vp, access, cred, td)); 3239 default: 3240 return (EPERM); 3241 } 3242} 3243 3244#ifdef DEBUG_VFS_LOCKS 3245/* 3246 * This only exists to supress warnings from unlocked specfs accesses. It is 3247 * no longer ok to have an unlocked VFS. 3248 */ 3249#define IGNORE_LOCK(vp) ((vp)->v_type == VCHR || (vp)->v_type == VBAD) 3250 3251int vfs_badlock_ddb = 1; /* Drop into debugger on violation. */ 3252SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_ddb, CTLFLAG_RW, &vfs_badlock_ddb, 0, ""); 3253 3254int vfs_badlock_mutex = 1; /* Check for interlock across VOPs. */ 3255SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_mutex, CTLFLAG_RW, &vfs_badlock_mutex, 0, ""); 3256 3257int vfs_badlock_print = 1; /* Print lock violations. */ 3258SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_print, CTLFLAG_RW, &vfs_badlock_print, 0, ""); 3259 3260#ifdef KDB 3261int vfs_badlock_backtrace = 1; /* Print backtrace at lock violations. */ 3262SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_backtrace, CTLFLAG_RW, &vfs_badlock_backtrace, 0, ""); 3263#endif 3264 3265static void 3266vfs_badlock(const char *msg, const char *str, struct vnode *vp) 3267{ 3268 3269#ifdef KDB 3270 if (vfs_badlock_backtrace) 3271 kdb_backtrace(); 3272#endif 3273 if (vfs_badlock_print) 3274 printf("%s: %p %s\n", str, (void *)vp, msg); 3275 if (vfs_badlock_ddb) 3276 kdb_enter("lock violation"); 3277} 3278 3279void 3280assert_vi_locked(struct vnode *vp, const char *str) 3281{ 3282 3283 if (vfs_badlock_mutex && !mtx_owned(VI_MTX(vp))) 3284 vfs_badlock("interlock is not locked but should be", str, vp); 3285} 3286 3287void 3288assert_vi_unlocked(struct vnode *vp, const char *str) 3289{ 3290 3291 if (vfs_badlock_mutex && mtx_owned(VI_MTX(vp))) 3292 vfs_badlock("interlock is locked but should not be", str, vp); 3293} 3294 3295void 3296assert_vop_locked(struct vnode *vp, const char *str) 3297{ 3298 3299 if (vp && !IGNORE_LOCK(vp) && VOP_ISLOCKED(vp, NULL) == 0) 3300 vfs_badlock("is not locked but should be", str, vp); 3301} 3302 3303void 3304assert_vop_unlocked(struct vnode *vp, const char *str) 3305{ 3306 3307 if (vp && !IGNORE_LOCK(vp) && 3308 VOP_ISLOCKED(vp, curthread) == LK_EXCLUSIVE) 3309 vfs_badlock("is locked but should not be", str, vp); 3310} 3311 3312void 3313assert_vop_elocked(struct vnode *vp, const char *str) 3314{ 3315 3316 if (vp && !IGNORE_LOCK(vp) && 3317 VOP_ISLOCKED(vp, curthread) != LK_EXCLUSIVE) 3318 vfs_badlock("is not exclusive locked but should be", str, vp); 3319} 3320 3321#if 0 3322void 3323assert_vop_elocked_other(struct vnode *vp, const char *str) 3324{ 3325 3326 if (vp && !IGNORE_LOCK(vp) && 3327 VOP_ISLOCKED(vp, curthread) != LK_EXCLOTHER) 3328 vfs_badlock("is not exclusive locked by another thread", 3329 str, vp); 3330} 3331 3332void 3333assert_vop_slocked(struct vnode *vp, const char *str) 3334{ 3335 3336 if (vp && !IGNORE_LOCK(vp) && 3337 VOP_ISLOCKED(vp, curthread) != LK_SHARED) 3338 vfs_badlock("is not locked shared but should be", str, vp); 3339} 3340#endif /* 0 */ 3341#endif /* DEBUG_VFS_LOCKS */ 3342 3343void 3344vop_rename_pre(void *ap) 3345{ 3346 struct vop_rename_args *a = ap; 3347 3348#ifdef DEBUG_VFS_LOCKS 3349 if (a->a_tvp) 3350 ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME"); 3351 ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME"); 3352 ASSERT_VI_UNLOCKED(a->a_fvp, "VOP_RENAME"); 3353 ASSERT_VI_UNLOCKED(a->a_fdvp, "VOP_RENAME"); 3354 3355 /* Check the source (from). */ 3356 if (a->a_tdvp != a->a_fdvp) 3357 ASSERT_VOP_UNLOCKED(a->a_fdvp, "vop_rename: fdvp locked"); 3358 if (a->a_tvp != a->a_fvp) 3359 ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: tvp locked"); 3360 3361 /* Check the target. */ 3362 if (a->a_tvp) 3363 ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked"); 3364 ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked"); 3365#endif 3366 if (a->a_tdvp != a->a_fdvp) 3367 vholdl(a->a_fdvp); 3368 if (a->a_tvp != a->a_fvp) 3369 vhold(a->a_fvp); 3370 vhold(a->a_tdvp); 3371 if (a->a_tvp) 3372 vhold(a->a_tvp); 3373} 3374 3375void 3376vop_strategy_pre(void *ap) 3377{ 3378#ifdef DEBUG_VFS_LOCKS 3379 struct vop_strategy_args *a; 3380 struct buf *bp; 3381 3382 a = ap; 3383 bp = a->a_bp; 3384 3385 /* 3386 * Cluster ops lock their component buffers but not the IO container. 3387 */ 3388 if ((bp->b_flags & B_CLUSTER) != 0) 3389 return; 3390 3391 if (BUF_REFCNT(bp) < 1) { 3392 if (vfs_badlock_print) 3393 printf( 3394 "VOP_STRATEGY: bp is not locked but should be\n"); 3395 if (vfs_badlock_ddb) 3396 kdb_enter("lock violation"); 3397 } 3398#endif 3399} 3400 3401void 3402vop_lookup_pre(void *ap) 3403{ 3404#ifdef DEBUG_VFS_LOCKS 3405 struct vop_lookup_args *a; 3406 struct vnode *dvp; 3407 3408 a = ap; 3409 dvp = a->a_dvp; 3410 ASSERT_VI_UNLOCKED(dvp, "VOP_LOOKUP"); 3411 ASSERT_VOP_LOCKED(dvp, "VOP_LOOKUP"); 3412#endif 3413} 3414 3415void 3416vop_lookup_post(void *ap, int rc) 3417{ 3418#ifdef DEBUG_VFS_LOCKS 3419 struct vop_lookup_args *a; 3420 struct vnode *dvp; 3421 struct vnode *vp; 3422 3423 a = ap; 3424 dvp = a->a_dvp; 3425 vp = *(a->a_vpp); 3426 3427 ASSERT_VI_UNLOCKED(dvp, "VOP_LOOKUP"); 3428 ASSERT_VOP_LOCKED(dvp, "VOP_LOOKUP"); 3429 3430 if (!rc) 3431 ASSERT_VOP_LOCKED(vp, "VOP_LOOKUP (child)"); 3432#endif 3433} 3434 3435void 3436vop_lock_pre(void *ap) 3437{ 3438#ifdef DEBUG_VFS_LOCKS 3439 struct vop_lock_args *a = ap; 3440 3441 if ((a->a_flags & LK_INTERLOCK) == 0) 3442 ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK"); 3443 else 3444 ASSERT_VI_LOCKED(a->a_vp, "VOP_LOCK"); 3445#endif 3446} 3447 3448void 3449vop_lock_post(void *ap, int rc) 3450{ 3451#ifdef DEBUG_VFS_LOCKS 3452 struct vop_lock_args *a = ap; 3453 3454 ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK"); 3455 if (rc == 0) 3456 ASSERT_VOP_LOCKED(a->a_vp, "VOP_LOCK"); 3457#endif 3458} 3459 3460void 3461vop_unlock_pre(void *ap) 3462{ 3463#ifdef DEBUG_VFS_LOCKS 3464 struct vop_unlock_args *a = ap; 3465 3466 if (a->a_flags & LK_INTERLOCK) 3467 ASSERT_VI_LOCKED(a->a_vp, "VOP_UNLOCK"); 3468 ASSERT_VOP_LOCKED(a->a_vp, "VOP_UNLOCK"); 3469#endif 3470} 3471 3472void 3473vop_unlock_post(void *ap, int rc) 3474{ 3475#ifdef DEBUG_VFS_LOCKS 3476 struct vop_unlock_args *a = ap; 3477 3478 if (a->a_flags & LK_INTERLOCK) 3479 ASSERT_VI_UNLOCKED(a->a_vp, "VOP_UNLOCK"); 3480#endif 3481} 3482 3483void 3484vop_create_post(void *ap, int rc) 3485{ 3486 struct vop_create_args *a = ap; 3487 3488 if (!rc) 3489 VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE); 3490} 3491 3492void 3493vop_link_post(void *ap, int rc) 3494{ 3495 struct vop_link_args *a = ap; 3496 3497 if (!rc) { 3498 VFS_KNOTE_LOCKED(a->a_vp, NOTE_LINK); 3499 VFS_KNOTE_LOCKED(a->a_tdvp, NOTE_WRITE); 3500 } 3501} 3502 3503void 3504vop_mkdir_post(void *ap, int rc) 3505{ 3506 struct vop_mkdir_args *a = ap; 3507 3508 if (!rc) 3509 VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK); 3510} 3511 3512void 3513vop_mknod_post(void *ap, int rc) 3514{ 3515 struct vop_mknod_args *a = ap; 3516 3517 if (!rc) 3518 VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE); 3519} 3520 3521void 3522vop_remove_post(void *ap, int rc) 3523{ 3524 struct vop_remove_args *a = ap; 3525 3526 if (!rc) { 3527 VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE); 3528 VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE); 3529 } 3530} 3531 3532void 3533vop_rename_post(void *ap, int rc) 3534{ 3535 struct vop_rename_args *a = ap; 3536 3537 if (!rc) { 3538 VFS_KNOTE_UNLOCKED(a->a_fdvp, NOTE_WRITE); 3539 VFS_KNOTE_UNLOCKED(a->a_tdvp, NOTE_WRITE); 3540 VFS_KNOTE_UNLOCKED(a->a_fvp, NOTE_RENAME); 3541 if (a->a_tvp) 3542 VFS_KNOTE_UNLOCKED(a->a_tvp, NOTE_DELETE); 3543 } 3544 if (a->a_tdvp != a->a_fdvp) 3545 vdrop(a->a_fdvp); 3546 if (a->a_tvp != a->a_fvp) 3547 vdrop(a->a_fvp); 3548 vdrop(a->a_tdvp); 3549 if (a->a_tvp) 3550 vdrop(a->a_tvp); 3551} 3552 3553void 3554vop_rmdir_post(void *ap, int rc) 3555{ 3556 struct vop_rmdir_args *a = ap; 3557 3558 if (!rc) { 3559 VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK); 3560 VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE); 3561 } 3562} 3563 3564void 3565vop_setattr_post(void *ap, int rc) 3566{ 3567 struct vop_setattr_args *a = ap; 3568 3569 if (!rc) 3570 VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB); 3571} 3572 3573void 3574vop_symlink_post(void *ap, int rc) 3575{ 3576 struct vop_symlink_args *a = ap; 3577 3578 if (!rc) 3579 VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE); 3580} 3581 3582static struct knlist fs_knlist; 3583 3584static void 3585vfs_event_init(void *arg) 3586{ 3587 knlist_init(&fs_knlist, NULL, NULL, NULL, NULL); 3588} 3589/* XXX - correct order? */ 3590SYSINIT(vfs_knlist, SI_SUB_VFS, SI_ORDER_ANY, vfs_event_init, NULL); 3591 3592void 3593vfs_event_signal(fsid_t *fsid, u_int32_t event, intptr_t data __unused) 3594{ 3595 3596 KNOTE_UNLOCKED(&fs_knlist, event); 3597} 3598 3599static int filt_fsattach(struct knote *kn); 3600static void filt_fsdetach(struct knote *kn); 3601static int filt_fsevent(struct knote *kn, long hint); 3602 3603struct filterops fs_filtops = 3604 { 0, filt_fsattach, filt_fsdetach, filt_fsevent }; 3605 3606static int 3607filt_fsattach(struct knote *kn) 3608{ 3609 3610 kn->kn_flags |= EV_CLEAR; 3611 knlist_add(&fs_knlist, kn, 0); 3612 return (0); 3613} 3614 3615static void 3616filt_fsdetach(struct knote *kn) 3617{ 3618 3619 knlist_remove(&fs_knlist, kn, 0); 3620} 3621 3622static int 3623filt_fsevent(struct knote *kn, long hint) 3624{ 3625 3626 kn->kn_fflags |= hint; 3627 return (kn->kn_fflags != 0); 3628} 3629 3630static int 3631sysctl_vfs_ctl(SYSCTL_HANDLER_ARGS) 3632{ 3633 struct vfsidctl vc; 3634 int error; 3635 struct mount *mp; 3636 3637 error = SYSCTL_IN(req, &vc, sizeof(vc)); 3638 if (error) 3639 return (error); 3640 if (vc.vc_vers != VFS_CTL_VERS1) 3641 return (EINVAL); 3642 mp = vfs_getvfs(&vc.vc_fsid); 3643 if (mp == NULL) 3644 return (ENOENT); 3645 /* ensure that a specific sysctl goes to the right filesystem. */ 3646 if (strcmp(vc.vc_fstypename, "*") != 0 && 3647 strcmp(vc.vc_fstypename, mp->mnt_vfc->vfc_name) != 0) { 3648 return (EINVAL); 3649 } 3650 VCTLTOREQ(&vc, req); 3651 return (VFS_SYSCTL(mp, vc.vc_op, req)); 3652} 3653 3654SYSCTL_PROC(_vfs, OID_AUTO, ctl, CTLFLAG_WR, 3655 NULL, 0, sysctl_vfs_ctl, "", "Sysctl by fsid"); 3656 3657/* 3658 * Function to initialize a va_filerev field sensibly. 3659 * XXX: Wouldn't a random number make a lot more sense ?? 3660 */ 3661u_quad_t 3662init_va_filerev(void) 3663{ 3664 struct bintime bt; 3665 3666 getbinuptime(&bt); 3667 return (((u_quad_t)bt.sec << 32LL) | (bt.frac >> 32LL)); 3668} 3669 3670static int filt_vfsread(struct knote *kn, long hint); 3671static int filt_vfswrite(struct knote *kn, long hint); 3672static int filt_vfsvnode(struct knote *kn, long hint); 3673static void filt_vfsdetach(struct knote *kn); 3674static struct filterops vfsread_filtops = 3675 { 1, NULL, filt_vfsdetach, filt_vfsread }; 3676static struct filterops vfswrite_filtops = 3677 { 1, NULL, filt_vfsdetach, filt_vfswrite }; 3678static struct filterops vfsvnode_filtops = 3679 { 1, NULL, filt_vfsdetach, filt_vfsvnode }; 3680 3681static void 3682vfs_knllock(void *arg) 3683{ 3684 struct vnode *vp = arg; 3685 3686 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, curthread); 3687} 3688 3689static void 3690vfs_knlunlock(void *arg) 3691{ 3692 struct vnode *vp = arg; 3693 3694 VOP_UNLOCK(vp, 0, curthread); 3695} 3696 3697static int 3698vfs_knllocked(void *arg) 3699{ 3700 struct vnode *vp = arg; 3701 3702 return (VOP_ISLOCKED(vp, curthread) == LK_EXCLUSIVE); 3703} 3704 3705int 3706vfs_kqfilter(struct vop_kqfilter_args *ap) 3707{ 3708 struct vnode *vp = ap->a_vp; 3709 struct knote *kn = ap->a_kn; 3710 struct knlist *knl; 3711 3712 switch (kn->kn_filter) { 3713 case EVFILT_READ: 3714 kn->kn_fop = &vfsread_filtops; 3715 break; 3716 case EVFILT_WRITE: 3717 kn->kn_fop = &vfswrite_filtops; 3718 break; 3719 case EVFILT_VNODE: 3720 kn->kn_fop = &vfsvnode_filtops; 3721 break; 3722 default: 3723 return (1); 3724 } 3725 3726 kn->kn_hook = (caddr_t)vp; 3727 3728 if (vp->v_pollinfo == NULL) 3729 v_addpollinfo(vp); 3730 if (vp->v_pollinfo == NULL) 3731 return (ENOMEM); 3732 knl = &vp->v_pollinfo->vpi_selinfo.si_note; 3733 knlist_add(knl, kn, 0); 3734 3735 return (0); 3736} 3737 3738/* 3739 * Detach knote from vnode 3740 */ 3741static void 3742filt_vfsdetach(struct knote *kn) 3743{ 3744 struct vnode *vp = (struct vnode *)kn->kn_hook; 3745 3746 KASSERT(vp->v_pollinfo != NULL, ("Missing v_pollinfo")); 3747 knlist_remove(&vp->v_pollinfo->vpi_selinfo.si_note, kn, 0); 3748} 3749 3750/*ARGSUSED*/ 3751static int 3752filt_vfsread(struct knote *kn, long hint) 3753{ 3754 struct vnode *vp = (struct vnode *)kn->kn_hook; 3755 struct vattr va; 3756 3757 /* 3758 * filesystem is gone, so set the EOF flag and schedule 3759 * the knote for deletion. 3760 */ 3761 if (hint == NOTE_REVOKE) { 3762 kn->kn_flags |= (EV_EOF | EV_ONESHOT); 3763 return (1); 3764 } 3765 3766 if (VOP_GETATTR(vp, &va, curthread->td_ucred, curthread)) 3767 return (0); 3768 3769 kn->kn_data = va.va_size - kn->kn_fp->f_offset; 3770 return (kn->kn_data != 0); 3771} 3772 3773/*ARGSUSED*/ 3774static int 3775filt_vfswrite(struct knote *kn, long hint) 3776{ 3777 /* 3778 * filesystem is gone, so set the EOF flag and schedule 3779 * the knote for deletion. 3780 */ 3781 if (hint == NOTE_REVOKE) 3782 kn->kn_flags |= (EV_EOF | EV_ONESHOT); 3783 3784 kn->kn_data = 0; 3785 return (1); 3786} 3787 3788static int 3789filt_vfsvnode(struct knote *kn, long hint) 3790{ 3791 if (kn->kn_sfflags & hint) 3792 kn->kn_fflags |= hint; 3793 if (hint == NOTE_REVOKE) { 3794 kn->kn_flags |= EV_EOF; 3795 return (1); 3796 } 3797 return (kn->kn_fflags != 0); 3798} 3799