1/*- 2 * Copyright (c) 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 4. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95 35 */ 36 37/* 38 * External virtual filesystem routines 39 */ 40 41#include <sys/cdefs.h> 42__FBSDID("$FreeBSD$"); 43 44#include "opt_compat.h" 45#include "opt_ddb.h" 46#include "opt_watchdog.h" 47 48#include <sys/param.h> 49#include <sys/systm.h> 50#include <sys/bio.h> 51#include <sys/buf.h> 52#include <sys/condvar.h> 53#include <sys/conf.h> 54#include <sys/dirent.h> 55#include <sys/event.h> 56#include <sys/eventhandler.h> 57#include <sys/extattr.h> 58#include <sys/file.h> 59#include <sys/fcntl.h> 60#include <sys/jail.h> 61#include <sys/kdb.h> 62#include <sys/kernel.h> 63#include <sys/kthread.h> 64#include <sys/lockf.h> 65#include <sys/malloc.h> 66#include <sys/mount.h> 67#include <sys/namei.h> 68#include <sys/priv.h> 69#include <sys/reboot.h> 70#include <sys/sched.h> 71#include <sys/sleepqueue.h> 72#include <sys/smp.h> 73#include <sys/stat.h> 74#include <sys/sysctl.h> 75#include <sys/syslog.h> 76#include <sys/vmmeter.h> 77#include <sys/vnode.h> 78#include <sys/watchdog.h> 79 80#include <machine/stdarg.h> 81 82#include <security/mac/mac_framework.h> 83 84#include <vm/vm.h> 85#include <vm/vm_object.h> 86#include <vm/vm_extern.h> 87#include <vm/pmap.h> 88#include <vm/vm_map.h> 89#include <vm/vm_page.h> 90#include <vm/vm_kern.h> 91#include <vm/uma.h> 92 93#ifdef DDB 94#include <ddb/ddb.h> 95#endif 96 97#define WI_MPSAFEQ 0 98#define WI_GIANTQ 1 99 100static void delmntque(struct vnode *vp); 101static int flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, 102 int slpflag, int slptimeo); 103static void syncer_shutdown(void *arg, int howto); 104static int vtryrecycle(struct vnode *vp); 105static void v_incr_usecount(struct vnode *); 106static void v_decr_usecount(struct vnode *); 107static void v_decr_useonly(struct vnode *); 108static void v_upgrade_usecount(struct vnode *); 109static void vnlru_free(int); 110static void vgonel(struct vnode *); 111static void vfs_knllock(void *arg); 112static void vfs_knlunlock(void *arg); 113static void vfs_knl_assert_locked(void *arg); 114static void vfs_knl_assert_unlocked(void *arg); 115static void destroy_vpollinfo(struct vpollinfo *vi); 116 117/* 118 * Number of vnodes in existence. Increased whenever getnewvnode() 119 * allocates a new vnode, decreased in vdropl() for VI_DOOMED vnode. 120 */ 121static unsigned long numvnodes; 122 123SYSCTL_ULONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, 124 "Number of vnodes in existence"); 125 126/* 127 * Conversion tables for conversion from vnode types to inode formats 128 * and back. 129 */ 130enum vtype iftovt_tab[16] = { 131 VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, 132 VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD, 133}; 134int vttoif_tab[10] = { 135 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, 136 S_IFSOCK, S_IFIFO, S_IFMT, S_IFMT 137}; 138 139/* 140 * List of vnodes that are ready for recycling. 141 */ 142static TAILQ_HEAD(freelst, vnode) vnode_free_list; 143 144/* 145 * Free vnode target. Free vnodes may simply be files which have been stat'd 146 * but not read. This is somewhat common, and a small cache of such files 147 * should be kept to avoid recreation costs. 148 */ 149static u_long wantfreevnodes; 150SYSCTL_ULONG(_vfs, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, ""); 151/* Number of vnodes in the free list. */ 152static u_long freevnodes; 153SYSCTL_ULONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, 154 "Number of vnodes in the free list"); 155 156static int vlru_allow_cache_src; 157SYSCTL_INT(_vfs, OID_AUTO, vlru_allow_cache_src, CTLFLAG_RW, 158 &vlru_allow_cache_src, 0, "Allow vlru to reclaim source vnode"); 159 160/* 161 * Various variables used for debugging the new implementation of 162 * reassignbuf(). 163 * XXX these are probably of (very) limited utility now. 164 */ 165static int reassignbufcalls; 166SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0, 167 "Number of calls to reassignbuf"); 168 169/* 170 * Cache for the mount type id assigned to NFS. This is used for 171 * special checks in nfs/nfs_nqlease.c and vm/vnode_pager.c. 172 */ 173int nfs_mount_type = -1; 174 175/* To keep more than one thread at a time from running vfs_getnewfsid */ 176static struct mtx mntid_mtx; 177 178/* 179 * Lock for any access to the following: 180 * vnode_free_list 181 * numvnodes 182 * freevnodes 183 */ 184static struct mtx vnode_free_list_mtx; 185 186/* Publicly exported FS */ 187struct nfs_public nfs_pub; 188 189/* Zone for allocation of new vnodes - used exclusively by getnewvnode() */ 190static uma_zone_t vnode_zone; 191static uma_zone_t vnodepoll_zone; 192 193/* 194 * The workitem queue. 195 * 196 * It is useful to delay writes of file data and filesystem metadata 197 * for tens of seconds so that quickly created and deleted files need 198 * not waste disk bandwidth being created and removed. To realize this, 199 * we append vnodes to a "workitem" queue. When running with a soft 200 * updates implementation, most pending metadata dependencies should 201 * not wait for more than a few seconds. Thus, mounted on block devices 202 * are delayed only about a half the time that file data is delayed. 203 * Similarly, directory updates are more critical, so are only delayed 204 * about a third the time that file data is delayed. Thus, there are 205 * SYNCER_MAXDELAY queues that are processed round-robin at a rate of 206 * one each second (driven off the filesystem syncer process). The 207 * syncer_delayno variable indicates the next queue that is to be processed. 208 * Items that need to be processed soon are placed in this queue: 209 * 210 * syncer_workitem_pending[syncer_delayno] 211 * 212 * A delay of fifteen seconds is done by placing the request fifteen 213 * entries later in the queue: 214 * 215 * syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask] 216 * 217 */ 218static int syncer_delayno; 219static long syncer_mask; 220LIST_HEAD(synclist, bufobj); 221static struct synclist *syncer_workitem_pending[2]; 222/* 223 * The sync_mtx protects: 224 * bo->bo_synclist 225 * sync_vnode_count 226 * syncer_delayno 227 * syncer_state 228 * syncer_workitem_pending 229 * syncer_worklist_len 230 * rushjob 231 */ 232static struct mtx sync_mtx; 233static struct cv sync_wakeup; 234 235#define SYNCER_MAXDELAY 32 236static int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */ 237static int syncdelay = 30; /* max time to delay syncing data */ 238static int filedelay = 30; /* time to delay syncing files */ 239SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, 240 "Time to delay syncing files (in seconds)"); 241static int dirdelay = 29; /* time to delay syncing directories */ 242SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, 243 "Time to delay syncing directories (in seconds)"); 244static int metadelay = 28; /* time to delay syncing metadata */ 245SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, 246 "Time to delay syncing metadata (in seconds)"); 247static int rushjob; /* number of slots to run ASAP */ 248static int stat_rush_requests; /* number of times I/O speeded up */ 249SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, 250 "Number of times I/O speeded up (rush requests)"); 251 252/* 253 * When shutting down the syncer, run it at four times normal speed. 254 */ 255#define SYNCER_SHUTDOWN_SPEEDUP 4 256static int sync_vnode_count; 257static int syncer_worklist_len; 258static enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY } 259 syncer_state; 260 261/* 262 * Number of vnodes we want to exist at any one time. This is mostly used 263 * to size hash tables in vnode-related code. It is normally not used in 264 * getnewvnode(), as wantfreevnodes is normally nonzero.) 265 * 266 * XXX desiredvnodes is historical cruft and should not exist. 267 */ 268int desiredvnodes; 269SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW, 270 &desiredvnodes, 0, "Maximum number of vnodes"); 271SYSCTL_ULONG(_kern, OID_AUTO, minvnodes, CTLFLAG_RW, 272 &wantfreevnodes, 0, "Minimum number of vnodes (legacy)"); 273static int vnlru_nowhere; 274SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW, 275 &vnlru_nowhere, 0, "Number of times the vnlru process ran without success"); 276 277/* 278 * Macros to control when a vnode is freed and recycled. All require 279 * the vnode interlock. 280 */ 281#define VCANRECYCLE(vp) (((vp)->v_iflag & VI_FREE) && !(vp)->v_holdcnt) 282#define VSHOULDFREE(vp) (!((vp)->v_iflag & VI_FREE) && !(vp)->v_holdcnt) 283#define VSHOULDBUSY(vp) (((vp)->v_iflag & VI_FREE) && (vp)->v_holdcnt) 284 285/* Shift count for (uintptr_t)vp to initialize vp->v_hash. */ 286static int vnsz2log; 287 288/* 289 * Initialize the vnode management data structures. 290 * 291 * Reevaluate the following cap on the number of vnodes after the physical 292 * memory size exceeds 512GB. In the limit, as the physical memory size 293 * grows, the ratio of physical pages to vnodes approaches sixteen to one. 294 */ 295#ifndef MAXVNODES_MAX 296#define MAXVNODES_MAX (512 * (1024 * 1024 * 1024 / (int)PAGE_SIZE / 16)) 297#endif 298static void 299vntblinit(void *dummy __unused) 300{ 301 u_int i; 302 int physvnodes, virtvnodes; 303 304 /* 305 * Desiredvnodes is a function of the physical memory size and the 306 * kernel's heap size. Generally speaking, it scales with the 307 * physical memory size. The ratio of desiredvnodes to physical pages 308 * is one to four until desiredvnodes exceeds 98,304. Thereafter, the 309 * marginal ratio of desiredvnodes to physical pages is one to 310 * sixteen. However, desiredvnodes is limited by the kernel's heap 311 * size. The memory required by desiredvnodes vnodes and vm objects 312 * may not exceed one seventh of the kernel's heap size. 313 */ 314 physvnodes = maxproc + cnt.v_page_count / 16 + 3 * min(98304 * 4, 315 cnt.v_page_count) / 16; 316 virtvnodes = vm_kmem_size / (7 * (sizeof(struct vm_object) + 317 sizeof(struct vnode))); 318 desiredvnodes = min(physvnodes, virtvnodes); 319 if (desiredvnodes > MAXVNODES_MAX) { 320 if (bootverbose) 321 printf("Reducing kern.maxvnodes %d -> %d\n", 322 desiredvnodes, MAXVNODES_MAX); 323 desiredvnodes = MAXVNODES_MAX; 324 } 325 wantfreevnodes = desiredvnodes / 4; 326 mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF); 327 TAILQ_INIT(&vnode_free_list); 328 mtx_init(&vnode_free_list_mtx, "vnode_free_list", NULL, MTX_DEF); 329 vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL, 330 NULL, NULL, UMA_ALIGN_PTR, 0); 331 vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo), 332 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); 333 /* 334 * Initialize the filesystem syncer. 335 */ 336 syncer_workitem_pending[WI_MPSAFEQ] = hashinit(syncer_maxdelay, M_VNODE, 337 &syncer_mask); 338 syncer_workitem_pending[WI_GIANTQ] = hashinit(syncer_maxdelay, M_VNODE, 339 &syncer_mask); 340 syncer_maxdelay = syncer_mask + 1; 341 mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF); 342 cv_init(&sync_wakeup, "syncer"); 343 for (i = 1; i <= sizeof(struct vnode); i <<= 1) 344 vnsz2log++; 345 vnsz2log--; 346} 347SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL); 348 349 350/* 351 * Mark a mount point as busy. Used to synchronize access and to delay 352 * unmounting. Eventually, mountlist_mtx is not released on failure. 353 * 354 * vfs_busy() is a custom lock, it can block the caller. 355 * vfs_busy() only sleeps if the unmount is active on the mount point. 356 * For a mountpoint mp, vfs_busy-enforced lock is before lock of any 357 * vnode belonging to mp. 358 * 359 * Lookup uses vfs_busy() to traverse mount points. 360 * root fs var fs 361 * / vnode lock A / vnode lock (/var) D 362 * /var vnode lock B /log vnode lock(/var/log) E 363 * vfs_busy lock C vfs_busy lock F 364 * 365 * Within each file system, the lock order is C->A->B and F->D->E. 366 * 367 * When traversing across mounts, the system follows that lock order: 368 * 369 * C->A->B 370 * | 371 * +->F->D->E 372 * 373 * The lookup() process for namei("/var") illustrates the process: 374 * VOP_LOOKUP() obtains B while A is held 375 * vfs_busy() obtains a shared lock on F while A and B are held 376 * vput() releases lock on B 377 * vput() releases lock on A 378 * VFS_ROOT() obtains lock on D while shared lock on F is held 379 * vfs_unbusy() releases shared lock on F 380 * vn_lock() obtains lock on deadfs vnode vp_crossmp instead of A. 381 * Attempt to lock A (instead of vp_crossmp) while D is held would 382 * violate the global order, causing deadlocks. 383 * 384 * dounmount() locks B while F is drained. 385 */ 386int 387vfs_busy(struct mount *mp, int flags) 388{ 389 390 MPASS((flags & ~MBF_MASK) == 0); 391 CTR3(KTR_VFS, "%s: mp %p with flags %d", __func__, mp, flags); 392 393 MNT_ILOCK(mp); 394 MNT_REF(mp); 395 /* 396 * If mount point is currenly being unmounted, sleep until the 397 * mount point fate is decided. If thread doing the unmounting fails, 398 * it will clear MNTK_UNMOUNT flag before waking us up, indicating 399 * that this mount point has survived the unmount attempt and vfs_busy 400 * should retry. Otherwise the unmounter thread will set MNTK_REFEXPIRE 401 * flag in addition to MNTK_UNMOUNT, indicating that mount point is 402 * about to be really destroyed. vfs_busy needs to release its 403 * reference on the mount point in this case and return with ENOENT, 404 * telling the caller that mount mount it tried to busy is no longer 405 * valid. 406 */ 407 while (mp->mnt_kern_flag & MNTK_UNMOUNT) { 408 if (flags & MBF_NOWAIT || mp->mnt_kern_flag & MNTK_REFEXPIRE) { 409 MNT_REL(mp); 410 MNT_IUNLOCK(mp); 411 CTR1(KTR_VFS, "%s: failed busying before sleeping", 412 __func__); 413 return (ENOENT); 414 } 415 if (flags & MBF_MNTLSTLOCK) 416 mtx_unlock(&mountlist_mtx); 417 mp->mnt_kern_flag |= MNTK_MWAIT; 418 msleep(mp, MNT_MTX(mp), PVFS | PDROP, "vfs_busy", 0); 419 if (flags & MBF_MNTLSTLOCK) 420 mtx_lock(&mountlist_mtx); 421 MNT_ILOCK(mp); 422 } 423 if (flags & MBF_MNTLSTLOCK) 424 mtx_unlock(&mountlist_mtx); 425 mp->mnt_lockref++; 426 MNT_IUNLOCK(mp); 427 return (0); 428} 429 430/* 431 * Free a busy filesystem. 432 */ 433void 434vfs_unbusy(struct mount *mp) 435{ 436 437 CTR2(KTR_VFS, "%s: mp %p", __func__, mp); 438 MNT_ILOCK(mp); 439 MNT_REL(mp); 440 KASSERT(mp->mnt_lockref > 0, ("negative mnt_lockref")); 441 mp->mnt_lockref--; 442 if (mp->mnt_lockref == 0 && (mp->mnt_kern_flag & MNTK_DRAINING) != 0) { 443 MPASS(mp->mnt_kern_flag & MNTK_UNMOUNT); 444 CTR1(KTR_VFS, "%s: waking up waiters", __func__); 445 mp->mnt_kern_flag &= ~MNTK_DRAINING; 446 wakeup(&mp->mnt_lockref); 447 } 448 MNT_IUNLOCK(mp); 449} 450 451/* 452 * Lookup a mount point by filesystem identifier. 453 */ 454struct mount * 455vfs_getvfs(fsid_t *fsid) 456{ 457 struct mount *mp; 458 459 CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid); 460 mtx_lock(&mountlist_mtx); 461 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 462 if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] && 463 mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) { 464 vfs_ref(mp); 465 mtx_unlock(&mountlist_mtx); 466 return (mp); 467 } 468 } 469 mtx_unlock(&mountlist_mtx); 470 CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid); 471 return ((struct mount *) 0); 472} 473 474/* 475 * Lookup a mount point by filesystem identifier, busying it before 476 * returning. 477 */ 478struct mount * 479vfs_busyfs(fsid_t *fsid) 480{ 481 struct mount *mp; 482 int error; 483 484 CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid); 485 mtx_lock(&mountlist_mtx); 486 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 487 if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] && 488 mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) { 489 error = vfs_busy(mp, MBF_MNTLSTLOCK); 490 if (error) { 491 mtx_unlock(&mountlist_mtx); 492 return (NULL); 493 } 494 return (mp); 495 } 496 } 497 CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid); 498 mtx_unlock(&mountlist_mtx); 499 return ((struct mount *) 0); 500} 501 502/* 503 * Check if a user can access privileged mount options. 504 */ 505int 506vfs_suser(struct mount *mp, struct thread *td) 507{ 508 int error; 509 510 /* 511 * If the thread is jailed, but this is not a jail-friendly file 512 * system, deny immediately. 513 */ 514 if (!(mp->mnt_vfc->vfc_flags & VFCF_JAIL) && jailed(td->td_ucred)) 515 return (EPERM); 516 517 /* 518 * If the file system was mounted outside the jail of the calling 519 * thread, deny immediately. 520 */ 521 if (prison_check(td->td_ucred, mp->mnt_cred) != 0) 522 return (EPERM); 523 524 /* 525 * If file system supports delegated administration, we don't check 526 * for the PRIV_VFS_MOUNT_OWNER privilege - it will be better verified 527 * by the file system itself. 528 * If this is not the user that did original mount, we check for 529 * the PRIV_VFS_MOUNT_OWNER privilege. 530 */ 531 if (!(mp->mnt_vfc->vfc_flags & VFCF_DELEGADMIN) && 532 mp->mnt_cred->cr_uid != td->td_ucred->cr_uid) { 533 if ((error = priv_check(td, PRIV_VFS_MOUNT_OWNER)) != 0) 534 return (error); 535 } 536 return (0); 537} 538 539/* 540 * Get a new unique fsid. Try to make its val[0] unique, since this value 541 * will be used to create fake device numbers for stat(). Also try (but 542 * not so hard) make its val[0] unique mod 2^16, since some emulators only 543 * support 16-bit device numbers. We end up with unique val[0]'s for the 544 * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls. 545 * 546 * Keep in mind that several mounts may be running in parallel. Starting 547 * the search one past where the previous search terminated is both a 548 * micro-optimization and a defense against returning the same fsid to 549 * different mounts. 550 */ 551void 552vfs_getnewfsid(struct mount *mp) 553{ 554 static uint16_t mntid_base; 555 struct mount *nmp; 556 fsid_t tfsid; 557 int mtype; 558 559 CTR2(KTR_VFS, "%s: mp %p", __func__, mp); 560 mtx_lock(&mntid_mtx); 561 mtype = mp->mnt_vfc->vfc_typenum; 562 tfsid.val[1] = mtype; 563 mtype = (mtype & 0xFF) << 24; 564 for (;;) { 565 tfsid.val[0] = makedev(255, 566 mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF)); 567 mntid_base++; 568 if ((nmp = vfs_getvfs(&tfsid)) == NULL) 569 break; 570 vfs_rel(nmp); 571 } 572 mp->mnt_stat.f_fsid.val[0] = tfsid.val[0]; 573 mp->mnt_stat.f_fsid.val[1] = tfsid.val[1]; 574 mtx_unlock(&mntid_mtx); 575} 576 577/* 578 * Knob to control the precision of file timestamps: 579 * 580 * 0 = seconds only; nanoseconds zeroed. 581 * 1 = seconds and nanoseconds, accurate within 1/HZ. 582 * 2 = seconds and nanoseconds, truncated to microseconds. 583 * >=3 = seconds and nanoseconds, maximum precision. 584 */ 585enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC }; 586 587static int timestamp_precision = TSP_SEC; 588SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW, 589 ×tamp_precision, 0, "File timestamp precision (0: seconds, " 590 "1: sec + ns accurate to 1/HZ, 2: sec + ns truncated to ms, " 591 "3+: sec + ns (max. precision))"); 592 593/* 594 * Get a current timestamp. 595 */ 596void 597vfs_timestamp(struct timespec *tsp) 598{ 599 struct timeval tv; 600 601 switch (timestamp_precision) { 602 case TSP_SEC: 603 tsp->tv_sec = time_second; 604 tsp->tv_nsec = 0; 605 break; 606 case TSP_HZ: 607 getnanotime(tsp); 608 break; 609 case TSP_USEC: 610 microtime(&tv); 611 TIMEVAL_TO_TIMESPEC(&tv, tsp); 612 break; 613 case TSP_NSEC: 614 default: 615 nanotime(tsp); 616 break; 617 } 618} 619 620/* 621 * Set vnode attributes to VNOVAL 622 */ 623void 624vattr_null(struct vattr *vap) 625{ 626 627 vap->va_type = VNON; 628 vap->va_size = VNOVAL; 629 vap->va_bytes = VNOVAL; 630 vap->va_mode = VNOVAL; 631 vap->va_nlink = VNOVAL; 632 vap->va_uid = VNOVAL; 633 vap->va_gid = VNOVAL; 634 vap->va_fsid = VNOVAL; 635 vap->va_fileid = VNOVAL; 636 vap->va_blocksize = VNOVAL; 637 vap->va_rdev = VNOVAL; 638 vap->va_atime.tv_sec = VNOVAL; 639 vap->va_atime.tv_nsec = VNOVAL; 640 vap->va_mtime.tv_sec = VNOVAL; 641 vap->va_mtime.tv_nsec = VNOVAL; 642 vap->va_ctime.tv_sec = VNOVAL; 643 vap->va_ctime.tv_nsec = VNOVAL; 644 vap->va_birthtime.tv_sec = VNOVAL; 645 vap->va_birthtime.tv_nsec = VNOVAL; 646 vap->va_flags = VNOVAL; 647 vap->va_gen = VNOVAL; 648 vap->va_vaflags = 0; 649} 650 651/* 652 * This routine is called when we have too many vnodes. It attempts 653 * to free <count> vnodes and will potentially free vnodes that still 654 * have VM backing store (VM backing store is typically the cause 655 * of a vnode blowout so we want to do this). Therefore, this operation 656 * is not considered cheap. 657 * 658 * A number of conditions may prevent a vnode from being reclaimed. 659 * the buffer cache may have references on the vnode, a directory 660 * vnode may still have references due to the namei cache representing 661 * underlying files, or the vnode may be in active use. It is not 662 * desireable to reuse such vnodes. These conditions may cause the 663 * number of vnodes to reach some minimum value regardless of what 664 * you set kern.maxvnodes to. Do not set kern.maxvnodes too low. 665 */ 666static int 667vlrureclaim(struct mount *mp) 668{ 669 struct vnode *vp; 670 int done; 671 int trigger; 672 int usevnodes; 673 int count; 674 675 /* 676 * Calculate the trigger point, don't allow user 677 * screwups to blow us up. This prevents us from 678 * recycling vnodes with lots of resident pages. We 679 * aren't trying to free memory, we are trying to 680 * free vnodes. 681 */ 682 usevnodes = desiredvnodes; 683 if (usevnodes <= 0) 684 usevnodes = 1; 685 trigger = cnt.v_page_count * 2 / usevnodes; 686 done = 0; 687 vn_start_write(NULL, &mp, V_WAIT); 688 MNT_ILOCK(mp); 689 count = mp->mnt_nvnodelistsize / 10 + 1; 690 while (count != 0) { 691 vp = TAILQ_FIRST(&mp->mnt_nvnodelist); 692 while (vp != NULL && vp->v_type == VMARKER) 693 vp = TAILQ_NEXT(vp, v_nmntvnodes); 694 if (vp == NULL) 695 break; 696 TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 697 TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 698 --count; 699 if (!VI_TRYLOCK(vp)) 700 goto next_iter; 701 /* 702 * If it's been deconstructed already, it's still 703 * referenced, or it exceeds the trigger, skip it. 704 */ 705 if (vp->v_usecount || 706 (!vlru_allow_cache_src && 707 !LIST_EMPTY(&(vp)->v_cache_src)) || 708 (vp->v_iflag & VI_DOOMED) != 0 || (vp->v_object != NULL && 709 vp->v_object->resident_page_count > trigger)) { 710 VI_UNLOCK(vp); 711 goto next_iter; 712 } 713 MNT_IUNLOCK(mp); 714 vholdl(vp); 715 if (VOP_LOCK(vp, LK_INTERLOCK|LK_EXCLUSIVE|LK_NOWAIT)) { 716 vdrop(vp); 717 goto next_iter_mntunlocked; 718 } 719 VI_LOCK(vp); 720 /* 721 * v_usecount may have been bumped after VOP_LOCK() dropped 722 * the vnode interlock and before it was locked again. 723 * 724 * It is not necessary to recheck VI_DOOMED because it can 725 * only be set by another thread that holds both the vnode 726 * lock and vnode interlock. If another thread has the 727 * vnode lock before we get to VOP_LOCK() and obtains the 728 * vnode interlock after VOP_LOCK() drops the vnode 729 * interlock, the other thread will be unable to drop the 730 * vnode lock before our VOP_LOCK() call fails. 731 */ 732 if (vp->v_usecount || 733 (!vlru_allow_cache_src && 734 !LIST_EMPTY(&(vp)->v_cache_src)) || 735 (vp->v_object != NULL && 736 vp->v_object->resident_page_count > trigger)) { 737 VOP_UNLOCK(vp, LK_INTERLOCK); 738 goto next_iter_mntunlocked; 739 } 740 KASSERT((vp->v_iflag & VI_DOOMED) == 0, 741 ("VI_DOOMED unexpectedly detected in vlrureclaim()")); 742 vgonel(vp); 743 VOP_UNLOCK(vp, 0); 744 vdropl(vp); 745 done++; 746next_iter_mntunlocked: 747 if (!should_yield()) 748 goto relock_mnt; 749 goto yield; 750next_iter: 751 if (!should_yield()) 752 continue; 753 MNT_IUNLOCK(mp); 754yield: 755 kern_yield(PRI_UNCHANGED); 756relock_mnt: 757 MNT_ILOCK(mp); 758 } 759 MNT_IUNLOCK(mp); 760 vn_finished_write(mp); 761 return done; 762} 763 764/* 765 * Attempt to keep the free list at wantfreevnodes length. 766 */ 767static void 768vnlru_free(int count) 769{ 770 struct vnode *vp; 771 int vfslocked; 772 773 mtx_assert(&vnode_free_list_mtx, MA_OWNED); 774 for (; count > 0; count--) { 775 vp = TAILQ_FIRST(&vnode_free_list); 776 /* 777 * The list can be modified while the free_list_mtx 778 * has been dropped and vp could be NULL here. 779 */ 780 if (!vp) 781 break; 782 VNASSERT(vp->v_op != NULL, vp, 783 ("vnlru_free: vnode already reclaimed.")); 784 KASSERT((vp->v_iflag & VI_FREE) != 0, 785 ("Removing vnode not on freelist")); 786 KASSERT((vp->v_iflag & VI_ACTIVE) == 0, 787 ("Mangling active vnode")); 788 TAILQ_REMOVE(&vnode_free_list, vp, v_actfreelist); 789 /* 790 * Don't recycle if we can't get the interlock. 791 */ 792 if (!VI_TRYLOCK(vp)) { 793 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_actfreelist); 794 continue; 795 } 796 VNASSERT(VCANRECYCLE(vp), vp, 797 ("vp inconsistent on freelist")); 798 freevnodes--; 799 vp->v_iflag &= ~VI_FREE; 800 vholdl(vp); 801 mtx_unlock(&vnode_free_list_mtx); 802 VI_UNLOCK(vp); 803 vfslocked = VFS_LOCK_GIANT(vp->v_mount); 804 vtryrecycle(vp); 805 VFS_UNLOCK_GIANT(vfslocked); 806 /* 807 * If the recycled succeeded this vdrop will actually free 808 * the vnode. If not it will simply place it back on 809 * the free list. 810 */ 811 vdrop(vp); 812 mtx_lock(&vnode_free_list_mtx); 813 } 814} 815/* 816 * Attempt to recycle vnodes in a context that is always safe to block. 817 * Calling vlrurecycle() from the bowels of filesystem code has some 818 * interesting deadlock problems. 819 */ 820static struct proc *vnlruproc; 821static int vnlruproc_sig; 822 823static void 824vnlru_proc(void) 825{ 826 struct mount *mp, *nmp; 827 int done, vfslocked; 828 struct proc *p = vnlruproc; 829 830 EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, p, 831 SHUTDOWN_PRI_FIRST); 832 833 for (;;) { 834 kproc_suspend_check(p); 835 mtx_lock(&vnode_free_list_mtx); 836 if (freevnodes > wantfreevnodes) 837 vnlru_free(freevnodes - wantfreevnodes); 838 if (numvnodes <= desiredvnodes * 9 / 10) { 839 vnlruproc_sig = 0; 840 wakeup(&vnlruproc_sig); 841 msleep(vnlruproc, &vnode_free_list_mtx, 842 PVFS|PDROP, "vlruwt", hz); 843 continue; 844 } 845 mtx_unlock(&vnode_free_list_mtx); 846 done = 0; 847 mtx_lock(&mountlist_mtx); 848 for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { 849 if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) { 850 nmp = TAILQ_NEXT(mp, mnt_list); 851 continue; 852 } 853 vfslocked = VFS_LOCK_GIANT(mp); 854 done += vlrureclaim(mp); 855 VFS_UNLOCK_GIANT(vfslocked); 856 mtx_lock(&mountlist_mtx); 857 nmp = TAILQ_NEXT(mp, mnt_list); 858 vfs_unbusy(mp); 859 } 860 mtx_unlock(&mountlist_mtx); 861 if (done == 0) { 862#if 0 863 /* These messages are temporary debugging aids */ 864 if (vnlru_nowhere < 5) 865 printf("vnlru process getting nowhere..\n"); 866 else if (vnlru_nowhere == 5) 867 printf("vnlru process messages stopped.\n"); 868#endif 869 vnlru_nowhere++; 870 tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3); 871 } else 872 kern_yield(PRI_UNCHANGED); 873 } 874} 875 876static struct kproc_desc vnlru_kp = { 877 "vnlru", 878 vnlru_proc, 879 &vnlruproc 880}; 881SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, 882 &vnlru_kp); 883 884/* 885 * Routines having to do with the management of the vnode table. 886 */ 887 888/* 889 * Try to recycle a freed vnode. We abort if anyone picks up a reference 890 * before we actually vgone(). This function must be called with the vnode 891 * held to prevent the vnode from being returned to the free list midway 892 * through vgone(). 893 */ 894static int 895vtryrecycle(struct vnode *vp) 896{ 897 struct mount *vnmp; 898 899 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 900 VNASSERT(vp->v_holdcnt, vp, 901 ("vtryrecycle: Recycling vp %p without a reference.", vp)); 902 /* 903 * This vnode may found and locked via some other list, if so we 904 * can't recycle it yet. 905 */ 906 if (VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT) != 0) { 907 CTR2(KTR_VFS, 908 "%s: impossible to recycle, vp %p lock is already held", 909 __func__, vp); 910 return (EWOULDBLOCK); 911 } 912 /* 913 * Don't recycle if its filesystem is being suspended. 914 */ 915 if (vn_start_write(vp, &vnmp, V_NOWAIT) != 0) { 916 VOP_UNLOCK(vp, 0); 917 CTR2(KTR_VFS, 918 "%s: impossible to recycle, cannot start the write for %p", 919 __func__, vp); 920 return (EBUSY); 921 } 922 /* 923 * If we got this far, we need to acquire the interlock and see if 924 * anyone picked up this vnode from another list. If not, we will 925 * mark it with DOOMED via vgonel() so that anyone who does find it 926 * will skip over it. 927 */ 928 VI_LOCK(vp); 929 if (vp->v_usecount) { 930 VOP_UNLOCK(vp, LK_INTERLOCK); 931 vn_finished_write(vnmp); 932 CTR2(KTR_VFS, 933 "%s: impossible to recycle, %p is already referenced", 934 __func__, vp); 935 return (EBUSY); 936 } 937 if ((vp->v_iflag & VI_DOOMED) == 0) 938 vgonel(vp); 939 VOP_UNLOCK(vp, LK_INTERLOCK); 940 vn_finished_write(vnmp); 941 return (0); 942} 943 944/* 945 * Wait for available vnodes. 946 */ 947static int 948getnewvnode_wait(int suspended) 949{ 950 951 mtx_assert(&vnode_free_list_mtx, MA_OWNED); 952 if (numvnodes > desiredvnodes) { 953 if (suspended) { 954 /* 955 * File system is beeing suspended, we cannot risk a 956 * deadlock here, so allocate new vnode anyway. 957 */ 958 if (freevnodes > wantfreevnodes) 959 vnlru_free(freevnodes - wantfreevnodes); 960 return (0); 961 } 962 if (vnlruproc_sig == 0) { 963 vnlruproc_sig = 1; /* avoid unnecessary wakeups */ 964 wakeup(vnlruproc); 965 } 966 msleep(&vnlruproc_sig, &vnode_free_list_mtx, PVFS, 967 "vlruwk", hz); 968 } 969 return (numvnodes > desiredvnodes ? ENFILE : 0); 970} 971 972void 973getnewvnode_reserve(u_int count) 974{ 975 struct thread *td; 976 977 td = curthread; 978 mtx_lock(&vnode_free_list_mtx); 979 while (count > 0) { 980 if (getnewvnode_wait(0) == 0) { 981 count--; 982 td->td_vp_reserv++; 983 numvnodes++; 984 } 985 } 986 mtx_unlock(&vnode_free_list_mtx); 987} 988 989void 990getnewvnode_drop_reserve(void) 991{ 992 struct thread *td; 993 994 td = curthread; 995 mtx_lock(&vnode_free_list_mtx); 996 KASSERT(numvnodes >= td->td_vp_reserv, ("reserve too large")); 997 numvnodes -= td->td_vp_reserv; 998 mtx_unlock(&vnode_free_list_mtx); 999 td->td_vp_reserv = 0; 1000} 1001 1002/* 1003 * Return the next vnode from the free list. 1004 */ 1005int 1006getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops, 1007 struct vnode **vpp) 1008{ 1009 struct vnode *vp; 1010 struct bufobj *bo; 1011 struct thread *td; 1012 int error; 1013 1014 CTR3(KTR_VFS, "%s: mp %p with tag %s", __func__, mp, tag); 1015 vp = NULL; 1016 td = curthread; 1017 if (td->td_vp_reserv > 0) { 1018 td->td_vp_reserv -= 1; 1019 goto alloc; 1020 } 1021 mtx_lock(&vnode_free_list_mtx); 1022 /* 1023 * Lend our context to reclaim vnodes if they've exceeded the max. 1024 */ 1025 if (freevnodes > wantfreevnodes) 1026 vnlru_free(1); 1027 error = getnewvnode_wait(mp != NULL && (mp->mnt_kern_flag & 1028 MNTK_SUSPEND)); 1029#if 0 /* XXX Not all VFS_VGET/ffs_vget callers check returns. */ 1030 if (error != 0) { 1031 mtx_unlock(&vnode_free_list_mtx); 1032 return (error); 1033 } 1034#endif 1035 numvnodes++; 1036 mtx_unlock(&vnode_free_list_mtx); 1037alloc: 1038 vp = (struct vnode *) uma_zalloc(vnode_zone, M_WAITOK|M_ZERO); 1039 /* 1040 * Setup locks. 1041 */ 1042 vp->v_vnlock = &vp->v_lock; 1043 mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF); 1044 /* 1045 * By default, don't allow shared locks unless filesystems 1046 * opt-in. 1047 */ 1048 lockinit(vp->v_vnlock, PVFS, tag, VLKTIMEOUT, LK_NOSHARE); 1049 /* 1050 * Initialize bufobj. 1051 */ 1052 bo = &vp->v_bufobj; 1053 bo->__bo_vnode = vp; 1054 mtx_init(BO_MTX(bo), "bufobj interlock", NULL, MTX_DEF); 1055 bo->bo_ops = &buf_ops_bio; 1056 bo->bo_private = vp; 1057 TAILQ_INIT(&bo->bo_clean.bv_hd); 1058 TAILQ_INIT(&bo->bo_dirty.bv_hd); 1059 /* 1060 * Initialize namecache. 1061 */ 1062 LIST_INIT(&vp->v_cache_src); 1063 TAILQ_INIT(&vp->v_cache_dst); 1064 /* 1065 * Finalize various vnode identity bits. 1066 */ 1067 vp->v_type = VNON; 1068 vp->v_tag = tag; 1069 vp->v_op = vops; 1070 v_incr_usecount(vp); 1071 vp->v_data = 0; 1072#ifdef MAC 1073 mac_vnode_init(vp); 1074 if (mp != NULL && (mp->mnt_flag & MNT_MULTILABEL) == 0) 1075 mac_vnode_associate_singlelabel(mp, vp); 1076 else if (mp == NULL && vops != &dead_vnodeops) 1077 printf("NULL mp in getnewvnode()\n"); 1078#endif 1079 if (mp != NULL) { 1080 bo->bo_bsize = mp->mnt_stat.f_iosize; 1081 if ((mp->mnt_kern_flag & MNTK_NOKNOTE) != 0) 1082 vp->v_vflag |= VV_NOKNOTE; 1083 } 1084 rangelock_init(&vp->v_rl); 1085 1086 /* 1087 * For the filesystems which do not use vfs_hash_insert(), 1088 * still initialize v_hash to have vfs_hash_index() useful. 1089 * E.g., nullfs uses vfs_hash_index() on the lower vnode for 1090 * its own hashing. 1091 */ 1092 vp->v_hash = (uintptr_t)vp >> vnsz2log; 1093 1094 *vpp = vp; 1095 return (0); 1096} 1097 1098/* 1099 * Delete from old mount point vnode list, if on one. 1100 */ 1101static void 1102delmntque(struct vnode *vp) 1103{ 1104 struct mount *mp; 1105 int active; 1106 1107 mp = vp->v_mount; 1108 if (mp == NULL) 1109 return; 1110 MNT_ILOCK(mp); 1111 VI_LOCK(vp); 1112 KASSERT(mp->mnt_activevnodelistsize <= mp->mnt_nvnodelistsize, 1113 ("Active vnode list size %d > Vnode list size %d", 1114 mp->mnt_activevnodelistsize, mp->mnt_nvnodelistsize)); 1115 active = vp->v_iflag & VI_ACTIVE; 1116 vp->v_iflag &= ~VI_ACTIVE; 1117 if (active) { 1118 mtx_lock(&vnode_free_list_mtx); 1119 TAILQ_REMOVE(&mp->mnt_activevnodelist, vp, v_actfreelist); 1120 mp->mnt_activevnodelistsize--; 1121 mtx_unlock(&vnode_free_list_mtx); 1122 } 1123 vp->v_mount = NULL; 1124 VI_UNLOCK(vp); 1125 VNASSERT(mp->mnt_nvnodelistsize > 0, vp, 1126 ("bad mount point vnode list size")); 1127 TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 1128 mp->mnt_nvnodelistsize--; 1129 MNT_REL(mp); 1130 MNT_IUNLOCK(mp); 1131} 1132 1133static void 1134insmntque_stddtr(struct vnode *vp, void *dtr_arg) 1135{ 1136 1137 vp->v_data = NULL; 1138 vp->v_op = &dead_vnodeops; 1139 /* XXX non mp-safe fs may still call insmntque with vnode 1140 unlocked */ 1141 if (!VOP_ISLOCKED(vp)) 1142 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 1143 vgone(vp); 1144 vput(vp); 1145} 1146 1147/* 1148 * Insert into list of vnodes for the new mount point, if available. 1149 */ 1150int 1151insmntque1(struct vnode *vp, struct mount *mp, 1152 void (*dtr)(struct vnode *, void *), void *dtr_arg) 1153{ 1154 int locked; 1155 1156 KASSERT(vp->v_mount == NULL, 1157 ("insmntque: vnode already on per mount vnode list")); 1158 VNASSERT(mp != NULL, vp, ("Don't call insmntque(foo, NULL)")); 1159#ifdef DEBUG_VFS_LOCKS 1160 if (!VFS_NEEDSGIANT(mp)) 1161 ASSERT_VOP_ELOCKED(vp, 1162 "insmntque: mp-safe fs and non-locked vp"); 1163#endif 1164 /* 1165 * We acquire the vnode interlock early to ensure that the 1166 * vnode cannot be recycled by another process releasing a 1167 * holdcnt on it before we get it on both the vnode list 1168 * and the active vnode list. The mount mutex protects only 1169 * manipulation of the vnode list and the vnode freelist 1170 * mutex protects only manipulation of the active vnode list. 1171 * Hence the need to hold the vnode interlock throughout. 1172 */ 1173 MNT_ILOCK(mp); 1174 VI_LOCK(vp); 1175 if ((mp->mnt_kern_flag & MNTK_NOINSMNTQ) != 0 && 1176 ((mp->mnt_kern_flag & MNTK_UNMOUNTF) != 0 || 1177 mp->mnt_nvnodelistsize == 0)) { 1178 locked = VOP_ISLOCKED(vp); 1179 if (!locked || (locked == LK_EXCLUSIVE && 1180 (vp->v_vflag & VV_FORCEINSMQ) == 0)) { 1181 VI_UNLOCK(vp); 1182 MNT_IUNLOCK(mp); 1183 if (dtr != NULL) 1184 dtr(vp, dtr_arg); 1185 return (EBUSY); 1186 } 1187 } 1188 vp->v_mount = mp; 1189 MNT_REF(mp); 1190 TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes); 1191 VNASSERT(mp->mnt_nvnodelistsize >= 0, vp, 1192 ("neg mount point vnode list size")); 1193 mp->mnt_nvnodelistsize++; 1194 KASSERT((vp->v_iflag & VI_ACTIVE) == 0, 1195 ("Activating already active vnode")); 1196 vp->v_iflag |= VI_ACTIVE; 1197 mtx_lock(&vnode_free_list_mtx); 1198 TAILQ_INSERT_HEAD(&mp->mnt_activevnodelist, vp, v_actfreelist); 1199 mp->mnt_activevnodelistsize++; 1200 mtx_unlock(&vnode_free_list_mtx); 1201 VI_UNLOCK(vp); 1202 MNT_IUNLOCK(mp); 1203 return (0); 1204} 1205 1206int 1207insmntque(struct vnode *vp, struct mount *mp) 1208{ 1209 1210 return (insmntque1(vp, mp, insmntque_stddtr, NULL)); 1211} 1212 1213/* 1214 * Flush out and invalidate all buffers associated with a bufobj 1215 * Called with the underlying object locked. 1216 */ 1217int 1218bufobj_invalbuf(struct bufobj *bo, int flags, int slpflag, int slptimeo) 1219{ 1220 int error; 1221 1222 BO_LOCK(bo); 1223 if (flags & V_SAVE) { 1224 error = bufobj_wwait(bo, slpflag, slptimeo); 1225 if (error) { 1226 BO_UNLOCK(bo); 1227 return (error); 1228 } 1229 if (bo->bo_dirty.bv_cnt > 0) { 1230 BO_UNLOCK(bo); 1231 if ((error = BO_SYNC(bo, MNT_WAIT)) != 0) 1232 return (error); 1233 /* 1234 * XXX We could save a lock/unlock if this was only 1235 * enabled under INVARIANTS 1236 */ 1237 BO_LOCK(bo); 1238 if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0) 1239 panic("vinvalbuf: dirty bufs"); 1240 } 1241 } 1242 /* 1243 * If you alter this loop please notice that interlock is dropped and 1244 * reacquired in flushbuflist. Special care is needed to ensure that 1245 * no race conditions occur from this. 1246 */ 1247 do { 1248 error = flushbuflist(&bo->bo_clean, 1249 flags, bo, slpflag, slptimeo); 1250 if (error == 0 && !(flags & V_CLEANONLY)) 1251 error = flushbuflist(&bo->bo_dirty, 1252 flags, bo, slpflag, slptimeo); 1253 if (error != 0 && error != EAGAIN) { 1254 BO_UNLOCK(bo); 1255 return (error); 1256 } 1257 } while (error != 0); 1258 1259 /* 1260 * Wait for I/O to complete. XXX needs cleaning up. The vnode can 1261 * have write I/O in-progress but if there is a VM object then the 1262 * VM object can also have read-I/O in-progress. 1263 */ 1264 do { 1265 bufobj_wwait(bo, 0, 0); 1266 BO_UNLOCK(bo); 1267 if (bo->bo_object != NULL) { 1268 VM_OBJECT_LOCK(bo->bo_object); 1269 vm_object_pip_wait(bo->bo_object, "bovlbx"); 1270 VM_OBJECT_UNLOCK(bo->bo_object); 1271 } 1272 BO_LOCK(bo); 1273 } while (bo->bo_numoutput > 0); 1274 BO_UNLOCK(bo); 1275 1276 /* 1277 * Destroy the copy in the VM cache, too. 1278 */ 1279 if (bo->bo_object != NULL && 1280 (flags & (V_ALT | V_NORMAL | V_CLEANONLY)) == 0) { 1281 VM_OBJECT_LOCK(bo->bo_object); 1282 vm_object_page_remove(bo->bo_object, 0, 0, (flags & V_SAVE) ? 1283 OBJPR_CLEANONLY : 0); 1284 VM_OBJECT_UNLOCK(bo->bo_object); 1285 } 1286 1287#ifdef INVARIANTS 1288 BO_LOCK(bo); 1289 if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY)) == 0 && 1290 (bo->bo_dirty.bv_cnt > 0 || bo->bo_clean.bv_cnt > 0)) 1291 panic("vinvalbuf: flush failed"); 1292 BO_UNLOCK(bo); 1293#endif 1294 return (0); 1295} 1296 1297/* 1298 * Flush out and invalidate all buffers associated with a vnode. 1299 * Called with the underlying object locked. 1300 */ 1301int 1302vinvalbuf(struct vnode *vp, int flags, int slpflag, int slptimeo) 1303{ 1304 1305 CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags); 1306 ASSERT_VOP_LOCKED(vp, "vinvalbuf"); 1307 if (vp->v_object != NULL && vp->v_object->handle != vp) 1308 return (0); 1309 return (bufobj_invalbuf(&vp->v_bufobj, flags, slpflag, slptimeo)); 1310} 1311 1312/* 1313 * Flush out buffers on the specified list. 1314 * 1315 */ 1316static int 1317flushbuflist( struct bufv *bufv, int flags, struct bufobj *bo, int slpflag, 1318 int slptimeo) 1319{ 1320 struct buf *bp, *nbp; 1321 int retval, error; 1322 daddr_t lblkno; 1323 b_xflags_t xflags; 1324 1325 ASSERT_BO_LOCKED(bo); 1326 1327 retval = 0; 1328 TAILQ_FOREACH_SAFE(bp, &bufv->bv_hd, b_bobufs, nbp) { 1329 if (((flags & V_NORMAL) && (bp->b_xflags & BX_ALTDATA)) || 1330 ((flags & V_ALT) && (bp->b_xflags & BX_ALTDATA) == 0)) { 1331 continue; 1332 } 1333 lblkno = 0; 1334 xflags = 0; 1335 if (nbp != NULL) { 1336 lblkno = nbp->b_lblkno; 1337 xflags = nbp->b_xflags & 1338 (BX_BKGRDMARKER | BX_VNDIRTY | BX_VNCLEAN); 1339 } 1340 retval = EAGAIN; 1341 error = BUF_TIMELOCK(bp, 1342 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_MTX(bo), 1343 "flushbuf", slpflag, slptimeo); 1344 if (error) { 1345 BO_LOCK(bo); 1346 return (error != ENOLCK ? error : EAGAIN); 1347 } 1348 KASSERT(bp->b_bufobj == bo, 1349 ("bp %p wrong b_bufobj %p should be %p", 1350 bp, bp->b_bufobj, bo)); 1351 if (bp->b_bufobj != bo) { /* XXX: necessary ? */ 1352 BUF_UNLOCK(bp); 1353 BO_LOCK(bo); 1354 return (EAGAIN); 1355 } 1356 /* 1357 * XXX Since there are no node locks for NFS, I 1358 * believe there is a slight chance that a delayed 1359 * write will occur while sleeping just above, so 1360 * check for it. 1361 */ 1362 if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) && 1363 (flags & V_SAVE)) { 1364 BO_LOCK(bo); 1365 bremfree(bp); 1366 BO_UNLOCK(bo); 1367 bp->b_flags |= B_ASYNC; 1368 bwrite(bp); 1369 BO_LOCK(bo); 1370 return (EAGAIN); /* XXX: why not loop ? */ 1371 } 1372 BO_LOCK(bo); 1373 bremfree(bp); 1374 BO_UNLOCK(bo); 1375 bp->b_flags |= (B_INVAL | B_RELBUF); 1376 bp->b_flags &= ~B_ASYNC; 1377 brelse(bp); 1378 BO_LOCK(bo); 1379 if (nbp != NULL && 1380 (nbp->b_bufobj != bo || 1381 nbp->b_lblkno != lblkno || 1382 (nbp->b_xflags & 1383 (BX_BKGRDMARKER | BX_VNDIRTY | BX_VNCLEAN)) != xflags)) 1384 break; /* nbp invalid */ 1385 } 1386 return (retval); 1387} 1388 1389/* 1390 * Truncate a file's buffer and pages to a specified length. This 1391 * is in lieu of the old vinvalbuf mechanism, which performed unneeded 1392 * sync activity. 1393 */ 1394int 1395vtruncbuf(struct vnode *vp, struct ucred *cred, struct thread *td, 1396 off_t length, int blksize) 1397{ 1398 struct buf *bp, *nbp; 1399 int anyfreed; 1400 int trunclbn; 1401 struct bufobj *bo; 1402 1403 CTR5(KTR_VFS, "%s: vp %p with cred %p and block %d:%ju", __func__, 1404 vp, cred, blksize, (uintmax_t)length); 1405 1406 /* 1407 * Round up to the *next* lbn. 1408 */ 1409 trunclbn = (length + blksize - 1) / blksize; 1410 1411 ASSERT_VOP_LOCKED(vp, "vtruncbuf"); 1412restart: 1413 bo = &vp->v_bufobj; 1414 BO_LOCK(bo); 1415 anyfreed = 1; 1416 for (;anyfreed;) { 1417 anyfreed = 0; 1418 TAILQ_FOREACH_SAFE(bp, &bo->bo_clean.bv_hd, b_bobufs, nbp) { 1419 if (bp->b_lblkno < trunclbn) 1420 continue; 1421 if (BUF_LOCK(bp, 1422 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, 1423 BO_MTX(bo)) == ENOLCK) 1424 goto restart; 1425 1426 BO_LOCK(bo); 1427 bremfree(bp); 1428 BO_UNLOCK(bo); 1429 bp->b_flags |= (B_INVAL | B_RELBUF); 1430 bp->b_flags &= ~B_ASYNC; 1431 brelse(bp); 1432 anyfreed = 1; 1433 1434 BO_LOCK(bo); 1435 if (nbp != NULL && 1436 (((nbp->b_xflags & BX_VNCLEAN) == 0) || 1437 (nbp->b_vp != vp) || 1438 (nbp->b_flags & B_DELWRI))) { 1439 BO_UNLOCK(bo); 1440 goto restart; 1441 } 1442 } 1443 1444 TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { 1445 if (bp->b_lblkno < trunclbn) 1446 continue; 1447 if (BUF_LOCK(bp, 1448 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, 1449 BO_MTX(bo)) == ENOLCK) 1450 goto restart; 1451 BO_LOCK(bo); 1452 bremfree(bp); 1453 BO_UNLOCK(bo); 1454 bp->b_flags |= (B_INVAL | B_RELBUF); 1455 bp->b_flags &= ~B_ASYNC; 1456 brelse(bp); 1457 anyfreed = 1; 1458 1459 BO_LOCK(bo); 1460 if (nbp != NULL && 1461 (((nbp->b_xflags & BX_VNDIRTY) == 0) || 1462 (nbp->b_vp != vp) || 1463 (nbp->b_flags & B_DELWRI) == 0)) { 1464 BO_UNLOCK(bo); 1465 goto restart; 1466 } 1467 } 1468 } 1469 1470 if (length > 0) { 1471restartsync: 1472 TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { 1473 if (bp->b_lblkno > 0) 1474 continue; 1475 /* 1476 * Since we hold the vnode lock this should only 1477 * fail if we're racing with the buf daemon. 1478 */ 1479 if (BUF_LOCK(bp, 1480 LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, 1481 BO_MTX(bo)) == ENOLCK) { 1482 goto restart; 1483 } 1484 VNASSERT((bp->b_flags & B_DELWRI), vp, 1485 ("buf(%p) on dirty queue without DELWRI", bp)); 1486 1487 BO_LOCK(bo); 1488 bremfree(bp); 1489 BO_UNLOCK(bo); 1490 bawrite(bp); 1491 BO_LOCK(bo); 1492 goto restartsync; 1493 } 1494 } 1495 1496 bufobj_wwait(bo, 0, 0); 1497 BO_UNLOCK(bo); 1498 vnode_pager_setsize(vp, length); 1499 1500 return (0); 1501} 1502 1503/* 1504 * buf_splay() - splay tree core for the clean/dirty list of buffers in 1505 * a vnode. 1506 * 1507 * NOTE: We have to deal with the special case of a background bitmap 1508 * buffer, a situation where two buffers will have the same logical 1509 * block offset. We want (1) only the foreground buffer to be accessed 1510 * in a lookup and (2) must differentiate between the foreground and 1511 * background buffer in the splay tree algorithm because the splay 1512 * tree cannot normally handle multiple entities with the same 'index'. 1513 * We accomplish this by adding differentiating flags to the splay tree's 1514 * numerical domain. 1515 */ 1516static 1517struct buf * 1518buf_splay(daddr_t lblkno, b_xflags_t xflags, struct buf *root) 1519{ 1520 struct buf dummy; 1521 struct buf *lefttreemax, *righttreemin, *y; 1522 1523 if (root == NULL) 1524 return (NULL); 1525 lefttreemax = righttreemin = &dummy; 1526 for (;;) { 1527 if (lblkno < root->b_lblkno || 1528 (lblkno == root->b_lblkno && 1529 (xflags & BX_BKGRDMARKER) < (root->b_xflags & BX_BKGRDMARKER))) { 1530 if ((y = root->b_left) == NULL) 1531 break; 1532 if (lblkno < y->b_lblkno) { 1533 /* Rotate right. */ 1534 root->b_left = y->b_right; 1535 y->b_right = root; 1536 root = y; 1537 if ((y = root->b_left) == NULL) 1538 break; 1539 } 1540 /* Link into the new root's right tree. */ 1541 righttreemin->b_left = root; 1542 righttreemin = root; 1543 } else if (lblkno > root->b_lblkno || 1544 (lblkno == root->b_lblkno && 1545 (xflags & BX_BKGRDMARKER) > (root->b_xflags & BX_BKGRDMARKER))) { 1546 if ((y = root->b_right) == NULL) 1547 break; 1548 if (lblkno > y->b_lblkno) { 1549 /* Rotate left. */ 1550 root->b_right = y->b_left; 1551 y->b_left = root; 1552 root = y; 1553 if ((y = root->b_right) == NULL) 1554 break; 1555 } 1556 /* Link into the new root's left tree. */ 1557 lefttreemax->b_right = root; 1558 lefttreemax = root; 1559 } else { 1560 break; 1561 } 1562 root = y; 1563 } 1564 /* Assemble the new root. */ 1565 lefttreemax->b_right = root->b_left; 1566 righttreemin->b_left = root->b_right; 1567 root->b_left = dummy.b_right; 1568 root->b_right = dummy.b_left; 1569 return (root); 1570} 1571 1572static void 1573buf_vlist_remove(struct buf *bp) 1574{ 1575 struct buf *root; 1576 struct bufv *bv; 1577 1578 KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); 1579 ASSERT_BO_LOCKED(bp->b_bufobj); 1580 KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) != 1581 (BX_VNDIRTY|BX_VNCLEAN), 1582 ("buf_vlist_remove: Buf %p is on two lists", bp)); 1583 if (bp->b_xflags & BX_VNDIRTY) 1584 bv = &bp->b_bufobj->bo_dirty; 1585 else 1586 bv = &bp->b_bufobj->bo_clean; 1587 if (bp != bv->bv_root) { 1588 root = buf_splay(bp->b_lblkno, bp->b_xflags, bv->bv_root); 1589 KASSERT(root == bp, ("splay lookup failed in remove")); 1590 } 1591 if (bp->b_left == NULL) { 1592 root = bp->b_right; 1593 } else { 1594 root = buf_splay(bp->b_lblkno, bp->b_xflags, bp->b_left); 1595 root->b_right = bp->b_right; 1596 } 1597 bv->bv_root = root; 1598 TAILQ_REMOVE(&bv->bv_hd, bp, b_bobufs); 1599 bv->bv_cnt--; 1600 bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN); 1601} 1602 1603/* 1604 * Add the buffer to the sorted clean or dirty block list using a 1605 * splay tree algorithm. 1606 * 1607 * NOTE: xflags is passed as a constant, optimizing this inline function! 1608 */ 1609static void 1610buf_vlist_add(struct buf *bp, struct bufobj *bo, b_xflags_t xflags) 1611{ 1612 struct buf *root; 1613 struct bufv *bv; 1614 1615 ASSERT_BO_LOCKED(bo); 1616 KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, 1617 ("buf_vlist_add: Buf %p has existing xflags %d", bp, bp->b_xflags)); 1618 bp->b_xflags |= xflags; 1619 if (xflags & BX_VNDIRTY) 1620 bv = &bo->bo_dirty; 1621 else 1622 bv = &bo->bo_clean; 1623 1624 root = buf_splay(bp->b_lblkno, bp->b_xflags, bv->bv_root); 1625 if (root == NULL) { 1626 bp->b_left = NULL; 1627 bp->b_right = NULL; 1628 TAILQ_INSERT_TAIL(&bv->bv_hd, bp, b_bobufs); 1629 } else if (bp->b_lblkno < root->b_lblkno || 1630 (bp->b_lblkno == root->b_lblkno && 1631 (bp->b_xflags & BX_BKGRDMARKER) < (root->b_xflags & BX_BKGRDMARKER))) { 1632 bp->b_left = root->b_left; 1633 bp->b_right = root; 1634 root->b_left = NULL; 1635 TAILQ_INSERT_BEFORE(root, bp, b_bobufs); 1636 } else { 1637 bp->b_right = root->b_right; 1638 bp->b_left = root; 1639 root->b_right = NULL; 1640 TAILQ_INSERT_AFTER(&bv->bv_hd, root, bp, b_bobufs); 1641 } 1642 bv->bv_cnt++; 1643 bv->bv_root = bp; 1644} 1645 1646/* 1647 * Lookup a buffer using the splay tree. Note that we specifically avoid 1648 * shadow buffers used in background bitmap writes. 1649 * 1650 * This code isn't quite efficient as it could be because we are maintaining 1651 * two sorted lists and do not know which list the block resides in. 1652 * 1653 * During a "make buildworld" the desired buffer is found at one of 1654 * the roots more than 60% of the time. Thus, checking both roots 1655 * before performing either splay eliminates unnecessary splays on the 1656 * first tree splayed. 1657 */ 1658struct buf * 1659gbincore(struct bufobj *bo, daddr_t lblkno) 1660{ 1661 struct buf *bp; 1662 1663 ASSERT_BO_LOCKED(bo); 1664 if ((bp = bo->bo_clean.bv_root) != NULL && 1665 bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER)) 1666 return (bp); 1667 if ((bp = bo->bo_dirty.bv_root) != NULL && 1668 bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER)) 1669 return (bp); 1670 if ((bp = bo->bo_clean.bv_root) != NULL) { 1671 bo->bo_clean.bv_root = bp = buf_splay(lblkno, 0, bp); 1672 if (bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER)) 1673 return (bp); 1674 } 1675 if ((bp = bo->bo_dirty.bv_root) != NULL) { 1676 bo->bo_dirty.bv_root = bp = buf_splay(lblkno, 0, bp); 1677 if (bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER)) 1678 return (bp); 1679 } 1680 return (NULL); 1681} 1682 1683/* 1684 * Associate a buffer with a vnode. 1685 */ 1686void 1687bgetvp(struct vnode *vp, struct buf *bp) 1688{ 1689 struct bufobj *bo; 1690 1691 bo = &vp->v_bufobj; 1692 ASSERT_BO_LOCKED(bo); 1693 VNASSERT(bp->b_vp == NULL, bp->b_vp, ("bgetvp: not free")); 1694 1695 CTR3(KTR_BUF, "bgetvp(%p) vp %p flags %X", bp, vp, bp->b_flags); 1696 VNASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, vp, 1697 ("bgetvp: bp already attached! %p", bp)); 1698 1699 vhold(vp); 1700 if (VFS_NEEDSGIANT(vp->v_mount) || bo->bo_flag & BO_NEEDSGIANT) 1701 bp->b_flags |= B_NEEDSGIANT; 1702 bp->b_vp = vp; 1703 bp->b_bufobj = bo; 1704 /* 1705 * Insert onto list for new vnode. 1706 */ 1707 buf_vlist_add(bp, bo, BX_VNCLEAN); 1708} 1709 1710/* 1711 * Disassociate a buffer from a vnode. 1712 */ 1713void 1714brelvp(struct buf *bp) 1715{ 1716 struct bufobj *bo; 1717 struct vnode *vp; 1718 1719 CTR3(KTR_BUF, "brelvp(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); 1720 KASSERT(bp->b_vp != NULL, ("brelvp: NULL")); 1721 1722 /* 1723 * Delete from old vnode list, if on one. 1724 */ 1725 vp = bp->b_vp; /* XXX */ 1726 bo = bp->b_bufobj; 1727 BO_LOCK(bo); 1728 if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) 1729 buf_vlist_remove(bp); 1730 else 1731 panic("brelvp: Buffer %p not on queue.", bp); 1732 if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) { 1733 bo->bo_flag &= ~BO_ONWORKLST; 1734 mtx_lock(&sync_mtx); 1735 LIST_REMOVE(bo, bo_synclist); 1736 syncer_worklist_len--; 1737 mtx_unlock(&sync_mtx); 1738 } 1739 bp->b_flags &= ~B_NEEDSGIANT; 1740 bp->b_vp = NULL; 1741 bp->b_bufobj = NULL; 1742 BO_UNLOCK(bo); 1743 vdrop(vp); 1744} 1745 1746/* 1747 * Add an item to the syncer work queue. 1748 */ 1749static void 1750vn_syncer_add_to_worklist(struct bufobj *bo, int delay) 1751{ 1752 int queue, slot; 1753 1754 ASSERT_BO_LOCKED(bo); 1755 1756 mtx_lock(&sync_mtx); 1757 if (bo->bo_flag & BO_ONWORKLST) 1758 LIST_REMOVE(bo, bo_synclist); 1759 else { 1760 bo->bo_flag |= BO_ONWORKLST; 1761 syncer_worklist_len++; 1762 } 1763 1764 if (delay > syncer_maxdelay - 2) 1765 delay = syncer_maxdelay - 2; 1766 slot = (syncer_delayno + delay) & syncer_mask; 1767 1768 queue = VFS_NEEDSGIANT(bo->__bo_vnode->v_mount) ? WI_GIANTQ : 1769 WI_MPSAFEQ; 1770 LIST_INSERT_HEAD(&syncer_workitem_pending[queue][slot], bo, 1771 bo_synclist); 1772 mtx_unlock(&sync_mtx); 1773} 1774 1775static int 1776sysctl_vfs_worklist_len(SYSCTL_HANDLER_ARGS) 1777{ 1778 int error, len; 1779 1780 mtx_lock(&sync_mtx); 1781 len = syncer_worklist_len - sync_vnode_count; 1782 mtx_unlock(&sync_mtx); 1783 error = SYSCTL_OUT(req, &len, sizeof(len)); 1784 return (error); 1785} 1786 1787SYSCTL_PROC(_vfs, OID_AUTO, worklist_len, CTLTYPE_INT | CTLFLAG_RD, NULL, 0, 1788 sysctl_vfs_worklist_len, "I", "Syncer thread worklist length"); 1789 1790static struct proc *updateproc; 1791static void sched_sync(void); 1792static struct kproc_desc up_kp = { 1793 "syncer", 1794 sched_sync, 1795 &updateproc 1796}; 1797SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp); 1798 1799static int 1800sync_vnode(struct synclist *slp, struct bufobj **bo, struct thread *td) 1801{ 1802 struct vnode *vp; 1803 struct mount *mp; 1804 1805 *bo = LIST_FIRST(slp); 1806 if (*bo == NULL) 1807 return (0); 1808 vp = (*bo)->__bo_vnode; /* XXX */ 1809 if (VOP_ISLOCKED(vp) != 0 || VI_TRYLOCK(vp) == 0) 1810 return (1); 1811 /* 1812 * We use vhold in case the vnode does not 1813 * successfully sync. vhold prevents the vnode from 1814 * going away when we unlock the sync_mtx so that 1815 * we can acquire the vnode interlock. 1816 */ 1817 vholdl(vp); 1818 mtx_unlock(&sync_mtx); 1819 VI_UNLOCK(vp); 1820 if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { 1821 vdrop(vp); 1822 mtx_lock(&sync_mtx); 1823 return (*bo == LIST_FIRST(slp)); 1824 } 1825 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 1826 (void) VOP_FSYNC(vp, MNT_LAZY, td); 1827 VOP_UNLOCK(vp, 0); 1828 vn_finished_write(mp); 1829 BO_LOCK(*bo); 1830 if (((*bo)->bo_flag & BO_ONWORKLST) != 0) { 1831 /* 1832 * Put us back on the worklist. The worklist 1833 * routine will remove us from our current 1834 * position and then add us back in at a later 1835 * position. 1836 */ 1837 vn_syncer_add_to_worklist(*bo, syncdelay); 1838 } 1839 BO_UNLOCK(*bo); 1840 vdrop(vp); 1841 mtx_lock(&sync_mtx); 1842 return (0); 1843} 1844 1845/* 1846 * System filesystem synchronizer daemon. 1847 */ 1848static void 1849sched_sync(void) 1850{ 1851 struct synclist *gnext, *next; 1852 struct synclist *gslp, *slp; 1853 struct bufobj *bo; 1854 long starttime; 1855 struct thread *td = curthread; 1856 int last_work_seen; 1857 int net_worklist_len; 1858 int syncer_final_iter; 1859 int first_printf; 1860 int error; 1861 1862 last_work_seen = 0; 1863 syncer_final_iter = 0; 1864 first_printf = 1; 1865 syncer_state = SYNCER_RUNNING; 1866 starttime = time_uptime; 1867 td->td_pflags |= TDP_NORUNNINGBUF; 1868 1869 EVENTHANDLER_REGISTER(shutdown_pre_sync, syncer_shutdown, td->td_proc, 1870 SHUTDOWN_PRI_LAST); 1871 1872 mtx_lock(&sync_mtx); 1873 for (;;) { 1874 if (syncer_state == SYNCER_FINAL_DELAY && 1875 syncer_final_iter == 0) { 1876 mtx_unlock(&sync_mtx); 1877 kproc_suspend_check(td->td_proc); 1878 mtx_lock(&sync_mtx); 1879 } 1880 net_worklist_len = syncer_worklist_len - sync_vnode_count; 1881 if (syncer_state != SYNCER_RUNNING && 1882 starttime != time_uptime) { 1883 if (first_printf) { 1884 printf("\nSyncing disks, vnodes remaining..."); 1885 first_printf = 0; 1886 } 1887 printf("%d ", net_worklist_len); 1888 } 1889 starttime = time_uptime; 1890 1891 /* 1892 * Push files whose dirty time has expired. Be careful 1893 * of interrupt race on slp queue. 1894 * 1895 * Skip over empty worklist slots when shutting down. 1896 */ 1897 do { 1898 slp = &syncer_workitem_pending[WI_MPSAFEQ][syncer_delayno]; 1899 gslp = &syncer_workitem_pending[WI_GIANTQ][syncer_delayno]; 1900 syncer_delayno += 1; 1901 if (syncer_delayno == syncer_maxdelay) 1902 syncer_delayno = 0; 1903 next = &syncer_workitem_pending[WI_MPSAFEQ][syncer_delayno]; 1904 gnext = &syncer_workitem_pending[WI_GIANTQ][syncer_delayno]; 1905 /* 1906 * If the worklist has wrapped since the 1907 * it was emptied of all but syncer vnodes, 1908 * switch to the FINAL_DELAY state and run 1909 * for one more second. 1910 */ 1911 if (syncer_state == SYNCER_SHUTTING_DOWN && 1912 net_worklist_len == 0 && 1913 last_work_seen == syncer_delayno) { 1914 syncer_state = SYNCER_FINAL_DELAY; 1915 syncer_final_iter = SYNCER_SHUTDOWN_SPEEDUP; 1916 } 1917 } while (syncer_state != SYNCER_RUNNING && LIST_EMPTY(slp) && 1918 LIST_EMPTY(gslp) && syncer_worklist_len > 0); 1919 1920 /* 1921 * Keep track of the last time there was anything 1922 * on the worklist other than syncer vnodes. 1923 * Return to the SHUTTING_DOWN state if any 1924 * new work appears. 1925 */ 1926 if (net_worklist_len > 0 || syncer_state == SYNCER_RUNNING) 1927 last_work_seen = syncer_delayno; 1928 if (net_worklist_len > 0 && syncer_state == SYNCER_FINAL_DELAY) 1929 syncer_state = SYNCER_SHUTTING_DOWN; 1930 while (!LIST_EMPTY(slp)) { 1931 error = sync_vnode(slp, &bo, td); 1932 if (error == 1) { 1933 LIST_REMOVE(bo, bo_synclist); 1934 LIST_INSERT_HEAD(next, bo, bo_synclist); 1935 continue; 1936 } 1937 1938 if (first_printf == 0) 1939 wdog_kern_pat(WD_LASTVAL); 1940 1941 } 1942 if (!LIST_EMPTY(gslp)) { 1943 mtx_unlock(&sync_mtx); 1944 mtx_lock(&Giant); 1945 mtx_lock(&sync_mtx); 1946 while (!LIST_EMPTY(gslp)) { 1947 error = sync_vnode(gslp, &bo, td); 1948 if (error == 1) { 1949 LIST_REMOVE(bo, bo_synclist); 1950 LIST_INSERT_HEAD(gnext, bo, 1951 bo_synclist); 1952 continue; 1953 } 1954 } 1955 mtx_unlock(&Giant); 1956 } 1957 if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter > 0) 1958 syncer_final_iter--; 1959 /* 1960 * The variable rushjob allows the kernel to speed up the 1961 * processing of the filesystem syncer process. A rushjob 1962 * value of N tells the filesystem syncer to process the next 1963 * N seconds worth of work on its queue ASAP. Currently rushjob 1964 * is used by the soft update code to speed up the filesystem 1965 * syncer process when the incore state is getting so far 1966 * ahead of the disk that the kernel memory pool is being 1967 * threatened with exhaustion. 1968 */ 1969 if (rushjob > 0) { 1970 rushjob -= 1; 1971 continue; 1972 } 1973 /* 1974 * Just sleep for a short period of time between 1975 * iterations when shutting down to allow some I/O 1976 * to happen. 1977 * 1978 * If it has taken us less than a second to process the 1979 * current work, then wait. Otherwise start right over 1980 * again. We can still lose time if any single round 1981 * takes more than two seconds, but it does not really 1982 * matter as we are just trying to generally pace the 1983 * filesystem activity. 1984 */ 1985 if (syncer_state != SYNCER_RUNNING || 1986 time_uptime == starttime) { 1987 thread_lock(td); 1988 sched_prio(td, PPAUSE); 1989 thread_unlock(td); 1990 } 1991 if (syncer_state != SYNCER_RUNNING) 1992 cv_timedwait(&sync_wakeup, &sync_mtx, 1993 hz / SYNCER_SHUTDOWN_SPEEDUP); 1994 else if (time_uptime == starttime) 1995 cv_timedwait(&sync_wakeup, &sync_mtx, hz); 1996 } 1997} 1998 1999/* 2000 * Request the syncer daemon to speed up its work. 2001 * We never push it to speed up more than half of its 2002 * normal turn time, otherwise it could take over the cpu. 2003 */ 2004int 2005speedup_syncer(void) 2006{ 2007 int ret = 0; 2008 2009 mtx_lock(&sync_mtx); 2010 if (rushjob < syncdelay / 2) { 2011 rushjob += 1; 2012 stat_rush_requests += 1; 2013 ret = 1; 2014 } 2015 mtx_unlock(&sync_mtx); 2016 cv_broadcast(&sync_wakeup); 2017 return (ret); 2018} 2019 2020/* 2021 * Tell the syncer to speed up its work and run though its work 2022 * list several times, then tell it to shut down. 2023 */ 2024static void 2025syncer_shutdown(void *arg, int howto) 2026{ 2027 2028 if (howto & RB_NOSYNC) 2029 return; 2030 mtx_lock(&sync_mtx); 2031 syncer_state = SYNCER_SHUTTING_DOWN; 2032 rushjob = 0; 2033 mtx_unlock(&sync_mtx); 2034 cv_broadcast(&sync_wakeup); 2035 kproc_shutdown(arg, howto); 2036} 2037 2038/* 2039 * Reassign a buffer from one vnode to another. 2040 * Used to assign file specific control information 2041 * (indirect blocks) to the vnode to which they belong. 2042 */ 2043void 2044reassignbuf(struct buf *bp) 2045{ 2046 struct vnode *vp; 2047 struct bufobj *bo; 2048 int delay; 2049#ifdef INVARIANTS 2050 struct bufv *bv; 2051#endif 2052 2053 vp = bp->b_vp; 2054 bo = bp->b_bufobj; 2055 ++reassignbufcalls; 2056 2057 CTR3(KTR_BUF, "reassignbuf(%p) vp %p flags %X", 2058 bp, bp->b_vp, bp->b_flags); 2059 /* 2060 * B_PAGING flagged buffers cannot be reassigned because their vp 2061 * is not fully linked in. 2062 */ 2063 if (bp->b_flags & B_PAGING) 2064 panic("cannot reassign paging buffer"); 2065 2066 /* 2067 * Delete from old vnode list, if on one. 2068 */ 2069 BO_LOCK(bo); 2070 if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) 2071 buf_vlist_remove(bp); 2072 else 2073 panic("reassignbuf: Buffer %p not on queue.", bp); 2074 /* 2075 * If dirty, put on list of dirty buffers; otherwise insert onto list 2076 * of clean buffers. 2077 */ 2078 if (bp->b_flags & B_DELWRI) { 2079 if ((bo->bo_flag & BO_ONWORKLST) == 0) { 2080 switch (vp->v_type) { 2081 case VDIR: 2082 delay = dirdelay; 2083 break; 2084 case VCHR: 2085 delay = metadelay; 2086 break; 2087 default: 2088 delay = filedelay; 2089 } 2090 vn_syncer_add_to_worklist(bo, delay); 2091 } 2092 buf_vlist_add(bp, bo, BX_VNDIRTY); 2093 } else { 2094 buf_vlist_add(bp, bo, BX_VNCLEAN); 2095 2096 if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) { 2097 mtx_lock(&sync_mtx); 2098 LIST_REMOVE(bo, bo_synclist); 2099 syncer_worklist_len--; 2100 mtx_unlock(&sync_mtx); 2101 bo->bo_flag &= ~BO_ONWORKLST; 2102 } 2103 } 2104#ifdef INVARIANTS 2105 bv = &bo->bo_clean; 2106 bp = TAILQ_FIRST(&bv->bv_hd); 2107 KASSERT(bp == NULL || bp->b_bufobj == bo, 2108 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); 2109 bp = TAILQ_LAST(&bv->bv_hd, buflists); 2110 KASSERT(bp == NULL || bp->b_bufobj == bo, 2111 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); 2112 bv = &bo->bo_dirty; 2113 bp = TAILQ_FIRST(&bv->bv_hd); 2114 KASSERT(bp == NULL || bp->b_bufobj == bo, 2115 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); 2116 bp = TAILQ_LAST(&bv->bv_hd, buflists); 2117 KASSERT(bp == NULL || bp->b_bufobj == bo, 2118 ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); 2119#endif 2120 BO_UNLOCK(bo); 2121} 2122 2123/* 2124 * Increment the use and hold counts on the vnode, taking care to reference 2125 * the driver's usecount if this is a chardev. The vholdl() will remove 2126 * the vnode from the free list if it is presently free. Requires the 2127 * vnode interlock and returns with it held. 2128 */ 2129static void 2130v_incr_usecount(struct vnode *vp) 2131{ 2132 2133 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 2134 vp->v_usecount++; 2135 if (vp->v_type == VCHR && vp->v_rdev != NULL) { 2136 dev_lock(); 2137 vp->v_rdev->si_usecount++; 2138 dev_unlock(); 2139 } 2140 vholdl(vp); 2141} 2142 2143/* 2144 * Turn a holdcnt into a use+holdcnt such that only one call to 2145 * v_decr_usecount is needed. 2146 */ 2147static void 2148v_upgrade_usecount(struct vnode *vp) 2149{ 2150 2151 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 2152 vp->v_usecount++; 2153 if (vp->v_type == VCHR && vp->v_rdev != NULL) { 2154 dev_lock(); 2155 vp->v_rdev->si_usecount++; 2156 dev_unlock(); 2157 } 2158} 2159 2160/* 2161 * Decrement the vnode use and hold count along with the driver's usecount 2162 * if this is a chardev. The vdropl() below releases the vnode interlock 2163 * as it may free the vnode. 2164 */ 2165static void 2166v_decr_usecount(struct vnode *vp) 2167{ 2168 2169 ASSERT_VI_LOCKED(vp, __FUNCTION__); 2170 VNASSERT(vp->v_usecount > 0, vp, 2171 ("v_decr_usecount: negative usecount")); 2172 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 2173 vp->v_usecount--; 2174 if (vp->v_type == VCHR && vp->v_rdev != NULL) { 2175 dev_lock(); 2176 vp->v_rdev->si_usecount--; 2177 dev_unlock(); 2178 } 2179 vdropl(vp); 2180} 2181 2182/* 2183 * Decrement only the use count and driver use count. This is intended to 2184 * be paired with a follow on vdropl() to release the remaining hold count. 2185 * In this way we may vgone() a vnode with a 0 usecount without risk of 2186 * having it end up on a free list because the hold count is kept above 0. 2187 */ 2188static void 2189v_decr_useonly(struct vnode *vp) 2190{ 2191 2192 ASSERT_VI_LOCKED(vp, __FUNCTION__); 2193 VNASSERT(vp->v_usecount > 0, vp, 2194 ("v_decr_useonly: negative usecount")); 2195 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 2196 vp->v_usecount--; 2197 if (vp->v_type == VCHR && vp->v_rdev != NULL) { 2198 dev_lock(); 2199 vp->v_rdev->si_usecount--; 2200 dev_unlock(); 2201 } 2202} 2203 2204/* 2205 * Grab a particular vnode from the free list, increment its 2206 * reference count and lock it. VI_DOOMED is set if the vnode 2207 * is being destroyed. Only callers who specify LK_RETRY will 2208 * see doomed vnodes. If inactive processing was delayed in 2209 * vput try to do it here. 2210 */ 2211int 2212vget(struct vnode *vp, int flags, struct thread *td) 2213{ 2214 int error; 2215 2216 error = 0; 2217 VFS_ASSERT_GIANT(vp->v_mount); 2218 VNASSERT((flags & LK_TYPE_MASK) != 0, vp, 2219 ("vget: invalid lock operation")); 2220 CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags); 2221 2222 if ((flags & LK_INTERLOCK) == 0) 2223 VI_LOCK(vp); 2224 vholdl(vp); 2225 if ((error = vn_lock(vp, flags | LK_INTERLOCK)) != 0) { 2226 vdrop(vp); 2227 CTR2(KTR_VFS, "%s: impossible to lock vnode %p", __func__, 2228 vp); 2229 return (error); 2230 } 2231 if (vp->v_iflag & VI_DOOMED && (flags & LK_RETRY) == 0) 2232 panic("vget: vn_lock failed to return ENOENT\n"); 2233 VI_LOCK(vp); 2234 /* Upgrade our holdcnt to a usecount. */ 2235 v_upgrade_usecount(vp); 2236 /* 2237 * We don't guarantee that any particular close will 2238 * trigger inactive processing so just make a best effort 2239 * here at preventing a reference to a removed file. If 2240 * we don't succeed no harm is done. 2241 */ 2242 if (vp->v_iflag & VI_OWEINACT) { 2243 if (VOP_ISLOCKED(vp) == LK_EXCLUSIVE && 2244 (flags & LK_NOWAIT) == 0) 2245 vinactive(vp, td); 2246 vp->v_iflag &= ~VI_OWEINACT; 2247 } 2248 VI_UNLOCK(vp); 2249 return (0); 2250} 2251 2252/* 2253 * Increase the reference count of a vnode. 2254 */ 2255void 2256vref(struct vnode *vp) 2257{ 2258 2259 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 2260 VI_LOCK(vp); 2261 v_incr_usecount(vp); 2262 VI_UNLOCK(vp); 2263} 2264 2265/* 2266 * Return reference count of a vnode. 2267 * 2268 * The results of this call are only guaranteed when some mechanism other 2269 * than the VI lock is used to stop other processes from gaining references 2270 * to the vnode. This may be the case if the caller holds the only reference. 2271 * This is also useful when stale data is acceptable as race conditions may 2272 * be accounted for by some other means. 2273 */ 2274int 2275vrefcnt(struct vnode *vp) 2276{ 2277 int usecnt; 2278 2279 VI_LOCK(vp); 2280 usecnt = vp->v_usecount; 2281 VI_UNLOCK(vp); 2282 2283 return (usecnt); 2284} 2285 2286#define VPUTX_VRELE 1 2287#define VPUTX_VPUT 2 2288#define VPUTX_VUNREF 3 2289 2290static void 2291vputx(struct vnode *vp, int func) 2292{ 2293 int error; 2294 2295 KASSERT(vp != NULL, ("vputx: null vp")); 2296 if (func == VPUTX_VUNREF) 2297 ASSERT_VOP_LOCKED(vp, "vunref"); 2298 else if (func == VPUTX_VPUT) 2299 ASSERT_VOP_LOCKED(vp, "vput"); 2300 else 2301 KASSERT(func == VPUTX_VRELE, ("vputx: wrong func")); 2302 VFS_ASSERT_GIANT(vp->v_mount); 2303 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 2304 VI_LOCK(vp); 2305 2306 /* Skip this v_writecount check if we're going to panic below. */ 2307 VNASSERT(vp->v_writecount < vp->v_usecount || vp->v_usecount < 1, vp, 2308 ("vputx: missed vn_close")); 2309 error = 0; 2310 2311 if (vp->v_usecount > 1 || ((vp->v_iflag & VI_DOINGINACT) && 2312 vp->v_usecount == 1)) { 2313 if (func == VPUTX_VPUT) 2314 VOP_UNLOCK(vp, 0); 2315 v_decr_usecount(vp); 2316 return; 2317 } 2318 2319 if (vp->v_usecount != 1) { 2320 vprint("vputx: negative ref count", vp); 2321 panic("vputx: negative ref cnt"); 2322 } 2323 CTR2(KTR_VFS, "%s: return vnode %p to the freelist", __func__, vp); 2324 /* 2325 * We want to hold the vnode until the inactive finishes to 2326 * prevent vgone() races. We drop the use count here and the 2327 * hold count below when we're done. 2328 */ 2329 v_decr_useonly(vp); 2330 /* 2331 * We must call VOP_INACTIVE with the node locked. Mark 2332 * as VI_DOINGINACT to avoid recursion. 2333 */ 2334 vp->v_iflag |= VI_OWEINACT; 2335 switch (func) { 2336 case VPUTX_VRELE: 2337 error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK); 2338 VI_LOCK(vp); 2339 break; 2340 case VPUTX_VPUT: 2341 if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) { 2342 error = VOP_LOCK(vp, LK_UPGRADE | LK_INTERLOCK | 2343 LK_NOWAIT); 2344 VI_LOCK(vp); 2345 } 2346 break; 2347 case VPUTX_VUNREF: 2348 if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) { 2349 error = VOP_LOCK(vp, LK_TRYUPGRADE | LK_INTERLOCK); 2350 VI_LOCK(vp); 2351 } 2352 break; 2353 } 2354 if (vp->v_usecount > 0) 2355 vp->v_iflag &= ~VI_OWEINACT; 2356 if (error == 0) { 2357 if (vp->v_iflag & VI_OWEINACT) 2358 vinactive(vp, curthread); 2359 if (func != VPUTX_VUNREF) 2360 VOP_UNLOCK(vp, 0); 2361 } 2362 vdropl(vp); 2363} 2364 2365/* 2366 * Vnode put/release. 2367 * If count drops to zero, call inactive routine and return to freelist. 2368 */ 2369void 2370vrele(struct vnode *vp) 2371{ 2372 2373 vputx(vp, VPUTX_VRELE); 2374} 2375 2376/* 2377 * Release an already locked vnode. This give the same effects as 2378 * unlock+vrele(), but takes less time and avoids releasing and 2379 * re-aquiring the lock (as vrele() acquires the lock internally.) 2380 */ 2381void 2382vput(struct vnode *vp) 2383{ 2384 2385 vputx(vp, VPUTX_VPUT); 2386} 2387 2388/* 2389 * Release an exclusively locked vnode. Do not unlock the vnode lock. 2390 */ 2391void 2392vunref(struct vnode *vp) 2393{ 2394 2395 vputx(vp, VPUTX_VUNREF); 2396} 2397 2398/* 2399 * Somebody doesn't want the vnode recycled. 2400 */ 2401void 2402vhold(struct vnode *vp) 2403{ 2404 2405 VI_LOCK(vp); 2406 vholdl(vp); 2407 VI_UNLOCK(vp); 2408} 2409 2410/* 2411 * Increase the hold count and activate if this is the first reference. 2412 */ 2413void 2414vholdl(struct vnode *vp) 2415{ 2416 struct mount *mp; 2417 2418 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 2419 vp->v_holdcnt++; 2420 if (!VSHOULDBUSY(vp)) 2421 return; 2422 ASSERT_VI_LOCKED(vp, "vholdl"); 2423 VNASSERT((vp->v_iflag & VI_FREE) != 0, vp, ("vnode not free")); 2424 VNASSERT(vp->v_op != NULL, vp, ("vholdl: vnode already reclaimed.")); 2425 /* 2426 * Remove a vnode from the free list, mark it as in use, 2427 * and put it on the active list. 2428 */ 2429 mtx_lock(&vnode_free_list_mtx); 2430 TAILQ_REMOVE(&vnode_free_list, vp, v_actfreelist); 2431 freevnodes--; 2432 vp->v_iflag &= ~(VI_FREE|VI_AGE); 2433 KASSERT((vp->v_iflag & VI_ACTIVE) == 0, 2434 ("Activating already active vnode")); 2435 vp->v_iflag |= VI_ACTIVE; 2436 mp = vp->v_mount; 2437 TAILQ_INSERT_HEAD(&mp->mnt_activevnodelist, vp, v_actfreelist); 2438 mp->mnt_activevnodelistsize++; 2439 mtx_unlock(&vnode_free_list_mtx); 2440} 2441 2442/* 2443 * Note that there is one less who cares about this vnode. 2444 * vdrop() is the opposite of vhold(). 2445 */ 2446void 2447vdrop(struct vnode *vp) 2448{ 2449 2450 VI_LOCK(vp); 2451 vdropl(vp); 2452} 2453 2454/* 2455 * Drop the hold count of the vnode. If this is the last reference to 2456 * the vnode we place it on the free list unless it has been vgone'd 2457 * (marked VI_DOOMED) in which case we will free it. 2458 */ 2459void 2460vdropl(struct vnode *vp) 2461{ 2462 struct bufobj *bo; 2463 struct mount *mp; 2464 int active; 2465 2466 ASSERT_VI_LOCKED(vp, "vdropl"); 2467 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 2468 if (vp->v_holdcnt <= 0) 2469 panic("vdrop: holdcnt %d", vp->v_holdcnt); 2470 vp->v_holdcnt--; 2471 if (vp->v_holdcnt > 0) { 2472 VI_UNLOCK(vp); 2473 return; 2474 } 2475 if ((vp->v_iflag & VI_DOOMED) == 0) { 2476 /* 2477 * Mark a vnode as free: remove it from its active list 2478 * and put it up for recycling on the freelist. 2479 */ 2480 VNASSERT(vp->v_op != NULL, vp, 2481 ("vdropl: vnode already reclaimed.")); 2482 VNASSERT((vp->v_iflag & VI_FREE) == 0, vp, 2483 ("vnode already free")); 2484 VNASSERT(VSHOULDFREE(vp), vp, 2485 ("vdropl: freeing when we shouldn't")); 2486 active = vp->v_iflag & VI_ACTIVE; 2487 vp->v_iflag &= ~VI_ACTIVE; 2488 mp = vp->v_mount; 2489 mtx_lock(&vnode_free_list_mtx); 2490 if (active) { 2491 TAILQ_REMOVE(&mp->mnt_activevnodelist, vp, 2492 v_actfreelist); 2493 mp->mnt_activevnodelistsize--; 2494 } 2495 if (vp->v_iflag & VI_AGE) { 2496 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_actfreelist); 2497 } else { 2498 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_actfreelist); 2499 } 2500 freevnodes++; 2501 vp->v_iflag &= ~VI_AGE; 2502 vp->v_iflag |= VI_FREE; 2503 mtx_unlock(&vnode_free_list_mtx); 2504 VI_UNLOCK(vp); 2505 return; 2506 } 2507 /* 2508 * The vnode has been marked for destruction, so free it. 2509 */ 2510 CTR2(KTR_VFS, "%s: destroying the vnode %p", __func__, vp); 2511 mtx_lock(&vnode_free_list_mtx); 2512 numvnodes--; 2513 mtx_unlock(&vnode_free_list_mtx); 2514 bo = &vp->v_bufobj; 2515 VNASSERT((vp->v_iflag & VI_FREE) == 0, vp, 2516 ("cleaned vnode still on the free list.")); 2517 VNASSERT(vp->v_data == NULL, vp, ("cleaned vnode isn't")); 2518 VNASSERT(vp->v_holdcnt == 0, vp, ("Non-zero hold count")); 2519 VNASSERT(vp->v_usecount == 0, vp, ("Non-zero use count")); 2520 VNASSERT(vp->v_writecount == 0, vp, ("Non-zero write count")); 2521 VNASSERT(bo->bo_numoutput == 0, vp, ("Clean vnode has pending I/O's")); 2522 VNASSERT(bo->bo_clean.bv_cnt == 0, vp, ("cleanbufcnt not 0")); 2523 VNASSERT(bo->bo_clean.bv_root == NULL, vp, ("cleanblkroot not NULL")); 2524 VNASSERT(bo->bo_dirty.bv_cnt == 0, vp, ("dirtybufcnt not 0")); 2525 VNASSERT(bo->bo_dirty.bv_root == NULL, vp, ("dirtyblkroot not NULL")); 2526 VNASSERT(TAILQ_EMPTY(&vp->v_cache_dst), vp, ("vp has namecache dst")); 2527 VNASSERT(LIST_EMPTY(&vp->v_cache_src), vp, ("vp has namecache src")); 2528 VNASSERT(vp->v_cache_dd == NULL, vp, ("vp has namecache for ..")); 2529 VI_UNLOCK(vp); 2530#ifdef MAC 2531 mac_vnode_destroy(vp); 2532#endif 2533 if (vp->v_pollinfo != NULL) 2534 destroy_vpollinfo(vp->v_pollinfo); 2535#ifdef INVARIANTS 2536 /* XXX Elsewhere we detect an already freed vnode via NULL v_op. */ 2537 vp->v_op = NULL; 2538#endif 2539 rangelock_destroy(&vp->v_rl); 2540 lockdestroy(vp->v_vnlock); 2541 mtx_destroy(&vp->v_interlock); 2542 mtx_destroy(BO_MTX(bo)); 2543 uma_zfree(vnode_zone, vp); 2544} 2545 2546/* 2547 * Call VOP_INACTIVE on the vnode and manage the DOINGINACT and OWEINACT 2548 * flags. DOINGINACT prevents us from recursing in calls to vinactive. 2549 * OWEINACT tracks whether a vnode missed a call to inactive due to a 2550 * failed lock upgrade. 2551 */ 2552void 2553vinactive(struct vnode *vp, struct thread *td) 2554{ 2555 struct vm_object *obj; 2556 2557 ASSERT_VOP_ELOCKED(vp, "vinactive"); 2558 ASSERT_VI_LOCKED(vp, "vinactive"); 2559 VNASSERT((vp->v_iflag & VI_DOINGINACT) == 0, vp, 2560 ("vinactive: recursed on VI_DOINGINACT")); 2561 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 2562 vp->v_iflag |= VI_DOINGINACT; 2563 vp->v_iflag &= ~VI_OWEINACT; 2564 VI_UNLOCK(vp); 2565 /* 2566 * Before moving off the active list, we must be sure that any 2567 * modified pages are on the vnode's dirty list since these will 2568 * no longer be checked once the vnode is on the inactive list. 2569 * Because the vnode vm object keeps a hold reference on the vnode 2570 * if there is at least one resident non-cached page, the vnode 2571 * cannot leave the active list without the page cleanup done. 2572 */ 2573 obj = vp->v_object; 2574 if (obj != NULL && (obj->flags & OBJ_MIGHTBEDIRTY) != 0) { 2575 VM_OBJECT_LOCK(obj); 2576 vm_object_page_clean(obj, 0, 0, OBJPC_NOSYNC); 2577 VM_OBJECT_UNLOCK(obj); 2578 } 2579 VOP_INACTIVE(vp, td); 2580 VI_LOCK(vp); 2581 VNASSERT(vp->v_iflag & VI_DOINGINACT, vp, 2582 ("vinactive: lost VI_DOINGINACT")); 2583 vp->v_iflag &= ~VI_DOINGINACT; 2584} 2585 2586/* 2587 * Remove any vnodes in the vnode table belonging to mount point mp. 2588 * 2589 * If FORCECLOSE is not specified, there should not be any active ones, 2590 * return error if any are found (nb: this is a user error, not a 2591 * system error). If FORCECLOSE is specified, detach any active vnodes 2592 * that are found. 2593 * 2594 * If WRITECLOSE is set, only flush out regular file vnodes open for 2595 * writing. 2596 * 2597 * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped. 2598 * 2599 * `rootrefs' specifies the base reference count for the root vnode 2600 * of this filesystem. The root vnode is considered busy if its 2601 * v_usecount exceeds this value. On a successful return, vflush(, td) 2602 * will call vrele() on the root vnode exactly rootrefs times. 2603 * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must 2604 * be zero. 2605 */ 2606#ifdef DIAGNOSTIC 2607static int busyprt = 0; /* print out busy vnodes */ 2608SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "Print out busy vnodes"); 2609#endif 2610 2611int 2612vflush(struct mount *mp, int rootrefs, int flags, struct thread *td) 2613{ 2614 struct vnode *vp, *mvp, *rootvp = NULL; 2615 struct vattr vattr; 2616 int busy = 0, error; 2617 2618 CTR4(KTR_VFS, "%s: mp %p with rootrefs %d and flags %d", __func__, mp, 2619 rootrefs, flags); 2620 if (rootrefs > 0) { 2621 KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0, 2622 ("vflush: bad args")); 2623 /* 2624 * Get the filesystem root vnode. We can vput() it 2625 * immediately, since with rootrefs > 0, it won't go away. 2626 */ 2627 if ((error = VFS_ROOT(mp, LK_EXCLUSIVE, &rootvp)) != 0) { 2628 CTR2(KTR_VFS, "%s: vfs_root lookup failed with %d", 2629 __func__, error); 2630 return (error); 2631 } 2632 vput(rootvp); 2633 } 2634loop: 2635 MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { 2636 vholdl(vp); 2637 error = vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE); 2638 if (error) { 2639 vdrop(vp); 2640 MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); 2641 goto loop; 2642 } 2643 /* 2644 * Skip over a vnodes marked VV_SYSTEM. 2645 */ 2646 if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) { 2647 VOP_UNLOCK(vp, 0); 2648 vdrop(vp); 2649 continue; 2650 } 2651 /* 2652 * If WRITECLOSE is set, flush out unlinked but still open 2653 * files (even if open only for reading) and regular file 2654 * vnodes open for writing. 2655 */ 2656 if (flags & WRITECLOSE) { 2657 if (vp->v_object != NULL) { 2658 VM_OBJECT_LOCK(vp->v_object); 2659 vm_object_page_clean(vp->v_object, 0, 0, 0); 2660 VM_OBJECT_UNLOCK(vp->v_object); 2661 } 2662 error = VOP_FSYNC(vp, MNT_WAIT, td); 2663 if (error != 0) { 2664 VOP_UNLOCK(vp, 0); 2665 vdrop(vp); 2666 MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); 2667 return (error); 2668 } 2669 error = VOP_GETATTR(vp, &vattr, td->td_ucred); 2670 VI_LOCK(vp); 2671 2672 if ((vp->v_type == VNON || 2673 (error == 0 && vattr.va_nlink > 0)) && 2674 (vp->v_writecount == 0 || vp->v_type != VREG)) { 2675 VOP_UNLOCK(vp, 0); 2676 vdropl(vp); 2677 continue; 2678 } 2679 } else 2680 VI_LOCK(vp); 2681 /* 2682 * With v_usecount == 0, all we need to do is clear out the 2683 * vnode data structures and we are done. 2684 * 2685 * If FORCECLOSE is set, forcibly close the vnode. 2686 */ 2687 if (vp->v_usecount == 0 || (flags & FORCECLOSE)) { 2688 VNASSERT(vp->v_usecount == 0 || 2689 (vp->v_type != VCHR && vp->v_type != VBLK), vp, 2690 ("device VNODE %p is FORCECLOSED", vp)); 2691 vgonel(vp); 2692 } else { 2693 busy++; 2694#ifdef DIAGNOSTIC 2695 if (busyprt) 2696 vprint("vflush: busy vnode", vp); 2697#endif 2698 } 2699 VOP_UNLOCK(vp, 0); 2700 vdropl(vp); 2701 } 2702 if (rootrefs > 0 && (flags & FORCECLOSE) == 0) { 2703 /* 2704 * If just the root vnode is busy, and if its refcount 2705 * is equal to `rootrefs', then go ahead and kill it. 2706 */ 2707 VI_LOCK(rootvp); 2708 KASSERT(busy > 0, ("vflush: not busy")); 2709 VNASSERT(rootvp->v_usecount >= rootrefs, rootvp, 2710 ("vflush: usecount %d < rootrefs %d", 2711 rootvp->v_usecount, rootrefs)); 2712 if (busy == 1 && rootvp->v_usecount == rootrefs) { 2713 VOP_LOCK(rootvp, LK_EXCLUSIVE|LK_INTERLOCK); 2714 vgone(rootvp); 2715 VOP_UNLOCK(rootvp, 0); 2716 busy = 0; 2717 } else 2718 VI_UNLOCK(rootvp); 2719 } 2720 if (busy) { 2721 CTR2(KTR_VFS, "%s: failing as %d vnodes are busy", __func__, 2722 busy); 2723 return (EBUSY); 2724 } 2725 for (; rootrefs > 0; rootrefs--) 2726 vrele(rootvp); 2727 return (0); 2728} 2729 2730/* 2731 * Recycle an unused vnode to the front of the free list. 2732 */ 2733int 2734vrecycle(struct vnode *vp, struct thread *td) 2735{ 2736 int recycled; 2737 2738 ASSERT_VOP_ELOCKED(vp, "vrecycle"); 2739 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 2740 recycled = 0; 2741 VI_LOCK(vp); 2742 if (vp->v_usecount == 0) { 2743 recycled = 1; 2744 vgonel(vp); 2745 } 2746 VI_UNLOCK(vp); 2747 return (recycled); 2748} 2749 2750/* 2751 * Eliminate all activity associated with a vnode 2752 * in preparation for reuse. 2753 */ 2754void 2755vgone(struct vnode *vp) 2756{ 2757 VI_LOCK(vp); 2758 vgonel(vp); 2759 VI_UNLOCK(vp); 2760} 2761 2762static void 2763notify_lowervp_vfs_dummy(struct mount *mp __unused, 2764 struct vnode *lowervp __unused) 2765{ 2766} 2767 2768/* 2769 * Notify upper mounts about reclaimed or unlinked vnode. 2770 */ 2771void 2772vfs_notify_upper(struct vnode *vp, int event) 2773{ 2774 static struct vfsops vgonel_vfsops = { 2775 .vfs_reclaim_lowervp = notify_lowervp_vfs_dummy, 2776 .vfs_unlink_lowervp = notify_lowervp_vfs_dummy, 2777 }; 2778 struct mount *mp, *ump, *mmp; 2779 2780 mp = vp->v_mount; 2781 if (mp == NULL) 2782 return; 2783 2784 MNT_ILOCK(mp); 2785 if (TAILQ_EMPTY(&mp->mnt_uppers)) 2786 goto unlock; 2787 MNT_IUNLOCK(mp); 2788 mmp = malloc(sizeof(struct mount), M_TEMP, M_WAITOK | M_ZERO); 2789 mmp->mnt_op = &vgonel_vfsops; 2790 mmp->mnt_kern_flag |= MNTK_MARKER; 2791 MNT_ILOCK(mp); 2792 mp->mnt_kern_flag |= MNTK_VGONE_UPPER; 2793 for (ump = TAILQ_FIRST(&mp->mnt_uppers); ump != NULL;) { 2794 if ((ump->mnt_kern_flag & MNTK_MARKER) != 0) { 2795 ump = TAILQ_NEXT(ump, mnt_upper_link); 2796 continue; 2797 } 2798 TAILQ_INSERT_AFTER(&mp->mnt_uppers, ump, mmp, mnt_upper_link); 2799 MNT_IUNLOCK(mp); 2800 switch (event) { 2801 case VFS_NOTIFY_UPPER_RECLAIM: 2802 VFS_RECLAIM_LOWERVP(ump, vp); 2803 break; 2804 case VFS_NOTIFY_UPPER_UNLINK: 2805 VFS_UNLINK_LOWERVP(ump, vp); 2806 break; 2807 default: 2808 KASSERT(0, ("invalid event %d", event)); 2809 break; 2810 } 2811 MNT_ILOCK(mp); 2812 ump = TAILQ_NEXT(mmp, mnt_upper_link); 2813 TAILQ_REMOVE(&mp->mnt_uppers, mmp, mnt_upper_link); 2814 } 2815 free(mmp, M_TEMP); 2816 mp->mnt_kern_flag &= ~MNTK_VGONE_UPPER; 2817 if ((mp->mnt_kern_flag & MNTK_VGONE_WAITER) != 0) { 2818 mp->mnt_kern_flag &= ~MNTK_VGONE_WAITER; 2819 wakeup(&mp->mnt_uppers); 2820 } 2821unlock: 2822 MNT_IUNLOCK(mp); 2823} 2824 2825/* 2826 * vgone, with the vp interlock held. 2827 */ 2828void 2829vgonel(struct vnode *vp) 2830{ 2831 struct thread *td; 2832 int oweinact; 2833 int active; 2834 struct mount *mp; 2835 2836 ASSERT_VOP_ELOCKED(vp, "vgonel"); 2837 ASSERT_VI_LOCKED(vp, "vgonel"); 2838 VNASSERT(vp->v_holdcnt, vp, 2839 ("vgonel: vp %p has no reference.", vp)); 2840 CTR2(KTR_VFS, "%s: vp %p", __func__, vp); 2841 td = curthread; 2842 2843 /* 2844 * Don't vgonel if we're already doomed. 2845 */ 2846 if (vp->v_iflag & VI_DOOMED) 2847 return; 2848 vp->v_iflag |= VI_DOOMED; 2849 2850 /* 2851 * Check to see if the vnode is in use. If so, we have to call 2852 * VOP_CLOSE() and VOP_INACTIVE(). 2853 */ 2854 active = vp->v_usecount; 2855 oweinact = (vp->v_iflag & VI_OWEINACT); 2856 VI_UNLOCK(vp); 2857 vfs_notify_upper(vp, VFS_NOTIFY_UPPER_RECLAIM); 2858 2859 /* 2860 * Clean out any buffers associated with the vnode. 2861 * If the flush fails, just toss the buffers. 2862 */ 2863 mp = NULL; 2864 if (!TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd)) 2865 (void) vn_start_secondary_write(vp, &mp, V_WAIT); 2866 if (vinvalbuf(vp, V_SAVE, 0, 0) != 0) 2867 vinvalbuf(vp, 0, 0, 0); 2868 2869 /* 2870 * If purging an active vnode, it must be closed and 2871 * deactivated before being reclaimed. 2872 */ 2873 if (active) 2874 VOP_CLOSE(vp, FNONBLOCK, NOCRED, td); 2875 if (oweinact || active) { 2876 VI_LOCK(vp); 2877 if ((vp->v_iflag & VI_DOINGINACT) == 0) 2878 vinactive(vp, td); 2879 VI_UNLOCK(vp); 2880 } 2881 if (vp->v_type == VSOCK) 2882 vfs_unp_reclaim(vp); 2883 /* 2884 * Reclaim the vnode. 2885 */ 2886 if (VOP_RECLAIM(vp, td)) 2887 panic("vgone: cannot reclaim"); 2888 if (mp != NULL) 2889 vn_finished_secondary_write(mp); 2890 VNASSERT(vp->v_object == NULL, vp, 2891 ("vop_reclaim left v_object vp=%p, tag=%s", vp, vp->v_tag)); 2892 /* 2893 * Clear the advisory locks and wake up waiting threads. 2894 */ 2895 (void)VOP_ADVLOCKPURGE(vp); 2896 /* 2897 * Delete from old mount point vnode list. 2898 */ 2899 delmntque(vp); 2900 cache_purge(vp); 2901 /* 2902 * Done with purge, reset to the standard lock and invalidate 2903 * the vnode. 2904 */ 2905 VI_LOCK(vp); 2906 vp->v_vnlock = &vp->v_lock; 2907 vp->v_op = &dead_vnodeops; 2908 vp->v_tag = "none"; 2909 vp->v_type = VBAD; 2910} 2911 2912/* 2913 * Calculate the total number of references to a special device. 2914 */ 2915int 2916vcount(struct vnode *vp) 2917{ 2918 int count; 2919 2920 dev_lock(); 2921 count = vp->v_rdev->si_usecount; 2922 dev_unlock(); 2923 return (count); 2924} 2925 2926/* 2927 * Same as above, but using the struct cdev *as argument 2928 */ 2929int 2930count_dev(struct cdev *dev) 2931{ 2932 int count; 2933 2934 dev_lock(); 2935 count = dev->si_usecount; 2936 dev_unlock(); 2937 return(count); 2938} 2939 2940/* 2941 * Print out a description of a vnode. 2942 */ 2943static char *typename[] = 2944{"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD", 2945 "VMARKER"}; 2946 2947void 2948vn_printf(struct vnode *vp, const char *fmt, ...) 2949{ 2950 va_list ap; 2951 char buf[256], buf2[16]; 2952 u_long flags; 2953 2954 va_start(ap, fmt); 2955 vprintf(fmt, ap); 2956 va_end(ap); 2957 printf("%p: ", (void *)vp); 2958 printf("tag %s, type %s\n", vp->v_tag, typename[vp->v_type]); 2959 printf(" usecount %d, writecount %d, refcount %d mountedhere %p\n", 2960 vp->v_usecount, vp->v_writecount, vp->v_holdcnt, vp->v_mountedhere); 2961 buf[0] = '\0'; 2962 buf[1] = '\0'; 2963 if (vp->v_vflag & VV_ROOT) 2964 strlcat(buf, "|VV_ROOT", sizeof(buf)); 2965 if (vp->v_vflag & VV_ISTTY) 2966 strlcat(buf, "|VV_ISTTY", sizeof(buf)); 2967 if (vp->v_vflag & VV_NOSYNC) 2968 strlcat(buf, "|VV_NOSYNC", sizeof(buf)); 2969 if (vp->v_vflag & VV_ETERNALDEV) 2970 strlcat(buf, "|VV_ETERNALDEV", sizeof(buf)); 2971 if (vp->v_vflag & VV_CACHEDLABEL) 2972 strlcat(buf, "|VV_CACHEDLABEL", sizeof(buf)); 2973 if (vp->v_vflag & VV_TEXT) 2974 strlcat(buf, "|VV_TEXT", sizeof(buf)); 2975 if (vp->v_vflag & VV_COPYONWRITE) 2976 strlcat(buf, "|VV_COPYONWRITE", sizeof(buf)); 2977 if (vp->v_vflag & VV_SYSTEM) 2978 strlcat(buf, "|VV_SYSTEM", sizeof(buf)); 2979 if (vp->v_vflag & VV_PROCDEP) 2980 strlcat(buf, "|VV_PROCDEP", sizeof(buf)); 2981 if (vp->v_vflag & VV_NOKNOTE) 2982 strlcat(buf, "|VV_NOKNOTE", sizeof(buf)); 2983 if (vp->v_vflag & VV_DELETED) 2984 strlcat(buf, "|VV_DELETED", sizeof(buf)); 2985 if (vp->v_vflag & VV_MD) 2986 strlcat(buf, "|VV_MD", sizeof(buf)); 2987 if (vp->v_vflag & VV_FORCEINSMQ) 2988 strlcat(buf, "|VV_FORCEINSMQ", sizeof(buf)); 2989 flags = vp->v_vflag & ~(VV_ROOT | VV_ISTTY | VV_NOSYNC | VV_ETERNALDEV | 2990 VV_CACHEDLABEL | VV_TEXT | VV_COPYONWRITE | VV_SYSTEM | VV_PROCDEP | 2991 VV_NOKNOTE | VV_DELETED | VV_MD | VV_FORCEINSMQ); 2992 if (flags != 0) { 2993 snprintf(buf2, sizeof(buf2), "|VV(0x%lx)", flags); 2994 strlcat(buf, buf2, sizeof(buf)); 2995 } 2996 if (vp->v_iflag & VI_MOUNT) 2997 strlcat(buf, "|VI_MOUNT", sizeof(buf)); 2998 if (vp->v_iflag & VI_AGE) 2999 strlcat(buf, "|VI_AGE", sizeof(buf)); 3000 if (vp->v_iflag & VI_DOOMED) 3001 strlcat(buf, "|VI_DOOMED", sizeof(buf)); 3002 if (vp->v_iflag & VI_FREE) 3003 strlcat(buf, "|VI_FREE", sizeof(buf)); 3004 if (vp->v_iflag & VI_ACTIVE) 3005 strlcat(buf, "|VI_ACTIVE", sizeof(buf)); 3006 if (vp->v_iflag & VI_DOINGINACT) 3007 strlcat(buf, "|VI_DOINGINACT", sizeof(buf)); 3008 if (vp->v_iflag & VI_OWEINACT) 3009 strlcat(buf, "|VI_OWEINACT", sizeof(buf)); 3010 flags = vp->v_iflag & ~(VI_MOUNT | VI_AGE | VI_DOOMED | VI_FREE | 3011 VI_ACTIVE | VI_DOINGINACT | VI_OWEINACT); 3012 if (flags != 0) { 3013 snprintf(buf2, sizeof(buf2), "|VI(0x%lx)", flags); 3014 strlcat(buf, buf2, sizeof(buf)); 3015 } 3016 printf(" flags (%s)\n", buf + 1); 3017 if (mtx_owned(VI_MTX(vp))) 3018 printf(" VI_LOCKed"); 3019 if (vp->v_object != NULL) 3020 printf(" v_object %p ref %d pages %d " 3021 "cleanbuf %d dirtybuf %d\n", 3022 vp->v_object, vp->v_object->ref_count, 3023 vp->v_object->resident_page_count, 3024 vp->v_bufobj.bo_dirty.bv_cnt, 3025 vp->v_bufobj.bo_clean.bv_cnt); 3026 printf(" "); 3027 lockmgr_printinfo(vp->v_vnlock); 3028 if (vp->v_data != NULL) 3029 VOP_PRINT(vp); 3030} 3031 3032#ifdef DDB 3033/* 3034 * List all of the locked vnodes in the system. 3035 * Called when debugging the kernel. 3036 */ 3037DB_SHOW_COMMAND(lockedvnods, lockedvnodes) 3038{ 3039 struct mount *mp, *nmp; 3040 struct vnode *vp; 3041 3042 /* 3043 * Note: because this is DDB, we can't obey the locking semantics 3044 * for these structures, which means we could catch an inconsistent 3045 * state and dereference a nasty pointer. Not much to be done 3046 * about that. 3047 */ 3048 db_printf("Locked vnodes\n"); 3049 for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { 3050 nmp = TAILQ_NEXT(mp, mnt_list); 3051 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 3052 if (vp->v_type != VMARKER && 3053 VOP_ISLOCKED(vp)) 3054 vprint("", vp); 3055 } 3056 nmp = TAILQ_NEXT(mp, mnt_list); 3057 } 3058} 3059 3060/* 3061 * Show details about the given vnode. 3062 */ 3063DB_SHOW_COMMAND(vnode, db_show_vnode) 3064{ 3065 struct vnode *vp; 3066 3067 if (!have_addr) 3068 return; 3069 vp = (struct vnode *)addr; 3070 vn_printf(vp, "vnode "); 3071} 3072 3073/* 3074 * Show details about the given mount point. 3075 */ 3076DB_SHOW_COMMAND(mount, db_show_mount) 3077{ 3078 struct mount *mp; 3079 struct vfsopt *opt; 3080 struct statfs *sp; 3081 struct vnode *vp; 3082 char buf[512]; 3083 uint64_t mflags; 3084 u_int flags; 3085 3086 if (!have_addr) { 3087 /* No address given, print short info about all mount points. */ 3088 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 3089 db_printf("%p %s on %s (%s)\n", mp, 3090 mp->mnt_stat.f_mntfromname, 3091 mp->mnt_stat.f_mntonname, 3092 mp->mnt_stat.f_fstypename); 3093 if (db_pager_quit) 3094 break; 3095 } 3096 db_printf("\nMore info: show mount <addr>\n"); 3097 return; 3098 } 3099 3100 mp = (struct mount *)addr; 3101 db_printf("%p %s on %s (%s)\n", mp, mp->mnt_stat.f_mntfromname, 3102 mp->mnt_stat.f_mntonname, mp->mnt_stat.f_fstypename); 3103 3104 buf[0] = '\0'; 3105 mflags = mp->mnt_flag; 3106#define MNT_FLAG(flag) do { \ 3107 if (mflags & (flag)) { \ 3108 if (buf[0] != '\0') \ 3109 strlcat(buf, ", ", sizeof(buf)); \ 3110 strlcat(buf, (#flag) + 4, sizeof(buf)); \ 3111 mflags &= ~(flag); \ 3112 } \ 3113} while (0) 3114 MNT_FLAG(MNT_RDONLY); 3115 MNT_FLAG(MNT_SYNCHRONOUS); 3116 MNT_FLAG(MNT_NOEXEC); 3117 MNT_FLAG(MNT_NOSUID); 3118 MNT_FLAG(MNT_NFS4ACLS); 3119 MNT_FLAG(MNT_UNION); 3120 MNT_FLAG(MNT_ASYNC); 3121 MNT_FLAG(MNT_SUIDDIR); 3122 MNT_FLAG(MNT_SOFTDEP); 3123 MNT_FLAG(MNT_NOSYMFOLLOW); 3124 MNT_FLAG(MNT_GJOURNAL); 3125 MNT_FLAG(MNT_MULTILABEL); 3126 MNT_FLAG(MNT_ACLS); 3127 MNT_FLAG(MNT_NOATIME); 3128 MNT_FLAG(MNT_NOCLUSTERR); 3129 MNT_FLAG(MNT_NOCLUSTERW); 3130 MNT_FLAG(MNT_SUJ); 3131 MNT_FLAG(MNT_EXRDONLY); 3132 MNT_FLAG(MNT_EXPORTED); 3133 MNT_FLAG(MNT_DEFEXPORTED); 3134 MNT_FLAG(MNT_EXPORTANON); 3135 MNT_FLAG(MNT_EXKERB); 3136 MNT_FLAG(MNT_EXPUBLIC); 3137 MNT_FLAG(MNT_LOCAL); 3138 MNT_FLAG(MNT_QUOTA); 3139 MNT_FLAG(MNT_ROOTFS); 3140 MNT_FLAG(MNT_USER); 3141 MNT_FLAG(MNT_IGNORE); 3142 MNT_FLAG(MNT_UPDATE); 3143 MNT_FLAG(MNT_DELEXPORT); 3144 MNT_FLAG(MNT_RELOAD); 3145 MNT_FLAG(MNT_FORCE); 3146 MNT_FLAG(MNT_SNAPSHOT); 3147 MNT_FLAG(MNT_BYFSID); 3148#undef MNT_FLAG 3149 if (mflags != 0) { 3150 if (buf[0] != '\0') 3151 strlcat(buf, ", ", sizeof(buf)); 3152 snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), 3153 "0x%016jx", mflags); 3154 } 3155 db_printf(" mnt_flag = %s\n", buf); 3156 3157 buf[0] = '\0'; 3158 flags = mp->mnt_kern_flag; 3159#define MNT_KERN_FLAG(flag) do { \ 3160 if (flags & (flag)) { \ 3161 if (buf[0] != '\0') \ 3162 strlcat(buf, ", ", sizeof(buf)); \ 3163 strlcat(buf, (#flag) + 5, sizeof(buf)); \ 3164 flags &= ~(flag); \ 3165 } \ 3166} while (0) 3167 MNT_KERN_FLAG(MNTK_UNMOUNTF); 3168 MNT_KERN_FLAG(MNTK_ASYNC); 3169 MNT_KERN_FLAG(MNTK_SOFTDEP); 3170 MNT_KERN_FLAG(MNTK_NOINSMNTQ); 3171 MNT_KERN_FLAG(MNTK_DRAINING); 3172 MNT_KERN_FLAG(MNTK_REFEXPIRE); 3173 MNT_KERN_FLAG(MNTK_EXTENDED_SHARED); 3174 MNT_KERN_FLAG(MNTK_SHARED_WRITES); 3175 MNT_KERN_FLAG(MNTK_NO_IOPF); 3176 MNT_KERN_FLAG(MNTK_VGONE_UPPER); 3177 MNT_KERN_FLAG(MNTK_VGONE_WAITER); 3178 MNT_KERN_FLAG(MNTK_LOOKUP_EXCL_DOTDOT); 3179 MNT_KERN_FLAG(MNTK_MARKER); 3180 MNT_KERN_FLAG(MNTK_NOASYNC); 3181 MNT_KERN_FLAG(MNTK_UNMOUNT); 3182 MNT_KERN_FLAG(MNTK_MWAIT); 3183 MNT_KERN_FLAG(MNTK_SUSPEND); 3184 MNT_KERN_FLAG(MNTK_SUSPEND2); 3185 MNT_KERN_FLAG(MNTK_SUSPENDED); 3186 MNT_KERN_FLAG(MNTK_MPSAFE); 3187 MNT_KERN_FLAG(MNTK_LOOKUP_SHARED); 3188 MNT_KERN_FLAG(MNTK_NOKNOTE); 3189#undef MNT_KERN_FLAG 3190 if (flags != 0) { 3191 if (buf[0] != '\0') 3192 strlcat(buf, ", ", sizeof(buf)); 3193 snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), 3194 "0x%08x", flags); 3195 } 3196 db_printf(" mnt_kern_flag = %s\n", buf); 3197 3198 db_printf(" mnt_opt = "); 3199 opt = TAILQ_FIRST(mp->mnt_opt); 3200 if (opt != NULL) { 3201 db_printf("%s", opt->name); 3202 opt = TAILQ_NEXT(opt, link); 3203 while (opt != NULL) { 3204 db_printf(", %s", opt->name); 3205 opt = TAILQ_NEXT(opt, link); 3206 } 3207 } 3208 db_printf("\n"); 3209 3210 sp = &mp->mnt_stat; 3211 db_printf(" mnt_stat = { version=%u type=%u flags=0x%016jx " 3212 "bsize=%ju iosize=%ju blocks=%ju bfree=%ju bavail=%jd files=%ju " 3213 "ffree=%jd syncwrites=%ju asyncwrites=%ju syncreads=%ju " 3214 "asyncreads=%ju namemax=%u owner=%u fsid=[%d, %d] }\n", 3215 (u_int)sp->f_version, (u_int)sp->f_type, (uintmax_t)sp->f_flags, 3216 (uintmax_t)sp->f_bsize, (uintmax_t)sp->f_iosize, 3217 (uintmax_t)sp->f_blocks, (uintmax_t)sp->f_bfree, 3218 (intmax_t)sp->f_bavail, (uintmax_t)sp->f_files, 3219 (intmax_t)sp->f_ffree, (uintmax_t)sp->f_syncwrites, 3220 (uintmax_t)sp->f_asyncwrites, (uintmax_t)sp->f_syncreads, 3221 (uintmax_t)sp->f_asyncreads, (u_int)sp->f_namemax, 3222 (u_int)sp->f_owner, (int)sp->f_fsid.val[0], (int)sp->f_fsid.val[1]); 3223 3224 db_printf(" mnt_cred = { uid=%u ruid=%u", 3225 (u_int)mp->mnt_cred->cr_uid, (u_int)mp->mnt_cred->cr_ruid); 3226 if (jailed(mp->mnt_cred)) 3227 db_printf(", jail=%d", mp->mnt_cred->cr_prison->pr_id); 3228 db_printf(" }\n"); 3229 db_printf(" mnt_ref = %d\n", mp->mnt_ref); 3230 db_printf(" mnt_gen = %d\n", mp->mnt_gen); 3231 db_printf(" mnt_nvnodelistsize = %d\n", mp->mnt_nvnodelistsize); 3232 db_printf(" mnt_activevnodelistsize = %d\n", 3233 mp->mnt_activevnodelistsize); 3234 db_printf(" mnt_writeopcount = %d\n", mp->mnt_writeopcount); 3235 db_printf(" mnt_maxsymlinklen = %d\n", mp->mnt_maxsymlinklen); 3236 db_printf(" mnt_iosize_max = %d\n", mp->mnt_iosize_max); 3237 db_printf(" mnt_hashseed = %u\n", mp->mnt_hashseed); 3238 db_printf(" mnt_secondary_writes = %d\n", mp->mnt_secondary_writes); 3239 db_printf(" mnt_secondary_accwrites = %d\n", 3240 mp->mnt_secondary_accwrites); 3241 db_printf(" mnt_gjprovider = %s\n", 3242 mp->mnt_gjprovider != NULL ? mp->mnt_gjprovider : "NULL"); 3243 3244 db_printf("\n\nList of active vnodes\n"); 3245 TAILQ_FOREACH(vp, &mp->mnt_activevnodelist, v_actfreelist) { 3246 if (vp->v_type != VMARKER) { 3247 vn_printf(vp, "vnode "); 3248 if (db_pager_quit) 3249 break; 3250 } 3251 } 3252 db_printf("\n\nList of inactive vnodes\n"); 3253 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 3254 if (vp->v_type != VMARKER && (vp->v_iflag & VI_ACTIVE) == 0) { 3255 vn_printf(vp, "vnode "); 3256 if (db_pager_quit) 3257 break; 3258 } 3259 } 3260} 3261#endif /* DDB */ 3262 3263/* 3264 * Fill in a struct xvfsconf based on a struct vfsconf. 3265 */ 3266static int 3267vfsconf2x(struct sysctl_req *req, struct vfsconf *vfsp) 3268{ 3269 struct xvfsconf xvfsp; 3270 3271 bzero(&xvfsp, sizeof(xvfsp)); 3272 strcpy(xvfsp.vfc_name, vfsp->vfc_name); 3273 xvfsp.vfc_typenum = vfsp->vfc_typenum; 3274 xvfsp.vfc_refcount = vfsp->vfc_refcount; 3275 xvfsp.vfc_flags = vfsp->vfc_flags; 3276 /* 3277 * These are unused in userland, we keep them 3278 * to not break binary compatibility. 3279 */ 3280 xvfsp.vfc_vfsops = NULL; 3281 xvfsp.vfc_next = NULL; 3282 return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp))); 3283} 3284 3285#ifdef COMPAT_FREEBSD32 3286struct xvfsconf32 { 3287 uint32_t vfc_vfsops; 3288 char vfc_name[MFSNAMELEN]; 3289 int32_t vfc_typenum; 3290 int32_t vfc_refcount; 3291 int32_t vfc_flags; 3292 uint32_t vfc_next; 3293}; 3294 3295static int 3296vfsconf2x32(struct sysctl_req *req, struct vfsconf *vfsp) 3297{ 3298 struct xvfsconf32 xvfsp; 3299 3300 strcpy(xvfsp.vfc_name, vfsp->vfc_name); 3301 xvfsp.vfc_typenum = vfsp->vfc_typenum; 3302 xvfsp.vfc_refcount = vfsp->vfc_refcount; 3303 xvfsp.vfc_flags = vfsp->vfc_flags; 3304 xvfsp.vfc_vfsops = 0; 3305 xvfsp.vfc_next = 0; 3306 return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp))); 3307} 3308#endif 3309 3310/* 3311 * Top level filesystem related information gathering. 3312 */ 3313static int 3314sysctl_vfs_conflist(SYSCTL_HANDLER_ARGS) 3315{ 3316 struct vfsconf *vfsp; 3317 int error; 3318 3319 error = 0; 3320 TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { 3321#ifdef COMPAT_FREEBSD32 3322 if (req->flags & SCTL_MASK32) 3323 error = vfsconf2x32(req, vfsp); 3324 else 3325#endif 3326 error = vfsconf2x(req, vfsp); 3327 if (error) 3328 break; 3329 } 3330 return (error); 3331} 3332 3333SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLTYPE_OPAQUE | CTLFLAG_RD, 3334 NULL, 0, sysctl_vfs_conflist, 3335 "S,xvfsconf", "List of all configured filesystems"); 3336 3337#ifndef BURN_BRIDGES 3338static int sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS); 3339 3340static int 3341vfs_sysctl(SYSCTL_HANDLER_ARGS) 3342{ 3343 int *name = (int *)arg1 - 1; /* XXX */ 3344 u_int namelen = arg2 + 1; /* XXX */ 3345 struct vfsconf *vfsp; 3346 3347 printf("WARNING: userland calling deprecated sysctl, " 3348 "please rebuild world\n"); 3349 3350#if 1 || defined(COMPAT_PRELITE2) 3351 /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */ 3352 if (namelen == 1) 3353 return (sysctl_ovfs_conf(oidp, arg1, arg2, req)); 3354#endif 3355 3356 switch (name[1]) { 3357 case VFS_MAXTYPENUM: 3358 if (namelen != 2) 3359 return (ENOTDIR); 3360 return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int))); 3361 case VFS_CONF: 3362 if (namelen != 3) 3363 return (ENOTDIR); /* overloaded */ 3364 TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) 3365 if (vfsp->vfc_typenum == name[2]) 3366 break; 3367 if (vfsp == NULL) 3368 return (EOPNOTSUPP); 3369#ifdef COMPAT_FREEBSD32 3370 if (req->flags & SCTL_MASK32) 3371 return (vfsconf2x32(req, vfsp)); 3372 else 3373#endif 3374 return (vfsconf2x(req, vfsp)); 3375 } 3376 return (EOPNOTSUPP); 3377} 3378 3379static SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD | CTLFLAG_SKIP, 3380 vfs_sysctl, "Generic filesystem"); 3381 3382#if 1 || defined(COMPAT_PRELITE2) 3383 3384static int 3385sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS) 3386{ 3387 int error; 3388 struct vfsconf *vfsp; 3389 struct ovfsconf ovfs; 3390 3391 TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { 3392 bzero(&ovfs, sizeof(ovfs)); 3393 ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */ 3394 strcpy(ovfs.vfc_name, vfsp->vfc_name); 3395 ovfs.vfc_index = vfsp->vfc_typenum; 3396 ovfs.vfc_refcount = vfsp->vfc_refcount; 3397 ovfs.vfc_flags = vfsp->vfc_flags; 3398 error = SYSCTL_OUT(req, &ovfs, sizeof ovfs); 3399 if (error) 3400 return error; 3401 } 3402 return 0; 3403} 3404 3405#endif /* 1 || COMPAT_PRELITE2 */ 3406#endif /* !BURN_BRIDGES */ 3407 3408#define KINFO_VNODESLOP 10 3409#ifdef notyet 3410/* 3411 * Dump vnode list (via sysctl). 3412 */ 3413/* ARGSUSED */ 3414static int 3415sysctl_vnode(SYSCTL_HANDLER_ARGS) 3416{ 3417 struct xvnode *xvn; 3418 struct mount *mp; 3419 struct vnode *vp; 3420 int error, len, n; 3421 3422 /* 3423 * Stale numvnodes access is not fatal here. 3424 */ 3425 req->lock = 0; 3426 len = (numvnodes + KINFO_VNODESLOP) * sizeof *xvn; 3427 if (!req->oldptr) 3428 /* Make an estimate */ 3429 return (SYSCTL_OUT(req, 0, len)); 3430 3431 error = sysctl_wire_old_buffer(req, 0); 3432 if (error != 0) 3433 return (error); 3434 xvn = malloc(len, M_TEMP, M_ZERO | M_WAITOK); 3435 n = 0; 3436 mtx_lock(&mountlist_mtx); 3437 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 3438 if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) 3439 continue; 3440 MNT_ILOCK(mp); 3441 TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { 3442 if (n == len) 3443 break; 3444 vref(vp); 3445 xvn[n].xv_size = sizeof *xvn; 3446 xvn[n].xv_vnode = vp; 3447 xvn[n].xv_id = 0; /* XXX compat */ 3448#define XV_COPY(field) xvn[n].xv_##field = vp->v_##field 3449 XV_COPY(usecount); 3450 XV_COPY(writecount); 3451 XV_COPY(holdcnt); 3452 XV_COPY(mount); 3453 XV_COPY(numoutput); 3454 XV_COPY(type); 3455#undef XV_COPY 3456 xvn[n].xv_flag = vp->v_vflag; 3457 3458 switch (vp->v_type) { 3459 case VREG: 3460 case VDIR: 3461 case VLNK: 3462 break; 3463 case VBLK: 3464 case VCHR: 3465 if (vp->v_rdev == NULL) { 3466 vrele(vp); 3467 continue; 3468 } 3469 xvn[n].xv_dev = dev2udev(vp->v_rdev); 3470 break; 3471 case VSOCK: 3472 xvn[n].xv_socket = vp->v_socket; 3473 break; 3474 case VFIFO: 3475 xvn[n].xv_fifo = vp->v_fifoinfo; 3476 break; 3477 case VNON: 3478 case VBAD: 3479 default: 3480 /* shouldn't happen? */ 3481 vrele(vp); 3482 continue; 3483 } 3484 vrele(vp); 3485 ++n; 3486 } 3487 MNT_IUNLOCK(mp); 3488 mtx_lock(&mountlist_mtx); 3489 vfs_unbusy(mp); 3490 if (n == len) 3491 break; 3492 } 3493 mtx_unlock(&mountlist_mtx); 3494 3495 error = SYSCTL_OUT(req, xvn, n * sizeof *xvn); 3496 free(xvn, M_TEMP); 3497 return (error); 3498} 3499 3500SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD, 3501 0, 0, sysctl_vnode, "S,xvnode", ""); 3502#endif 3503 3504/* 3505 * Unmount all filesystems. The list is traversed in reverse order 3506 * of mounting to avoid dependencies. 3507 */ 3508void 3509vfs_unmountall(void) 3510{ 3511 struct mount *mp; 3512 struct thread *td; 3513 int error; 3514 3515 CTR1(KTR_VFS, "%s: unmounting all filesystems", __func__); 3516 td = curthread; 3517 3518 /* 3519 * Since this only runs when rebooting, it is not interlocked. 3520 */ 3521 while(!TAILQ_EMPTY(&mountlist)) { 3522 mp = TAILQ_LAST(&mountlist, mntlist); 3523 error = dounmount(mp, MNT_FORCE, td); 3524 if (error) { 3525 TAILQ_REMOVE(&mountlist, mp, mnt_list); 3526 /* 3527 * XXX: Due to the way in which we mount the root 3528 * file system off of devfs, devfs will generate a 3529 * "busy" warning when we try to unmount it before 3530 * the root. Don't print a warning as a result in 3531 * order to avoid false positive errors that may 3532 * cause needless upset. 3533 */ 3534 if (strcmp(mp->mnt_vfc->vfc_name, "devfs") != 0) { 3535 printf("unmount of %s failed (", 3536 mp->mnt_stat.f_mntonname); 3537 if (error == EBUSY) 3538 printf("BUSY)\n"); 3539 else 3540 printf("%d)\n", error); 3541 } 3542 } else { 3543 /* The unmount has removed mp from the mountlist */ 3544 } 3545 } 3546} 3547 3548/* 3549 * perform msync on all vnodes under a mount point 3550 * the mount point must be locked. 3551 */ 3552void 3553vfs_msync(struct mount *mp, int flags) 3554{ 3555 struct vnode *vp, *mvp; 3556 struct vm_object *obj; 3557 3558 CTR2(KTR_VFS, "%s: mp %p", __func__, mp); 3559 MNT_VNODE_FOREACH_ACTIVE(vp, mp, mvp) { 3560 obj = vp->v_object; 3561 if (obj != NULL && (obj->flags & OBJ_MIGHTBEDIRTY) != 0 && 3562 (flags == MNT_WAIT || VOP_ISLOCKED(vp) == 0)) { 3563 if (!vget(vp, 3564 LK_EXCLUSIVE | LK_RETRY | LK_INTERLOCK, 3565 curthread)) { 3566 if (vp->v_vflag & VV_NOSYNC) { /* unlinked */ 3567 vput(vp); 3568 continue; 3569 } 3570 3571 obj = vp->v_object; 3572 if (obj != NULL) { 3573 VM_OBJECT_LOCK(obj); 3574 vm_object_page_clean(obj, 0, 0, 3575 flags == MNT_WAIT ? 3576 OBJPC_SYNC : OBJPC_NOSYNC); 3577 VM_OBJECT_UNLOCK(obj); 3578 } 3579 vput(vp); 3580 } 3581 } else 3582 VI_UNLOCK(vp); 3583 } 3584} 3585 3586static void 3587destroy_vpollinfo_free(struct vpollinfo *vi) 3588{ 3589 3590 knlist_destroy(&vi->vpi_selinfo.si_note); 3591 mtx_destroy(&vi->vpi_lock); 3592 uma_zfree(vnodepoll_zone, vi); 3593} 3594 3595static void 3596destroy_vpollinfo(struct vpollinfo *vi) 3597{ 3598 3599 knlist_clear(&vi->vpi_selinfo.si_note, 1); 3600 seldrain(&vi->vpi_selinfo); 3601 destroy_vpollinfo_free(vi); 3602} 3603 3604/* 3605 * Initalize per-vnode helper structure to hold poll-related state. 3606 */ 3607void 3608v_addpollinfo(struct vnode *vp) 3609{ 3610 struct vpollinfo *vi; 3611 3612 if (vp->v_pollinfo != NULL) 3613 return; 3614 vi = uma_zalloc(vnodepoll_zone, M_WAITOK); 3615 mtx_init(&vi->vpi_lock, "vnode pollinfo", NULL, MTX_DEF); 3616 knlist_init(&vi->vpi_selinfo.si_note, vp, vfs_knllock, 3617 vfs_knlunlock, vfs_knl_assert_locked, vfs_knl_assert_unlocked); 3618 VI_LOCK(vp); 3619 if (vp->v_pollinfo != NULL) { 3620 VI_UNLOCK(vp); 3621 destroy_vpollinfo_free(vi); 3622 return; 3623 } 3624 vp->v_pollinfo = vi; 3625 VI_UNLOCK(vp); 3626} 3627 3628/* 3629 * Record a process's interest in events which might happen to 3630 * a vnode. Because poll uses the historic select-style interface 3631 * internally, this routine serves as both the ``check for any 3632 * pending events'' and the ``record my interest in future events'' 3633 * functions. (These are done together, while the lock is held, 3634 * to avoid race conditions.) 3635 */ 3636int 3637vn_pollrecord(struct vnode *vp, struct thread *td, int events) 3638{ 3639 3640 v_addpollinfo(vp); 3641 mtx_lock(&vp->v_pollinfo->vpi_lock); 3642 if (vp->v_pollinfo->vpi_revents & events) { 3643 /* 3644 * This leaves events we are not interested 3645 * in available for the other process which 3646 * which presumably had requested them 3647 * (otherwise they would never have been 3648 * recorded). 3649 */ 3650 events &= vp->v_pollinfo->vpi_revents; 3651 vp->v_pollinfo->vpi_revents &= ~events; 3652 3653 mtx_unlock(&vp->v_pollinfo->vpi_lock); 3654 return (events); 3655 } 3656 vp->v_pollinfo->vpi_events |= events; 3657 selrecord(td, &vp->v_pollinfo->vpi_selinfo); 3658 mtx_unlock(&vp->v_pollinfo->vpi_lock); 3659 return (0); 3660} 3661 3662/* 3663 * Routine to create and manage a filesystem syncer vnode. 3664 */ 3665#define sync_close ((int (*)(struct vop_close_args *))nullop) 3666static int sync_fsync(struct vop_fsync_args *); 3667static int sync_inactive(struct vop_inactive_args *); 3668static int sync_reclaim(struct vop_reclaim_args *); 3669 3670static struct vop_vector sync_vnodeops = { 3671 .vop_bypass = VOP_EOPNOTSUPP, 3672 .vop_close = sync_close, /* close */ 3673 .vop_fsync = sync_fsync, /* fsync */ 3674 .vop_inactive = sync_inactive, /* inactive */ 3675 .vop_reclaim = sync_reclaim, /* reclaim */ 3676 .vop_lock1 = vop_stdlock, /* lock */ 3677 .vop_unlock = vop_stdunlock, /* unlock */ 3678 .vop_islocked = vop_stdislocked, /* islocked */ 3679}; 3680 3681/* 3682 * Create a new filesystem syncer vnode for the specified mount point. 3683 */ 3684void 3685vfs_allocate_syncvnode(struct mount *mp) 3686{ 3687 struct vnode *vp; 3688 struct bufobj *bo; 3689 static long start, incr, next; 3690 int error; 3691 3692 /* Allocate a new vnode */ 3693 error = getnewvnode("syncer", mp, &sync_vnodeops, &vp); 3694 if (error != 0) 3695 panic("vfs_allocate_syncvnode: getnewvnode() failed"); 3696 vp->v_type = VNON; 3697 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 3698 vp->v_vflag |= VV_FORCEINSMQ; 3699 error = insmntque(vp, mp); 3700 if (error != 0) 3701 panic("vfs_allocate_syncvnode: insmntque() failed"); 3702 vp->v_vflag &= ~VV_FORCEINSMQ; 3703 VOP_UNLOCK(vp, 0); 3704 /* 3705 * Place the vnode onto the syncer worklist. We attempt to 3706 * scatter them about on the list so that they will go off 3707 * at evenly distributed times even if all the filesystems 3708 * are mounted at once. 3709 */ 3710 next += incr; 3711 if (next == 0 || next > syncer_maxdelay) { 3712 start /= 2; 3713 incr /= 2; 3714 if (start == 0) { 3715 start = syncer_maxdelay / 2; 3716 incr = syncer_maxdelay; 3717 } 3718 next = start; 3719 } 3720 bo = &vp->v_bufobj; 3721 BO_LOCK(bo); 3722 vn_syncer_add_to_worklist(bo, syncdelay > 0 ? next % syncdelay : 0); 3723 /* XXX - vn_syncer_add_to_worklist() also grabs and drops sync_mtx. */ 3724 mtx_lock(&sync_mtx); 3725 sync_vnode_count++; 3726 if (mp->mnt_syncer == NULL) { 3727 mp->mnt_syncer = vp; 3728 vp = NULL; 3729 } 3730 mtx_unlock(&sync_mtx); 3731 BO_UNLOCK(bo); 3732 if (vp != NULL) { 3733 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 3734 vgone(vp); 3735 vput(vp); 3736 } 3737} 3738 3739void 3740vfs_deallocate_syncvnode(struct mount *mp) 3741{ 3742 struct vnode *vp; 3743 3744 mtx_lock(&sync_mtx); 3745 vp = mp->mnt_syncer; 3746 if (vp != NULL) 3747 mp->mnt_syncer = NULL; 3748 mtx_unlock(&sync_mtx); 3749 if (vp != NULL) 3750 vrele(vp); 3751} 3752 3753/* 3754 * Do a lazy sync of the filesystem. 3755 */ 3756static int 3757sync_fsync(struct vop_fsync_args *ap) 3758{ 3759 struct vnode *syncvp = ap->a_vp; 3760 struct mount *mp = syncvp->v_mount; 3761 int error, save; 3762 struct bufobj *bo; 3763 3764 /* 3765 * We only need to do something if this is a lazy evaluation. 3766 */ 3767 if (ap->a_waitfor != MNT_LAZY) 3768 return (0); 3769 3770 /* 3771 * Move ourselves to the back of the sync list. 3772 */ 3773 bo = &syncvp->v_bufobj; 3774 BO_LOCK(bo); 3775 vn_syncer_add_to_worklist(bo, syncdelay); 3776 BO_UNLOCK(bo); 3777 3778 /* 3779 * Walk the list of vnodes pushing all that are dirty and 3780 * not already on the sync list. 3781 */ 3782 mtx_lock(&mountlist_mtx); 3783 if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK) != 0) { 3784 mtx_unlock(&mountlist_mtx); 3785 return (0); 3786 } 3787 if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) { 3788 vfs_unbusy(mp); 3789 return (0); 3790 } 3791 save = curthread_pflags_set(TDP_SYNCIO); 3792 vfs_msync(mp, MNT_NOWAIT); 3793 error = VFS_SYNC(mp, MNT_LAZY); 3794 curthread_pflags_restore(save); 3795 vn_finished_write(mp); 3796 vfs_unbusy(mp); 3797 return (error); 3798} 3799 3800/* 3801 * The syncer vnode is no referenced. 3802 */ 3803static int 3804sync_inactive(struct vop_inactive_args *ap) 3805{ 3806 3807 vgone(ap->a_vp); 3808 return (0); 3809} 3810 3811/* 3812 * The syncer vnode is no longer needed and is being decommissioned. 3813 * 3814 * Modifications to the worklist must be protected by sync_mtx. 3815 */ 3816static int 3817sync_reclaim(struct vop_reclaim_args *ap) 3818{ 3819 struct vnode *vp = ap->a_vp; 3820 struct bufobj *bo; 3821 3822 bo = &vp->v_bufobj; 3823 BO_LOCK(bo); 3824 mtx_lock(&sync_mtx); 3825 if (vp->v_mount->mnt_syncer == vp) 3826 vp->v_mount->mnt_syncer = NULL; 3827 if (bo->bo_flag & BO_ONWORKLST) { 3828 LIST_REMOVE(bo, bo_synclist); 3829 syncer_worklist_len--; 3830 sync_vnode_count--; 3831 bo->bo_flag &= ~BO_ONWORKLST; 3832 } 3833 mtx_unlock(&sync_mtx); 3834 BO_UNLOCK(bo); 3835 3836 return (0); 3837} 3838 3839/* 3840 * Check if vnode represents a disk device 3841 */ 3842int 3843vn_isdisk(struct vnode *vp, int *errp) 3844{ 3845 int error; 3846 3847 error = 0; 3848 dev_lock(); 3849 if (vp->v_type != VCHR) 3850 error = ENOTBLK; 3851 else if (vp->v_rdev == NULL) 3852 error = ENXIO; 3853 else if (vp->v_rdev->si_devsw == NULL) 3854 error = ENXIO; 3855 else if (!(vp->v_rdev->si_devsw->d_flags & D_DISK)) 3856 error = ENOTBLK; 3857 dev_unlock(); 3858 if (errp != NULL) 3859 *errp = error; 3860 return (error == 0); 3861} 3862 3863/* 3864 * Common filesystem object access control check routine. Accepts a 3865 * vnode's type, "mode", uid and gid, requested access mode, credentials, 3866 * and optional call-by-reference privused argument allowing vaccess() 3867 * to indicate to the caller whether privilege was used to satisfy the 3868 * request (obsoleted). Returns 0 on success, or an errno on failure. 3869 */ 3870int 3871vaccess(enum vtype type, mode_t file_mode, uid_t file_uid, gid_t file_gid, 3872 accmode_t accmode, struct ucred *cred, int *privused) 3873{ 3874 accmode_t dac_granted; 3875 accmode_t priv_granted; 3876 3877 KASSERT((accmode & ~(VEXEC | VWRITE | VREAD | VADMIN | VAPPEND)) == 0, 3878 ("invalid bit in accmode")); 3879 KASSERT((accmode & VAPPEND) == 0 || (accmode & VWRITE), 3880 ("VAPPEND without VWRITE")); 3881 3882 /* 3883 * Look for a normal, non-privileged way to access the file/directory 3884 * as requested. If it exists, go with that. 3885 */ 3886 3887 if (privused != NULL) 3888 *privused = 0; 3889 3890 dac_granted = 0; 3891 3892 /* Check the owner. */ 3893 if (cred->cr_uid == file_uid) { 3894 dac_granted |= VADMIN; 3895 if (file_mode & S_IXUSR) 3896 dac_granted |= VEXEC; 3897 if (file_mode & S_IRUSR) 3898 dac_granted |= VREAD; 3899 if (file_mode & S_IWUSR) 3900 dac_granted |= (VWRITE | VAPPEND); 3901 3902 if ((accmode & dac_granted) == accmode) 3903 return (0); 3904 3905 goto privcheck; 3906 } 3907 3908 /* Otherwise, check the groups (first match) */ 3909 if (groupmember(file_gid, cred)) { 3910 if (file_mode & S_IXGRP) 3911 dac_granted |= VEXEC; 3912 if (file_mode & S_IRGRP) 3913 dac_granted |= VREAD; 3914 if (file_mode & S_IWGRP) 3915 dac_granted |= (VWRITE | VAPPEND); 3916 3917 if ((accmode & dac_granted) == accmode) 3918 return (0); 3919 3920 goto privcheck; 3921 } 3922 3923 /* Otherwise, check everyone else. */ 3924 if (file_mode & S_IXOTH) 3925 dac_granted |= VEXEC; 3926 if (file_mode & S_IROTH) 3927 dac_granted |= VREAD; 3928 if (file_mode & S_IWOTH) 3929 dac_granted |= (VWRITE | VAPPEND); 3930 if ((accmode & dac_granted) == accmode) 3931 return (0); 3932 3933privcheck: 3934 /* 3935 * Build a privilege mask to determine if the set of privileges 3936 * satisfies the requirements when combined with the granted mask 3937 * from above. For each privilege, if the privilege is required, 3938 * bitwise or the request type onto the priv_granted mask. 3939 */ 3940 priv_granted = 0; 3941 3942 if (type == VDIR) { 3943 /* 3944 * For directories, use PRIV_VFS_LOOKUP to satisfy VEXEC 3945 * requests, instead of PRIV_VFS_EXEC. 3946 */ 3947 if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) && 3948 !priv_check_cred(cred, PRIV_VFS_LOOKUP, 0)) 3949 priv_granted |= VEXEC; 3950 } else { 3951 /* 3952 * Ensure that at least one execute bit is on. Otherwise, 3953 * a privileged user will always succeed, and we don't want 3954 * this to happen unless the file really is executable. 3955 */ 3956 if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) && 3957 (file_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) != 0 && 3958 !priv_check_cred(cred, PRIV_VFS_EXEC, 0)) 3959 priv_granted |= VEXEC; 3960 } 3961 3962 if ((accmode & VREAD) && ((dac_granted & VREAD) == 0) && 3963 !priv_check_cred(cred, PRIV_VFS_READ, 0)) 3964 priv_granted |= VREAD; 3965 3966 if ((accmode & VWRITE) && ((dac_granted & VWRITE) == 0) && 3967 !priv_check_cred(cred, PRIV_VFS_WRITE, 0)) 3968 priv_granted |= (VWRITE | VAPPEND); 3969 3970 if ((accmode & VADMIN) && ((dac_granted & VADMIN) == 0) && 3971 !priv_check_cred(cred, PRIV_VFS_ADMIN, 0)) 3972 priv_granted |= VADMIN; 3973 3974 if ((accmode & (priv_granted | dac_granted)) == accmode) { 3975 /* XXX audit: privilege used */ 3976 if (privused != NULL) 3977 *privused = 1; 3978 return (0); 3979 } 3980 3981 return ((accmode & VADMIN) ? EPERM : EACCES); 3982} 3983 3984/* 3985 * Credential check based on process requesting service, and per-attribute 3986 * permissions. 3987 */ 3988int 3989extattr_check_cred(struct vnode *vp, int attrnamespace, struct ucred *cred, 3990 struct thread *td, accmode_t accmode) 3991{ 3992 3993 /* 3994 * Kernel-invoked always succeeds. 3995 */ 3996 if (cred == NOCRED) 3997 return (0); 3998 3999 /* 4000 * Do not allow privileged processes in jail to directly manipulate 4001 * system attributes. 4002 */ 4003 switch (attrnamespace) { 4004 case EXTATTR_NAMESPACE_SYSTEM: 4005 /* Potentially should be: return (EPERM); */ 4006 return (priv_check_cred(cred, PRIV_VFS_EXTATTR_SYSTEM, 0)); 4007 case EXTATTR_NAMESPACE_USER: 4008 return (VOP_ACCESS(vp, accmode, cred, td)); 4009 default: 4010 return (EPERM); 4011 } 4012} 4013 4014#ifdef DEBUG_VFS_LOCKS 4015/* 4016 * This only exists to supress warnings from unlocked specfs accesses. It is 4017 * no longer ok to have an unlocked VFS. 4018 */ 4019#define IGNORE_LOCK(vp) (panicstr != NULL || (vp) == NULL || \ 4020 (vp)->v_type == VCHR || (vp)->v_type == VBAD) 4021 4022int vfs_badlock_ddb = 1; /* Drop into debugger on violation. */ 4023SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_ddb, CTLFLAG_RW, &vfs_badlock_ddb, 0, 4024 "Drop into debugger on lock violation"); 4025 4026int vfs_badlock_mutex = 1; /* Check for interlock across VOPs. */ 4027SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_mutex, CTLFLAG_RW, &vfs_badlock_mutex, 4028 0, "Check for interlock across VOPs"); 4029 4030int vfs_badlock_print = 1; /* Print lock violations. */ 4031SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_print, CTLFLAG_RW, &vfs_badlock_print, 4032 0, "Print lock violations"); 4033 4034#ifdef KDB 4035int vfs_badlock_backtrace = 1; /* Print backtrace at lock violations. */ 4036SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_backtrace, CTLFLAG_RW, 4037 &vfs_badlock_backtrace, 0, "Print backtrace at lock violations"); 4038#endif 4039 4040static void 4041vfs_badlock(const char *msg, const char *str, struct vnode *vp) 4042{ 4043 4044#ifdef KDB 4045 if (vfs_badlock_backtrace) 4046 kdb_backtrace(); 4047#endif 4048 if (vfs_badlock_print) 4049 printf("%s: %p %s\n", str, (void *)vp, msg); 4050 if (vfs_badlock_ddb) 4051 kdb_enter(KDB_WHY_VFSLOCK, "lock violation"); 4052} 4053 4054void 4055assert_vi_locked(struct vnode *vp, const char *str) 4056{ 4057 4058 if (vfs_badlock_mutex && !mtx_owned(VI_MTX(vp))) 4059 vfs_badlock("interlock is not locked but should be", str, vp); 4060} 4061 4062void 4063assert_vi_unlocked(struct vnode *vp, const char *str) 4064{ 4065 4066 if (vfs_badlock_mutex && mtx_owned(VI_MTX(vp))) 4067 vfs_badlock("interlock is locked but should not be", str, vp); 4068} 4069 4070void 4071assert_vop_locked(struct vnode *vp, const char *str) 4072{ 4073 int locked; 4074 4075 if (!IGNORE_LOCK(vp)) { 4076 locked = VOP_ISLOCKED(vp); 4077 if (locked == 0 || locked == LK_EXCLOTHER) 4078 vfs_badlock("is not locked but should be", str, vp); 4079 } 4080} 4081 4082void 4083assert_vop_unlocked(struct vnode *vp, const char *str) 4084{ 4085 4086 if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) == LK_EXCLUSIVE) 4087 vfs_badlock("is locked but should not be", str, vp); 4088} 4089 4090void 4091assert_vop_elocked(struct vnode *vp, const char *str) 4092{ 4093 4094 if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_EXCLUSIVE) 4095 vfs_badlock("is not exclusive locked but should be", str, vp); 4096} 4097 4098#if 0 4099void 4100assert_vop_elocked_other(struct vnode *vp, const char *str) 4101{ 4102 4103 if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_EXCLOTHER) 4104 vfs_badlock("is not exclusive locked by another thread", 4105 str, vp); 4106} 4107 4108void 4109assert_vop_slocked(struct vnode *vp, const char *str) 4110{ 4111 4112 if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_SHARED) 4113 vfs_badlock("is not locked shared but should be", str, vp); 4114} 4115#endif /* 0 */ 4116#endif /* DEBUG_VFS_LOCKS */ 4117 4118void 4119vop_rename_fail(struct vop_rename_args *ap) 4120{ 4121 4122 if (ap->a_tvp != NULL) 4123 vput(ap->a_tvp); 4124 if (ap->a_tdvp == ap->a_tvp) 4125 vrele(ap->a_tdvp); 4126 else 4127 vput(ap->a_tdvp); 4128 vrele(ap->a_fdvp); 4129 vrele(ap->a_fvp); 4130} 4131 4132void 4133vop_rename_pre(void *ap) 4134{ 4135 struct vop_rename_args *a = ap; 4136 4137#ifdef DEBUG_VFS_LOCKS 4138 if (a->a_tvp) 4139 ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME"); 4140 ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME"); 4141 ASSERT_VI_UNLOCKED(a->a_fvp, "VOP_RENAME"); 4142 ASSERT_VI_UNLOCKED(a->a_fdvp, "VOP_RENAME"); 4143 4144 /* Check the source (from). */ 4145 if (a->a_tdvp->v_vnlock != a->a_fdvp->v_vnlock && 4146 (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fdvp->v_vnlock)) 4147 ASSERT_VOP_UNLOCKED(a->a_fdvp, "vop_rename: fdvp locked"); 4148 if (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fvp->v_vnlock) 4149 ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: fvp locked"); 4150 4151 /* Check the target. */ 4152 if (a->a_tvp) 4153 ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked"); 4154 ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked"); 4155#endif 4156 if (a->a_tdvp != a->a_fdvp) 4157 vhold(a->a_fdvp); 4158 if (a->a_tvp != a->a_fvp) 4159 vhold(a->a_fvp); 4160 vhold(a->a_tdvp); 4161 if (a->a_tvp) 4162 vhold(a->a_tvp); 4163} 4164 4165void 4166vop_strategy_pre(void *ap) 4167{ 4168#ifdef DEBUG_VFS_LOCKS 4169 struct vop_strategy_args *a; 4170 struct buf *bp; 4171 4172 a = ap; 4173 bp = a->a_bp; 4174 4175 /* 4176 * Cluster ops lock their component buffers but not the IO container. 4177 */ 4178 if ((bp->b_flags & B_CLUSTER) != 0) 4179 return; 4180 4181 if (panicstr == NULL && !BUF_ISLOCKED(bp)) { 4182 if (vfs_badlock_print) 4183 printf( 4184 "VOP_STRATEGY: bp is not locked but should be\n"); 4185 if (vfs_badlock_ddb) 4186 kdb_enter(KDB_WHY_VFSLOCK, "lock violation"); 4187 } 4188#endif 4189} 4190 4191void 4192vop_lock_pre(void *ap) 4193{ 4194#ifdef DEBUG_VFS_LOCKS 4195 struct vop_lock1_args *a = ap; 4196 4197 if ((a->a_flags & LK_INTERLOCK) == 0) 4198 ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK"); 4199 else 4200 ASSERT_VI_LOCKED(a->a_vp, "VOP_LOCK"); 4201#endif 4202} 4203 4204void 4205vop_lock_post(void *ap, int rc) 4206{ 4207#ifdef DEBUG_VFS_LOCKS 4208 struct vop_lock1_args *a = ap; 4209 4210 ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK"); 4211 if (rc == 0 && (a->a_flags & LK_EXCLOTHER) == 0) 4212 ASSERT_VOP_LOCKED(a->a_vp, "VOP_LOCK"); 4213#endif 4214} 4215 4216void 4217vop_unlock_pre(void *ap) 4218{ 4219#ifdef DEBUG_VFS_LOCKS 4220 struct vop_unlock_args *a = ap; 4221 4222 if (a->a_flags & LK_INTERLOCK) 4223 ASSERT_VI_LOCKED(a->a_vp, "VOP_UNLOCK"); 4224 ASSERT_VOP_LOCKED(a->a_vp, "VOP_UNLOCK"); 4225#endif 4226} 4227 4228void 4229vop_unlock_post(void *ap, int rc) 4230{ 4231#ifdef DEBUG_VFS_LOCKS 4232 struct vop_unlock_args *a = ap; 4233 4234 if (a->a_flags & LK_INTERLOCK) 4235 ASSERT_VI_UNLOCKED(a->a_vp, "VOP_UNLOCK"); 4236#endif 4237} 4238 4239void 4240vop_create_post(void *ap, int rc) 4241{ 4242 struct vop_create_args *a = ap; 4243 4244 if (!rc) 4245 VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE); 4246} 4247 4248void 4249vop_deleteextattr_post(void *ap, int rc) 4250{ 4251 struct vop_deleteextattr_args *a = ap; 4252 4253 if (!rc) 4254 VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB); 4255} 4256 4257void 4258vop_link_post(void *ap, int rc) 4259{ 4260 struct vop_link_args *a = ap; 4261 4262 if (!rc) { 4263 VFS_KNOTE_LOCKED(a->a_vp, NOTE_LINK); 4264 VFS_KNOTE_LOCKED(a->a_tdvp, NOTE_WRITE); 4265 } 4266} 4267 4268void 4269vop_mkdir_post(void *ap, int rc) 4270{ 4271 struct vop_mkdir_args *a = ap; 4272 4273 if (!rc) 4274 VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK); 4275} 4276 4277void 4278vop_mknod_post(void *ap, int rc) 4279{ 4280 struct vop_mknod_args *a = ap; 4281 4282 if (!rc) 4283 VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE); 4284} 4285 4286void 4287vop_remove_post(void *ap, int rc) 4288{ 4289 struct vop_remove_args *a = ap; 4290 4291 if (!rc) { 4292 VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE); 4293 VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE); 4294 } 4295} 4296 4297void 4298vop_rename_post(void *ap, int rc) 4299{ 4300 struct vop_rename_args *a = ap; 4301 4302 if (!rc) { 4303 VFS_KNOTE_UNLOCKED(a->a_fdvp, NOTE_WRITE); 4304 VFS_KNOTE_UNLOCKED(a->a_tdvp, NOTE_WRITE); 4305 VFS_KNOTE_UNLOCKED(a->a_fvp, NOTE_RENAME); 4306 if (a->a_tvp) 4307 VFS_KNOTE_UNLOCKED(a->a_tvp, NOTE_DELETE); 4308 } 4309 if (a->a_tdvp != a->a_fdvp) 4310 vdrop(a->a_fdvp); 4311 if (a->a_tvp != a->a_fvp) 4312 vdrop(a->a_fvp); 4313 vdrop(a->a_tdvp); 4314 if (a->a_tvp) 4315 vdrop(a->a_tvp); 4316} 4317 4318void 4319vop_rmdir_post(void *ap, int rc) 4320{ 4321 struct vop_rmdir_args *a = ap; 4322 4323 if (!rc) { 4324 VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK); 4325 VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE); 4326 } 4327} 4328 4329void 4330vop_setattr_post(void *ap, int rc) 4331{ 4332 struct vop_setattr_args *a = ap; 4333 4334 if (!rc) 4335 VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB); 4336} 4337 4338void 4339vop_setextattr_post(void *ap, int rc) 4340{ 4341 struct vop_setextattr_args *a = ap; 4342 4343 if (!rc) 4344 VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB); 4345} 4346 4347void 4348vop_symlink_post(void *ap, int rc) 4349{ 4350 struct vop_symlink_args *a = ap; 4351 4352 if (!rc) 4353 VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE); 4354} 4355 4356static struct knlist fs_knlist; 4357 4358static void 4359vfs_event_init(void *arg) 4360{ 4361 knlist_init_mtx(&fs_knlist, NULL); 4362} 4363/* XXX - correct order? */ 4364SYSINIT(vfs_knlist, SI_SUB_VFS, SI_ORDER_ANY, vfs_event_init, NULL); 4365 4366void 4367vfs_event_signal(fsid_t *fsid, uint32_t event, intptr_t data __unused) 4368{ 4369 4370 KNOTE_UNLOCKED(&fs_knlist, event); 4371} 4372 4373static int filt_fsattach(struct knote *kn); 4374static void filt_fsdetach(struct knote *kn); 4375static int filt_fsevent(struct knote *kn, long hint); 4376 4377struct filterops fs_filtops = { 4378 .f_isfd = 0, 4379 .f_attach = filt_fsattach, 4380 .f_detach = filt_fsdetach, 4381 .f_event = filt_fsevent 4382}; 4383 4384static int 4385filt_fsattach(struct knote *kn) 4386{ 4387 4388 kn->kn_flags |= EV_CLEAR; 4389 knlist_add(&fs_knlist, kn, 0); 4390 return (0); 4391} 4392 4393static void 4394filt_fsdetach(struct knote *kn) 4395{ 4396 4397 knlist_remove(&fs_knlist, kn, 0); 4398} 4399 4400static int 4401filt_fsevent(struct knote *kn, long hint) 4402{ 4403 4404 kn->kn_fflags |= hint; 4405 return (kn->kn_fflags != 0); 4406} 4407 4408static int 4409sysctl_vfs_ctl(SYSCTL_HANDLER_ARGS) 4410{ 4411 struct vfsidctl vc; 4412 int error; 4413 struct mount *mp; 4414 4415 error = SYSCTL_IN(req, &vc, sizeof(vc)); 4416 if (error) 4417 return (error); 4418 if (vc.vc_vers != VFS_CTL_VERS1) 4419 return (EINVAL); 4420 mp = vfs_getvfs(&vc.vc_fsid); 4421 if (mp == NULL) 4422 return (ENOENT); 4423 /* ensure that a specific sysctl goes to the right filesystem. */ 4424 if (strcmp(vc.vc_fstypename, "*") != 0 && 4425 strcmp(vc.vc_fstypename, mp->mnt_vfc->vfc_name) != 0) { 4426 vfs_rel(mp); 4427 return (EINVAL); 4428 } 4429 VCTLTOREQ(&vc, req); 4430 error = VFS_SYSCTL(mp, vc.vc_op, req); 4431 vfs_rel(mp); 4432 return (error); 4433} 4434 4435SYSCTL_PROC(_vfs, OID_AUTO, ctl, CTLTYPE_OPAQUE | CTLFLAG_WR, 4436 NULL, 0, sysctl_vfs_ctl, "", 4437 "Sysctl by fsid"); 4438 4439/* 4440 * Function to initialize a va_filerev field sensibly. 4441 * XXX: Wouldn't a random number make a lot more sense ?? 4442 */ 4443u_quad_t 4444init_va_filerev(void) 4445{ 4446 struct bintime bt; 4447 4448 getbinuptime(&bt); 4449 return (((u_quad_t)bt.sec << 32LL) | (bt.frac >> 32LL)); 4450} 4451 4452static int filt_vfsread(struct knote *kn, long hint); 4453static int filt_vfswrite(struct knote *kn, long hint); 4454static int filt_vfsvnode(struct knote *kn, long hint); 4455static void filt_vfsdetach(struct knote *kn); 4456static struct filterops vfsread_filtops = { 4457 .f_isfd = 1, 4458 .f_detach = filt_vfsdetach, 4459 .f_event = filt_vfsread 4460}; 4461static struct filterops vfswrite_filtops = { 4462 .f_isfd = 1, 4463 .f_detach = filt_vfsdetach, 4464 .f_event = filt_vfswrite 4465}; 4466static struct filterops vfsvnode_filtops = { 4467 .f_isfd = 1, 4468 .f_detach = filt_vfsdetach, 4469 .f_event = filt_vfsvnode 4470}; 4471 4472static void 4473vfs_knllock(void *arg) 4474{ 4475 struct vnode *vp = arg; 4476 4477 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 4478} 4479 4480static void 4481vfs_knlunlock(void *arg) 4482{ 4483 struct vnode *vp = arg; 4484 4485 VOP_UNLOCK(vp, 0); 4486} 4487 4488static void 4489vfs_knl_assert_locked(void *arg) 4490{ 4491#ifdef DEBUG_VFS_LOCKS 4492 struct vnode *vp = arg; 4493 4494 ASSERT_VOP_LOCKED(vp, "vfs_knl_assert_locked"); 4495#endif 4496} 4497 4498static void 4499vfs_knl_assert_unlocked(void *arg) 4500{ 4501#ifdef DEBUG_VFS_LOCKS 4502 struct vnode *vp = arg; 4503 4504 ASSERT_VOP_UNLOCKED(vp, "vfs_knl_assert_unlocked"); 4505#endif 4506} 4507 4508int 4509vfs_kqfilter(struct vop_kqfilter_args *ap) 4510{ 4511 struct vnode *vp = ap->a_vp; 4512 struct knote *kn = ap->a_kn; 4513 struct knlist *knl; 4514 4515 switch (kn->kn_filter) { 4516 case EVFILT_READ: 4517 kn->kn_fop = &vfsread_filtops; 4518 break; 4519 case EVFILT_WRITE: 4520 kn->kn_fop = &vfswrite_filtops; 4521 break; 4522 case EVFILT_VNODE: 4523 kn->kn_fop = &vfsvnode_filtops; 4524 break; 4525 default: 4526 return (EINVAL); 4527 } 4528 4529 kn->kn_hook = (caddr_t)vp; 4530 4531 v_addpollinfo(vp); 4532 if (vp->v_pollinfo == NULL) 4533 return (ENOMEM); 4534 knl = &vp->v_pollinfo->vpi_selinfo.si_note; 4535 vhold(vp); 4536 knlist_add(knl, kn, 0); 4537 4538 return (0); 4539} 4540 4541/* 4542 * Detach knote from vnode 4543 */ 4544static void 4545filt_vfsdetach(struct knote *kn) 4546{ 4547 struct vnode *vp = (struct vnode *)kn->kn_hook; 4548 4549 KASSERT(vp->v_pollinfo != NULL, ("Missing v_pollinfo")); 4550 knlist_remove(&vp->v_pollinfo->vpi_selinfo.si_note, kn, 0); 4551 vdrop(vp); 4552} 4553 4554/*ARGSUSED*/ 4555static int 4556filt_vfsread(struct knote *kn, long hint) 4557{ 4558 struct vnode *vp = (struct vnode *)kn->kn_hook; 4559 struct vattr va; 4560 int res; 4561 4562 /* 4563 * filesystem is gone, so set the EOF flag and schedule 4564 * the knote for deletion. 4565 */ 4566 if (hint == NOTE_REVOKE) { 4567 VI_LOCK(vp); 4568 kn->kn_flags |= (EV_EOF | EV_ONESHOT); 4569 VI_UNLOCK(vp); 4570 return (1); 4571 } 4572 4573 if (VOP_GETATTR(vp, &va, curthread->td_ucred)) 4574 return (0); 4575 4576 VI_LOCK(vp); 4577 kn->kn_data = va.va_size - kn->kn_fp->f_offset; 4578 res = (kn->kn_data != 0); 4579 VI_UNLOCK(vp); 4580 return (res); 4581} 4582 4583/*ARGSUSED*/ 4584static int 4585filt_vfswrite(struct knote *kn, long hint) 4586{ 4587 struct vnode *vp = (struct vnode *)kn->kn_hook; 4588 4589 VI_LOCK(vp); 4590 4591 /* 4592 * filesystem is gone, so set the EOF flag and schedule 4593 * the knote for deletion. 4594 */ 4595 if (hint == NOTE_REVOKE) 4596 kn->kn_flags |= (EV_EOF | EV_ONESHOT); 4597 4598 kn->kn_data = 0; 4599 VI_UNLOCK(vp); 4600 return (1); 4601} 4602 4603static int 4604filt_vfsvnode(struct knote *kn, long hint) 4605{ 4606 struct vnode *vp = (struct vnode *)kn->kn_hook; 4607 int res; 4608 4609 VI_LOCK(vp); 4610 if (kn->kn_sfflags & hint) 4611 kn->kn_fflags |= hint; 4612 if (hint == NOTE_REVOKE) { 4613 kn->kn_flags |= EV_EOF; 4614 VI_UNLOCK(vp); 4615 return (1); 4616 } 4617 res = (kn->kn_fflags != 0); 4618 VI_UNLOCK(vp); 4619 return (res); 4620} 4621 4622int 4623vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off) 4624{ 4625 int error; 4626 4627 if (dp->d_reclen > ap->a_uio->uio_resid) 4628 return (ENAMETOOLONG); 4629 error = uiomove(dp, dp->d_reclen, ap->a_uio); 4630 if (error) { 4631 if (ap->a_ncookies != NULL) { 4632 if (ap->a_cookies != NULL) 4633 free(ap->a_cookies, M_TEMP); 4634 ap->a_cookies = NULL; 4635 *ap->a_ncookies = 0; 4636 } 4637 return (error); 4638 } 4639 if (ap->a_ncookies == NULL) 4640 return (0); 4641 4642 KASSERT(ap->a_cookies, 4643 ("NULL ap->a_cookies value with non-NULL ap->a_ncookies!")); 4644 4645 *ap->a_cookies = realloc(*ap->a_cookies, 4646 (*ap->a_ncookies + 1) * sizeof(u_long), M_TEMP, M_WAITOK | M_ZERO); 4647 (*ap->a_cookies)[*ap->a_ncookies] = off; 4648 return (0); 4649} 4650 4651/* 4652 * Mark for update the access time of the file if the filesystem 4653 * supports VOP_MARKATIME. This functionality is used by execve and 4654 * mmap, so we want to avoid the I/O implied by directly setting 4655 * va_atime for the sake of efficiency. 4656 */ 4657void 4658vfs_mark_atime(struct vnode *vp, struct ucred *cred) 4659{ 4660 struct mount *mp; 4661 4662 mp = vp->v_mount; 4663 VFS_ASSERT_GIANT(mp); 4664 ASSERT_VOP_LOCKED(vp, "vfs_mark_atime"); 4665 if (mp != NULL && (mp->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0) 4666 (void)VOP_MARKATIME(vp); 4667} 4668 4669/* 4670 * The purpose of this routine is to remove granularity from accmode_t, 4671 * reducing it into standard unix access bits - VEXEC, VREAD, VWRITE, 4672 * VADMIN and VAPPEND. 4673 * 4674 * If it returns 0, the caller is supposed to continue with the usual 4675 * access checks using 'accmode' as modified by this routine. If it 4676 * returns nonzero value, the caller is supposed to return that value 4677 * as errno. 4678 * 4679 * Note that after this routine runs, accmode may be zero. 4680 */ 4681int 4682vfs_unixify_accmode(accmode_t *accmode) 4683{ 4684 /* 4685 * There is no way to specify explicit "deny" rule using 4686 * file mode or POSIX.1e ACLs. 4687 */ 4688 if (*accmode & VEXPLICIT_DENY) { 4689 *accmode = 0; 4690 return (0); 4691 } 4692 4693 /* 4694 * None of these can be translated into usual access bits. 4695 * Also, the common case for NFSv4 ACLs is to not contain 4696 * either of these bits. Caller should check for VWRITE 4697 * on the containing directory instead. 4698 */ 4699 if (*accmode & (VDELETE_CHILD | VDELETE)) 4700 return (EPERM); 4701 4702 if (*accmode & VADMIN_PERMS) { 4703 *accmode &= ~VADMIN_PERMS; 4704 *accmode |= VADMIN; 4705 } 4706 4707 /* 4708 * There is no way to deny VREAD_ATTRIBUTES, VREAD_ACL 4709 * or VSYNCHRONIZE using file mode or POSIX.1e ACL. 4710 */ 4711 *accmode &= ~(VSTAT_PERMS | VSYNCHRONIZE); 4712 4713 return (0); 4714} 4715 4716/* 4717 * These are helper functions for filesystems to traverse all 4718 * their vnodes. See MNT_VNODE_FOREACH_ALL() in sys/mount.h. 4719 * 4720 * This interface replaces MNT_VNODE_FOREACH. 4721 */ 4722 4723MALLOC_DEFINE(M_VNODE_MARKER, "vnodemarker", "vnode marker"); 4724 4725struct vnode * 4726__mnt_vnode_next_all(struct vnode **mvp, struct mount *mp) 4727{ 4728 struct vnode *vp; 4729 4730 if (should_yield()) 4731 kern_yield(PRI_UNCHANGED); 4732 MNT_ILOCK(mp); 4733 KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); 4734 vp = TAILQ_NEXT(*mvp, v_nmntvnodes); 4735 while (vp != NULL && (vp->v_type == VMARKER || 4736 (vp->v_iflag & VI_DOOMED) != 0)) 4737 vp = TAILQ_NEXT(vp, v_nmntvnodes); 4738 4739 /* Check if we are done */ 4740 if (vp == NULL) { 4741 __mnt_vnode_markerfree_all(mvp, mp); 4742 /* MNT_IUNLOCK(mp); -- done in above function */ 4743 mtx_assert(MNT_MTX(mp), MA_NOTOWNED); 4744 return (NULL); 4745 } 4746 TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes); 4747 TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes); 4748 VI_LOCK(vp); 4749 MNT_IUNLOCK(mp); 4750 return (vp); 4751} 4752 4753struct vnode * 4754__mnt_vnode_first_all(struct vnode **mvp, struct mount *mp) 4755{ 4756 struct vnode *vp; 4757 4758 *mvp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO); 4759 MNT_ILOCK(mp); 4760 MNT_REF(mp); 4761 (*mvp)->v_type = VMARKER; 4762 4763 vp = TAILQ_FIRST(&mp->mnt_nvnodelist); 4764 while (vp != NULL && (vp->v_type == VMARKER || 4765 (vp->v_iflag & VI_DOOMED) != 0)) 4766 vp = TAILQ_NEXT(vp, v_nmntvnodes); 4767 4768 /* Check if we are done */ 4769 if (vp == NULL) { 4770 MNT_REL(mp); 4771 MNT_IUNLOCK(mp); 4772 free(*mvp, M_VNODE_MARKER); 4773 *mvp = NULL; 4774 return (NULL); 4775 } 4776 (*mvp)->v_mount = mp; 4777 TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes); 4778 VI_LOCK(vp); 4779 MNT_IUNLOCK(mp); 4780 return (vp); 4781} 4782 4783 4784void 4785__mnt_vnode_markerfree_all(struct vnode **mvp, struct mount *mp) 4786{ 4787 4788 if (*mvp == NULL) { 4789 MNT_IUNLOCK(mp); 4790 return; 4791 } 4792 4793 mtx_assert(MNT_MTX(mp), MA_OWNED); 4794 4795 KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); 4796 TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes); 4797 MNT_REL(mp); 4798 MNT_IUNLOCK(mp); 4799 free(*mvp, M_VNODE_MARKER); 4800 *mvp = NULL; 4801} 4802 4803/* 4804 * These are helper functions for filesystems to traverse their 4805 * active vnodes. See MNT_VNODE_FOREACH_ACTIVE() in sys/mount.h 4806 */ 4807static void 4808mnt_vnode_markerfree_active(struct vnode **mvp, struct mount *mp) 4809{ 4810 4811 KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); 4812 4813 MNT_ILOCK(mp); 4814 MNT_REL(mp); 4815 MNT_IUNLOCK(mp); 4816 free(*mvp, M_VNODE_MARKER); 4817 *mvp = NULL; 4818} 4819 4820#ifdef SMP 4821#define ALWAYS_YIELD (mp_ncpus == 1) 4822#else 4823#define ALWAYS_YIELD 1 4824#endif 4825 4826static struct vnode * 4827mnt_vnode_next_active(struct vnode **mvp, struct mount *mp) 4828{ 4829 struct vnode *vp, *nvp; 4830 4831 mtx_assert(&vnode_free_list_mtx, MA_OWNED); 4832 KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); 4833restart: 4834 vp = TAILQ_NEXT(*mvp, v_actfreelist); 4835 TAILQ_REMOVE(&mp->mnt_activevnodelist, *mvp, v_actfreelist); 4836 while (vp != NULL) { 4837 if (vp->v_type == VMARKER) { 4838 vp = TAILQ_NEXT(vp, v_actfreelist); 4839 continue; 4840 } 4841 if (!VI_TRYLOCK(vp)) { 4842 if (ALWAYS_YIELD || should_yield()) { 4843 TAILQ_INSERT_BEFORE(vp, *mvp, v_actfreelist); 4844 mtx_unlock(&vnode_free_list_mtx); 4845 pause("vnacti", 1); 4846 mtx_lock(&vnode_free_list_mtx); 4847 goto restart; 4848 } 4849 continue; 4850 } 4851 KASSERT(vp->v_type != VMARKER, ("locked marker %p", vp)); 4852 KASSERT(vp->v_mount == mp || vp->v_mount == NULL, 4853 ("alien vnode on the active list %p %p", vp, mp)); 4854 if (vp->v_mount == mp && (vp->v_iflag & VI_DOOMED) == 0) 4855 break; 4856 nvp = TAILQ_NEXT(vp, v_actfreelist); 4857 VI_UNLOCK(vp); 4858 vp = nvp; 4859 } 4860 4861 /* Check if we are done */ 4862 if (vp == NULL) { 4863 mtx_unlock(&vnode_free_list_mtx); 4864 mnt_vnode_markerfree_active(mvp, mp); 4865 return (NULL); 4866 } 4867 TAILQ_INSERT_AFTER(&mp->mnt_activevnodelist, vp, *mvp, v_actfreelist); 4868 mtx_unlock(&vnode_free_list_mtx); 4869 ASSERT_VI_LOCKED(vp, "active iter"); 4870 KASSERT((vp->v_iflag & VI_ACTIVE) != 0, ("Non-active vp %p", vp)); 4871 return (vp); 4872} 4873#undef ALWAYS_YIELD 4874 4875struct vnode * 4876__mnt_vnode_next_active(struct vnode **mvp, struct mount *mp) 4877{ 4878 4879 if (should_yield()) 4880 kern_yield(PRI_UNCHANGED); 4881 mtx_lock(&vnode_free_list_mtx); 4882 return (mnt_vnode_next_active(mvp, mp)); 4883} 4884 4885struct vnode * 4886__mnt_vnode_first_active(struct vnode **mvp, struct mount *mp) 4887{ 4888 struct vnode *vp; 4889 4890 *mvp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO); 4891 MNT_ILOCK(mp); 4892 MNT_REF(mp); 4893 MNT_IUNLOCK(mp); 4894 (*mvp)->v_type = VMARKER; 4895 (*mvp)->v_mount = mp; 4896 4897 mtx_lock(&vnode_free_list_mtx); 4898 vp = TAILQ_FIRST(&mp->mnt_activevnodelist); 4899 if (vp == NULL) { 4900 mtx_unlock(&vnode_free_list_mtx); 4901 mnt_vnode_markerfree_active(mvp, mp); 4902 return (NULL); 4903 } 4904 TAILQ_INSERT_BEFORE(vp, *mvp, v_actfreelist); 4905 return (mnt_vnode_next_active(mvp, mp)); 4906} 4907 4908void 4909__mnt_vnode_markerfree_active(struct vnode **mvp, struct mount *mp) 4910{ 4911 4912 if (*mvp == NULL) 4913 return; 4914 4915 mtx_lock(&vnode_free_list_mtx); 4916 TAILQ_REMOVE(&mp->mnt_activevnodelist, *mvp, v_actfreelist); 4917 mtx_unlock(&vnode_free_list_mtx); 4918 mnt_vnode_markerfree_active(mvp, mp); 4919} 4920