1/* $NetBSD: vfs_vnode.c,v 1.15 2011/12/20 16:49:37 hannken Exp $ */ 2 3/*- 4 * Copyright (c) 1997-2011 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, 9 * NASA Ames Research Center, by Charles M. Hannum, and by Andrew Doran. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 23 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 30 * POSSIBILITY OF SUCH DAMAGE. 31 */ 32 33/* 34 * Copyright (c) 1989, 1993 35 * The Regents of the University of California. All rights reserved. 36 * (c) UNIX System Laboratories, Inc. 37 * All or some portions of this file are derived from material licensed 38 * to the University of California by American Telephone and Telegraph 39 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 40 * the permission of UNIX System Laboratories, Inc. 41 * 42 * Redistribution and use in source and binary forms, with or without 43 * modification, are permitted provided that the following conditions 44 * are met: 45 * 1. Redistributions of source code must retain the above copyright 46 * notice, this list of conditions and the following disclaimer. 47 * 2. Redistributions in binary form must reproduce the above copyright 48 * notice, this list of conditions and the following disclaimer in the 49 * documentation and/or other materials provided with the distribution. 50 * 3. Neither the name of the University nor the names of its contributors 51 * may be used to endorse or promote products derived from this software 52 * without specific prior written permission. 53 * 54 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 55 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 56 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 57 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 58 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 59 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 60 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 62 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 64 * SUCH DAMAGE. 65 * 66 * @(#)vfs_subr.c 8.13 (Berkeley) 4/18/94 67 */ 68 69/* 70 * The vnode cache subsystem. 71 * 72 * Life-cycle 73 * 74 * Normally, there are two points where new vnodes are created: 75 * VOP_CREATE(9) and VOP_LOOKUP(9). The life-cycle of a vnode 76 * starts in one of the following ways: 77 * 78 * - Allocation, via getnewvnode(9) and/or vnalloc(9). 79 * - Reclamation of inactive vnode, via vget(9). 80 * 81 * The life-cycle ends when the last reference is dropped, usually 82 * in VOP_REMOVE(9). In such case, VOP_INACTIVE(9) is called to inform 83 * the file system that vnode is inactive. Via this call, file system 84 * indicates whether vnode should be recycled (usually, count of links 85 * is checked i.e. whether file was removed). 86 * 87 * Depending on indication, vnode can be put into a free list (cache), 88 * or cleaned via vclean(9), which calls VOP_RECLAIM(9) to disassociate 89 * underlying file system from the vnode, and finally destroyed. 90 * 91 * Reference counting 92 * 93 * Vnode is considered active, if reference count (vnode_t::v_usecount) 94 * is non-zero. It is maintained using: vref(9) and vrele(9), as well 95 * as vput(9), routines. Common points holding references are e.g. 96 * file openings, current working directory, mount points, etc. 97 * 98 * Note on v_usecount and its locking 99 * 100 * At nearly all points it is known that v_usecount could be zero, 101 * the vnode_t::v_interlock will be held. To change v_usecount away 102 * from zero, the interlock must be held. To change from a non-zero 103 * value to zero, again the interlock must be held. 104 * 105 * There is a flag bit, VC_XLOCK, embedded in v_usecount. To raise 106 * v_usecount, if the VC_XLOCK bit is set in it, the interlock must 107 * be held. To modify the VC_XLOCK bit, the interlock must be held. 108 * We always keep the usecount (v_usecount & VC_MASK) non-zero while 109 * the VC_XLOCK bit is set. 110 * 111 * Unless the VC_XLOCK bit is set, changing the usecount from a non-zero 112 * value to a non-zero value can safely be done using atomic operations, 113 * without the interlock held. 114 * 115 * Even if the VC_XLOCK bit is set, decreasing the usecount to a non-zero 116 * value can be done using atomic operations, without the interlock held. 117 * 118 * Note: if VI_CLEAN is set, vnode_t::v_interlock will be released while 119 * mntvnode_lock is still held. 120 */ 121 122#include <sys/cdefs.h> 123__KERNEL_RCSID(0, "$NetBSD: vfs_vnode.c,v 1.15 2011/12/20 16:49:37 hannken Exp $"); 124 125#include <sys/param.h> 126#include <sys/kernel.h> 127 128#include <sys/atomic.h> 129#include <sys/buf.h> 130#include <sys/conf.h> 131#include <sys/device.h> 132#include <sys/kauth.h> 133#include <sys/kmem.h> 134#include <sys/kthread.h> 135#include <sys/module.h> 136#include <sys/mount.h> 137#include <sys/namei.h> 138#include <sys/syscallargs.h> 139#include <sys/sysctl.h> 140#include <sys/systm.h> 141#include <sys/vnode.h> 142#include <sys/wapbl.h> 143 144#include <uvm/uvm.h> 145#include <uvm/uvm_readahead.h> 146 147u_int numvnodes __cacheline_aligned; 148 149static pool_cache_t vnode_cache __read_mostly; 150static kmutex_t vnode_free_list_lock __cacheline_aligned; 151 152static vnodelst_t vnode_free_list __cacheline_aligned; 153static vnodelst_t vnode_hold_list __cacheline_aligned; 154static vnodelst_t vrele_list __cacheline_aligned; 155 156static kmutex_t vrele_lock __cacheline_aligned; 157static kcondvar_t vrele_cv __cacheline_aligned; 158static lwp_t * vrele_lwp __cacheline_aligned; 159static int vrele_pending __cacheline_aligned; 160static int vrele_gen __cacheline_aligned; 161static kcondvar_t vdrain_cv __cacheline_aligned; 162 163static int cleanvnode(void); 164static void vdrain_thread(void *); 165static void vrele_thread(void *); 166static void vnpanic(vnode_t *, const char *, ...) 167 __attribute__((__format__(__printf__, 2, 3))); 168 169/* Routines having to do with the management of the vnode table. */ 170extern int (**dead_vnodeop_p)(void *); 171 172void 173vfs_vnode_sysinit(void) 174{ 175 int error; 176 177 vnode_cache = pool_cache_init(sizeof(vnode_t), 0, 0, 0, "vnodepl", 178 NULL, IPL_NONE, NULL, NULL, NULL); 179 KASSERT(vnode_cache != NULL); 180 181 mutex_init(&vnode_free_list_lock, MUTEX_DEFAULT, IPL_NONE); 182 TAILQ_INIT(&vnode_free_list); 183 TAILQ_INIT(&vnode_hold_list); 184 TAILQ_INIT(&vrele_list); 185 186 mutex_init(&vrele_lock, MUTEX_DEFAULT, IPL_NONE); 187 cv_init(&vdrain_cv, "vdrain"); 188 cv_init(&vrele_cv, "vrele"); 189 error = kthread_create(PRI_VM, KTHREAD_MPSAFE, NULL, vdrain_thread, 190 NULL, NULL, "vdrain"); 191 KASSERT(error == 0); 192 error = kthread_create(PRI_VM, KTHREAD_MPSAFE, NULL, vrele_thread, 193 NULL, &vrele_lwp, "vrele"); 194 KASSERT(error == 0); 195} 196 197/* 198 * Allocate a new, uninitialized vnode. If 'mp' is non-NULL, this is a 199 * marker vnode. 200 */ 201vnode_t * 202vnalloc(struct mount *mp) 203{ 204 vnode_t *vp; 205 206 vp = pool_cache_get(vnode_cache, PR_WAITOK); 207 KASSERT(vp != NULL); 208 209 memset(vp, 0, sizeof(*vp)); 210 uvm_obj_init(&vp->v_uobj, &uvm_vnodeops, true, 0); 211 cv_init(&vp->v_cv, "vnode"); 212 /* 213 * Done by memset() above. 214 * LIST_INIT(&vp->v_nclist); 215 * LIST_INIT(&vp->v_dnclist); 216 */ 217 218 if (mp != NULL) { 219 vp->v_mount = mp; 220 vp->v_type = VBAD; 221 vp->v_iflag = VI_MARKER; 222 } else { 223 rw_init(&vp->v_lock); 224 } 225 226 return vp; 227} 228 229/* 230 * Free an unused, unreferenced vnode. 231 */ 232void 233vnfree(vnode_t *vp) 234{ 235 236 KASSERT(vp->v_usecount == 0); 237 238 if ((vp->v_iflag & VI_MARKER) == 0) { 239 rw_destroy(&vp->v_lock); 240 mutex_enter(&vnode_free_list_lock); 241 numvnodes--; 242 mutex_exit(&vnode_free_list_lock); 243 } 244 245 /* 246 * Note: the vnode interlock will either be freed, of reference 247 * dropped (if VI_LOCKSHARE was in use). 248 */ 249 uvm_obj_destroy(&vp->v_uobj, true); 250 cv_destroy(&vp->v_cv); 251 pool_cache_put(vnode_cache, vp); 252} 253 254/* 255 * cleanvnode: grab a vnode from freelist, clean and free it. 256 * 257 * => Releases vnode_free_list_lock. 258 */ 259static int 260cleanvnode(void) 261{ 262 vnode_t *vp; 263 vnodelst_t *listhd; 264 265 KASSERT(mutex_owned(&vnode_free_list_lock)); 266retry: 267 listhd = &vnode_free_list; 268try_nextlist: 269 TAILQ_FOREACH(vp, listhd, v_freelist) { 270 /* 271 * It's safe to test v_usecount and v_iflag 272 * without holding the interlock here, since 273 * these vnodes should never appear on the 274 * lists. 275 */ 276 KASSERT(vp->v_usecount == 0); 277 KASSERT((vp->v_iflag & VI_CLEAN) == 0); 278 KASSERT(vp->v_freelisthd == listhd); 279 280 if (!mutex_tryenter(vp->v_interlock)) 281 continue; 282 if ((vp->v_iflag & VI_XLOCK) == 0) 283 break; 284 mutex_exit(vp->v_interlock); 285 } 286 287 if (vp == NULL) { 288 if (listhd == &vnode_free_list) { 289 listhd = &vnode_hold_list; 290 goto try_nextlist; 291 } 292 mutex_exit(&vnode_free_list_lock); 293 return EBUSY; 294 } 295 296 /* Remove it from the freelist. */ 297 TAILQ_REMOVE(listhd, vp, v_freelist); 298 vp->v_freelisthd = NULL; 299 mutex_exit(&vnode_free_list_lock); 300 301 KASSERT(vp->v_usecount == 0); 302 303 /* 304 * The vnode is still associated with a file system, so we must 305 * clean it out before freeing it. We need to add a reference 306 * before doing this. If the vnode gains another reference while 307 * being cleaned out then we lose - retry. 308 */ 309 atomic_add_int(&vp->v_usecount, 1 + VC_XLOCK); 310 vclean(vp, DOCLOSE); 311 KASSERT(vp->v_usecount >= 1 + VC_XLOCK); 312 atomic_add_int(&vp->v_usecount, -VC_XLOCK); 313 if (vp->v_usecount > 1) { 314 /* 315 * Don't return to freelist - the holder of the last 316 * reference will destroy it. 317 */ 318 vrelel(vp, 0); /* releases vp->v_interlock */ 319 mutex_enter(&vnode_free_list_lock); 320 goto retry; 321 } 322 323 KASSERT((vp->v_iflag & VI_CLEAN) == VI_CLEAN); 324 mutex_exit(vp->v_interlock); 325 if (vp->v_type == VBLK || vp->v_type == VCHR) { 326 spec_node_destroy(vp); 327 } 328 vp->v_type = VNON; 329 330 KASSERT(vp->v_data == NULL); 331 KASSERT(vp->v_uobj.uo_npages == 0); 332 KASSERT(TAILQ_EMPTY(&vp->v_uobj.memq)); 333 KASSERT(vp->v_numoutput == 0); 334 KASSERT((vp->v_iflag & VI_ONWORKLST) == 0); 335 336 vrele(vp); 337 338 return 0; 339} 340 341/* 342 * getnewvnode: return a fresh vnode. 343 * 344 * => Returns referenced vnode, moved into the mount queue. 345 * => Shares the interlock specified by 'slock', if it is not NULL. 346 */ 347int 348getnewvnode(enum vtagtype tag, struct mount *mp, int (**vops)(void *), 349 kmutex_t *slock, vnode_t **vpp) 350{ 351 struct uvm_object *uobj; 352 vnode_t *vp; 353 int error = 0; 354 355 if (mp != NULL) { 356 /* 357 * Mark filesystem busy while we are creating a vnode. 358 * If unmount is in progress, this will fail. 359 */ 360 error = vfs_busy(mp, NULL); 361 if (error) 362 return error; 363 } 364 365 vp = NULL; 366 367 /* Allocate a new vnode. */ 368 mutex_enter(&vnode_free_list_lock); 369 numvnodes++; 370 if (numvnodes > desiredvnodes + desiredvnodes / 10) 371 cv_signal(&vdrain_cv); 372 mutex_exit(&vnode_free_list_lock); 373 vp = vnalloc(NULL); 374 375 KASSERT(vp->v_freelisthd == NULL); 376 KASSERT(LIST_EMPTY(&vp->v_nclist)); 377 KASSERT(LIST_EMPTY(&vp->v_dnclist)); 378 379 /* Initialize vnode. */ 380 vp->v_usecount = 1; 381 vp->v_type = VNON; 382 vp->v_tag = tag; 383 vp->v_op = vops; 384 vp->v_data = NULL; 385 386 uobj = &vp->v_uobj; 387 KASSERT(uobj->pgops == &uvm_vnodeops); 388 KASSERT(uobj->uo_npages == 0); 389 KASSERT(TAILQ_FIRST(&uobj->memq) == NULL); 390 vp->v_size = vp->v_writesize = VSIZENOTSET; 391 392 /* Share the vnode_t::v_interlock, if requested. */ 393 if (slock) { 394 /* Set the interlock and mark that it is shared. */ 395 KASSERT(vp->v_mount == NULL); 396 mutex_obj_hold(slock); 397 uvm_obj_setlock(&vp->v_uobj, slock); 398 KASSERT(vp->v_interlock == slock); 399 vp->v_iflag |= VI_LOCKSHARE; 400 } 401 402 /* Finally, move vnode into the mount queue. */ 403 vfs_insmntque(vp, mp); 404 405 if (mp != NULL) { 406 if ((mp->mnt_iflag & IMNT_MPSAFE) != 0) 407 vp->v_vflag |= VV_MPSAFE; 408 vfs_unbusy(mp, true, NULL); 409 } 410 411 *vpp = vp; 412 return 0; 413} 414 415/* 416 * This is really just the reverse of getnewvnode(). Needed for 417 * VFS_VGET functions who may need to push back a vnode in case 418 * of a locking race. 419 */ 420void 421ungetnewvnode(vnode_t *vp) 422{ 423 424 KASSERT(vp->v_usecount == 1); 425 KASSERT(vp->v_data == NULL); 426 KASSERT(vp->v_freelisthd == NULL); 427 428 mutex_enter(vp->v_interlock); 429 vp->v_iflag |= VI_CLEAN; 430 vrelel(vp, 0); 431} 432 433/* 434 * Helper thread to keep the number of vnodes below desiredvnodes. 435 */ 436static void 437vdrain_thread(void *cookie) 438{ 439 int error; 440 441 mutex_enter(&vnode_free_list_lock); 442 443 for (;;) { 444 cv_timedwait(&vdrain_cv, &vnode_free_list_lock, hz); 445 while (numvnodes > desiredvnodes) { 446 error = cleanvnode(); 447 if (error) 448 kpause("vndsbusy", false, hz, NULL); 449 mutex_enter(&vnode_free_list_lock); 450 if (error) 451 break; 452 } 453 } 454} 455 456/* 457 * Remove a vnode from its freelist. 458 */ 459void 460vremfree(vnode_t *vp) 461{ 462 463 KASSERT(mutex_owned(vp->v_interlock)); 464 KASSERT(vp->v_usecount == 0); 465 466 /* 467 * Note that the reference count must not change until 468 * the vnode is removed. 469 */ 470 mutex_enter(&vnode_free_list_lock); 471 if (vp->v_holdcnt > 0) { 472 KASSERT(vp->v_freelisthd == &vnode_hold_list); 473 } else { 474 KASSERT(vp->v_freelisthd == &vnode_free_list); 475 } 476 TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist); 477 vp->v_freelisthd = NULL; 478 mutex_exit(&vnode_free_list_lock); 479} 480 481/* 482 * Try to gain a reference to a vnode, without acquiring its interlock. 483 * The caller must hold a lock that will prevent the vnode from being 484 * recycled or freed. 485 */ 486bool 487vtryget(vnode_t *vp) 488{ 489 u_int use, next; 490 491 /* 492 * If the vnode is being freed, don't make life any harder 493 * for vclean() by adding another reference without waiting. 494 * This is not strictly necessary, but we'll do it anyway. 495 */ 496 if (__predict_false((vp->v_iflag & VI_XLOCK) != 0)) { 497 return false; 498 } 499 for (use = vp->v_usecount;; use = next) { 500 if (use == 0 || __predict_false((use & VC_XLOCK) != 0)) { 501 /* Need interlock held if first reference. */ 502 return false; 503 } 504 next = atomic_cas_uint(&vp->v_usecount, use, use + 1); 505 if (__predict_true(next == use)) { 506 return true; 507 } 508 } 509} 510 511/* 512 * vget: get a particular vnode from the free list, increment its reference 513 * count and lock it. 514 * 515 * => Should be called with v_interlock held. 516 * 517 * If VI_XLOCK is set, the vnode is being eliminated in vgone()/vclean(). 518 * In that case, we cannot grab the vnode, so the process is awakened when 519 * the transition is completed, and an error returned to indicate that the 520 * vnode is no longer usable (e.g. changed to a new file system type). 521 */ 522int 523vget(vnode_t *vp, int flags) 524{ 525 int error = 0; 526 527 KASSERT((vp->v_iflag & VI_MARKER) == 0); 528 KASSERT(mutex_owned(vp->v_interlock)); 529 KASSERT((flags & ~(LK_SHARED|LK_EXCLUSIVE|LK_NOWAIT)) == 0); 530 531 /* 532 * Before adding a reference, we must remove the vnode 533 * from its freelist. 534 */ 535 if (vp->v_usecount == 0) { 536 vremfree(vp); 537 vp->v_usecount = 1; 538 } else { 539 atomic_inc_uint(&vp->v_usecount); 540 } 541 542 /* 543 * If the vnode is in the process of being cleaned out for 544 * another use, we wait for the cleaning to finish and then 545 * return failure. Cleaning is determined by checking if 546 * the VI_XLOCK flag is set. 547 */ 548 if ((vp->v_iflag & VI_XLOCK) != 0) { 549 if ((flags & LK_NOWAIT) != 0) { 550 vrelel(vp, 0); 551 return EBUSY; 552 } 553 vwait(vp, VI_XLOCK); 554 vrelel(vp, 0); 555 return ENOENT; 556 } 557 558 if ((vp->v_iflag & VI_INACTNOW) != 0) { 559 /* 560 * if it's being desactived, wait for it to complete. 561 * Make sure to not return a clean vnode. 562 */ 563 if ((flags & LK_NOWAIT) != 0) { 564 vrelel(vp, 0); 565 return EBUSY; 566 } 567 vwait(vp, VI_INACTNOW); 568 if ((vp->v_iflag & VI_CLEAN) != 0) { 569 vrelel(vp, 0); 570 return ENOENT; 571 } 572 } 573 574 /* 575 * Ok, we got it in good shape. Just locking left. 576 */ 577 KASSERT((vp->v_iflag & VI_CLEAN) == 0); 578 mutex_exit(vp->v_interlock); 579 if (flags & (LK_EXCLUSIVE | LK_SHARED)) { 580 error = vn_lock(vp, flags); 581 if (error != 0) { 582 vrele(vp); 583 } 584 } 585 return error; 586} 587 588/* 589 * vput: unlock and release the reference. 590 */ 591void 592vput(vnode_t *vp) 593{ 594 595 KASSERT((vp->v_iflag & VI_MARKER) == 0); 596 597 VOP_UNLOCK(vp); 598 vrele(vp); 599} 600 601/* 602 * Try to drop reference on a vnode. Abort if we are releasing the 603 * last reference. Note: this _must_ succeed if not the last reference. 604 */ 605static inline bool 606vtryrele(vnode_t *vp) 607{ 608 u_int use, next; 609 610 for (use = vp->v_usecount;; use = next) { 611 if (use == 1) { 612 return false; 613 } 614 KASSERT((use & VC_MASK) > 1); 615 next = atomic_cas_uint(&vp->v_usecount, use, use - 1); 616 if (__predict_true(next == use)) { 617 return true; 618 } 619 } 620} 621 622/* 623 * Vnode release. If reference count drops to zero, call inactive 624 * routine and either return to freelist or free to the pool. 625 */ 626void 627vrelel(vnode_t *vp, int flags) 628{ 629 bool recycle, defer; 630 int error; 631 632 KASSERT(mutex_owned(vp->v_interlock)); 633 KASSERT((vp->v_iflag & VI_MARKER) == 0); 634 KASSERT(vp->v_freelisthd == NULL); 635 636 if (__predict_false(vp->v_op == dead_vnodeop_p && 637 (vp->v_iflag & (VI_CLEAN|VI_XLOCK)) == 0)) { 638 vnpanic(vp, "dead but not clean"); 639 } 640 641 /* 642 * If not the last reference, just drop the reference count 643 * and unlock. 644 */ 645 if (vtryrele(vp)) { 646 vp->v_iflag |= VI_INACTREDO; 647 mutex_exit(vp->v_interlock); 648 return; 649 } 650 if (vp->v_usecount <= 0 || vp->v_writecount != 0) { 651 vnpanic(vp, "%s: bad ref count", __func__); 652 } 653 654 KASSERT((vp->v_iflag & VI_XLOCK) == 0); 655 656#ifdef DIAGNOSTIC 657 if ((vp->v_type == VBLK || vp->v_type == VCHR) && 658 vp->v_specnode != NULL && vp->v_specnode->sn_opencnt != 0) { 659 vprint("vrelel: missing VOP_CLOSE()", vp); 660 } 661#endif 662 663 /* 664 * If not clean, deactivate the vnode, but preserve 665 * our reference across the call to VOP_INACTIVE(). 666 */ 667retry: 668 if ((vp->v_iflag & VI_CLEAN) == 0) { 669 recycle = false; 670 vp->v_iflag |= VI_INACTNOW; 671 672 /* 673 * XXX This ugly block can be largely eliminated if 674 * locking is pushed down into the file systems. 675 * 676 * Defer vnode release to vrele_thread if caller 677 * requests it explicitly. 678 */ 679 if ((curlwp == uvm.pagedaemon_lwp) || 680 (flags & VRELEL_ASYNC_RELE) != 0) { 681 /* The pagedaemon can't wait around; defer. */ 682 defer = true; 683 } else if (curlwp == vrele_lwp) { 684 /* 685 * We have to try harder. But we can't sleep 686 * with VI_INACTNOW as vget() may be waiting on it. 687 */ 688 vp->v_iflag &= ~(VI_INACTREDO|VI_INACTNOW); 689 cv_broadcast(&vp->v_cv); 690 mutex_exit(vp->v_interlock); 691 error = vn_lock(vp, LK_EXCLUSIVE); 692 if (error != 0) { 693 /* XXX */ 694 vnpanic(vp, "%s: unable to lock %p", 695 __func__, vp); 696 } 697 mutex_enter(vp->v_interlock); 698 /* 699 * if we did get another reference while 700 * sleeping, don't try to inactivate it yet. 701 */ 702 if (__predict_false(vtryrele(vp))) { 703 VOP_UNLOCK(vp); 704 mutex_exit(vp->v_interlock); 705 return; 706 } 707 vp->v_iflag |= VI_INACTNOW; 708 mutex_exit(vp->v_interlock); 709 defer = false; 710 } else if ((vp->v_iflag & VI_LAYER) != 0) { 711 /* 712 * Acquiring the stack's lock in vclean() even 713 * for an honest vput/vrele is dangerous because 714 * our caller may hold other vnode locks; defer. 715 */ 716 defer = true; 717 } else { 718 /* If we can't acquire the lock, then defer. */ 719 vp->v_iflag &= ~VI_INACTREDO; 720 mutex_exit(vp->v_interlock); 721 error = vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT); 722 if (error != 0) { 723 defer = true; 724 mutex_enter(vp->v_interlock); 725 } else { 726 defer = false; 727 } 728 } 729 730 if (defer) { 731 /* 732 * Defer reclaim to the kthread; it's not safe to 733 * clean it here. We donate it our last reference. 734 */ 735 KASSERT(mutex_owned(vp->v_interlock)); 736 KASSERT((vp->v_iflag & VI_INACTPEND) == 0); 737 vp->v_iflag &= ~VI_INACTNOW; 738 vp->v_iflag |= VI_INACTPEND; 739 mutex_enter(&vrele_lock); 740 TAILQ_INSERT_TAIL(&vrele_list, vp, v_freelist); 741 if (++vrele_pending > (desiredvnodes >> 8)) 742 cv_signal(&vrele_cv); 743 mutex_exit(&vrele_lock); 744 cv_broadcast(&vp->v_cv); 745 mutex_exit(vp->v_interlock); 746 return; 747 } 748 749 /* 750 * The vnode can gain another reference while being 751 * deactivated. If VOP_INACTIVE() indicates that 752 * the described file has been deleted, then recycle 753 * the vnode irrespective of additional references. 754 * Another thread may be waiting to re-use the on-disk 755 * inode. 756 * 757 * Note that VOP_INACTIVE() will drop the vnode lock. 758 */ 759 VOP_INACTIVE(vp, &recycle); 760 mutex_enter(vp->v_interlock); 761 vp->v_iflag &= ~VI_INACTNOW; 762 cv_broadcast(&vp->v_cv); 763 if (!recycle) { 764 if (vtryrele(vp)) { 765 mutex_exit(vp->v_interlock); 766 return; 767 } 768 769 /* 770 * If we grew another reference while 771 * VOP_INACTIVE() was underway, retry. 772 */ 773 if ((vp->v_iflag & VI_INACTREDO) != 0) { 774 goto retry; 775 } 776 } 777 778 /* Take care of space accounting. */ 779 if (vp->v_iflag & VI_EXECMAP) { 780 atomic_add_int(&uvmexp.execpages, 781 -vp->v_uobj.uo_npages); 782 atomic_add_int(&uvmexp.filepages, 783 vp->v_uobj.uo_npages); 784 } 785 vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP|VI_WRMAP); 786 vp->v_vflag &= ~VV_MAPPED; 787 788 /* 789 * Recycle the vnode if the file is now unused (unlinked), 790 * otherwise just free it. 791 */ 792 if (recycle) { 793 vclean(vp, DOCLOSE); 794 } 795 KASSERT(vp->v_usecount > 0); 796 } 797 798 if (atomic_dec_uint_nv(&vp->v_usecount) != 0) { 799 /* Gained another reference while being reclaimed. */ 800 mutex_exit(vp->v_interlock); 801 return; 802 } 803 804 if ((vp->v_iflag & VI_CLEAN) != 0) { 805 /* 806 * It's clean so destroy it. It isn't referenced 807 * anywhere since it has been reclaimed. 808 */ 809 KASSERT(vp->v_holdcnt == 0); 810 KASSERT(vp->v_writecount == 0); 811 mutex_exit(vp->v_interlock); 812 vfs_insmntque(vp, NULL); 813 if (vp->v_type == VBLK || vp->v_type == VCHR) { 814 spec_node_destroy(vp); 815 } 816 vnfree(vp); 817 } else { 818 /* 819 * Otherwise, put it back onto the freelist. It 820 * can't be destroyed while still associated with 821 * a file system. 822 */ 823 mutex_enter(&vnode_free_list_lock); 824 if (vp->v_holdcnt > 0) { 825 vp->v_freelisthd = &vnode_hold_list; 826 } else { 827 vp->v_freelisthd = &vnode_free_list; 828 } 829 TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist); 830 mutex_exit(&vnode_free_list_lock); 831 mutex_exit(vp->v_interlock); 832 } 833} 834 835void 836vrele(vnode_t *vp) 837{ 838 839 KASSERT((vp->v_iflag & VI_MARKER) == 0); 840 841 if ((vp->v_iflag & VI_INACTNOW) == 0 && vtryrele(vp)) { 842 return; 843 } 844 mutex_enter(vp->v_interlock); 845 vrelel(vp, 0); 846} 847 848/* 849 * Asynchronous vnode release, vnode is released in different context. 850 */ 851void 852vrele_async(vnode_t *vp) 853{ 854 855 KASSERT((vp->v_iflag & VI_MARKER) == 0); 856 857 if ((vp->v_iflag & VI_INACTNOW) == 0 && vtryrele(vp)) { 858 return; 859 } 860 mutex_enter(vp->v_interlock); 861 vrelel(vp, VRELEL_ASYNC_RELE); 862} 863 864static void 865vrele_thread(void *cookie) 866{ 867 vnode_t *vp; 868 869 for (;;) { 870 mutex_enter(&vrele_lock); 871 while (TAILQ_EMPTY(&vrele_list)) { 872 vrele_gen++; 873 cv_broadcast(&vrele_cv); 874 cv_timedwait(&vrele_cv, &vrele_lock, hz); 875 } 876 vp = TAILQ_FIRST(&vrele_list); 877 TAILQ_REMOVE(&vrele_list, vp, v_freelist); 878 vrele_pending--; 879 mutex_exit(&vrele_lock); 880 881 /* 882 * If not the last reference, then ignore the vnode 883 * and look for more work. 884 */ 885 mutex_enter(vp->v_interlock); 886 KASSERT((vp->v_iflag & VI_INACTPEND) != 0); 887 vp->v_iflag &= ~VI_INACTPEND; 888 vrelel(vp, 0); 889 } 890} 891 892void 893vrele_flush(void) 894{ 895 int gen; 896 897 mutex_enter(&vrele_lock); 898 gen = vrele_gen; 899 while (vrele_pending && gen == vrele_gen) { 900 cv_broadcast(&vrele_cv); 901 cv_wait(&vrele_cv, &vrele_lock); 902 } 903 mutex_exit(&vrele_lock); 904} 905 906/* 907 * Vnode reference, where a reference is already held by some other 908 * object (for example, a file structure). 909 */ 910void 911vref(vnode_t *vp) 912{ 913 914 KASSERT((vp->v_iflag & VI_MARKER) == 0); 915 KASSERT(vp->v_usecount != 0); 916 917 atomic_inc_uint(&vp->v_usecount); 918} 919 920/* 921 * Page or buffer structure gets a reference. 922 * Called with v_interlock held. 923 */ 924void 925vholdl(vnode_t *vp) 926{ 927 928 KASSERT(mutex_owned(vp->v_interlock)); 929 KASSERT((vp->v_iflag & VI_MARKER) == 0); 930 931 if (vp->v_holdcnt++ == 0 && vp->v_usecount == 0) { 932 mutex_enter(&vnode_free_list_lock); 933 KASSERT(vp->v_freelisthd == &vnode_free_list); 934 TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist); 935 vp->v_freelisthd = &vnode_hold_list; 936 TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist); 937 mutex_exit(&vnode_free_list_lock); 938 } 939} 940 941/* 942 * Page or buffer structure frees a reference. 943 * Called with v_interlock held. 944 */ 945void 946holdrelel(vnode_t *vp) 947{ 948 949 KASSERT(mutex_owned(vp->v_interlock)); 950 KASSERT((vp->v_iflag & VI_MARKER) == 0); 951 952 if (vp->v_holdcnt <= 0) { 953 vnpanic(vp, "%s: holdcnt vp %p", __func__, vp); 954 } 955 956 vp->v_holdcnt--; 957 if (vp->v_holdcnt == 0 && vp->v_usecount == 0) { 958 mutex_enter(&vnode_free_list_lock); 959 KASSERT(vp->v_freelisthd == &vnode_hold_list); 960 TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist); 961 vp->v_freelisthd = &vnode_free_list; 962 TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist); 963 mutex_exit(&vnode_free_list_lock); 964 } 965} 966 967/* 968 * Disassociate the underlying file system from a vnode. 969 * 970 * Must be called with the interlock held, and will return with it held. 971 */ 972void 973vclean(vnode_t *vp, int flags) 974{ 975 lwp_t *l = curlwp; 976 bool recycle, active; 977 int error; 978 979 KASSERT(mutex_owned(vp->v_interlock)); 980 KASSERT((vp->v_iflag & VI_MARKER) == 0); 981 KASSERT(vp->v_usecount != 0); 982 983 /* If cleaning is already in progress wait until done and return. */ 984 if (vp->v_iflag & VI_XLOCK) { 985 vwait(vp, VI_XLOCK); 986 return; 987 } 988 989 /* If already clean, nothing to do. */ 990 if ((vp->v_iflag & VI_CLEAN) != 0) { 991 return; 992 } 993 994 /* 995 * Prevent the vnode from being recycled or brought into use 996 * while we clean it out. 997 */ 998 vp->v_iflag |= VI_XLOCK; 999 if (vp->v_iflag & VI_EXECMAP) { 1000 atomic_add_int(&uvmexp.execpages, -vp->v_uobj.uo_npages); 1001 atomic_add_int(&uvmexp.filepages, vp->v_uobj.uo_npages); 1002 } 1003 vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP); 1004 active = (vp->v_usecount & VC_MASK) > 1; 1005 1006 /* XXXAD should not lock vnode under layer */ 1007 mutex_exit(vp->v_interlock); 1008 VOP_LOCK(vp, LK_EXCLUSIVE); 1009 1010 /* 1011 * Clean out any cached data associated with the vnode. 1012 * If purging an active vnode, it must be closed and 1013 * deactivated before being reclaimed. Note that the 1014 * VOP_INACTIVE will unlock the vnode. 1015 */ 1016 if (flags & DOCLOSE) { 1017 error = vinvalbuf(vp, V_SAVE, NOCRED, l, 0, 0); 1018 if (error != 0) { 1019 /* XXX, fix vn_start_write's grab of mp and use that. */ 1020 1021 if (wapbl_vphaswapbl(vp)) 1022 WAPBL_DISCARD(wapbl_vptomp(vp)); 1023 error = vinvalbuf(vp, 0, NOCRED, l, 0, 0); 1024 } 1025 KASSERT(error == 0); 1026 KASSERT((vp->v_iflag & VI_ONWORKLST) == 0); 1027 if (active && (vp->v_type == VBLK || vp->v_type == VCHR)) { 1028 spec_node_revoke(vp); 1029 } 1030 } 1031 if (active) { 1032 VOP_INACTIVE(vp, &recycle); 1033 } else { 1034 /* 1035 * Any other processes trying to obtain this lock must first 1036 * wait for VI_XLOCK to clear, then call the new lock operation. 1037 */ 1038 VOP_UNLOCK(vp); 1039 } 1040 1041 /* Disassociate the underlying file system from the vnode. */ 1042 if (VOP_RECLAIM(vp)) { 1043 vnpanic(vp, "%s: cannot reclaim", __func__); 1044 } 1045 1046 KASSERT(vp->v_data == NULL); 1047 KASSERT(vp->v_uobj.uo_npages == 0); 1048 1049 if (vp->v_type == VREG && vp->v_ractx != NULL) { 1050 uvm_ra_freectx(vp->v_ractx); 1051 vp->v_ractx = NULL; 1052 } 1053 1054 /* Purge name cache. */ 1055 cache_purge(vp); 1056 1057 /* Done with purge, notify sleepers of the grim news. */ 1058 mutex_enter(vp->v_interlock); 1059 vp->v_op = dead_vnodeop_p; 1060 vp->v_tag = VT_NON; 1061 KNOTE(&vp->v_klist, NOTE_REVOKE); 1062 vp->v_iflag &= ~VI_XLOCK; 1063 vp->v_vflag &= ~VV_LOCKSWORK; 1064 if ((flags & DOCLOSE) != 0) { 1065 vp->v_iflag |= VI_CLEAN; 1066 } 1067 cv_broadcast(&vp->v_cv); 1068 1069 KASSERT((vp->v_iflag & VI_ONWORKLST) == 0); 1070} 1071 1072/* 1073 * Recycle an unused vnode to the front of the free list. 1074 * Release the passed interlock if the vnode will be recycled. 1075 */ 1076int 1077vrecycle(vnode_t *vp, kmutex_t *inter_lkp, struct lwp *l) 1078{ 1079 1080 KASSERT((vp->v_iflag & VI_MARKER) == 0); 1081 1082 mutex_enter(vp->v_interlock); 1083 if (vp->v_usecount != 0) { 1084 mutex_exit(vp->v_interlock); 1085 return 0; 1086 } 1087 if (inter_lkp) { 1088 mutex_exit(inter_lkp); 1089 } 1090 vremfree(vp); 1091 vp->v_usecount = 1; 1092 vclean(vp, DOCLOSE); 1093 vrelel(vp, 0); 1094 return 1; 1095} 1096 1097/* 1098 * Eliminate all activity associated with the requested vnode 1099 * and with all vnodes aliased to the requested vnode. 1100 */ 1101void 1102vrevoke(vnode_t *vp) 1103{ 1104 vnode_t *vq, **vpp; 1105 enum vtype type; 1106 dev_t dev; 1107 1108 KASSERT(vp->v_usecount > 0); 1109 1110 mutex_enter(vp->v_interlock); 1111 if ((vp->v_iflag & VI_CLEAN) != 0) { 1112 mutex_exit(vp->v_interlock); 1113 return; 1114 } else if (vp->v_type != VBLK && vp->v_type != VCHR) { 1115 atomic_inc_uint(&vp->v_usecount); 1116 vclean(vp, DOCLOSE); 1117 vrelel(vp, 0); 1118 return; 1119 } else { 1120 dev = vp->v_rdev; 1121 type = vp->v_type; 1122 mutex_exit(vp->v_interlock); 1123 } 1124 1125 vpp = &specfs_hash[SPECHASH(dev)]; 1126 mutex_enter(&device_lock); 1127 for (vq = *vpp; vq != NULL;) { 1128 /* If clean or being cleaned, then ignore it. */ 1129 mutex_enter(vq->v_interlock); 1130 if ((vq->v_iflag & (VI_CLEAN | VI_XLOCK)) != 0 || 1131 vq->v_type != type || vq->v_rdev != dev) { 1132 mutex_exit(vq->v_interlock); 1133 vq = vq->v_specnext; 1134 continue; 1135 } 1136 mutex_exit(&device_lock); 1137 if (vq->v_usecount == 0) { 1138 vremfree(vq); 1139 vq->v_usecount = 1; 1140 } else { 1141 atomic_inc_uint(&vq->v_usecount); 1142 } 1143 vclean(vq, DOCLOSE); 1144 vrelel(vq, 0); 1145 mutex_enter(&device_lock); 1146 vq = *vpp; 1147 } 1148 mutex_exit(&device_lock); 1149} 1150 1151/* 1152 * Eliminate all activity associated with a vnode in preparation for 1153 * reuse. Drops a reference from the vnode. 1154 */ 1155void 1156vgone(vnode_t *vp) 1157{ 1158 1159 mutex_enter(vp->v_interlock); 1160 vclean(vp, DOCLOSE); 1161 vrelel(vp, 0); 1162} 1163 1164/* 1165 * Update outstanding I/O count and do wakeup if requested. 1166 */ 1167void 1168vwakeup(struct buf *bp) 1169{ 1170 vnode_t *vp; 1171 1172 if ((vp = bp->b_vp) == NULL) 1173 return; 1174 1175 KASSERT(bp->b_objlock == vp->v_interlock); 1176 KASSERT(mutex_owned(bp->b_objlock)); 1177 1178 if (--vp->v_numoutput < 0) 1179 vnpanic(vp, "%s: neg numoutput, vp %p", __func__, vp); 1180 if (vp->v_numoutput == 0) 1181 cv_broadcast(&vp->v_cv); 1182} 1183 1184/* 1185 * Wait for a vnode (typically with VI_XLOCK set) to be cleaned or 1186 * recycled. 1187 */ 1188void 1189vwait(vnode_t *vp, int flags) 1190{ 1191 1192 KASSERT(mutex_owned(vp->v_interlock)); 1193 KASSERT(vp->v_usecount != 0); 1194 1195 while ((vp->v_iflag & flags) != 0) 1196 cv_wait(&vp->v_cv, vp->v_interlock); 1197} 1198 1199int 1200vfs_drainvnodes(long target) 1201{ 1202 int error; 1203 1204 mutex_enter(&vnode_free_list_lock); 1205 1206 while (numvnodes > target) { 1207 error = cleanvnode(); 1208 if (error != 0) 1209 return error; 1210 mutex_enter(&vnode_free_list_lock); 1211 } 1212 1213 mutex_exit(&vnode_free_list_lock); 1214 1215 return 0; 1216} 1217 1218void 1219vnpanic(vnode_t *vp, const char *fmt, ...) 1220{ 1221 va_list ap; 1222 1223#ifdef DIAGNOSTIC 1224 vprint(NULL, vp); 1225#endif 1226 va_start(ap, fmt); 1227 vpanic(fmt, ap); 1228 va_end(ap); 1229} 1230