vfs_subr.c revision 34928
1/* 2 * Copyright (c) 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the University of 21 * California, Berkeley and its contributors. 22 * 4. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95 39 * $Id: vfs_subr.c,v 1.146 1998/03/28 12:04:32 bde Exp $ 40 */ 41 42/* 43 * External virtual filesystem routines 44 */ 45#include "opt_ddb.h" 46#include "opt_devfs.h" 47 48#include <sys/param.h> 49#include <sys/systm.h> 50#include <sys/kernel.h> 51#include <sys/proc.h> 52#include <sys/malloc.h> 53#include <sys/mount.h> 54#include <sys/socket.h> 55#include <sys/vnode.h> 56#include <sys/stat.h> 57#include <sys/buf.h> 58#include <sys/domain.h> 59#include <sys/dirent.h> 60#include <sys/vmmeter.h> 61 62#include <machine/limits.h> 63 64#include <vm/vm.h> 65#include <vm/vm_object.h> 66#include <vm/vm_extern.h> 67#include <vm/pmap.h> 68#include <vm/vm_map.h> 69#include <vm/vm_pager.h> 70#include <vm/vnode_pager.h> 71#include <vm/vm_zone.h> 72#include <sys/sysctl.h> 73 74#include <miscfs/specfs/specdev.h> 75 76static MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure"); 77 78static void insmntque __P((struct vnode *vp, struct mount *mp)); 79#ifdef DDB 80static void printlockedvnodes __P((void)); 81#endif 82static void vclean __P((struct vnode *vp, int flags, struct proc *p)); 83static void vfree __P((struct vnode *)); 84static void vgonel __P((struct vnode *vp, struct proc *p)); 85static unsigned long numvnodes; 86SYSCTL_INT(_debug, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, ""); 87 88enum vtype iftovt_tab[16] = { 89 VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, 90 VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD, 91}; 92int vttoif_tab[9] = { 93 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, 94 S_IFSOCK, S_IFIFO, S_IFMT, 95}; 96 97/* 98 * Insq/Remq for the vnode usage lists. 99 */ 100#define bufinsvn(bp, dp) LIST_INSERT_HEAD(dp, bp, b_vnbufs) 101#define bufremvn(bp) { \ 102 LIST_REMOVE(bp, b_vnbufs); \ 103 (bp)->b_vnbufs.le_next = NOLIST; \ 104} 105 106static TAILQ_HEAD(freelst, vnode) vnode_free_list; /* vnode free list */ 107struct tobefreelist vnode_tobefree_list; /* vnode free list */ 108 109static u_long wantfreevnodes = 25; 110SYSCTL_INT(_debug, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, ""); 111static u_long freevnodes = 0; 112SYSCTL_INT(_debug, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, ""); 113 114int vfs_ioopt = 0; 115#ifdef ENABLE_VFS_IOOPT 116SYSCTL_INT(_vfs, OID_AUTO, ioopt, CTLFLAG_RW, &vfs_ioopt, 0, ""); 117#endif 118 119struct mntlist mountlist; /* mounted filesystem list */ 120struct simplelock mountlist_slock; 121static struct simplelock mntid_slock; 122struct simplelock mntvnode_slock; 123static struct simplelock vnode_free_list_slock; 124static struct simplelock spechash_slock; 125struct nfs_public nfs_pub; /* publicly exported FS */ 126static vm_zone_t vnode_zone; 127 128/* 129 * The workitem queue. 130 */ 131#define SYNCER_MAXDELAY 32 132int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */ 133time_t syncdelay = 30; 134int rushjob; /* number of slots to run ASAP */ 135 136static int syncer_delayno = 0; 137static long syncer_mask; 138LIST_HEAD(synclist, vnode); 139static struct synclist *syncer_workitem_pending; 140 141int desiredvnodes; 142SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW, &desiredvnodes, 0, ""); 143 144static void vfs_free_addrlist __P((struct netexport *nep)); 145static int vfs_free_netcred __P((struct radix_node *rn, void *w)); 146static int vfs_hang_addrlist __P((struct mount *mp, struct netexport *nep, 147 struct export_args *argp)); 148 149/* 150 * Initialize the vnode management data structures. 151 */ 152void 153vntblinit() 154{ 155 156 desiredvnodes = maxproc + cnt.v_page_count / 4; 157 simple_lock_init(&mntvnode_slock); 158 simple_lock_init(&mntid_slock); 159 simple_lock_init(&spechash_slock); 160 TAILQ_INIT(&vnode_free_list); 161 TAILQ_INIT(&vnode_tobefree_list); 162 simple_lock_init(&vnode_free_list_slock); 163 CIRCLEQ_INIT(&mountlist); 164 vnode_zone = zinit("VNODE", sizeof (struct vnode), 0, 0, 5); 165 /* 166 * Initialize the filesystem syncer. 167 */ 168 syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE, 169 &syncer_mask); 170 syncer_maxdelay = syncer_mask + 1; 171} 172 173/* 174 * Mark a mount point as busy. Used to synchronize access and to delay 175 * unmounting. Interlock is not released on failure. 176 */ 177int 178vfs_busy(mp, flags, interlkp, p) 179 struct mount *mp; 180 int flags; 181 struct simplelock *interlkp; 182 struct proc *p; 183{ 184 int lkflags; 185 186 if (mp->mnt_kern_flag & MNTK_UNMOUNT) { 187 if (flags & LK_NOWAIT) 188 return (ENOENT); 189 mp->mnt_kern_flag |= MNTK_MWAIT; 190 if (interlkp) { 191 simple_unlock(interlkp); 192 } 193 /* 194 * Since all busy locks are shared except the exclusive 195 * lock granted when unmounting, the only place that a 196 * wakeup needs to be done is at the release of the 197 * exclusive lock at the end of dounmount. 198 */ 199 tsleep((caddr_t)mp, PVFS, "vfs_busy", 0); 200 if (interlkp) { 201 simple_lock(interlkp); 202 } 203 return (ENOENT); 204 } 205 lkflags = LK_SHARED | LK_NOPAUSE; 206 if (interlkp) 207 lkflags |= LK_INTERLOCK; 208 if (lockmgr(&mp->mnt_lock, lkflags, interlkp, p)) 209 panic("vfs_busy: unexpected lock failure"); 210 return (0); 211} 212 213/* 214 * Free a busy filesystem. 215 */ 216void 217vfs_unbusy(mp, p) 218 struct mount *mp; 219 struct proc *p; 220{ 221 222 lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, p); 223} 224 225/* 226 * Lookup a filesystem type, and if found allocate and initialize 227 * a mount structure for it. 228 * 229 * Devname is usually updated by mount(8) after booting. 230 */ 231int 232vfs_rootmountalloc(fstypename, devname, mpp) 233 char *fstypename; 234 char *devname; 235 struct mount **mpp; 236{ 237 struct proc *p = curproc; /* XXX */ 238 struct vfsconf *vfsp; 239 struct mount *mp; 240 241 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) 242 if (!strcmp(vfsp->vfc_name, fstypename)) 243 break; 244 if (vfsp == NULL) 245 return (ENODEV); 246 mp = malloc((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK); 247 bzero((char *)mp, (u_long)sizeof(struct mount)); 248 lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, LK_NOPAUSE); 249 (void)vfs_busy(mp, LK_NOWAIT, 0, p); 250 LIST_INIT(&mp->mnt_vnodelist); 251 mp->mnt_vfc = vfsp; 252 mp->mnt_op = vfsp->vfc_vfsops; 253 mp->mnt_flag = MNT_RDONLY; 254 mp->mnt_vnodecovered = NULLVP; 255 vfsp->vfc_refcount++; 256 mp->mnt_stat.f_type = vfsp->vfc_typenum; 257 mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK; 258 strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN); 259 mp->mnt_stat.f_mntonname[0] = '/'; 260 mp->mnt_stat.f_mntonname[1] = 0; 261 (void) copystr(devname, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, 0); 262 *mpp = mp; 263 return (0); 264} 265 266/* 267 * Find an appropriate filesystem to use for the root. If a filesystem 268 * has not been preselected, walk through the list of known filesystems 269 * trying those that have mountroot routines, and try them until one 270 * works or we have tried them all. 271 */ 272#ifdef notdef /* XXX JH */ 273int 274lite2_vfs_mountroot() 275{ 276 struct vfsconf *vfsp; 277 extern int (*lite2_mountroot) __P((void)); 278 int error; 279 280 if (lite2_mountroot != NULL) 281 return ((*lite2_mountroot)()); 282 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) { 283 if (vfsp->vfc_mountroot == NULL) 284 continue; 285 if ((error = (*vfsp->vfc_mountroot)()) == 0) 286 return (0); 287 printf("%s_mountroot failed: %d\n", vfsp->vfc_name, error); 288 } 289 return (ENODEV); 290} 291#endif 292 293/* 294 * Lookup a mount point by filesystem identifier. 295 */ 296struct mount * 297vfs_getvfs(fsid) 298 fsid_t *fsid; 299{ 300 register struct mount *mp; 301 302 simple_lock(&mountlist_slock); 303 for (mp = mountlist.cqh_first; mp != (void *)&mountlist; 304 mp = mp->mnt_list.cqe_next) { 305 if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] && 306 mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) { 307 simple_unlock(&mountlist_slock); 308 return (mp); 309 } 310 } 311 simple_unlock(&mountlist_slock); 312 return ((struct mount *) 0); 313} 314 315/* 316 * Get a new unique fsid 317 */ 318void 319vfs_getnewfsid(mp) 320 struct mount *mp; 321{ 322 static u_short xxxfs_mntid; 323 324 fsid_t tfsid; 325 int mtype; 326 327 simple_lock(&mntid_slock); 328 mtype = mp->mnt_vfc->vfc_typenum; 329 mp->mnt_stat.f_fsid.val[0] = makedev(nblkdev + mtype, 0); 330 mp->mnt_stat.f_fsid.val[1] = mtype; 331 if (xxxfs_mntid == 0) 332 ++xxxfs_mntid; 333 tfsid.val[0] = makedev(nblkdev + mtype, xxxfs_mntid); 334 tfsid.val[1] = mtype; 335 if (mountlist.cqh_first != (void *)&mountlist) { 336 while (vfs_getvfs(&tfsid)) { 337 tfsid.val[0]++; 338 xxxfs_mntid++; 339 } 340 } 341 mp->mnt_stat.f_fsid.val[0] = tfsid.val[0]; 342 simple_unlock(&mntid_slock); 343} 344 345/* 346 * Set vnode attributes to VNOVAL 347 */ 348void 349vattr_null(vap) 350 register struct vattr *vap; 351{ 352 353 vap->va_type = VNON; 354 vap->va_size = VNOVAL; 355 vap->va_bytes = VNOVAL; 356 vap->va_mode = vap->va_nlink = vap->va_uid = vap->va_gid = 357 vap->va_fsid = vap->va_fileid = 358 vap->va_blocksize = vap->va_rdev = 359 vap->va_atime.tv_sec = vap->va_atime.tv_nsec = 360 vap->va_mtime.tv_sec = vap->va_mtime.tv_nsec = 361 vap->va_ctime.tv_sec = vap->va_ctime.tv_nsec = 362 vap->va_flags = vap->va_gen = VNOVAL; 363 vap->va_vaflags = 0; 364} 365 366/* 367 * Routines having to do with the management of the vnode table. 368 */ 369extern vop_t **dead_vnodeop_p; 370 371/* 372 * Return the next vnode from the free list. 373 */ 374int 375getnewvnode(tag, mp, vops, vpp) 376 enum vtagtype tag; 377 struct mount *mp; 378 vop_t **vops; 379 struct vnode **vpp; 380{ 381 int s; 382 struct proc *p = curproc; /* XXX */ 383 struct vnode *vp, *tvp, *nvp; 384 vm_object_t object; 385 TAILQ_HEAD(freelst, vnode) vnode_tmp_list; 386 387 /* 388 * We take the least recently used vnode from the freelist 389 * if we can get it and it has no cached pages, and no 390 * namecache entries are relative to it. 391 * Otherwise we allocate a new vnode 392 */ 393 394 s = splbio(); 395 simple_lock(&vnode_free_list_slock); 396 TAILQ_INIT(&vnode_tmp_list); 397 398 for (vp = TAILQ_FIRST(&vnode_tobefree_list); vp; vp = nvp) { 399 nvp = TAILQ_NEXT(vp, v_freelist); 400 TAILQ_REMOVE(&vnode_tobefree_list, vp, v_freelist); 401 if (vp->v_flag & VAGE) { 402 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); 403 } else { 404 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 405 } 406 vp->v_flag &= ~(VTBFREE|VAGE); 407 vp->v_flag |= VFREE; 408 if (vp->v_usecount) 409 panic("tobe free vnode isn't"); 410 freevnodes++; 411 } 412 413 if (wantfreevnodes && freevnodes < wantfreevnodes) { 414 vp = NULL; 415 } else if (!wantfreevnodes && freevnodes <= desiredvnodes) { 416 /* 417 * XXX: this is only here to be backwards compatible 418 */ 419 vp = NULL; 420 } else { 421 for (vp = TAILQ_FIRST(&vnode_free_list); vp; vp = nvp) { 422 423 nvp = TAILQ_NEXT(vp, v_freelist); 424 425 if (!simple_lock_try(&vp->v_interlock)) 426 continue; 427 if (vp->v_usecount) 428 panic("free vnode isn't"); 429 430 object = vp->v_object; 431 if (object && (object->resident_page_count || object->ref_count)) { 432 printf("object inconsistant state: RPC: %d, RC: %d\n", 433 object->resident_page_count, object->ref_count); 434 /* Don't recycle if it's caching some pages */ 435 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); 436 TAILQ_INSERT_TAIL(&vnode_tmp_list, vp, v_freelist); 437 continue; 438 } else if (LIST_FIRST(&vp->v_cache_src)) { 439 /* Don't recycle if active in the namecache */ 440 simple_unlock(&vp->v_interlock); 441 continue; 442 } else { 443 break; 444 } 445 } 446 } 447 448 for (tvp = TAILQ_FIRST(&vnode_tmp_list); tvp; tvp = nvp) { 449 nvp = TAILQ_NEXT(tvp, v_freelist); 450 TAILQ_REMOVE(&vnode_tmp_list, tvp, v_freelist); 451 TAILQ_INSERT_TAIL(&vnode_free_list, tvp, v_freelist); 452 simple_unlock(&tvp->v_interlock); 453 } 454 455 if (vp) { 456 vp->v_flag |= VDOOMED; 457 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); 458 freevnodes--; 459 simple_unlock(&vnode_free_list_slock); 460 cache_purge(vp); 461 vp->v_lease = NULL; 462 if (vp->v_type != VBAD) { 463 vgonel(vp, p); 464 } else { 465 simple_unlock(&vp->v_interlock); 466 } 467 468#ifdef DIAGNOSTIC 469 { 470 int s; 471 472 if (vp->v_data) 473 panic("cleaned vnode isn't"); 474 s = splbio(); 475 if (vp->v_numoutput) 476 panic("Clean vnode has pending I/O's"); 477 splx(s); 478 } 479#endif 480 vp->v_flag = 0; 481 vp->v_lastr = 0; 482 vp->v_lastw = 0; 483 vp->v_lasta = 0; 484 vp->v_cstart = 0; 485 vp->v_clen = 0; 486 vp->v_socket = 0; 487 vp->v_writecount = 0; /* XXX */ 488 vp->v_maxio = 0; 489 } else { 490 simple_unlock(&vnode_free_list_slock); 491 vp = (struct vnode *) zalloc(vnode_zone); 492 bzero((char *) vp, sizeof *vp); 493 simple_lock_init(&vp->v_interlock); 494 vp->v_dd = vp; 495 cache_purge(vp); 496 LIST_INIT(&vp->v_cache_src); 497 TAILQ_INIT(&vp->v_cache_dst); 498 numvnodes++; 499 } 500 501 vp->v_type = VNON; 502 vp->v_tag = tag; 503 vp->v_op = vops; 504 insmntque(vp, mp); 505 *vpp = vp; 506 vp->v_usecount = 1; 507 vp->v_data = 0; 508 splx(s); 509 510 vfs_object_create(vp, p, p->p_ucred, TRUE); 511 return (0); 512} 513 514/* 515 * Move a vnode from one mount queue to another. 516 */ 517static void 518insmntque(vp, mp) 519 register struct vnode *vp; 520 register struct mount *mp; 521{ 522 523 simple_lock(&mntvnode_slock); 524 /* 525 * Delete from old mount point vnode list, if on one. 526 */ 527 if (vp->v_mount != NULL) 528 LIST_REMOVE(vp, v_mntvnodes); 529 /* 530 * Insert into list of vnodes for the new mount point, if available. 531 */ 532 if ((vp->v_mount = mp) == NULL) { 533 simple_unlock(&mntvnode_slock); 534 return; 535 } 536 LIST_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes); 537 simple_unlock(&mntvnode_slock); 538} 539 540/* 541 * Update outstanding I/O count and do wakeup if requested. 542 */ 543void 544vwakeup(bp) 545 register struct buf *bp; 546{ 547 register struct vnode *vp; 548 549 bp->b_flags &= ~B_WRITEINPROG; 550 if ((vp = bp->b_vp)) { 551 vp->v_numoutput--; 552 if (vp->v_numoutput < 0) 553 panic("vwakeup: neg numoutput"); 554 if ((vp->v_numoutput == 0) && (vp->v_flag & VBWAIT)) { 555 vp->v_flag &= ~VBWAIT; 556 wakeup((caddr_t) &vp->v_numoutput); 557 } 558 } 559} 560 561/* 562 * Flush out and invalidate all buffers associated with a vnode. 563 * Called with the underlying object locked. 564 */ 565int 566vinvalbuf(vp, flags, cred, p, slpflag, slptimeo) 567 register struct vnode *vp; 568 int flags; 569 struct ucred *cred; 570 struct proc *p; 571 int slpflag, slptimeo; 572{ 573 register struct buf *bp; 574 struct buf *nbp, *blist; 575 int s, error; 576 vm_object_t object; 577 578 if ((flags & V_SAVE) && vp->v_dirtyblkhd.lh_first != NULL) { 579 if ((error = VOP_FSYNC(vp, cred, MNT_WAIT, p))) 580 return (error); 581 if (vp->v_dirtyblkhd.lh_first != NULL) 582 panic("vinvalbuf: dirty bufs"); 583 } 584 585 s = splbio(); 586 for (;;) { 587 if ((blist = vp->v_cleanblkhd.lh_first) && (flags & V_SAVEMETA)) 588 while (blist && blist->b_lblkno < 0) 589 blist = blist->b_vnbufs.le_next; 590 if (!blist && (blist = vp->v_dirtyblkhd.lh_first) && 591 (flags & V_SAVEMETA)) 592 while (blist && blist->b_lblkno < 0) 593 blist = blist->b_vnbufs.le_next; 594 if (!blist) 595 break; 596 597 for (bp = blist; bp; bp = nbp) { 598 nbp = bp->b_vnbufs.le_next; 599 if ((flags & V_SAVEMETA) && bp->b_lblkno < 0) 600 continue; 601 if (bp->b_flags & B_BUSY) { 602 bp->b_flags |= B_WANTED; 603 error = tsleep((caddr_t) bp, 604 slpflag | (PRIBIO + 4), "vinvalbuf", 605 slptimeo); 606 if (error) { 607 splx(s); 608 return (error); 609 } 610 break; 611 } 612 /* 613 * XXX Since there are no node locks for NFS, I 614 * believe there is a slight chance that a delayed 615 * write will occur while sleeping just above, so 616 * check for it. Note that vfs_bio_awrite expects 617 * buffers to reside on a queue, while VOP_BWRITE and 618 * brelse do not. 619 */ 620 if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) && 621 (flags & V_SAVE)) { 622 623 if (bp->b_vp == vp) { 624 if (bp->b_flags & B_CLUSTEROK) { 625 vfs_bio_awrite(bp); 626 } else { 627 bremfree(bp); 628 bp->b_flags |= (B_BUSY | B_ASYNC); 629 VOP_BWRITE(bp); 630 } 631 } else { 632 bremfree(bp); 633 bp->b_flags |= B_BUSY; 634 (void) VOP_BWRITE(bp); 635 } 636 break; 637 } 638 bremfree(bp); 639 bp->b_flags |= (B_INVAL | B_NOCACHE | B_RELBUF | B_BUSY); 640 bp->b_flags &= ~B_ASYNC; 641 brelse(bp); 642 } 643 } 644 645 while (vp->v_numoutput > 0) { 646 vp->v_flag |= VBWAIT; 647 tsleep(&vp->v_numoutput, PVM, "vnvlbv", 0); 648 } 649 650 splx(s); 651 652 /* 653 * Destroy the copy in the VM cache, too. 654 */ 655 simple_lock(&vp->v_interlock); 656 object = vp->v_object; 657 if (object != NULL) { 658 if (flags & V_SAVEMETA) 659 vm_object_page_remove(object, 0, object->size, 660 (flags & V_SAVE) ? TRUE : FALSE); 661 else 662 vm_object_page_remove(object, 0, 0, 663 (flags & V_SAVE) ? TRUE : FALSE); 664 } 665 simple_unlock(&vp->v_interlock); 666 667 if (!(flags & V_SAVEMETA) && 668 (vp->v_dirtyblkhd.lh_first || vp->v_cleanblkhd.lh_first)) 669 panic("vinvalbuf: flush failed"); 670 return (0); 671} 672 673/* 674 * Truncate a file's buffer and pages to a specified length. This 675 * is in lieu of the old vinvalbuf mechanism, which performed unneeded 676 * sync activity. 677 */ 678int 679vtruncbuf(vp, cred, p, length, blksize) 680 register struct vnode *vp; 681 struct ucred *cred; 682 struct proc *p; 683 off_t length; 684 int blksize; 685{ 686 register struct buf *bp; 687 struct buf *nbp, *blist; 688 int s, error, anyfreed; 689 vm_object_t object; 690 int trunclbn; 691 692 /* 693 * Round up to the *next* lbn. 694 */ 695 trunclbn = (length + blksize - 1) / blksize; 696 697 s = splbio(); 698restart: 699 anyfreed = 1; 700 for (;anyfreed;) { 701 anyfreed = 0; 702 for ( bp = LIST_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) { 703 704 nbp = LIST_NEXT(bp, b_vnbufs); 705 706 if (bp->b_lblkno >= trunclbn) { 707 if (bp->b_flags & B_BUSY) { 708 bp->b_flags |= B_WANTED; 709 tsleep(bp, PRIBIO + 4, "vtrb1", 0); 710 goto restart; 711 } else { 712 bremfree(bp); 713 bp->b_flags |= (B_BUSY | B_INVAL | B_RELBUF); 714 bp->b_flags &= ~B_ASYNC; 715 brelse(bp); 716 anyfreed = 1; 717 } 718 if (nbp && 719 ((LIST_NEXT(nbp, b_vnbufs) == NOLIST) || 720 (nbp->b_vp != vp) || 721 (nbp->b_flags & B_DELWRI))) { 722 goto restart; 723 } 724 } 725 } 726 727 for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { 728 729 nbp = LIST_NEXT(bp, b_vnbufs); 730 731 if (bp->b_lblkno >= trunclbn) { 732 if (bp->b_flags & B_BUSY) { 733 bp->b_flags |= B_WANTED; 734 tsleep(bp, PRIBIO + 4, "vtrb2", 0); 735 goto restart; 736 } else { 737 bremfree(bp); 738 bp->b_flags |= (B_BUSY | B_INVAL | B_RELBUF); 739 bp->b_flags &= ~B_ASYNC; 740 brelse(bp); 741 anyfreed = 1; 742 } 743 if (nbp && 744 ((LIST_NEXT(nbp, b_vnbufs) == NOLIST) || 745 (nbp->b_vp != vp) || 746 (nbp->b_flags & B_DELWRI) == 0)) { 747 goto restart; 748 } 749 } 750 } 751 } 752 753 if (length > 0) { 754restartsync: 755 for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { 756 757 nbp = LIST_NEXT(bp, b_vnbufs); 758 759 if ((bp->b_flags & B_DELWRI) && (bp->b_lblkno < 0)) { 760 if (bp->b_flags & B_BUSY) { 761 bp->b_flags |= B_WANTED; 762 tsleep(bp, PRIBIO, "vtrb3", 0); 763 } else { 764 bremfree(bp); 765 bp->b_flags |= B_BUSY; 766 if (bp->b_vp == vp) { 767 bp->b_flags |= B_ASYNC; 768 } else { 769 bp->b_flags &= ~B_ASYNC; 770 } 771 VOP_BWRITE(bp); 772 } 773 goto restartsync; 774 } 775 776 } 777 } 778 779 while (vp->v_numoutput > 0) { 780 vp->v_flag |= VBWAIT; 781 tsleep(&vp->v_numoutput, PVM, "vbtrunc", 0); 782 } 783 784 splx(s); 785 786 vnode_pager_setsize(vp, length); 787 788 return (0); 789} 790 791/* 792 * Associate a buffer with a vnode. 793 */ 794void 795bgetvp(vp, bp) 796 register struct vnode *vp; 797 register struct buf *bp; 798{ 799 int s; 800 801#if defined(DIAGNOSTIC) 802 if (bp->b_vp) 803 panic("bgetvp: not free"); 804#endif 805 vhold(vp); 806 bp->b_vp = vp; 807 if (vp->v_type == VBLK || vp->v_type == VCHR) 808 bp->b_dev = vp->v_rdev; 809 else 810 bp->b_dev = NODEV; 811 /* 812 * Insert onto list for new vnode. 813 */ 814 s = splbio(); 815 bufinsvn(bp, &vp->v_cleanblkhd); 816 splx(s); 817} 818 819/* 820 * Disassociate a buffer from a vnode. 821 */ 822void 823brelvp(bp) 824 register struct buf *bp; 825{ 826 struct vnode *vp; 827 int s; 828 829#if defined(DIAGNOSTIC) 830 if (bp->b_vp == (struct vnode *) 0) 831 panic("brelvp: NULL"); 832#endif 833 834 /* 835 * Delete from old vnode list, if on one. 836 */ 837 vp = bp->b_vp; 838 s = splbio(); 839 if (bp->b_vnbufs.le_next != NOLIST) 840 bufremvn(bp); 841 if ((vp->v_flag & VONWORKLST) && (LIST_FIRST(&vp->v_dirtyblkhd) == NULL)) { 842 vp->v_flag &= ~VONWORKLST; 843 LIST_REMOVE(vp, v_synclist); 844 } 845 splx(s); 846 bp->b_vp = (struct vnode *) 0; 847 vdrop(vp); 848} 849 850/* 851 * The workitem queue. 852 * 853 * It is useful to delay writes of file data and filesystem metadata 854 * for tens of seconds so that quickly created and deleted files need 855 * not waste disk bandwidth being created and removed. To realize this, 856 * we append vnodes to a "workitem" queue. When running with a soft 857 * updates implementation, most pending metadata dependencies should 858 * not wait for more than a few seconds. Thus, mounted on block devices 859 * are delayed only about a half the time that file data is delayed. 860 * Similarly, directory updates are more critical, so are only delayed 861 * about a third the time that file data is delayed. Thus, there are 862 * SYNCER_MAXDELAY queues that are processed round-robin at a rate of 863 * one each second (driven off the filesystem syner process). The 864 * syncer_delayno variable indicates the next queue that is to be processed. 865 * Items that need to be processed soon are placed in this queue: 866 * 867 * syncer_workitem_pending[syncer_delayno] 868 * 869 * A delay of fifteen seconds is done by placing the request fifteen 870 * entries later in the queue: 871 * 872 * syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask] 873 * 874 */ 875 876/* 877 * Add an item to the syncer work queue. 878 */ 879void 880vn_syncer_add_to_worklist(vp, delay) 881 struct vnode *vp; 882 int delay; 883{ 884 int s, slot; 885 886 s = splbio(); 887 888 if (vp->v_flag & VONWORKLST) { 889 LIST_REMOVE(vp, v_synclist); 890 } 891 892 if (delay > syncer_maxdelay - 2) 893 delay = syncer_maxdelay - 2; 894 slot = (syncer_delayno + delay) & syncer_mask; 895 896 LIST_INSERT_HEAD(&syncer_workitem_pending[slot], vp, v_synclist); 897 vp->v_flag |= VONWORKLST; 898 splx(s); 899} 900 901static void sched_sync __P((void)); 902static struct proc *updateproc; 903static struct kproc_desc up_kp = { 904 "syncer", 905 sched_sync, 906 &updateproc 907}; 908SYSINIT_KT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp) 909 910/* 911 * System filesystem synchronizer daemon. 912 */ 913void 914sched_sync(void) 915{ 916 struct synclist *slp; 917 struct vnode *vp; 918 long starttime; 919 int s; 920 struct proc *p = updateproc; 921 922 for (;;) { 923 starttime = time.tv_sec; 924 925 /* 926 * Push files whose dirty time has expired. 927 */ 928 s = splbio(); 929 slp = &syncer_workitem_pending[syncer_delayno]; 930 syncer_delayno += 1; 931 if (syncer_delayno == syncer_maxdelay) 932 syncer_delayno = 0; 933 splx(s); 934 935 while ((vp = LIST_FIRST(slp)) != NULL) { 936 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); 937 (void) VOP_FSYNC(vp, p->p_ucred, MNT_LAZY, p); 938 VOP_UNLOCK(vp, 0, p); 939 if (LIST_FIRST(slp) == vp) { 940 if (LIST_FIRST(&vp->v_dirtyblkhd) == NULL && 941 vp->v_type != VBLK) 942 panic("sched_sync: fsync failed"); 943 /* 944 * Move ourselves to the back of the sync list. 945 */ 946 LIST_REMOVE(vp, v_synclist); 947 vn_syncer_add_to_worklist(vp, syncdelay); 948 } 949 } 950 951 /* 952 * Do soft update processing. 953 */ 954 if (bioops.io_sync) 955 (*bioops.io_sync)(NULL); 956 957 /* 958 * The variable rushjob allows the kernel to speed up the 959 * processing of the filesystem syncer process. A rushjob 960 * value of N tells the filesystem syncer to process the next 961 * N seconds worth of work on its queue ASAP. Currently rushjob 962 * is used by the soft update code to speed up the filesystem 963 * syncer process when the incore state is getting so far 964 * ahead of the disk that the kernel memory pool is being 965 * threatened with exhaustion. 966 */ 967 if (rushjob > 0) { 968 rushjob -= 1; 969 continue; 970 } 971 /* 972 * If it has taken us less than a second to process the 973 * current work, then wait. Otherwise start right over 974 * again. We can still lose time if any single round 975 * takes more than two seconds, but it does not really 976 * matter as we are just trying to generally pace the 977 * filesystem activity. 978 */ 979 if (time.tv_sec == starttime) 980 tsleep(&lbolt, PPAUSE, "syncer", 0); 981 } 982} 983 984/* 985 * Associate a p-buffer with a vnode. 986 */ 987void 988pbgetvp(vp, bp) 989 register struct vnode *vp; 990 register struct buf *bp; 991{ 992#if defined(DIAGNOSTIC) 993 if (bp->b_vp) 994 panic("pbgetvp: not free"); 995#endif 996 bp->b_vp = vp; 997 if (vp->v_type == VBLK || vp->v_type == VCHR) 998 bp->b_dev = vp->v_rdev; 999 else 1000 bp->b_dev = NODEV; 1001} 1002 1003/* 1004 * Disassociate a p-buffer from a vnode. 1005 */ 1006void 1007pbrelvp(bp) 1008 register struct buf *bp; 1009{ 1010 1011#if defined(DIAGNOSTIC) 1012 if (bp->b_vp == (struct vnode *) 0) 1013 panic("pbrelvp: NULL"); 1014#endif 1015 1016 bp->b_vp = (struct vnode *) 0; 1017} 1018 1019/* 1020 * Reassign a buffer from one vnode to another. 1021 * Used to assign file specific control information 1022 * (indirect blocks) to the vnode to which they belong. 1023 */ 1024void 1025reassignbuf(bp, newvp) 1026 register struct buf *bp; 1027 register struct vnode *newvp; 1028{ 1029 struct buflists *listheadp; 1030 int delay; 1031 int s; 1032 1033 if (newvp == NULL) { 1034 printf("reassignbuf: NULL"); 1035 return; 1036 } 1037 1038 s = splbio(); 1039 /* 1040 * Delete from old vnode list, if on one. 1041 */ 1042 if (bp->b_vnbufs.le_next != NOLIST) { 1043 bufremvn(bp); 1044 vdrop(bp->b_vp); 1045 } 1046 /* 1047 * If dirty, put on list of dirty buffers; otherwise insert onto list 1048 * of clean buffers. 1049 */ 1050 if (bp->b_flags & B_DELWRI) { 1051 struct buf *tbp; 1052 1053 listheadp = &newvp->v_dirtyblkhd; 1054 if ((newvp->v_flag & VONWORKLST) == 0) { 1055 switch (newvp->v_type) { 1056 case VDIR: 1057 delay = syncdelay / 3; 1058 break; 1059 case VBLK: 1060 if (newvp->v_specmountpoint != NULL) { 1061 delay = syncdelay / 2; 1062 break; 1063 } 1064 /* fall through */ 1065 default: 1066 delay = syncdelay; 1067 } 1068 vn_syncer_add_to_worklist(newvp, delay); 1069 } 1070 tbp = listheadp->lh_first; 1071 if (!tbp || (tbp->b_lblkno > bp->b_lblkno)) { 1072 bufinsvn(bp, listheadp); 1073 } else { 1074 while (tbp->b_vnbufs.le_next && 1075 (tbp->b_vnbufs.le_next->b_lblkno < bp->b_lblkno)) { 1076 tbp = tbp->b_vnbufs.le_next; 1077 } 1078 LIST_INSERT_AFTER(tbp, bp, b_vnbufs); 1079 } 1080 } else { 1081 bufinsvn(bp, &newvp->v_cleanblkhd); 1082 if ((newvp->v_flag & VONWORKLST) && 1083 LIST_FIRST(&newvp->v_dirtyblkhd) == NULL) { 1084 newvp->v_flag &= ~VONWORKLST; 1085 LIST_REMOVE(newvp, v_synclist); 1086 } 1087 } 1088 bp->b_vp = newvp; 1089 vhold(bp->b_vp); 1090 splx(s); 1091} 1092 1093#ifndef DEVFS_ROOT 1094/* 1095 * Create a vnode for a block device. 1096 * Used for mounting the root file system. 1097 */ 1098int 1099bdevvp(dev, vpp) 1100 dev_t dev; 1101 struct vnode **vpp; 1102{ 1103 register struct vnode *vp; 1104 struct vnode *nvp; 1105 int error; 1106 1107 if (dev == NODEV) 1108 return (0); 1109 error = getnewvnode(VT_NON, (struct mount *) 0, spec_vnodeop_p, &nvp); 1110 if (error) { 1111 *vpp = 0; 1112 return (error); 1113 } 1114 vp = nvp; 1115 vp->v_type = VBLK; 1116 if ((nvp = checkalias(vp, dev, (struct mount *) 0))) { 1117 vput(vp); 1118 vp = nvp; 1119 } 1120 *vpp = vp; 1121 return (0); 1122} 1123#endif /* !DEVFS_ROOT */ 1124 1125/* 1126 * Check to see if the new vnode represents a special device 1127 * for which we already have a vnode (either because of 1128 * bdevvp() or because of a different vnode representing 1129 * the same block device). If such an alias exists, deallocate 1130 * the existing contents and return the aliased vnode. The 1131 * caller is responsible for filling it with its new contents. 1132 */ 1133struct vnode * 1134checkalias(nvp, nvp_rdev, mp) 1135 register struct vnode *nvp; 1136 dev_t nvp_rdev; 1137 struct mount *mp; 1138{ 1139 struct proc *p = curproc; /* XXX */ 1140 struct vnode *vp; 1141 struct vnode **vpp; 1142 1143 if (nvp->v_type != VBLK && nvp->v_type != VCHR) 1144 return (NULLVP); 1145 1146 vpp = &speclisth[SPECHASH(nvp_rdev)]; 1147loop: 1148 simple_lock(&spechash_slock); 1149 for (vp = *vpp; vp; vp = vp->v_specnext) { 1150 if (nvp_rdev != vp->v_rdev || nvp->v_type != vp->v_type) 1151 continue; 1152 /* 1153 * Alias, but not in use, so flush it out. 1154 */ 1155 simple_lock(&vp->v_interlock); 1156 if (vp->v_usecount == 0) { 1157 simple_unlock(&spechash_slock); 1158 vgonel(vp, p); 1159 goto loop; 1160 } 1161 if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, p)) { 1162 simple_unlock(&spechash_slock); 1163 goto loop; 1164 } 1165 break; 1166 } 1167 if (vp == NULL || vp->v_tag != VT_NON) { 1168 MALLOC(nvp->v_specinfo, struct specinfo *, 1169 sizeof(struct specinfo), M_VNODE, M_WAITOK); 1170 nvp->v_rdev = nvp_rdev; 1171 nvp->v_hashchain = vpp; 1172 nvp->v_specnext = *vpp; 1173 nvp->v_specmountpoint = NULL; 1174 simple_unlock(&spechash_slock); 1175 *vpp = nvp; 1176 if (vp != NULLVP) { 1177 nvp->v_flag |= VALIASED; 1178 vp->v_flag |= VALIASED; 1179 vput(vp); 1180 } 1181 return (NULLVP); 1182 } 1183 simple_unlock(&spechash_slock); 1184 VOP_UNLOCK(vp, 0, p); 1185 simple_lock(&vp->v_interlock); 1186 vclean(vp, 0, p); 1187 vp->v_op = nvp->v_op; 1188 vp->v_tag = nvp->v_tag; 1189 nvp->v_type = VNON; 1190 insmntque(vp, mp); 1191 return (vp); 1192} 1193 1194/* 1195 * Grab a particular vnode from the free list, increment its 1196 * reference count and lock it. The vnode lock bit is set the 1197 * vnode is being eliminated in vgone. The process is awakened 1198 * when the transition is completed, and an error returned to 1199 * indicate that the vnode is no longer usable (possibly having 1200 * been changed to a new file system type). 1201 */ 1202int 1203vget(vp, flags, p) 1204 register struct vnode *vp; 1205 int flags; 1206 struct proc *p; 1207{ 1208 int error; 1209 1210 /* 1211 * If the vnode is in the process of being cleaned out for 1212 * another use, we wait for the cleaning to finish and then 1213 * return failure. Cleaning is determined by checking that 1214 * the VXLOCK flag is set. 1215 */ 1216 if ((flags & LK_INTERLOCK) == 0) { 1217 simple_lock(&vp->v_interlock); 1218 } 1219 if (vp->v_flag & VXLOCK) { 1220 vp->v_flag |= VXWANT; 1221 simple_unlock(&vp->v_interlock); 1222 tsleep((caddr_t)vp, PINOD, "vget", 0); 1223 return (ENOENT); 1224 } 1225 1226 vp->v_usecount++; 1227 1228 if (VSHOULDBUSY(vp)) 1229 vbusy(vp); 1230 if (flags & LK_TYPE_MASK) { 1231 if ((error = vn_lock(vp, flags | LK_INTERLOCK, p)) != 0) { 1232 /* 1233 * must expand vrele here because we do not want 1234 * to call VOP_INACTIVE if the reference count 1235 * drops back to zero since it was never really 1236 * active. We must remove it from the free list 1237 * before sleeping so that multiple processes do 1238 * not try to recycle it. 1239 */ 1240 simple_lock(&vp->v_interlock); 1241 vp->v_usecount--; 1242 if (VSHOULDFREE(vp)) 1243 vfree(vp); 1244 simple_unlock(&vp->v_interlock); 1245 } 1246 return (error); 1247 } 1248 simple_unlock(&vp->v_interlock); 1249 return (0); 1250} 1251 1252void 1253vref(struct vnode *vp) 1254{ 1255 simple_lock(&vp->v_interlock); 1256 vp->v_usecount++; 1257 simple_unlock(&vp->v_interlock); 1258} 1259 1260/* 1261 * Vnode put/release. 1262 * If count drops to zero, call inactive routine and return to freelist. 1263 */ 1264void 1265vrele(vp) 1266 struct vnode *vp; 1267{ 1268 struct proc *p = curproc; /* XXX */ 1269 1270#ifdef DIAGNOSTIC 1271 if (vp == NULL) 1272 panic("vrele: null vp"); 1273#endif 1274 simple_lock(&vp->v_interlock); 1275 1276 if (vp->v_usecount > 1) { 1277 1278 vp->v_usecount--; 1279 simple_unlock(&vp->v_interlock); 1280 1281 return; 1282 } 1283 1284 if (vp->v_usecount == 1) { 1285 1286 vp->v_usecount--; 1287 1288 if (VSHOULDFREE(vp)) 1289 vfree(vp); 1290 /* 1291 * If we are doing a vput, the node is already locked, and we must 1292 * call VOP_INACTIVE with the node locked. So, in the case of 1293 * vrele, we explicitly lock the vnode before calling VOP_INACTIVE. 1294 */ 1295 if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK, p) == 0) { 1296 VOP_INACTIVE(vp, p); 1297 } 1298 1299 } else { 1300#ifdef DIAGNOSTIC 1301 vprint("vrele: negative ref count", vp); 1302 simple_unlock(&vp->v_interlock); 1303#endif 1304 panic("vrele: negative ref cnt"); 1305 } 1306} 1307 1308void 1309vput(vp) 1310 struct vnode *vp; 1311{ 1312 struct proc *p = curproc; /* XXX */ 1313 1314#ifdef DIAGNOSTIC 1315 if (vp == NULL) 1316 panic("vput: null vp"); 1317#endif 1318 1319 simple_lock(&vp->v_interlock); 1320 1321 if (vp->v_usecount > 1) { 1322 1323 vp->v_usecount--; 1324 VOP_UNLOCK(vp, LK_INTERLOCK, p); 1325 return; 1326 1327 } 1328 1329 if (vp->v_usecount == 1) { 1330 1331 vp->v_usecount--; 1332 if (VSHOULDFREE(vp)) 1333 vfree(vp); 1334 /* 1335 * If we are doing a vput, the node is already locked, and we must 1336 * call VOP_INACTIVE with the node locked. So, in the case of 1337 * vrele, we explicitly lock the vnode before calling VOP_INACTIVE. 1338 */ 1339 simple_unlock(&vp->v_interlock); 1340 VOP_INACTIVE(vp, p); 1341 1342 } else { 1343#ifdef DIAGNOSTIC 1344 vprint("vput: negative ref count", vp); 1345#endif 1346 panic("vput: negative ref cnt"); 1347 } 1348} 1349 1350/* 1351 * Somebody doesn't want the vnode recycled. 1352 */ 1353void 1354vhold(vp) 1355 register struct vnode *vp; 1356{ 1357 int s; 1358 1359 s = splbio(); 1360 vp->v_holdcnt++; 1361 if (VSHOULDBUSY(vp)) 1362 vbusy(vp); 1363 splx(s); 1364} 1365 1366/* 1367 * One less who cares about this vnode. 1368 */ 1369void 1370vdrop(vp) 1371 register struct vnode *vp; 1372{ 1373 int s; 1374 1375 s = splbio(); 1376 if (vp->v_holdcnt <= 0) 1377 panic("vdrop: holdcnt"); 1378 vp->v_holdcnt--; 1379 if (VSHOULDFREE(vp)) 1380 vfree(vp); 1381 splx(s); 1382} 1383 1384/* 1385 * Remove any vnodes in the vnode table belonging to mount point mp. 1386 * 1387 * If MNT_NOFORCE is specified, there should not be any active ones, 1388 * return error if any are found (nb: this is a user error, not a 1389 * system error). If MNT_FORCE is specified, detach any active vnodes 1390 * that are found. 1391 */ 1392#ifdef DIAGNOSTIC 1393static int busyprt = 0; /* print out busy vnodes */ 1394SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, ""); 1395#endif 1396 1397int 1398vflush(mp, skipvp, flags) 1399 struct mount *mp; 1400 struct vnode *skipvp; 1401 int flags; 1402{ 1403 struct proc *p = curproc; /* XXX */ 1404 struct vnode *vp, *nvp; 1405 int busy = 0; 1406 1407 simple_lock(&mntvnode_slock); 1408loop: 1409 for (vp = mp->mnt_vnodelist.lh_first; vp; vp = nvp) { 1410 /* 1411 * Make sure this vnode wasn't reclaimed in getnewvnode(). 1412 * Start over if it has (it won't be on the list anymore). 1413 */ 1414 if (vp->v_mount != mp) 1415 goto loop; 1416 nvp = vp->v_mntvnodes.le_next; 1417 /* 1418 * Skip over a selected vnode. 1419 */ 1420 if (vp == skipvp) 1421 continue; 1422 1423 simple_lock(&vp->v_interlock); 1424 /* 1425 * Skip over a vnodes marked VSYSTEM. 1426 */ 1427 if ((flags & SKIPSYSTEM) && (vp->v_flag & VSYSTEM)) { 1428 simple_unlock(&vp->v_interlock); 1429 continue; 1430 } 1431 /* 1432 * If WRITECLOSE is set, only flush out regular file vnodes 1433 * open for writing. 1434 */ 1435 if ((flags & WRITECLOSE) && 1436 (vp->v_writecount == 0 || vp->v_type != VREG)) { 1437 simple_unlock(&vp->v_interlock); 1438 continue; 1439 } 1440 1441 /* 1442 * With v_usecount == 0, all we need to do is clear out the 1443 * vnode data structures and we are done. 1444 */ 1445 if (vp->v_usecount == 0) { 1446 simple_unlock(&mntvnode_slock); 1447 vgonel(vp, p); 1448 simple_lock(&mntvnode_slock); 1449 continue; 1450 } 1451 1452 /* 1453 * If FORCECLOSE is set, forcibly close the vnode. For block 1454 * or character devices, revert to an anonymous device. For 1455 * all other files, just kill them. 1456 */ 1457 if (flags & FORCECLOSE) { 1458 simple_unlock(&mntvnode_slock); 1459 if (vp->v_type != VBLK && vp->v_type != VCHR) { 1460 vgonel(vp, p); 1461 } else { 1462 vclean(vp, 0, p); 1463 vp->v_op = spec_vnodeop_p; 1464 insmntque(vp, (struct mount *) 0); 1465 } 1466 simple_lock(&mntvnode_slock); 1467 continue; 1468 } 1469#ifdef DIAGNOSTIC 1470 if (busyprt) 1471 vprint("vflush: busy vnode", vp); 1472#endif 1473 simple_unlock(&vp->v_interlock); 1474 busy++; 1475 } 1476 simple_unlock(&mntvnode_slock); 1477 if (busy) 1478 return (EBUSY); 1479 return (0); 1480} 1481 1482/* 1483 * Disassociate the underlying file system from a vnode. 1484 */ 1485static void 1486vclean(vp, flags, p) 1487 struct vnode *vp; 1488 int flags; 1489 struct proc *p; 1490{ 1491 int active; 1492 vm_object_t obj; 1493 1494 /* 1495 * Check to see if the vnode is in use. If so we have to reference it 1496 * before we clean it out so that its count cannot fall to zero and 1497 * generate a race against ourselves to recycle it. 1498 */ 1499 if ((active = vp->v_usecount)) 1500 vp->v_usecount++; 1501 1502 /* 1503 * Prevent the vnode from being recycled or brought into use while we 1504 * clean it out. 1505 */ 1506 if (vp->v_flag & VXLOCK) 1507 panic("vclean: deadlock"); 1508 vp->v_flag |= VXLOCK; 1509 /* 1510 * Even if the count is zero, the VOP_INACTIVE routine may still 1511 * have the object locked while it cleans it out. The VOP_LOCK 1512 * ensures that the VOP_INACTIVE routine is done with its work. 1513 * For active vnodes, it ensures that no other activity can 1514 * occur while the underlying object is being cleaned out. 1515 */ 1516 VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK, p); 1517 1518 /* 1519 * Clean out any buffers associated with the vnode. 1520 */ 1521 vinvalbuf(vp, V_SAVE, NOCRED, p, 0, 0); 1522 if (obj = vp->v_object) { 1523 if (obj->ref_count == 0) { 1524 /* 1525 * This is a normal way of shutting down the object/vnode 1526 * association. 1527 */ 1528 vm_object_terminate(obj); 1529 } else { 1530 /* 1531 * Woe to the process that tries to page now :-). 1532 */ 1533 vm_pager_deallocate(obj); 1534 } 1535 } 1536 1537 /* 1538 * If purging an active vnode, it must be closed and 1539 * deactivated before being reclaimed. Note that the 1540 * VOP_INACTIVE will unlock the vnode. 1541 */ 1542 if (active) { 1543 if (flags & DOCLOSE) 1544 VOP_CLOSE(vp, IO_NDELAY, NOCRED, p); 1545 VOP_INACTIVE(vp, p); 1546 } else { 1547 /* 1548 * Any other processes trying to obtain this lock must first 1549 * wait for VXLOCK to clear, then call the new lock operation. 1550 */ 1551 VOP_UNLOCK(vp, 0, p); 1552 } 1553 /* 1554 * Reclaim the vnode. 1555 */ 1556 if (VOP_RECLAIM(vp, p)) 1557 panic("vclean: cannot reclaim"); 1558 1559 if (active) 1560 vrele(vp); 1561 1562 cache_purge(vp); 1563 if (vp->v_vnlock) { 1564#if 0 /* This is the only place we have LK_DRAINED in the entire kernel ??? */ 1565#ifdef DIAGNOSTIC 1566 if ((vp->v_vnlock->lk_flags & LK_DRAINED) == 0) 1567 vprint("vclean: lock not drained", vp); 1568#endif 1569#endif 1570 FREE(vp->v_vnlock, M_VNODE); 1571 vp->v_vnlock = NULL; 1572 } 1573 1574 if (VSHOULDFREE(vp)) 1575 vfree(vp); 1576 1577 /* 1578 * Done with purge, notify sleepers of the grim news. 1579 */ 1580 vp->v_op = dead_vnodeop_p; 1581 vn_pollgone(vp); 1582 vp->v_tag = VT_NON; 1583 vp->v_flag &= ~VXLOCK; 1584 if (vp->v_flag & VXWANT) { 1585 vp->v_flag &= ~VXWANT; 1586 wakeup((caddr_t) vp); 1587 } 1588} 1589 1590/* 1591 * Eliminate all activity associated with the requested vnode 1592 * and with all vnodes aliased to the requested vnode. 1593 */ 1594int 1595vop_revoke(ap) 1596 struct vop_revoke_args /* { 1597 struct vnode *a_vp; 1598 int a_flags; 1599 } */ *ap; 1600{ 1601 struct vnode *vp, *vq; 1602 struct proc *p = curproc; /* XXX */ 1603 1604#ifdef DIAGNOSTIC 1605 if ((ap->a_flags & REVOKEALL) == 0) 1606 panic("vop_revoke"); 1607#endif 1608 1609 vp = ap->a_vp; 1610 simple_lock(&vp->v_interlock); 1611 1612 if (vp->v_flag & VALIASED) { 1613 /* 1614 * If a vgone (or vclean) is already in progress, 1615 * wait until it is done and return. 1616 */ 1617 if (vp->v_flag & VXLOCK) { 1618 vp->v_flag |= VXWANT; 1619 simple_unlock(&vp->v_interlock); 1620 tsleep((caddr_t)vp, PINOD, "vop_revokeall", 0); 1621 return (0); 1622 } 1623 /* 1624 * Ensure that vp will not be vgone'd while we 1625 * are eliminating its aliases. 1626 */ 1627 vp->v_flag |= VXLOCK; 1628 simple_unlock(&vp->v_interlock); 1629 while (vp->v_flag & VALIASED) { 1630 simple_lock(&spechash_slock); 1631 for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) { 1632 if (vq->v_rdev != vp->v_rdev || 1633 vq->v_type != vp->v_type || vp == vq) 1634 continue; 1635 simple_unlock(&spechash_slock); 1636 vgone(vq); 1637 break; 1638 } 1639 if (vq == NULLVP) { 1640 simple_unlock(&spechash_slock); 1641 } 1642 } 1643 /* 1644 * Remove the lock so that vgone below will 1645 * really eliminate the vnode after which time 1646 * vgone will awaken any sleepers. 1647 */ 1648 simple_lock(&vp->v_interlock); 1649 vp->v_flag &= ~VXLOCK; 1650 if (vp->v_flag & VXWANT) { 1651 vp->v_flag &= ~VXWANT; 1652 wakeup(vp); 1653 } 1654 } 1655 vgonel(vp, p); 1656 return (0); 1657} 1658 1659/* 1660 * Recycle an unused vnode to the front of the free list. 1661 * Release the passed interlock if the vnode will be recycled. 1662 */ 1663int 1664vrecycle(vp, inter_lkp, p) 1665 struct vnode *vp; 1666 struct simplelock *inter_lkp; 1667 struct proc *p; 1668{ 1669 1670 simple_lock(&vp->v_interlock); 1671 if (vp->v_usecount == 0) { 1672 if (inter_lkp) { 1673 simple_unlock(inter_lkp); 1674 } 1675 vgonel(vp, p); 1676 return (1); 1677 } 1678 simple_unlock(&vp->v_interlock); 1679 return (0); 1680} 1681 1682/* 1683 * Eliminate all activity associated with a vnode 1684 * in preparation for reuse. 1685 */ 1686void 1687vgone(vp) 1688 register struct vnode *vp; 1689{ 1690 struct proc *p = curproc; /* XXX */ 1691 1692 simple_lock(&vp->v_interlock); 1693 vgonel(vp, p); 1694} 1695 1696/* 1697 * vgone, with the vp interlock held. 1698 */ 1699static void 1700vgonel(vp, p) 1701 struct vnode *vp; 1702 struct proc *p; 1703{ 1704 int s; 1705 struct vnode *vq; 1706 struct vnode *vx; 1707 1708 /* 1709 * If a vgone (or vclean) is already in progress, 1710 * wait until it is done and return. 1711 */ 1712 if (vp->v_flag & VXLOCK) { 1713 vp->v_flag |= VXWANT; 1714 simple_unlock(&vp->v_interlock); 1715 tsleep((caddr_t)vp, PINOD, "vgone", 0); 1716 return; 1717 } 1718 1719 /* 1720 * Clean out the filesystem specific data. 1721 */ 1722 vclean(vp, DOCLOSE, p); 1723 simple_lock(&vp->v_interlock); 1724 1725 /* 1726 * Delete from old mount point vnode list, if on one. 1727 */ 1728 if (vp->v_mount != NULL) 1729 insmntque(vp, (struct mount *)0); 1730 /* 1731 * If special device, remove it from special device alias list 1732 * if it is on one. 1733 */ 1734 if ((vp->v_type == VBLK || vp->v_type == VCHR) && vp->v_specinfo != 0) { 1735 simple_lock(&spechash_slock); 1736 if (*vp->v_hashchain == vp) { 1737 *vp->v_hashchain = vp->v_specnext; 1738 } else { 1739 for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) { 1740 if (vq->v_specnext != vp) 1741 continue; 1742 vq->v_specnext = vp->v_specnext; 1743 break; 1744 } 1745 if (vq == NULL) 1746 panic("missing bdev"); 1747 } 1748 if (vp->v_flag & VALIASED) { 1749 vx = NULL; 1750 for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) { 1751 if (vq->v_rdev != vp->v_rdev || 1752 vq->v_type != vp->v_type) 1753 continue; 1754 if (vx) 1755 break; 1756 vx = vq; 1757 } 1758 if (vx == NULL) 1759 panic("missing alias"); 1760 if (vq == NULL) 1761 vx->v_flag &= ~VALIASED; 1762 vp->v_flag &= ~VALIASED; 1763 } 1764 simple_unlock(&spechash_slock); 1765 FREE(vp->v_specinfo, M_VNODE); 1766 vp->v_specinfo = NULL; 1767 } 1768 1769 /* 1770 * If it is on the freelist and not already at the head, 1771 * move it to the head of the list. The test of the back 1772 * pointer and the reference count of zero is because 1773 * it will be removed from the free list by getnewvnode, 1774 * but will not have its reference count incremented until 1775 * after calling vgone. If the reference count were 1776 * incremented first, vgone would (incorrectly) try to 1777 * close the previous instance of the underlying object. 1778 */ 1779 if (vp->v_usecount == 0 && !(vp->v_flag & VDOOMED)) { 1780 s = splbio(); 1781 simple_lock(&vnode_free_list_slock); 1782 if (vp->v_flag & VFREE) { 1783 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); 1784 } else if (vp->v_flag & VTBFREE) { 1785 TAILQ_REMOVE(&vnode_tobefree_list, vp, v_freelist); 1786 vp->v_flag &= ~VTBFREE; 1787 freevnodes++; 1788 } else 1789 freevnodes++; 1790 vp->v_flag |= VFREE; 1791 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); 1792 simple_unlock(&vnode_free_list_slock); 1793 splx(s); 1794 } 1795 1796 vp->v_type = VBAD; 1797 simple_unlock(&vp->v_interlock); 1798} 1799 1800/* 1801 * Lookup a vnode by device number. 1802 */ 1803int 1804vfinddev(dev, type, vpp) 1805 dev_t dev; 1806 enum vtype type; 1807 struct vnode **vpp; 1808{ 1809 register struct vnode *vp; 1810 int rc = 0; 1811 1812 simple_lock(&spechash_slock); 1813 for (vp = speclisth[SPECHASH(dev)]; vp; vp = vp->v_specnext) { 1814 if (dev != vp->v_rdev || type != vp->v_type) 1815 continue; 1816 *vpp = vp; 1817 rc = 1; 1818 break; 1819 } 1820 simple_unlock(&spechash_slock); 1821 return (rc); 1822} 1823 1824/* 1825 * Calculate the total number of references to a special device. 1826 */ 1827int 1828vcount(vp) 1829 register struct vnode *vp; 1830{ 1831 struct vnode *vq, *vnext; 1832 int count; 1833 1834loop: 1835 if ((vp->v_flag & VALIASED) == 0) 1836 return (vp->v_usecount); 1837 simple_lock(&spechash_slock); 1838 for (count = 0, vq = *vp->v_hashchain; vq; vq = vnext) { 1839 vnext = vq->v_specnext; 1840 if (vq->v_rdev != vp->v_rdev || vq->v_type != vp->v_type) 1841 continue; 1842 /* 1843 * Alias, but not in use, so flush it out. 1844 */ 1845 if (vq->v_usecount == 0 && vq != vp) { 1846 simple_unlock(&spechash_slock); 1847 vgone(vq); 1848 goto loop; 1849 } 1850 count += vq->v_usecount; 1851 } 1852 simple_unlock(&spechash_slock); 1853 return (count); 1854} 1855/* 1856 * Print out a description of a vnode. 1857 */ 1858static char *typename[] = 1859{"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"}; 1860 1861void 1862vprint(label, vp) 1863 char *label; 1864 register struct vnode *vp; 1865{ 1866 char buf[64]; 1867 1868 if (label != NULL) 1869 printf("%s: %x: ", label, vp); 1870 else 1871 printf("%x: ", vp); 1872 printf("type %s, usecount %d, writecount %d, refcount %ld,", 1873 typename[vp->v_type], vp->v_usecount, vp->v_writecount, 1874 vp->v_holdcnt); 1875 buf[0] = '\0'; 1876 if (vp->v_flag & VROOT) 1877 strcat(buf, "|VROOT"); 1878 if (vp->v_flag & VTEXT) 1879 strcat(buf, "|VTEXT"); 1880 if (vp->v_flag & VSYSTEM) 1881 strcat(buf, "|VSYSTEM"); 1882 if (vp->v_flag & VXLOCK) 1883 strcat(buf, "|VXLOCK"); 1884 if (vp->v_flag & VXWANT) 1885 strcat(buf, "|VXWANT"); 1886 if (vp->v_flag & VBWAIT) 1887 strcat(buf, "|VBWAIT"); 1888 if (vp->v_flag & VALIASED) 1889 strcat(buf, "|VALIASED"); 1890 if (vp->v_flag & VDOOMED) 1891 strcat(buf, "|VDOOMED"); 1892 if (vp->v_flag & VFREE) 1893 strcat(buf, "|VFREE"); 1894 if (vp->v_flag & VOBJBUF) 1895 strcat(buf, "|VOBJBUF"); 1896 if (buf[0] != '\0') 1897 printf(" flags (%s)", &buf[1]); 1898 if (vp->v_data == NULL) { 1899 printf("\n"); 1900 } else { 1901 printf("\n\t"); 1902 VOP_PRINT(vp); 1903 } 1904} 1905 1906#ifdef DDB 1907/* 1908 * List all of the locked vnodes in the system. 1909 * Called when debugging the kernel. 1910 */ 1911static void 1912printlockedvnodes() 1913{ 1914 struct proc *p = curproc; /* XXX */ 1915 struct mount *mp, *nmp; 1916 struct vnode *vp; 1917 1918 printf("Locked vnodes\n"); 1919 simple_lock(&mountlist_slock); 1920 for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) { 1921 if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) { 1922 nmp = mp->mnt_list.cqe_next; 1923 continue; 1924 } 1925 for (vp = mp->mnt_vnodelist.lh_first; 1926 vp != NULL; 1927 vp = vp->v_mntvnodes.le_next) { 1928 if (VOP_ISLOCKED(vp)) 1929 vprint((char *)0, vp); 1930 } 1931 simple_lock(&mountlist_slock); 1932 nmp = mp->mnt_list.cqe_next; 1933 vfs_unbusy(mp, p); 1934 } 1935 simple_unlock(&mountlist_slock); 1936} 1937#endif 1938 1939/* 1940 * Top level filesystem related information gathering. 1941 */ 1942static int sysctl_ovfs_conf __P(SYSCTL_HANDLER_ARGS); 1943 1944static int 1945vfs_sysctl SYSCTL_HANDLER_ARGS 1946{ 1947 int *name = (int *)arg1 - 1; /* XXX */ 1948 u_int namelen = arg2 + 1; /* XXX */ 1949 struct vfsconf *vfsp; 1950 1951#ifndef NO_COMPAT_PRELITE2 1952 /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */ 1953 if (namelen == 1) 1954 return (sysctl_ovfs_conf(oidp, arg1, arg2, req)); 1955#endif 1956 1957#ifdef notyet 1958 /* all sysctl names at this level are at least name and field */ 1959 if (namelen < 2) 1960 return (ENOTDIR); /* overloaded */ 1961 if (name[0] != VFS_GENERIC) { 1962 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) 1963 if (vfsp->vfc_typenum == name[0]) 1964 break; 1965 if (vfsp == NULL) 1966 return (EOPNOTSUPP); 1967 return ((*vfsp->vfc_vfsops->vfs_sysctl)(&name[1], namelen - 1, 1968 oldp, oldlenp, newp, newlen, p)); 1969 } 1970#endif 1971 switch (name[1]) { 1972 case VFS_MAXTYPENUM: 1973 if (namelen != 2) 1974 return (ENOTDIR); 1975 return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int))); 1976 case VFS_CONF: 1977 if (namelen != 3) 1978 return (ENOTDIR); /* overloaded */ 1979 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) 1980 if (vfsp->vfc_typenum == name[2]) 1981 break; 1982 if (vfsp == NULL) 1983 return (EOPNOTSUPP); 1984 return (SYSCTL_OUT(req, vfsp, sizeof *vfsp)); 1985 } 1986 return (EOPNOTSUPP); 1987} 1988 1989SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD, vfs_sysctl, 1990 "Generic filesystem"); 1991 1992#ifndef NO_COMPAT_PRELITE2 1993 1994static int 1995sysctl_ovfs_conf SYSCTL_HANDLER_ARGS 1996{ 1997 int error; 1998 struct vfsconf *vfsp; 1999 struct ovfsconf ovfs; 2000 2001 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) { 2002 ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */ 2003 strcpy(ovfs.vfc_name, vfsp->vfc_name); 2004 ovfs.vfc_index = vfsp->vfc_typenum; 2005 ovfs.vfc_refcount = vfsp->vfc_refcount; 2006 ovfs.vfc_flags = vfsp->vfc_flags; 2007 error = SYSCTL_OUT(req, &ovfs, sizeof ovfs); 2008 if (error) 2009 return error; 2010 } 2011 return 0; 2012} 2013 2014#endif /* !NO_COMPAT_PRELITE2 */ 2015 2016static volatile int kinfo_vdebug = 1; 2017 2018#if 0 2019#define KINFO_VNODESLOP 10 2020/* 2021 * Dump vnode list (via sysctl). 2022 * Copyout address of vnode followed by vnode. 2023 */ 2024/* ARGSUSED */ 2025static int 2026sysctl_vnode SYSCTL_HANDLER_ARGS 2027{ 2028 struct proc *p = curproc; /* XXX */ 2029 struct mount *mp, *nmp; 2030 struct vnode *nvp, *vp; 2031 int error; 2032 2033#define VPTRSZ sizeof (struct vnode *) 2034#define VNODESZ sizeof (struct vnode) 2035 2036 req->lock = 0; 2037 if (!req->oldptr) /* Make an estimate */ 2038 return (SYSCTL_OUT(req, 0, 2039 (numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ))); 2040 2041 simple_lock(&mountlist_slock); 2042 for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) { 2043 if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) { 2044 nmp = mp->mnt_list.cqe_next; 2045 continue; 2046 } 2047again: 2048 simple_lock(&mntvnode_slock); 2049 for (vp = mp->mnt_vnodelist.lh_first; 2050 vp != NULL; 2051 vp = nvp) { 2052 /* 2053 * Check that the vp is still associated with 2054 * this filesystem. RACE: could have been 2055 * recycled onto the same filesystem. 2056 */ 2057 if (vp->v_mount != mp) { 2058 simple_unlock(&mntvnode_slock); 2059 if (kinfo_vdebug) 2060 printf("kinfo: vp changed\n"); 2061 goto again; 2062 } 2063 nvp = vp->v_mntvnodes.le_next; 2064 simple_unlock(&mntvnode_slock); 2065 if ((error = SYSCTL_OUT(req, &vp, VPTRSZ)) || 2066 (error = SYSCTL_OUT(req, vp, VNODESZ))) 2067 return (error); 2068 simple_lock(&mntvnode_slock); 2069 } 2070 simple_unlock(&mntvnode_slock); 2071 simple_lock(&mountlist_slock); 2072 nmp = mp->mnt_list.cqe_next; 2073 vfs_unbusy(mp, p); 2074 } 2075 simple_unlock(&mountlist_slock); 2076 2077 return (0); 2078} 2079#endif 2080 2081/* 2082 * XXX 2083 * Exporting the vnode list on large systems causes them to crash. 2084 * Exporting the vnode list on medium systems causes sysctl to coredump. 2085 */ 2086#if 0 2087SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD, 2088 0, 0, sysctl_vnode, "S,vnode", ""); 2089#endif 2090 2091/* 2092 * Check to see if a filesystem is mounted on a block device. 2093 */ 2094int 2095vfs_mountedon(vp) 2096 struct vnode *vp; 2097{ 2098 struct vnode *vq; 2099 int error = 0; 2100 2101 if (vp->v_specmountpoint != NULL) 2102 return (EBUSY); 2103 if (vp->v_flag & VALIASED) { 2104 simple_lock(&spechash_slock); 2105 for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) { 2106 if (vq->v_rdev != vp->v_rdev || 2107 vq->v_type != vp->v_type) 2108 continue; 2109 if (vq->v_specmountpoint != NULL) { 2110 error = EBUSY; 2111 break; 2112 } 2113 } 2114 simple_unlock(&spechash_slock); 2115 } 2116 return (error); 2117} 2118 2119/* 2120 * Unmount all filesystems. The list is traversed in reverse order 2121 * of mounting to avoid dependencies. 2122 */ 2123void 2124vfs_unmountall() 2125{ 2126 struct mount *mp, *nmp; 2127 struct proc *p = initproc; /* XXX XXX should this be proc0? */ 2128 int error; 2129 2130 /* 2131 * Since this only runs when rebooting, it is not interlocked. 2132 */ 2133 for (mp = mountlist.cqh_last; mp != (void *)&mountlist; mp = nmp) { 2134 nmp = mp->mnt_list.cqe_prev; 2135 error = dounmount(mp, MNT_FORCE, p); 2136 if (error) { 2137 printf("unmount of %s failed (", 2138 mp->mnt_stat.f_mntonname); 2139 if (error == EBUSY) 2140 printf("BUSY)\n"); 2141 else 2142 printf("%d)\n", error); 2143 } 2144 } 2145} 2146 2147/* 2148 * Build hash lists of net addresses and hang them off the mount point. 2149 * Called by ufs_mount() to set up the lists of export addresses. 2150 */ 2151static int 2152vfs_hang_addrlist(mp, nep, argp) 2153 struct mount *mp; 2154 struct netexport *nep; 2155 struct export_args *argp; 2156{ 2157 register struct netcred *np; 2158 register struct radix_node_head *rnh; 2159 register int i; 2160 struct radix_node *rn; 2161 struct sockaddr *saddr, *smask = 0; 2162 struct domain *dom; 2163 int error; 2164 2165 if (argp->ex_addrlen == 0) { 2166 if (mp->mnt_flag & MNT_DEFEXPORTED) 2167 return (EPERM); 2168 np = &nep->ne_defexported; 2169 np->netc_exflags = argp->ex_flags; 2170 np->netc_anon = argp->ex_anon; 2171 np->netc_anon.cr_ref = 1; 2172 mp->mnt_flag |= MNT_DEFEXPORTED; 2173 return (0); 2174 } 2175 i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen; 2176 np = (struct netcred *) malloc(i, M_NETADDR, M_WAITOK); 2177 bzero((caddr_t) np, i); 2178 saddr = (struct sockaddr *) (np + 1); 2179 if ((error = copyin(argp->ex_addr, (caddr_t) saddr, argp->ex_addrlen))) 2180 goto out; 2181 if (saddr->sa_len > argp->ex_addrlen) 2182 saddr->sa_len = argp->ex_addrlen; 2183 if (argp->ex_masklen) { 2184 smask = (struct sockaddr *) ((caddr_t) saddr + argp->ex_addrlen); 2185 error = copyin(argp->ex_mask, (caddr_t) smask, argp->ex_masklen); 2186 if (error) 2187 goto out; 2188 if (smask->sa_len > argp->ex_masklen) 2189 smask->sa_len = argp->ex_masklen; 2190 } 2191 i = saddr->sa_family; 2192 if ((rnh = nep->ne_rtable[i]) == 0) { 2193 /* 2194 * Seems silly to initialize every AF when most are not used, 2195 * do so on demand here 2196 */ 2197 for (dom = domains; dom; dom = dom->dom_next) 2198 if (dom->dom_family == i && dom->dom_rtattach) { 2199 dom->dom_rtattach((void **) &nep->ne_rtable[i], 2200 dom->dom_rtoffset); 2201 break; 2202 } 2203 if ((rnh = nep->ne_rtable[i]) == 0) { 2204 error = ENOBUFS; 2205 goto out; 2206 } 2207 } 2208 rn = (*rnh->rnh_addaddr) ((caddr_t) saddr, (caddr_t) smask, rnh, 2209 np->netc_rnodes); 2210 if (rn == 0 || np != (struct netcred *) rn) { /* already exists */ 2211 error = EPERM; 2212 goto out; 2213 } 2214 np->netc_exflags = argp->ex_flags; 2215 np->netc_anon = argp->ex_anon; 2216 np->netc_anon.cr_ref = 1; 2217 return (0); 2218out: 2219 free(np, M_NETADDR); 2220 return (error); 2221} 2222 2223/* ARGSUSED */ 2224static int 2225vfs_free_netcred(rn, w) 2226 struct radix_node *rn; 2227 void *w; 2228{ 2229 register struct radix_node_head *rnh = (struct radix_node_head *) w; 2230 2231 (*rnh->rnh_deladdr) (rn->rn_key, rn->rn_mask, rnh); 2232 free((caddr_t) rn, M_NETADDR); 2233 return (0); 2234} 2235 2236/* 2237 * Free the net address hash lists that are hanging off the mount points. 2238 */ 2239static void 2240vfs_free_addrlist(nep) 2241 struct netexport *nep; 2242{ 2243 register int i; 2244 register struct radix_node_head *rnh; 2245 2246 for (i = 0; i <= AF_MAX; i++) 2247 if ((rnh = nep->ne_rtable[i])) { 2248 (*rnh->rnh_walktree) (rnh, vfs_free_netcred, 2249 (caddr_t) rnh); 2250 free((caddr_t) rnh, M_RTABLE); 2251 nep->ne_rtable[i] = 0; 2252 } 2253} 2254 2255int 2256vfs_export(mp, nep, argp) 2257 struct mount *mp; 2258 struct netexport *nep; 2259 struct export_args *argp; 2260{ 2261 int error; 2262 2263 if (argp->ex_flags & MNT_DELEXPORT) { 2264 if (mp->mnt_flag & MNT_EXPUBLIC) { 2265 vfs_setpublicfs(NULL, NULL, NULL); 2266 mp->mnt_flag &= ~MNT_EXPUBLIC; 2267 } 2268 vfs_free_addrlist(nep); 2269 mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED); 2270 } 2271 if (argp->ex_flags & MNT_EXPORTED) { 2272 if (argp->ex_flags & MNT_EXPUBLIC) { 2273 if ((error = vfs_setpublicfs(mp, nep, argp)) != 0) 2274 return (error); 2275 mp->mnt_flag |= MNT_EXPUBLIC; 2276 } 2277 if ((error = vfs_hang_addrlist(mp, nep, argp))) 2278 return (error); 2279 mp->mnt_flag |= MNT_EXPORTED; 2280 } 2281 return (0); 2282} 2283 2284 2285/* 2286 * Set the publicly exported filesystem (WebNFS). Currently, only 2287 * one public filesystem is possible in the spec (RFC 2054 and 2055) 2288 */ 2289int 2290vfs_setpublicfs(mp, nep, argp) 2291 struct mount *mp; 2292 struct netexport *nep; 2293 struct export_args *argp; 2294{ 2295 int error; 2296 struct vnode *rvp; 2297 char *cp; 2298 2299 /* 2300 * mp == NULL -> invalidate the current info, the FS is 2301 * no longer exported. May be called from either vfs_export 2302 * or unmount, so check if it hasn't already been done. 2303 */ 2304 if (mp == NULL) { 2305 if (nfs_pub.np_valid) { 2306 nfs_pub.np_valid = 0; 2307 if (nfs_pub.np_index != NULL) { 2308 FREE(nfs_pub.np_index, M_TEMP); 2309 nfs_pub.np_index = NULL; 2310 } 2311 } 2312 return (0); 2313 } 2314 2315 /* 2316 * Only one allowed at a time. 2317 */ 2318 if (nfs_pub.np_valid != 0 && mp != nfs_pub.np_mount) 2319 return (EBUSY); 2320 2321 /* 2322 * Get real filehandle for root of exported FS. 2323 */ 2324 bzero((caddr_t)&nfs_pub.np_handle, sizeof(nfs_pub.np_handle)); 2325 nfs_pub.np_handle.fh_fsid = mp->mnt_stat.f_fsid; 2326 2327 if ((error = VFS_ROOT(mp, &rvp))) 2328 return (error); 2329 2330 if ((error = VFS_VPTOFH(rvp, &nfs_pub.np_handle.fh_fid))) 2331 return (error); 2332 2333 vput(rvp); 2334 2335 /* 2336 * If an indexfile was specified, pull it in. 2337 */ 2338 if (argp->ex_indexfile != NULL) { 2339 MALLOC(nfs_pub.np_index, char *, MAXNAMLEN + 1, M_TEMP, 2340 M_WAITOK); 2341 error = copyinstr(argp->ex_indexfile, nfs_pub.np_index, 2342 MAXNAMLEN, (size_t *)0); 2343 if (!error) { 2344 /* 2345 * Check for illegal filenames. 2346 */ 2347 for (cp = nfs_pub.np_index; *cp; cp++) { 2348 if (*cp == '/') { 2349 error = EINVAL; 2350 break; 2351 } 2352 } 2353 } 2354 if (error) { 2355 FREE(nfs_pub.np_index, M_TEMP); 2356 return (error); 2357 } 2358 } 2359 2360 nfs_pub.np_mount = mp; 2361 nfs_pub.np_valid = 1; 2362 return (0); 2363} 2364 2365struct netcred * 2366vfs_export_lookup(mp, nep, nam) 2367 register struct mount *mp; 2368 struct netexport *nep; 2369 struct sockaddr *nam; 2370{ 2371 register struct netcred *np; 2372 register struct radix_node_head *rnh; 2373 struct sockaddr *saddr; 2374 2375 np = NULL; 2376 if (mp->mnt_flag & MNT_EXPORTED) { 2377 /* 2378 * Lookup in the export list first. 2379 */ 2380 if (nam != NULL) { 2381 saddr = nam; 2382 rnh = nep->ne_rtable[saddr->sa_family]; 2383 if (rnh != NULL) { 2384 np = (struct netcred *) 2385 (*rnh->rnh_matchaddr)((caddr_t)saddr, 2386 rnh); 2387 if (np && np->netc_rnodes->rn_flags & RNF_ROOT) 2388 np = NULL; 2389 } 2390 } 2391 /* 2392 * If no address match, use the default if it exists. 2393 */ 2394 if (np == NULL && mp->mnt_flag & MNT_DEFEXPORTED) 2395 np = &nep->ne_defexported; 2396 } 2397 return (np); 2398} 2399 2400/* 2401 * perform msync on all vnodes under a mount point 2402 * the mount point must be locked. 2403 */ 2404void 2405vfs_msync(struct mount *mp, int flags) { 2406 struct vnode *vp, *nvp; 2407 int anyio, tries; 2408 2409 tries = 5; 2410loop: 2411 anyio = 0; 2412 for (vp = mp->mnt_vnodelist.lh_first; vp != NULL; vp = nvp) { 2413 2414 nvp = vp->v_mntvnodes.le_next; 2415 2416 if (vp->v_mount != mp) { 2417 goto loop; 2418 } 2419 2420 if ((vp->v_flag & VXLOCK) || 2421 (VOP_ISLOCKED(vp) && (flags != MNT_WAIT))) { 2422 continue; 2423 } 2424 2425 simple_lock(&vp->v_interlock); 2426 if (vp->v_object && 2427 (vp->v_object->flags & OBJ_MIGHTBEDIRTY)) { 2428 if (!vget(vp, 2429 LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY | LK_NOOBJ, curproc)) { 2430 if (vp->v_object) { 2431 vm_object_page_clean(vp->v_object, 0, 0, TRUE); 2432 anyio = 1; 2433 } 2434 vput(vp); 2435 } 2436 } else { 2437 simple_unlock(&vp->v_interlock); 2438 } 2439 } 2440 if (anyio && (--tries > 0)) 2441 goto loop; 2442} 2443 2444/* 2445 * Create the VM object needed for VMIO and mmap support. This 2446 * is done for all VREG files in the system. Some filesystems might 2447 * afford the additional metadata buffering capability of the 2448 * VMIO code by making the device node be VMIO mode also. 2449 * 2450 * If !waslocked, must be called with interlock. 2451 */ 2452int 2453vfs_object_create(vp, p, cred, waslocked) 2454 struct vnode *vp; 2455 struct proc *p; 2456 struct ucred *cred; 2457 int waslocked; 2458{ 2459 struct vattr vat; 2460 vm_object_t object; 2461 int error = 0; 2462 2463 if ((vp->v_type != VREG) && (vp->v_type != VBLK)) { 2464 if (!waslocked) 2465 simple_unlock(&vp->v_interlock); 2466 return 0; 2467 } 2468 2469 if (!waslocked) 2470 vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK | LK_RETRY, p); 2471 2472retry: 2473 if ((object = vp->v_object) == NULL) { 2474 if (vp->v_type == VREG) { 2475 if ((error = VOP_GETATTR(vp, &vat, cred, p)) != 0) 2476 goto retn; 2477 object = vnode_pager_alloc(vp, 2478 OFF_TO_IDX(round_page(vat.va_size)), 0, 0); 2479 } else if (major(vp->v_rdev) < nblkdev) { 2480 /* 2481 * This simply allocates the biggest object possible 2482 * for a VBLK vnode. This should be fixed, but doesn't 2483 * cause any problems (yet). 2484 */ 2485 object = vnode_pager_alloc(vp, INT_MAX, 0, 0); 2486 } 2487 object->ref_count--; 2488 vp->v_usecount--; 2489 } else { 2490 if (object->flags & OBJ_DEAD) { 2491 VOP_UNLOCK(vp, 0, p); 2492 tsleep(object, PVM, "vodead", 0); 2493 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); 2494 goto retry; 2495 } 2496 } 2497 2498 if (vp->v_object) { 2499 vp->v_flag |= VOBJBUF; 2500 } 2501 2502retn: 2503 if (!waslocked) { 2504 simple_lock(&vp->v_interlock); 2505 VOP_UNLOCK(vp, LK_INTERLOCK, p); 2506 } 2507 2508 return error; 2509} 2510 2511static void 2512vfree(vp) 2513 struct vnode *vp; 2514{ 2515 int s; 2516 2517 s = splbio(); 2518 simple_lock(&vnode_free_list_slock); 2519 if (vp->v_flag & VTBFREE) { 2520 TAILQ_REMOVE(&vnode_tobefree_list, vp, v_freelist); 2521 vp->v_flag &= ~VTBFREE; 2522 } 2523 if (vp->v_flag & VAGE) { 2524 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); 2525 } else { 2526 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 2527 } 2528 freevnodes++; 2529 simple_unlock(&vnode_free_list_slock); 2530 vp->v_flag &= ~VAGE; 2531 vp->v_flag |= VFREE; 2532 splx(s); 2533} 2534 2535void 2536vbusy(vp) 2537 struct vnode *vp; 2538{ 2539 int s; 2540 2541 s = splbio(); 2542 simple_lock(&vnode_free_list_slock); 2543 if (vp->v_flag & VTBFREE) { 2544 TAILQ_REMOVE(&vnode_tobefree_list, vp, v_freelist); 2545 vp->v_flag &= ~VTBFREE; 2546 } else { 2547 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); 2548 freevnodes--; 2549 } 2550 simple_unlock(&vnode_free_list_slock); 2551 vp->v_flag &= ~(VFREE|VAGE); 2552 splx(s); 2553} 2554 2555/* 2556 * Record a process's interest in events which might happen to 2557 * a vnode. Because poll uses the historic select-style interface 2558 * internally, this routine serves as both the ``check for any 2559 * pending events'' and the ``record my interest in future events'' 2560 * functions. (These are done together, while the lock is held, 2561 * to avoid race conditions.) 2562 */ 2563int 2564vn_pollrecord(vp, p, events) 2565 struct vnode *vp; 2566 struct proc *p; 2567 short events; 2568{ 2569 simple_lock(&vp->v_pollinfo.vpi_lock); 2570 if (vp->v_pollinfo.vpi_revents & events) { 2571 /* 2572 * This leaves events we are not interested 2573 * in available for the other process which 2574 * which presumably had requested them 2575 * (otherwise they would never have been 2576 * recorded). 2577 */ 2578 events &= vp->v_pollinfo.vpi_revents; 2579 vp->v_pollinfo.vpi_revents &= ~events; 2580 2581 simple_unlock(&vp->v_pollinfo.vpi_lock); 2582 return events; 2583 } 2584 vp->v_pollinfo.vpi_events |= events; 2585 selrecord(p, &vp->v_pollinfo.vpi_selinfo); 2586 simple_unlock(&vp->v_pollinfo.vpi_lock); 2587 return 0; 2588} 2589 2590/* 2591 * Note the occurrence of an event. If the VN_POLLEVENT macro is used, 2592 * it is possible for us to miss an event due to race conditions, but 2593 * that condition is expected to be rare, so for the moment it is the 2594 * preferred interface. 2595 */ 2596void 2597vn_pollevent(vp, events) 2598 struct vnode *vp; 2599 short events; 2600{ 2601 simple_lock(&vp->v_pollinfo.vpi_lock); 2602 if (vp->v_pollinfo.vpi_events & events) { 2603 /* 2604 * We clear vpi_events so that we don't 2605 * call selwakeup() twice if two events are 2606 * posted before the polling process(es) is 2607 * awakened. This also ensures that we take at 2608 * most one selwakeup() if the polling process 2609 * is no longer interested. However, it does 2610 * mean that only one event can be noticed at 2611 * a time. (Perhaps we should only clear those 2612 * event bits which we note?) XXX 2613 */ 2614 vp->v_pollinfo.vpi_events = 0; /* &= ~events ??? */ 2615 vp->v_pollinfo.vpi_revents |= events; 2616 selwakeup(&vp->v_pollinfo.vpi_selinfo); 2617 } 2618 simple_unlock(&vp->v_pollinfo.vpi_lock); 2619} 2620 2621/* 2622 * Wake up anyone polling on vp because it is being revoked. 2623 * This depends on dead_poll() returning POLLHUP for correct 2624 * behavior. 2625 */ 2626void 2627vn_pollgone(vp) 2628 struct vnode *vp; 2629{ 2630 simple_lock(&vp->v_pollinfo.vpi_lock); 2631 if (vp->v_pollinfo.vpi_events) { 2632 vp->v_pollinfo.vpi_events = 0; 2633 selwakeup(&vp->v_pollinfo.vpi_selinfo); 2634 } 2635 simple_unlock(&vp->v_pollinfo.vpi_lock); 2636} 2637 2638 2639 2640/* 2641 * Routine to create and manage a filesystem syncer vnode. 2642 */ 2643#define sync_close ((int (*) __P((struct vop_close_args *)))nullop) 2644int sync_fsync __P((struct vop_fsync_args *)); 2645int sync_inactive __P((struct vop_inactive_args *)); 2646int sync_reclaim __P((struct vop_reclaim_args *)); 2647#define sync_lock ((int (*) __P((struct vop_lock_args *)))vop_nolock) 2648#define sync_unlock ((int (*) __P((struct vop_unlock_args *)))vop_nounlock) 2649int sync_print __P((struct vop_print_args *)); 2650#define sync_islocked ((int(*) __P((struct vop_islocked_args *)))vop_noislocked) 2651 2652vop_t **sync_vnodeop_p; 2653struct vnodeopv_entry_desc sync_vnodeop_entries[] = { 2654 { &vop_default_desc, (vop_t *) vop_eopnotsupp }, 2655 { &vop_close_desc, (vop_t *) sync_close }, /* close */ 2656 { &vop_fsync_desc, (vop_t *) sync_fsync }, /* fsync */ 2657 { &vop_inactive_desc, (vop_t *) sync_inactive }, /* inactive */ 2658 { &vop_reclaim_desc, (vop_t *) sync_reclaim }, /* reclaim */ 2659 { &vop_lock_desc, (vop_t *) sync_lock }, /* lock */ 2660 { &vop_unlock_desc, (vop_t *) sync_unlock }, /* unlock */ 2661 { &vop_print_desc, (vop_t *) sync_print }, /* print */ 2662 { &vop_islocked_desc, (vop_t *) sync_islocked }, /* islocked */ 2663 { NULL, NULL } 2664}; 2665struct vnodeopv_desc sync_vnodeop_opv_desc = 2666 { &sync_vnodeop_p, sync_vnodeop_entries }; 2667 2668VNODEOP_SET(sync_vnodeop_opv_desc); 2669 2670/* 2671 * Create a new filesystem syncer vnode for the specified mount point. 2672 */ 2673int 2674vfs_allocate_syncvnode(mp) 2675 struct mount *mp; 2676{ 2677 struct vnode *vp; 2678 static long start, incr, next; 2679 int error; 2680 2681 /* Allocate a new vnode */ 2682 if ((error = getnewvnode(VT_VFS, mp, sync_vnodeop_p, &vp)) != 0) { 2683 mp->mnt_syncer = NULL; 2684 return (error); 2685 } 2686 vp->v_type = VNON; 2687 /* 2688 * Place the vnode onto the syncer worklist. We attempt to 2689 * scatter them about on the list so that they will go off 2690 * at evenly distributed times even if all the filesystems 2691 * are mounted at once. 2692 */ 2693 next += incr; 2694 if (next == 0 || next > syncer_maxdelay) { 2695 start /= 2; 2696 incr /= 2; 2697 if (start == 0) { 2698 start = syncer_maxdelay / 2; 2699 incr = syncer_maxdelay; 2700 } 2701 next = start; 2702 } 2703 vn_syncer_add_to_worklist(vp, syncdelay > 0 ? next % syncdelay : 0); 2704 mp->mnt_syncer = vp; 2705 return (0); 2706} 2707 2708/* 2709 * Do a lazy sync of the filesystem. 2710 */ 2711int 2712sync_fsync(ap) 2713 struct vop_fsync_args /* { 2714 struct vnode *a_vp; 2715 struct ucred *a_cred; 2716 int a_waitfor; 2717 struct proc *a_p; 2718 } */ *ap; 2719{ 2720 struct vnode *syncvp = ap->a_vp; 2721 struct mount *mp = syncvp->v_mount; 2722 struct proc *p = ap->a_p; 2723 int asyncflag; 2724 2725 /* 2726 * We only need to do something if this is a lazy evaluation. 2727 */ 2728 if (ap->a_waitfor != MNT_LAZY) 2729 return (0); 2730 2731 /* 2732 * Move ourselves to the back of the sync list. 2733 */ 2734 vn_syncer_add_to_worklist(syncvp, syncdelay); 2735 2736 /* 2737 * Walk the list of vnodes pushing all that are dirty and 2738 * not already on the sync list. 2739 */ 2740 simple_lock(&mountlist_slock); 2741 if (vfs_busy(mp, LK_EXCLUSIVE | LK_NOWAIT, &mountlist_slock, p) != 0) 2742 return (0); 2743 asyncflag = mp->mnt_flag & MNT_ASYNC; 2744 mp->mnt_flag &= ~MNT_ASYNC; 2745 VFS_SYNC(mp, MNT_LAZY, ap->a_cred, p); 2746 if (asyncflag) 2747 mp->mnt_flag |= MNT_ASYNC; 2748 vfs_unbusy(mp, p); 2749 return (0); 2750} 2751 2752/* 2753 * The syncer vnode is no referenced. 2754 */ 2755int 2756sync_inactive(ap) 2757 struct vop_inactive_args /* { 2758 struct vnode *a_vp; 2759 struct proc *a_p; 2760 } */ *ap; 2761{ 2762 2763 vgone(ap->a_vp); 2764 return (0); 2765} 2766 2767/* 2768 * The syncer vnode is no longer needed and is being decommissioned. 2769 */ 2770int 2771sync_reclaim(ap) 2772 struct vop_reclaim_args /* { 2773 struct vnode *a_vp; 2774 } */ *ap; 2775{ 2776 struct vnode *vp = ap->a_vp; 2777 2778 vp->v_mount->mnt_syncer = NULL; 2779 if (vp->v_flag & VONWORKLST) { 2780 LIST_REMOVE(vp, v_synclist); 2781 vp->v_flag &= ~VONWORKLST; 2782 } 2783 2784 return (0); 2785} 2786 2787/* 2788 * Print out a syncer vnode. 2789 */ 2790int 2791sync_print(ap) 2792 struct vop_print_args /* { 2793 struct vnode *a_vp; 2794 } */ *ap; 2795{ 2796 struct vnode *vp = ap->a_vp; 2797 2798 printf("syncer vnode"); 2799 if (vp->v_vnlock != NULL) 2800 lockmgr_printinfo(vp->v_vnlock); 2801 printf("\n"); 2802 return (0); 2803} 2804