vfs_subr.c revision 49101
1/* 2 * Copyright (c) 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the University of 21 * California, Berkeley and its contributors. 22 * 4. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95 39 * $Id: vfs_subr.c,v 1.213 1999/07/20 09:47:44 phk Exp $ 40 */ 41 42/* 43 * External virtual filesystem routines 44 */ 45#include "opt_ddb.h" 46 47#include <sys/param.h> 48#include <sys/systm.h> 49#include <sys/conf.h> 50#include <sys/fcntl.h> 51#include <sys/kernel.h> 52#include <sys/proc.h> 53#include <sys/kthread.h> 54#include <sys/malloc.h> 55#include <sys/mount.h> 56#include <sys/socket.h> 57#include <sys/vnode.h> 58#include <sys/stat.h> 59#include <sys/buf.h> 60#include <sys/domain.h> 61#include <sys/dirent.h> 62#include <sys/vmmeter.h> 63 64#include <machine/limits.h> 65 66#include <vm/vm.h> 67#include <vm/vm_param.h> 68#include <vm/vm_prot.h> 69#include <vm/vm_object.h> 70#include <vm/vm_extern.h> 71#include <vm/pmap.h> 72#include <vm/vm_map.h> 73#include <vm/vm_page.h> 74#include <vm/vm_pager.h> 75#include <vm/vnode_pager.h> 76#include <vm/vm_zone.h> 77#include <sys/sysctl.h> 78 79#include <miscfs/specfs/specdev.h> 80 81static MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure"); 82 83static struct vnode *checkalias2 __P((struct vnode *nvp, dev_t dev, struct mount *mp)); 84static void insmntque __P((struct vnode *vp, struct mount *mp)); 85static void vclean __P((struct vnode *vp, int flags, struct proc *p)); 86static void vfree __P((struct vnode *)); 87static void vgonel __P((struct vnode *vp, struct proc *p)); 88static unsigned long numvnodes; 89SYSCTL_INT(_debug, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, ""); 90 91enum vtype iftovt_tab[16] = { 92 VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, 93 VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD, 94}; 95int vttoif_tab[9] = { 96 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, 97 S_IFSOCK, S_IFIFO, S_IFMT, 98}; 99 100static TAILQ_HEAD(freelst, vnode) vnode_free_list; /* vnode free list */ 101struct tobefreelist vnode_tobefree_list; /* vnode free list */ 102 103static u_long wantfreevnodes = 25; 104SYSCTL_INT(_debug, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, ""); 105static u_long freevnodes = 0; 106SYSCTL_INT(_debug, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, ""); 107 108static int reassignbufcalls; 109SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0, ""); 110static int reassignbufloops; 111SYSCTL_INT(_vfs, OID_AUTO, reassignbufloops, CTLFLAG_RW, &reassignbufloops, 0, ""); 112static int reassignbufsortgood; 113SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortgood, CTLFLAG_RW, &reassignbufsortgood, 0, ""); 114static int reassignbufsortbad; 115SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortbad, CTLFLAG_RW, &reassignbufsortbad, 0, ""); 116static int reassignbufmethod = 1; 117SYSCTL_INT(_vfs, OID_AUTO, reassignbufmethod, CTLFLAG_RW, &reassignbufmethod, 0, ""); 118 119#ifdef ENABLE_VFS_IOOPT 120int vfs_ioopt = 0; 121SYSCTL_INT(_vfs, OID_AUTO, ioopt, CTLFLAG_RW, &vfs_ioopt, 0, ""); 122#endif 123 124struct mntlist mountlist; /* mounted filesystem list */ 125struct simplelock mountlist_slock; 126struct simplelock mntvnode_slock; 127int nfs_mount_type = -1; 128#ifndef NULL_SIMPLELOCKS 129static struct simplelock mntid_slock; 130static struct simplelock vnode_free_list_slock; 131static struct simplelock spechash_slock; 132#endif 133struct nfs_public nfs_pub; /* publicly exported FS */ 134static vm_zone_t vnode_zone; 135 136/* 137 * The workitem queue. 138 */ 139#define SYNCER_MAXDELAY 32 140static int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */ 141time_t syncdelay = 30; /* max time to delay syncing data */ 142time_t filedelay = 30; /* time to delay syncing files */ 143SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, ""); 144time_t dirdelay = 29; /* time to delay syncing directories */ 145SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, ""); 146time_t metadelay = 28; /* time to delay syncing metadata */ 147SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, ""); 148static int rushjob; /* number of slots to run ASAP */ 149static int stat_rush_requests; /* number of times I/O speeded up */ 150SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, ""); 151 152static int syncer_delayno = 0; 153static long syncer_mask; 154LIST_HEAD(synclist, vnode); 155static struct synclist *syncer_workitem_pending; 156 157int desiredvnodes; 158SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW, 159 &desiredvnodes, 0, "Maximum number of vnodes"); 160 161static void vfs_free_addrlist __P((struct netexport *nep)); 162static int vfs_free_netcred __P((struct radix_node *rn, void *w)); 163static int vfs_hang_addrlist __P((struct mount *mp, struct netexport *nep, 164 struct export_args *argp)); 165 166/* 167 * Initialize the vnode management data structures. 168 */ 169void 170vntblinit() 171{ 172 173 desiredvnodes = maxproc + cnt.v_page_count / 4; 174 simple_lock_init(&mntvnode_slock); 175 simple_lock_init(&mntid_slock); 176 simple_lock_init(&spechash_slock); 177 TAILQ_INIT(&vnode_free_list); 178 TAILQ_INIT(&vnode_tobefree_list); 179 simple_lock_init(&vnode_free_list_slock); 180 CIRCLEQ_INIT(&mountlist); 181 vnode_zone = zinit("VNODE", sizeof (struct vnode), 0, 0, 5); 182 /* 183 * Initialize the filesystem syncer. 184 */ 185 syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE, 186 &syncer_mask); 187 syncer_maxdelay = syncer_mask + 1; 188} 189 190/* 191 * Mark a mount point as busy. Used to synchronize access and to delay 192 * unmounting. Interlock is not released on failure. 193 */ 194int 195vfs_busy(mp, flags, interlkp, p) 196 struct mount *mp; 197 int flags; 198 struct simplelock *interlkp; 199 struct proc *p; 200{ 201 int lkflags; 202 203 if (mp->mnt_kern_flag & MNTK_UNMOUNT) { 204 if (flags & LK_NOWAIT) 205 return (ENOENT); 206 mp->mnt_kern_flag |= MNTK_MWAIT; 207 if (interlkp) { 208 simple_unlock(interlkp); 209 } 210 /* 211 * Since all busy locks are shared except the exclusive 212 * lock granted when unmounting, the only place that a 213 * wakeup needs to be done is at the release of the 214 * exclusive lock at the end of dounmount. 215 */ 216 tsleep((caddr_t)mp, PVFS, "vfs_busy", 0); 217 if (interlkp) { 218 simple_lock(interlkp); 219 } 220 return (ENOENT); 221 } 222 lkflags = LK_SHARED | LK_NOPAUSE; 223 if (interlkp) 224 lkflags |= LK_INTERLOCK; 225 if (lockmgr(&mp->mnt_lock, lkflags, interlkp, p)) 226 panic("vfs_busy: unexpected lock failure"); 227 return (0); 228} 229 230/* 231 * Free a busy filesystem. 232 */ 233void 234vfs_unbusy(mp, p) 235 struct mount *mp; 236 struct proc *p; 237{ 238 239 lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, p); 240} 241 242/* 243 * Lookup a filesystem type, and if found allocate and initialize 244 * a mount structure for it. 245 * 246 * Devname is usually updated by mount(8) after booting. 247 */ 248int 249vfs_rootmountalloc(fstypename, devname, mpp) 250 char *fstypename; 251 char *devname; 252 struct mount **mpp; 253{ 254 struct proc *p = curproc; /* XXX */ 255 struct vfsconf *vfsp; 256 struct mount *mp; 257 258 if (fstypename == NULL) 259 return (ENODEV); 260 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) 261 if (!strcmp(vfsp->vfc_name, fstypename)) 262 break; 263 if (vfsp == NULL) 264 return (ENODEV); 265 mp = malloc((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK); 266 bzero((char *)mp, (u_long)sizeof(struct mount)); 267 lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, LK_NOPAUSE); 268 (void)vfs_busy(mp, LK_NOWAIT, 0, p); 269 LIST_INIT(&mp->mnt_vnodelist); 270 mp->mnt_vfc = vfsp; 271 mp->mnt_op = vfsp->vfc_vfsops; 272 mp->mnt_flag = MNT_RDONLY; 273 mp->mnt_vnodecovered = NULLVP; 274 vfsp->vfc_refcount++; 275 mp->mnt_stat.f_type = vfsp->vfc_typenum; 276 mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK; 277 strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN); 278 mp->mnt_stat.f_mntonname[0] = '/'; 279 mp->mnt_stat.f_mntonname[1] = 0; 280 (void) copystr(devname, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, 0); 281 *mpp = mp; 282 return (0); 283} 284 285/* 286 * Find an appropriate filesystem to use for the root. If a filesystem 287 * has not been preselected, walk through the list of known filesystems 288 * trying those that have mountroot routines, and try them until one 289 * works or we have tried them all. 290 */ 291#ifdef notdef /* XXX JH */ 292int 293lite2_vfs_mountroot() 294{ 295 struct vfsconf *vfsp; 296 extern int (*lite2_mountroot) __P((void)); 297 int error; 298 299 if (lite2_mountroot != NULL) 300 return ((*lite2_mountroot)()); 301 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) { 302 if (vfsp->vfc_mountroot == NULL) 303 continue; 304 if ((error = (*vfsp->vfc_mountroot)()) == 0) 305 return (0); 306 printf("%s_mountroot failed: %d\n", vfsp->vfc_name, error); 307 } 308 return (ENODEV); 309} 310#endif 311 312/* 313 * Lookup a mount point by filesystem identifier. 314 */ 315struct mount * 316vfs_getvfs(fsid) 317 fsid_t *fsid; 318{ 319 register struct mount *mp; 320 321 simple_lock(&mountlist_slock); 322 for (mp = mountlist.cqh_first; mp != (void *)&mountlist; 323 mp = mp->mnt_list.cqe_next) { 324 if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] && 325 mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) { 326 simple_unlock(&mountlist_slock); 327 return (mp); 328 } 329 } 330 simple_unlock(&mountlist_slock); 331 return ((struct mount *) 0); 332} 333 334/* 335 * Get a new unique fsid 336 */ 337void 338vfs_getnewfsid(mp) 339 struct mount *mp; 340{ 341 static u_short xxxfs_mntid; 342 343 fsid_t tfsid; 344 int mtype; 345 346 simple_lock(&mntid_slock); 347 mtype = mp->mnt_vfc->vfc_typenum; 348 mp->mnt_stat.f_fsid.val[0] = makeudev(255, mtype); 349 mp->mnt_stat.f_fsid.val[1] = mtype; 350 if (xxxfs_mntid == 0) 351 ++xxxfs_mntid; 352 tfsid.val[0] = makeudev(255, mtype + (xxxfs_mntid << 16)); 353 tfsid.val[1] = mtype; 354 if (mountlist.cqh_first != (void *)&mountlist) { 355 while (vfs_getvfs(&tfsid)) { 356 xxxfs_mntid++; 357 tfsid.val[0] = makeudev(255, 358 mtype + (xxxfs_mntid << 16)); 359 } 360 } 361 mp->mnt_stat.f_fsid.val[0] = tfsid.val[0]; 362 simple_unlock(&mntid_slock); 363} 364 365/* 366 * Set vnode attributes to VNOVAL 367 */ 368void 369vattr_null(vap) 370 register struct vattr *vap; 371{ 372 373 vap->va_type = VNON; 374 vap->va_size = VNOVAL; 375 vap->va_bytes = VNOVAL; 376 vap->va_mode = VNOVAL; 377 vap->va_nlink = VNOVAL; 378 vap->va_uid = VNOVAL; 379 vap->va_gid = VNOVAL; 380 vap->va_fsid = VNOVAL; 381 vap->va_fileid = VNOVAL; 382 vap->va_blocksize = VNOVAL; 383 vap->va_rdev = VNOVAL; 384 vap->va_atime.tv_sec = VNOVAL; 385 vap->va_atime.tv_nsec = VNOVAL; 386 vap->va_mtime.tv_sec = VNOVAL; 387 vap->va_mtime.tv_nsec = VNOVAL; 388 vap->va_ctime.tv_sec = VNOVAL; 389 vap->va_ctime.tv_nsec = VNOVAL; 390 vap->va_flags = VNOVAL; 391 vap->va_gen = VNOVAL; 392 vap->va_vaflags = 0; 393} 394 395/* 396 * Routines having to do with the management of the vnode table. 397 */ 398extern vop_t **dead_vnodeop_p; 399 400/* 401 * Return the next vnode from the free list. 402 */ 403int 404getnewvnode(tag, mp, vops, vpp) 405 enum vtagtype tag; 406 struct mount *mp; 407 vop_t **vops; 408 struct vnode **vpp; 409{ 410 int s; 411 struct proc *p = curproc; /* XXX */ 412 struct vnode *vp, *tvp, *nvp; 413 vm_object_t object; 414 TAILQ_HEAD(freelst, vnode) vnode_tmp_list; 415 416 /* 417 * We take the least recently used vnode from the freelist 418 * if we can get it and it has no cached pages, and no 419 * namecache entries are relative to it. 420 * Otherwise we allocate a new vnode 421 */ 422 423 s = splbio(); 424 simple_lock(&vnode_free_list_slock); 425 TAILQ_INIT(&vnode_tmp_list); 426 427 for (vp = TAILQ_FIRST(&vnode_tobefree_list); vp; vp = nvp) { 428 nvp = TAILQ_NEXT(vp, v_freelist); 429 TAILQ_REMOVE(&vnode_tobefree_list, vp, v_freelist); 430 if (vp->v_flag & VAGE) { 431 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); 432 } else { 433 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 434 } 435 vp->v_flag &= ~(VTBFREE|VAGE); 436 vp->v_flag |= VFREE; 437 if (vp->v_usecount) 438 panic("tobe free vnode isn't"); 439 freevnodes++; 440 } 441 442 if (wantfreevnodes && freevnodes < wantfreevnodes) { 443 vp = NULL; 444 } else if (!wantfreevnodes && freevnodes <= desiredvnodes) { 445 /* 446 * XXX: this is only here to be backwards compatible 447 */ 448 vp = NULL; 449 } else { 450 for (vp = TAILQ_FIRST(&vnode_free_list); vp; vp = nvp) { 451 nvp = TAILQ_NEXT(vp, v_freelist); 452 if (!simple_lock_try(&vp->v_interlock)) 453 continue; 454 if (vp->v_usecount) 455 panic("free vnode isn't"); 456 457 object = vp->v_object; 458 if (object && (object->resident_page_count || object->ref_count)) { 459 printf("object inconsistant state: RPC: %d, RC: %d\n", 460 object->resident_page_count, object->ref_count); 461 /* Don't recycle if it's caching some pages */ 462 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); 463 TAILQ_INSERT_TAIL(&vnode_tmp_list, vp, v_freelist); 464 continue; 465 } else if (LIST_FIRST(&vp->v_cache_src)) { 466 /* Don't recycle if active in the namecache */ 467 simple_unlock(&vp->v_interlock); 468 continue; 469 } else { 470 break; 471 } 472 } 473 } 474 475 for (tvp = TAILQ_FIRST(&vnode_tmp_list); tvp; tvp = nvp) { 476 nvp = TAILQ_NEXT(tvp, v_freelist); 477 TAILQ_REMOVE(&vnode_tmp_list, tvp, v_freelist); 478 TAILQ_INSERT_TAIL(&vnode_free_list, tvp, v_freelist); 479 simple_unlock(&tvp->v_interlock); 480 } 481 482 if (vp) { 483 vp->v_flag |= VDOOMED; 484 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); 485 freevnodes--; 486 simple_unlock(&vnode_free_list_slock); 487 cache_purge(vp); 488 vp->v_lease = NULL; 489 if (vp->v_type != VBAD) { 490 vgonel(vp, p); 491 } else { 492 simple_unlock(&vp->v_interlock); 493 } 494 495#ifdef INVARIANTS 496 { 497 int s; 498 499 if (vp->v_data) 500 panic("cleaned vnode isn't"); 501 s = splbio(); 502 if (vp->v_numoutput) 503 panic("Clean vnode has pending I/O's"); 504 splx(s); 505 } 506#endif 507 vp->v_flag = 0; 508 vp->v_lastr = 0; 509 vp->v_lastw = 0; 510 vp->v_lasta = 0; 511 vp->v_cstart = 0; 512 vp->v_clen = 0; 513 vp->v_socket = 0; 514 vp->v_writecount = 0; /* XXX */ 515 vp->v_maxio = 0; 516 } else { 517 simple_unlock(&vnode_free_list_slock); 518 vp = (struct vnode *) zalloc(vnode_zone); 519 bzero((char *) vp, sizeof *vp); 520 simple_lock_init(&vp->v_interlock); 521 vp->v_dd = vp; 522 cache_purge(vp); 523 LIST_INIT(&vp->v_cache_src); 524 TAILQ_INIT(&vp->v_cache_dst); 525 numvnodes++; 526 } 527 528 TAILQ_INIT(&vp->v_cleanblkhd); 529 TAILQ_INIT(&vp->v_dirtyblkhd); 530 vp->v_type = VNON; 531 vp->v_tag = tag; 532 vp->v_op = vops; 533 insmntque(vp, mp); 534 *vpp = vp; 535 vp->v_usecount = 1; 536 vp->v_data = 0; 537 splx(s); 538 539 vfs_object_create(vp, p, p->p_ucred); 540 return (0); 541} 542 543/* 544 * Move a vnode from one mount queue to another. 545 */ 546static void 547insmntque(vp, mp) 548 register struct vnode *vp; 549 register struct mount *mp; 550{ 551 552 simple_lock(&mntvnode_slock); 553 /* 554 * Delete from old mount point vnode list, if on one. 555 */ 556 if (vp->v_mount != NULL) 557 LIST_REMOVE(vp, v_mntvnodes); 558 /* 559 * Insert into list of vnodes for the new mount point, if available. 560 */ 561 if ((vp->v_mount = mp) == NULL) { 562 simple_unlock(&mntvnode_slock); 563 return; 564 } 565 LIST_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes); 566 simple_unlock(&mntvnode_slock); 567} 568 569/* 570 * Update outstanding I/O count and do wakeup if requested. 571 */ 572void 573vwakeup(bp) 574 register struct buf *bp; 575{ 576 register struct vnode *vp; 577 578 bp->b_flags &= ~B_WRITEINPROG; 579 if ((vp = bp->b_vp)) { 580 vp->v_numoutput--; 581 if (vp->v_numoutput < 0) 582 panic("vwakeup: neg numoutput"); 583 if ((vp->v_numoutput == 0) && (vp->v_flag & VBWAIT)) { 584 vp->v_flag &= ~VBWAIT; 585 wakeup((caddr_t) &vp->v_numoutput); 586 } 587 } 588} 589 590/* 591 * Flush out and invalidate all buffers associated with a vnode. 592 * Called with the underlying object locked. 593 */ 594int 595vinvalbuf(vp, flags, cred, p, slpflag, slptimeo) 596 register struct vnode *vp; 597 int flags; 598 struct ucred *cred; 599 struct proc *p; 600 int slpflag, slptimeo; 601{ 602 register struct buf *bp; 603 struct buf *nbp, *blist; 604 int s, error; 605 vm_object_t object; 606 607 if (flags & V_SAVE) { 608 s = splbio(); 609 while (vp->v_numoutput) { 610 vp->v_flag |= VBWAIT; 611 error = tsleep((caddr_t)&vp->v_numoutput, 612 slpflag | (PRIBIO + 1), "vinvlbuf", slptimeo); 613 if (error) { 614 splx(s); 615 return (error); 616 } 617 } 618 if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) { 619 splx(s); 620 if ((error = VOP_FSYNC(vp, cred, MNT_WAIT, p)) != 0) 621 return (error); 622 s = splbio(); 623 if (vp->v_numoutput > 0 || 624 !TAILQ_EMPTY(&vp->v_dirtyblkhd)) 625 panic("vinvalbuf: dirty bufs"); 626 } 627 splx(s); 628 } 629 s = splbio(); 630 for (;;) { 631 blist = TAILQ_FIRST(&vp->v_cleanblkhd); 632 if (!blist) 633 blist = TAILQ_FIRST(&vp->v_dirtyblkhd); 634 if (!blist) 635 break; 636 637 for (bp = blist; bp; bp = nbp) { 638 nbp = TAILQ_NEXT(bp, b_vnbufs); 639 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { 640 error = BUF_TIMELOCK(bp, 641 LK_EXCLUSIVE | LK_SLEEPFAIL, 642 "vinvalbuf", slpflag, slptimeo); 643 if (error == ENOLCK) 644 break; 645 splx(s); 646 return (error); 647 } 648 /* 649 * XXX Since there are no node locks for NFS, I 650 * believe there is a slight chance that a delayed 651 * write will occur while sleeping just above, so 652 * check for it. Note that vfs_bio_awrite expects 653 * buffers to reside on a queue, while VOP_BWRITE and 654 * brelse do not. 655 */ 656 if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) && 657 (flags & V_SAVE)) { 658 659 if (bp->b_vp == vp) { 660 if (bp->b_flags & B_CLUSTEROK) { 661 BUF_UNLOCK(bp); 662 vfs_bio_awrite(bp); 663 } else { 664 bremfree(bp); 665 bp->b_flags |= B_ASYNC; 666 VOP_BWRITE(bp->b_vp, bp); 667 } 668 } else { 669 bremfree(bp); 670 (void) VOP_BWRITE(bp->b_vp, bp); 671 } 672 break; 673 } 674 bremfree(bp); 675 bp->b_flags |= (B_INVAL | B_NOCACHE | B_RELBUF); 676 bp->b_flags &= ~B_ASYNC; 677 brelse(bp); 678 } 679 } 680 681 while (vp->v_numoutput > 0) { 682 vp->v_flag |= VBWAIT; 683 tsleep(&vp->v_numoutput, PVM, "vnvlbv", 0); 684 } 685 686 splx(s); 687 688 /* 689 * Destroy the copy in the VM cache, too. 690 */ 691 simple_lock(&vp->v_interlock); 692 object = vp->v_object; 693 if (object != NULL) { 694 vm_object_page_remove(object, 0, 0, 695 (flags & V_SAVE) ? TRUE : FALSE); 696 } 697 simple_unlock(&vp->v_interlock); 698 699 if (!TAILQ_EMPTY(&vp->v_dirtyblkhd) || !TAILQ_EMPTY(&vp->v_cleanblkhd)) 700 panic("vinvalbuf: flush failed"); 701 return (0); 702} 703 704/* 705 * Truncate a file's buffer and pages to a specified length. This 706 * is in lieu of the old vinvalbuf mechanism, which performed unneeded 707 * sync activity. 708 */ 709int 710vtruncbuf(vp, cred, p, length, blksize) 711 register struct vnode *vp; 712 struct ucred *cred; 713 struct proc *p; 714 off_t length; 715 int blksize; 716{ 717 register struct buf *bp; 718 struct buf *nbp; 719 int s, anyfreed; 720 int trunclbn; 721 722 /* 723 * Round up to the *next* lbn. 724 */ 725 trunclbn = (length + blksize - 1) / blksize; 726 727 s = splbio(); 728restart: 729 anyfreed = 1; 730 for (;anyfreed;) { 731 anyfreed = 0; 732 for (bp = TAILQ_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) { 733 nbp = TAILQ_NEXT(bp, b_vnbufs); 734 if (bp->b_lblkno >= trunclbn) { 735 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { 736 BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL); 737 goto restart; 738 } else { 739 bremfree(bp); 740 bp->b_flags |= (B_INVAL | B_RELBUF); 741 bp->b_flags &= ~B_ASYNC; 742 brelse(bp); 743 anyfreed = 1; 744 } 745 if (nbp && (((nbp->b_xflags & B_VNCLEAN) == 0)|| 746 (nbp->b_vp != vp) || 747 (nbp->b_flags & B_DELWRI))) { 748 goto restart; 749 } 750 } 751 } 752 753 for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { 754 nbp = TAILQ_NEXT(bp, b_vnbufs); 755 if (bp->b_lblkno >= trunclbn) { 756 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { 757 BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL); 758 goto restart; 759 } else { 760 bremfree(bp); 761 bp->b_flags |= (B_INVAL | B_RELBUF); 762 bp->b_flags &= ~B_ASYNC; 763 brelse(bp); 764 anyfreed = 1; 765 } 766 if (nbp && (((nbp->b_xflags & B_VNDIRTY) == 0)|| 767 (nbp->b_vp != vp) || 768 (nbp->b_flags & B_DELWRI) == 0)) { 769 goto restart; 770 } 771 } 772 } 773 } 774 775 if (length > 0) { 776restartsync: 777 for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { 778 nbp = TAILQ_NEXT(bp, b_vnbufs); 779 if ((bp->b_flags & B_DELWRI) && (bp->b_lblkno < 0)) { 780 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { 781 BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL); 782 goto restart; 783 } else { 784 bremfree(bp); 785 if (bp->b_vp == vp) { 786 bp->b_flags |= B_ASYNC; 787 } else { 788 bp->b_flags &= ~B_ASYNC; 789 } 790 VOP_BWRITE(bp->b_vp, bp); 791 } 792 goto restartsync; 793 } 794 795 } 796 } 797 798 while (vp->v_numoutput > 0) { 799 vp->v_flag |= VBWAIT; 800 tsleep(&vp->v_numoutput, PVM, "vbtrunc", 0); 801 } 802 803 splx(s); 804 805 vnode_pager_setsize(vp, length); 806 807 return (0); 808} 809 810/* 811 * Associate a buffer with a vnode. 812 */ 813void 814bgetvp(vp, bp) 815 register struct vnode *vp; 816 register struct buf *bp; 817{ 818 int s; 819 820 KASSERT(bp->b_vp == NULL, ("bgetvp: not free")); 821 822 vhold(vp); 823 bp->b_vp = vp; 824 if (vp->v_type == VBLK || vp->v_type == VCHR) 825 bp->b_dev = vp->v_rdev; 826 else 827 bp->b_dev = NODEV; 828 /* 829 * Insert onto list for new vnode. 830 */ 831 s = splbio(); 832 bp->b_xflags |= B_VNCLEAN; 833 bp->b_xflags &= ~B_VNDIRTY; 834 TAILQ_INSERT_TAIL(&vp->v_cleanblkhd, bp, b_vnbufs); 835 splx(s); 836} 837 838/* 839 * Disassociate a buffer from a vnode. 840 */ 841void 842brelvp(bp) 843 register struct buf *bp; 844{ 845 struct vnode *vp; 846 struct buflists *listheadp; 847 int s; 848 849 KASSERT(bp->b_vp != NULL, ("brelvp: NULL")); 850 851 /* 852 * Delete from old vnode list, if on one. 853 */ 854 vp = bp->b_vp; 855 s = splbio(); 856 if (bp->b_xflags & (B_VNDIRTY|B_VNCLEAN)) { 857 if (bp->b_xflags & B_VNDIRTY) 858 listheadp = &vp->v_dirtyblkhd; 859 else 860 listheadp = &vp->v_cleanblkhd; 861 TAILQ_REMOVE(listheadp, bp, b_vnbufs); 862 bp->b_xflags &= ~(B_VNDIRTY|B_VNCLEAN); 863 } 864 if ((vp->v_flag & VONWORKLST) && TAILQ_EMPTY(&vp->v_dirtyblkhd)) { 865 vp->v_flag &= ~VONWORKLST; 866 LIST_REMOVE(vp, v_synclist); 867 } 868 splx(s); 869 bp->b_vp = (struct vnode *) 0; 870 vdrop(vp); 871} 872 873/* 874 * The workitem queue. 875 * 876 * It is useful to delay writes of file data and filesystem metadata 877 * for tens of seconds so that quickly created and deleted files need 878 * not waste disk bandwidth being created and removed. To realize this, 879 * we append vnodes to a "workitem" queue. When running with a soft 880 * updates implementation, most pending metadata dependencies should 881 * not wait for more than a few seconds. Thus, mounted on block devices 882 * are delayed only about a half the time that file data is delayed. 883 * Similarly, directory updates are more critical, so are only delayed 884 * about a third the time that file data is delayed. Thus, there are 885 * SYNCER_MAXDELAY queues that are processed round-robin at a rate of 886 * one each second (driven off the filesystem syncer process). The 887 * syncer_delayno variable indicates the next queue that is to be processed. 888 * Items that need to be processed soon are placed in this queue: 889 * 890 * syncer_workitem_pending[syncer_delayno] 891 * 892 * A delay of fifteen seconds is done by placing the request fifteen 893 * entries later in the queue: 894 * 895 * syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask] 896 * 897 */ 898 899/* 900 * Add an item to the syncer work queue. 901 */ 902static void 903vn_syncer_add_to_worklist(struct vnode *vp, int delay) 904{ 905 int s, slot; 906 907 s = splbio(); 908 909 if (vp->v_flag & VONWORKLST) { 910 LIST_REMOVE(vp, v_synclist); 911 } 912 913 if (delay > syncer_maxdelay - 2) 914 delay = syncer_maxdelay - 2; 915 slot = (syncer_delayno + delay) & syncer_mask; 916 917 LIST_INSERT_HEAD(&syncer_workitem_pending[slot], vp, v_synclist); 918 vp->v_flag |= VONWORKLST; 919 splx(s); 920} 921 922struct proc *updateproc; 923static void sched_sync __P((void)); 924static struct kproc_desc up_kp = { 925 "syncer", 926 sched_sync, 927 &updateproc 928}; 929SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp) 930 931/* 932 * System filesystem synchronizer daemon. 933 */ 934void 935sched_sync(void) 936{ 937 struct synclist *slp; 938 struct vnode *vp; 939 long starttime; 940 int s; 941 struct proc *p = updateproc; 942 943 p->p_flag |= P_BUFEXHAUST; 944 945 for (;;) { 946 starttime = time_second; 947 948 /* 949 * Push files whose dirty time has expired. Be careful 950 * of interrupt race on slp queue. 951 */ 952 s = splbio(); 953 slp = &syncer_workitem_pending[syncer_delayno]; 954 syncer_delayno += 1; 955 if (syncer_delayno == syncer_maxdelay) 956 syncer_delayno = 0; 957 splx(s); 958 959 while ((vp = LIST_FIRST(slp)) != NULL) { 960 if (VOP_ISLOCKED(vp) == 0) { 961 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); 962 (void) VOP_FSYNC(vp, p->p_ucred, MNT_LAZY, p); 963 VOP_UNLOCK(vp, 0, p); 964 } 965 s = splbio(); 966 if (LIST_FIRST(slp) == vp) { 967 /* 968 * Note: v_tag VT_VFS vps can remain on the 969 * worklist too with no dirty blocks, but 970 * since sync_fsync() moves it to a different 971 * slot we are safe. 972 */ 973 if (TAILQ_EMPTY(&vp->v_dirtyblkhd) && 974 vp->v_type != VBLK) 975 panic("sched_sync: fsync failed vp %p tag %d", vp, vp->v_tag); 976 /* 977 * Put us back on the worklist. The worklist 978 * routine will remove us from our current 979 * position and then add us back in at a later 980 * position. 981 */ 982 vn_syncer_add_to_worklist(vp, syncdelay); 983 } 984 splx(s); 985 } 986 987 /* 988 * Do soft update processing. 989 */ 990 if (bioops.io_sync) 991 (*bioops.io_sync)(NULL); 992 993 /* 994 * The variable rushjob allows the kernel to speed up the 995 * processing of the filesystem syncer process. A rushjob 996 * value of N tells the filesystem syncer to process the next 997 * N seconds worth of work on its queue ASAP. Currently rushjob 998 * is used by the soft update code to speed up the filesystem 999 * syncer process when the incore state is getting so far 1000 * ahead of the disk that the kernel memory pool is being 1001 * threatened with exhaustion. 1002 */ 1003 if (rushjob > 0) { 1004 rushjob -= 1; 1005 continue; 1006 } 1007 /* 1008 * If it has taken us less than a second to process the 1009 * current work, then wait. Otherwise start right over 1010 * again. We can still lose time if any single round 1011 * takes more than two seconds, but it does not really 1012 * matter as we are just trying to generally pace the 1013 * filesystem activity. 1014 */ 1015 if (time_second == starttime) 1016 tsleep(&lbolt, PPAUSE, "syncer", 0); 1017 } 1018} 1019 1020/* 1021 * Request the syncer daemon to speed up its work. 1022 * We never push it to speed up more than half of its 1023 * normal turn time, otherwise it could take over the cpu. 1024 */ 1025int 1026speedup_syncer() 1027{ 1028 int s; 1029 1030 s = splhigh(); 1031 if (updateproc->p_wchan == &lbolt) 1032 setrunnable(updateproc); 1033 splx(s); 1034 if (rushjob < syncdelay / 2) { 1035 rushjob += 1; 1036 stat_rush_requests += 1; 1037 return (1); 1038 } 1039 return(0); 1040} 1041 1042/* 1043 * Associate a p-buffer with a vnode. 1044 * 1045 * Also sets B_PAGING flag to indicate that vnode is not fully associated 1046 * with the buffer. i.e. the bp has not been linked into the vnode or 1047 * ref-counted. 1048 */ 1049void 1050pbgetvp(vp, bp) 1051 register struct vnode *vp; 1052 register struct buf *bp; 1053{ 1054 1055 KASSERT(bp->b_vp == NULL, ("pbgetvp: not free")); 1056 1057 bp->b_vp = vp; 1058 bp->b_flags |= B_PAGING; 1059 if (vp->v_type == VBLK || vp->v_type == VCHR) 1060 bp->b_dev = vp->v_rdev; 1061 else 1062 bp->b_dev = NODEV; 1063} 1064 1065/* 1066 * Disassociate a p-buffer from a vnode. 1067 */ 1068void 1069pbrelvp(bp) 1070 register struct buf *bp; 1071{ 1072 1073 KASSERT(bp->b_vp != NULL, ("pbrelvp: NULL")); 1074 1075#if !defined(MAX_PERF) 1076 /* XXX REMOVE ME */ 1077 if (bp->b_vnbufs.tqe_next != NULL) { 1078 panic( 1079 "relpbuf(): b_vp was probably reassignbuf()d %p %x", 1080 bp, 1081 (int)bp->b_flags 1082 ); 1083 } 1084#endif 1085 bp->b_vp = (struct vnode *) 0; 1086 bp->b_flags &= ~B_PAGING; 1087} 1088 1089void 1090pbreassignbuf(bp, newvp) 1091 struct buf *bp; 1092 struct vnode *newvp; 1093{ 1094#if !defined(MAX_PERF) 1095 if ((bp->b_flags & B_PAGING) == 0) { 1096 panic( 1097 "pbreassignbuf() on non phys bp %p", 1098 bp 1099 ); 1100 } 1101#endif 1102 bp->b_vp = newvp; 1103} 1104 1105/* 1106 * Reassign a buffer from one vnode to another. 1107 * Used to assign file specific control information 1108 * (indirect blocks) to the vnode to which they belong. 1109 */ 1110void 1111reassignbuf(bp, newvp) 1112 register struct buf *bp; 1113 register struct vnode *newvp; 1114{ 1115 struct buflists *listheadp; 1116 int delay; 1117 int s; 1118 1119 if (newvp == NULL) { 1120 printf("reassignbuf: NULL"); 1121 return; 1122 } 1123 ++reassignbufcalls; 1124 1125#if !defined(MAX_PERF) 1126 /* 1127 * B_PAGING flagged buffers cannot be reassigned because their vp 1128 * is not fully linked in. 1129 */ 1130 if (bp->b_flags & B_PAGING) 1131 panic("cannot reassign paging buffer"); 1132#endif 1133 1134 s = splbio(); 1135 /* 1136 * Delete from old vnode list, if on one. 1137 */ 1138 if (bp->b_xflags & (B_VNDIRTY|B_VNCLEAN)) { 1139 if (bp->b_xflags & B_VNDIRTY) 1140 listheadp = &bp->b_vp->v_dirtyblkhd; 1141 else 1142 listheadp = &bp->b_vp->v_cleanblkhd; 1143 TAILQ_REMOVE(listheadp, bp, b_vnbufs); 1144 bp->b_xflags &= ~(B_VNDIRTY|B_VNCLEAN); 1145 if (bp->b_vp != newvp) { 1146 vdrop(bp->b_vp); 1147 bp->b_vp = NULL; /* for clarification */ 1148 } 1149 } 1150 /* 1151 * If dirty, put on list of dirty buffers; otherwise insert onto list 1152 * of clean buffers. 1153 */ 1154 if (bp->b_flags & B_DELWRI) { 1155 struct buf *tbp; 1156 1157 listheadp = &newvp->v_dirtyblkhd; 1158 if ((newvp->v_flag & VONWORKLST) == 0) { 1159 switch (newvp->v_type) { 1160 case VDIR: 1161 delay = dirdelay; 1162 break; 1163 case VBLK: 1164 if (newvp->v_specmountpoint != NULL) { 1165 delay = metadelay; 1166 break; 1167 } 1168 /* fall through */ 1169 default: 1170 delay = filedelay; 1171 } 1172 vn_syncer_add_to_worklist(newvp, delay); 1173 } 1174 bp->b_xflags |= B_VNDIRTY; 1175 tbp = TAILQ_FIRST(listheadp); 1176 if (tbp == NULL || 1177 bp->b_lblkno == 0 || 1178 (bp->b_lblkno > 0 && bp->b_lblkno < tbp->b_lblkno)) { 1179 TAILQ_INSERT_HEAD(listheadp, bp, b_vnbufs); 1180 ++reassignbufsortgood; 1181 } else if (bp->b_lblkno < 0) { 1182 TAILQ_INSERT_TAIL(listheadp, bp, b_vnbufs); 1183 ++reassignbufsortgood; 1184 } else if (reassignbufmethod == 1) { 1185 /* 1186 * New sorting algorithm, only handle sequential case, 1187 * otherwise guess. 1188 */ 1189 if ((tbp = gbincore(newvp, bp->b_lblkno - 1)) != NULL && 1190 (tbp->b_xflags & B_VNDIRTY)) { 1191 TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs); 1192 ++reassignbufsortgood; 1193 } else { 1194 TAILQ_INSERT_HEAD(listheadp, bp, b_vnbufs); 1195 ++reassignbufsortbad; 1196 } 1197 } else { 1198 /* 1199 * Old sorting algorithm, scan queue and insert 1200 */ 1201 struct buf *ttbp; 1202 while ((ttbp = TAILQ_NEXT(tbp, b_vnbufs)) && 1203 (ttbp->b_lblkno < bp->b_lblkno)) { 1204 ++reassignbufloops; 1205 tbp = ttbp; 1206 } 1207 TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs); 1208 } 1209 } else { 1210 bp->b_xflags |= B_VNCLEAN; 1211 TAILQ_INSERT_TAIL(&newvp->v_cleanblkhd, bp, b_vnbufs); 1212 if ((newvp->v_flag & VONWORKLST) && 1213 TAILQ_EMPTY(&newvp->v_dirtyblkhd)) { 1214 newvp->v_flag &= ~VONWORKLST; 1215 LIST_REMOVE(newvp, v_synclist); 1216 } 1217 } 1218 if (bp->b_vp != newvp) { 1219 bp->b_vp = newvp; 1220 vhold(bp->b_vp); 1221 } 1222 splx(s); 1223} 1224 1225/* 1226 * Create a vnode for a block device. 1227 * Used for mounting the root file system. 1228 */ 1229int 1230bdevvp(dev, vpp) 1231 dev_t dev; 1232 struct vnode **vpp; 1233{ 1234 register struct vnode *vp; 1235 struct vnode *nvp; 1236 int error; 1237 1238 if (dev == NODEV) { 1239 *vpp = NULLVP; 1240 return (ENXIO); 1241 } 1242 error = getnewvnode(VT_NON, (struct mount *)0, spec_vnodeop_p, &nvp); 1243 if (error) { 1244 *vpp = NULLVP; 1245 return (error); 1246 } 1247 vp = nvp; 1248 /* dev2udev() results in a CDEV, so we need to cheat here. */ 1249 vp->v_type = VBLK; 1250 if ((nvp = checkalias2(vp, dev, (struct mount *)0)) != NULL) { 1251 vput(vp); 1252 vp = nvp; 1253 } 1254 *vpp = vp; 1255 return (0); 1256} 1257 1258/* 1259 * Check to see if the new vnode represents a special device 1260 * for which we already have a vnode (either because of 1261 * bdevvp() or because of a different vnode representing 1262 * the same block device). If such an alias exists, deallocate 1263 * the existing contents and return the aliased vnode. The 1264 * caller is responsible for filling it with its new contents. 1265 */ 1266struct vnode * 1267checkalias(nvp, nvp_rdev, mp) 1268 register struct vnode *nvp; 1269 udev_t nvp_rdev; 1270 struct mount *mp; 1271{ 1272 dev_t dev; 1273 1274 if (nvp->v_type != VBLK && nvp->v_type != VCHR) 1275 return (NULLVP); 1276 1277 dev = udev2dev(nvp_rdev, nvp->v_type == VBLK ? 1 : 0); 1278 return (checkalias2(nvp, dev, mp)); 1279} 1280 1281static struct vnode * 1282checkalias2(nvp, dev, mp) 1283 register struct vnode *nvp; 1284 dev_t dev; 1285 struct mount *mp; 1286{ 1287 struct proc *p = curproc; /* XXX */ 1288 struct vnode *vp; 1289 struct vnode **vpp; 1290 1291 if (nvp->v_type != VBLK && nvp->v_type != VCHR) 1292 return (NULLVP); 1293 1294 vpp = &dev->si_hlist; 1295loop: 1296 simple_lock(&spechash_slock); 1297 for (vp = *vpp; vp; vp = vp->v_specnext) { 1298 if (nvp->v_type != vp->v_type) 1299 continue; 1300 /* 1301 * Alias, but not in use, so flush it out. 1302 * Only alias active device nodes. 1303 * Not sure why we don't re-use this like we do below. 1304 */ 1305 simple_lock(&vp->v_interlock); 1306 if (vp->v_usecount == 0) { 1307 simple_unlock(&spechash_slock); 1308 vgonel(vp, p); 1309 goto loop; 1310 } 1311 if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, p)) { 1312 /* 1313 * It dissappeared, and we may have slept. 1314 * Restart from the beginning 1315 */ 1316 simple_unlock(&spechash_slock); 1317 goto loop; 1318 } 1319 break; 1320 } 1321 /* 1322 * It would be a lot clearer what is going on here if 1323 * this had been expressed as: 1324 * if ( vp && (vp->v_tag == VT_NULL)) 1325 * and the clauses had been swapped. 1326 */ 1327 if (vp == NULL || vp->v_tag != VT_NON) { 1328 struct specinfo *sinfo; 1329 1330 /* 1331 * Put the new vnode into the hash chain. 1332 * and if there was an alias, connect them. 1333 */ 1334 nvp->v_specnext = *vpp; 1335 *vpp = nvp; 1336 nvp->v_specinfo = sinfo = dev; 1337 1338 simple_unlock(&spechash_slock); 1339 if (vp != NULLVP) { 1340 nvp->v_flag |= VALIASED; 1341 vp->v_flag |= VALIASED; 1342 vput(vp); 1343 } 1344 return (NULLVP); 1345 } 1346 /* 1347 * if ( vp && (vp->v_tag == VT_NULL)) 1348 * We have a vnode alias, but it is a trashed. 1349 * Make it look like it's newly allocated. (by getnewvnode()) 1350 * The caller should use this instead. 1351 */ 1352 simple_unlock(&spechash_slock); 1353 VOP_UNLOCK(vp, 0, p); 1354 simple_lock(&vp->v_interlock); 1355 vclean(vp, 0, p); 1356 vp->v_op = nvp->v_op; 1357 vp->v_tag = nvp->v_tag; 1358 nvp->v_type = VNON; 1359 insmntque(vp, mp); 1360 return (vp); 1361} 1362 1363/* 1364 * Grab a particular vnode from the free list, increment its 1365 * reference count and lock it. The vnode lock bit is set if the 1366 * vnode is being eliminated in vgone. The process is awakened 1367 * when the transition is completed, and an error returned to 1368 * indicate that the vnode is no longer usable (possibly having 1369 * been changed to a new file system type). 1370 */ 1371int 1372vget(vp, flags, p) 1373 register struct vnode *vp; 1374 int flags; 1375 struct proc *p; 1376{ 1377 int error; 1378 1379 /* 1380 * If the vnode is in the process of being cleaned out for 1381 * another use, we wait for the cleaning to finish and then 1382 * return failure. Cleaning is determined by checking that 1383 * the VXLOCK flag is set. 1384 */ 1385 if ((flags & LK_INTERLOCK) == 0) { 1386 simple_lock(&vp->v_interlock); 1387 } 1388 if (vp->v_flag & VXLOCK) { 1389 vp->v_flag |= VXWANT; 1390 simple_unlock(&vp->v_interlock); 1391 tsleep((caddr_t)vp, PINOD, "vget", 0); 1392 return (ENOENT); 1393 } 1394 1395 vp->v_usecount++; 1396 1397 if (VSHOULDBUSY(vp)) 1398 vbusy(vp); 1399 if (flags & LK_TYPE_MASK) { 1400 if ((error = vn_lock(vp, flags | LK_INTERLOCK, p)) != 0) { 1401 /* 1402 * must expand vrele here because we do not want 1403 * to call VOP_INACTIVE if the reference count 1404 * drops back to zero since it was never really 1405 * active. We must remove it from the free list 1406 * before sleeping so that multiple processes do 1407 * not try to recycle it. 1408 */ 1409 simple_lock(&vp->v_interlock); 1410 vp->v_usecount--; 1411 if (VSHOULDFREE(vp)) 1412 vfree(vp); 1413 simple_unlock(&vp->v_interlock); 1414 } 1415 return (error); 1416 } 1417 simple_unlock(&vp->v_interlock); 1418 return (0); 1419} 1420 1421void 1422vref(struct vnode *vp) 1423{ 1424 simple_lock(&vp->v_interlock); 1425 vp->v_usecount++; 1426 simple_unlock(&vp->v_interlock); 1427} 1428 1429/* 1430 * Vnode put/release. 1431 * If count drops to zero, call inactive routine and return to freelist. 1432 */ 1433void 1434vrele(vp) 1435 struct vnode *vp; 1436{ 1437 struct proc *p = curproc; /* XXX */ 1438 1439 KASSERT(vp != NULL, ("vrele: null vp")); 1440 1441 simple_lock(&vp->v_interlock); 1442 1443 if (vp->v_usecount > 1) { 1444 1445 vp->v_usecount--; 1446 simple_unlock(&vp->v_interlock); 1447 1448 return; 1449 } 1450 1451 if (vp->v_usecount == 1) { 1452 1453 vp->v_usecount--; 1454 if (VSHOULDFREE(vp)) 1455 vfree(vp); 1456 /* 1457 * If we are doing a vput, the node is already locked, and we must 1458 * call VOP_INACTIVE with the node locked. So, in the case of 1459 * vrele, we explicitly lock the vnode before calling VOP_INACTIVE. 1460 */ 1461 if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK, p) == 0) { 1462 VOP_INACTIVE(vp, p); 1463 } 1464 1465 } else { 1466#ifdef DIAGNOSTIC 1467 vprint("vrele: negative ref count", vp); 1468 simple_unlock(&vp->v_interlock); 1469#endif 1470 panic("vrele: negative ref cnt"); 1471 } 1472} 1473 1474void 1475vput(vp) 1476 struct vnode *vp; 1477{ 1478 struct proc *p = curproc; /* XXX */ 1479 1480 KASSERT(vp != NULL, ("vput: null vp")); 1481 1482 simple_lock(&vp->v_interlock); 1483 1484 if (vp->v_usecount > 1) { 1485 1486 vp->v_usecount--; 1487 VOP_UNLOCK(vp, LK_INTERLOCK, p); 1488 return; 1489 1490 } 1491 1492 if (vp->v_usecount == 1) { 1493 1494 vp->v_usecount--; 1495 if (VSHOULDFREE(vp)) 1496 vfree(vp); 1497 /* 1498 * If we are doing a vput, the node is already locked, and we must 1499 * call VOP_INACTIVE with the node locked. So, in the case of 1500 * vrele, we explicitly lock the vnode before calling VOP_INACTIVE. 1501 */ 1502 simple_unlock(&vp->v_interlock); 1503 VOP_INACTIVE(vp, p); 1504 1505 } else { 1506#ifdef DIAGNOSTIC 1507 vprint("vput: negative ref count", vp); 1508#endif 1509 panic("vput: negative ref cnt"); 1510 } 1511} 1512 1513/* 1514 * Somebody doesn't want the vnode recycled. 1515 */ 1516void 1517vhold(vp) 1518 register struct vnode *vp; 1519{ 1520 int s; 1521 1522 s = splbio(); 1523 vp->v_holdcnt++; 1524 if (VSHOULDBUSY(vp)) 1525 vbusy(vp); 1526 splx(s); 1527} 1528 1529/* 1530 * One less who cares about this vnode. 1531 */ 1532void 1533vdrop(vp) 1534 register struct vnode *vp; 1535{ 1536 int s; 1537 1538 s = splbio(); 1539 if (vp->v_holdcnt <= 0) 1540 panic("vdrop: holdcnt"); 1541 vp->v_holdcnt--; 1542 if (VSHOULDFREE(vp)) 1543 vfree(vp); 1544 splx(s); 1545} 1546 1547/* 1548 * Remove any vnodes in the vnode table belonging to mount point mp. 1549 * 1550 * If MNT_NOFORCE is specified, there should not be any active ones, 1551 * return error if any are found (nb: this is a user error, not a 1552 * system error). If MNT_FORCE is specified, detach any active vnodes 1553 * that are found. 1554 */ 1555#ifdef DIAGNOSTIC 1556static int busyprt = 0; /* print out busy vnodes */ 1557SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, ""); 1558#endif 1559 1560int 1561vflush(mp, skipvp, flags) 1562 struct mount *mp; 1563 struct vnode *skipvp; 1564 int flags; 1565{ 1566 struct proc *p = curproc; /* XXX */ 1567 struct vnode *vp, *nvp; 1568 int busy = 0; 1569 1570 simple_lock(&mntvnode_slock); 1571loop: 1572 for (vp = mp->mnt_vnodelist.lh_first; vp; vp = nvp) { 1573 /* 1574 * Make sure this vnode wasn't reclaimed in getnewvnode(). 1575 * Start over if it has (it won't be on the list anymore). 1576 */ 1577 if (vp->v_mount != mp) 1578 goto loop; 1579 nvp = vp->v_mntvnodes.le_next; 1580 /* 1581 * Skip over a selected vnode. 1582 */ 1583 if (vp == skipvp) 1584 continue; 1585 1586 simple_lock(&vp->v_interlock); 1587 /* 1588 * Skip over a vnodes marked VSYSTEM. 1589 */ 1590 if ((flags & SKIPSYSTEM) && (vp->v_flag & VSYSTEM)) { 1591 simple_unlock(&vp->v_interlock); 1592 continue; 1593 } 1594 /* 1595 * If WRITECLOSE is set, only flush out regular file vnodes 1596 * open for writing. 1597 */ 1598 if ((flags & WRITECLOSE) && 1599 (vp->v_writecount == 0 || vp->v_type != VREG)) { 1600 simple_unlock(&vp->v_interlock); 1601 continue; 1602 } 1603 1604 /* 1605 * With v_usecount == 0, all we need to do is clear out the 1606 * vnode data structures and we are done. 1607 */ 1608 if (vp->v_usecount == 0) { 1609 simple_unlock(&mntvnode_slock); 1610 vgonel(vp, p); 1611 simple_lock(&mntvnode_slock); 1612 continue; 1613 } 1614 1615 /* 1616 * If FORCECLOSE is set, forcibly close the vnode. For block 1617 * or character devices, revert to an anonymous device. For 1618 * all other files, just kill them. 1619 */ 1620 if (flags & FORCECLOSE) { 1621 simple_unlock(&mntvnode_slock); 1622 if (vp->v_type != VBLK && vp->v_type != VCHR) { 1623 vgonel(vp, p); 1624 } else { 1625 vclean(vp, 0, p); 1626 vp->v_op = spec_vnodeop_p; 1627 insmntque(vp, (struct mount *) 0); 1628 } 1629 simple_lock(&mntvnode_slock); 1630 continue; 1631 } 1632#ifdef DIAGNOSTIC 1633 if (busyprt) 1634 vprint("vflush: busy vnode", vp); 1635#endif 1636 simple_unlock(&vp->v_interlock); 1637 busy++; 1638 } 1639 simple_unlock(&mntvnode_slock); 1640 if (busy) 1641 return (EBUSY); 1642 return (0); 1643} 1644 1645/* 1646 * Disassociate the underlying file system from a vnode. 1647 */ 1648static void 1649vclean(vp, flags, p) 1650 struct vnode *vp; 1651 int flags; 1652 struct proc *p; 1653{ 1654 int active; 1655 vm_object_t obj; 1656 1657 /* 1658 * Check to see if the vnode is in use. If so we have to reference it 1659 * before we clean it out so that its count cannot fall to zero and 1660 * generate a race against ourselves to recycle it. 1661 */ 1662 if ((active = vp->v_usecount)) 1663 vp->v_usecount++; 1664 1665 /* 1666 * Prevent the vnode from being recycled or brought into use while we 1667 * clean it out. 1668 */ 1669 if (vp->v_flag & VXLOCK) 1670 panic("vclean: deadlock"); 1671 vp->v_flag |= VXLOCK; 1672 /* 1673 * Even if the count is zero, the VOP_INACTIVE routine may still 1674 * have the object locked while it cleans it out. The VOP_LOCK 1675 * ensures that the VOP_INACTIVE routine is done with its work. 1676 * For active vnodes, it ensures that no other activity can 1677 * occur while the underlying object is being cleaned out. 1678 */ 1679 VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK, p); 1680 1681 /* 1682 * Clean out any buffers associated with the vnode. 1683 */ 1684 vinvalbuf(vp, V_SAVE, NOCRED, p, 0, 0); 1685 if ((obj = vp->v_object) != NULL) { 1686 if (obj->ref_count == 0) { 1687 /* 1688 * This is a normal way of shutting down the object/vnode 1689 * association. 1690 */ 1691 vm_object_terminate(obj); 1692 } else { 1693 /* 1694 * Woe to the process that tries to page now :-). 1695 */ 1696 vm_pager_deallocate(obj); 1697 } 1698 } 1699 1700 /* 1701 * If purging an active vnode, it must be closed and 1702 * deactivated before being reclaimed. Note that the 1703 * VOP_INACTIVE will unlock the vnode. 1704 */ 1705 if (active) { 1706 if (flags & DOCLOSE) 1707 VOP_CLOSE(vp, FNONBLOCK, NOCRED, p); 1708 VOP_INACTIVE(vp, p); 1709 } else { 1710 /* 1711 * Any other processes trying to obtain this lock must first 1712 * wait for VXLOCK to clear, then call the new lock operation. 1713 */ 1714 VOP_UNLOCK(vp, 0, p); 1715 } 1716 /* 1717 * Reclaim the vnode. 1718 */ 1719 if (VOP_RECLAIM(vp, p)) 1720 panic("vclean: cannot reclaim"); 1721 1722 if (active) 1723 vrele(vp); 1724 1725 cache_purge(vp); 1726 if (vp->v_vnlock) { 1727#if 0 /* This is the only place we have LK_DRAINED in the entire kernel ??? */ 1728#ifdef DIAGNOSTIC 1729 if ((vp->v_vnlock->lk_flags & LK_DRAINED) == 0) 1730 vprint("vclean: lock not drained", vp); 1731#endif 1732#endif 1733 FREE(vp->v_vnlock, M_VNODE); 1734 vp->v_vnlock = NULL; 1735 } 1736 1737 if (VSHOULDFREE(vp)) 1738 vfree(vp); 1739 1740 /* 1741 * Done with purge, notify sleepers of the grim news. 1742 */ 1743 vp->v_op = dead_vnodeop_p; 1744 vn_pollgone(vp); 1745 vp->v_tag = VT_NON; 1746 vp->v_flag &= ~VXLOCK; 1747 if (vp->v_flag & VXWANT) { 1748 vp->v_flag &= ~VXWANT; 1749 wakeup((caddr_t) vp); 1750 } 1751} 1752 1753/* 1754 * Eliminate all activity associated with the requested vnode 1755 * and with all vnodes aliased to the requested vnode. 1756 */ 1757int 1758vop_revoke(ap) 1759 struct vop_revoke_args /* { 1760 struct vnode *a_vp; 1761 int a_flags; 1762 } */ *ap; 1763{ 1764 struct vnode *vp, *vq; 1765 struct proc *p = curproc; /* XXX */ 1766 1767 KASSERT((ap->a_flags & REVOKEALL) != 0, ("vop_revoke")); 1768 1769 vp = ap->a_vp; 1770 simple_lock(&vp->v_interlock); 1771 1772 if (vp->v_flag & VALIASED) { 1773 /* 1774 * If a vgone (or vclean) is already in progress, 1775 * wait until it is done and return. 1776 */ 1777 if (vp->v_flag & VXLOCK) { 1778 vp->v_flag |= VXWANT; 1779 simple_unlock(&vp->v_interlock); 1780 tsleep((caddr_t)vp, PINOD, "vop_revokeall", 0); 1781 return (0); 1782 } 1783 /* 1784 * Ensure that vp will not be vgone'd while we 1785 * are eliminating its aliases. 1786 */ 1787 vp->v_flag |= VXLOCK; 1788 simple_unlock(&vp->v_interlock); 1789 while (vp->v_flag & VALIASED) { 1790 simple_lock(&spechash_slock); 1791 for (vq = vp->v_hashchain; vq; vq = vq->v_specnext) { 1792 if (vq->v_type != vp->v_type || vp == vq) 1793 continue; 1794 simple_unlock(&spechash_slock); 1795 vgone(vq); 1796 break; 1797 } 1798 if (vq == NULLVP) { 1799 simple_unlock(&spechash_slock); 1800 } 1801 } 1802 /* 1803 * Remove the lock so that vgone below will 1804 * really eliminate the vnode after which time 1805 * vgone will awaken any sleepers. 1806 */ 1807 simple_lock(&vp->v_interlock); 1808 vp->v_flag &= ~VXLOCK; 1809 if (vp->v_flag & VXWANT) { 1810 vp->v_flag &= ~VXWANT; 1811 wakeup(vp); 1812 } 1813 } 1814 vgonel(vp, p); 1815 return (0); 1816} 1817 1818/* 1819 * Recycle an unused vnode to the front of the free list. 1820 * Release the passed interlock if the vnode will be recycled. 1821 */ 1822int 1823vrecycle(vp, inter_lkp, p) 1824 struct vnode *vp; 1825 struct simplelock *inter_lkp; 1826 struct proc *p; 1827{ 1828 1829 simple_lock(&vp->v_interlock); 1830 if (vp->v_usecount == 0) { 1831 if (inter_lkp) { 1832 simple_unlock(inter_lkp); 1833 } 1834 vgonel(vp, p); 1835 return (1); 1836 } 1837 simple_unlock(&vp->v_interlock); 1838 return (0); 1839} 1840 1841/* 1842 * Eliminate all activity associated with a vnode 1843 * in preparation for reuse. 1844 */ 1845void 1846vgone(vp) 1847 register struct vnode *vp; 1848{ 1849 struct proc *p = curproc; /* XXX */ 1850 1851 simple_lock(&vp->v_interlock); 1852 vgonel(vp, p); 1853} 1854 1855/* 1856 * vgone, with the vp interlock held. 1857 */ 1858static void 1859vgonel(vp, p) 1860 struct vnode *vp; 1861 struct proc *p; 1862{ 1863 int s; 1864 struct vnode *vq; 1865 struct vnode *vx; 1866 1867 /* 1868 * If a vgone (or vclean) is already in progress, 1869 * wait until it is done and return. 1870 */ 1871 if (vp->v_flag & VXLOCK) { 1872 vp->v_flag |= VXWANT; 1873 simple_unlock(&vp->v_interlock); 1874 tsleep((caddr_t)vp, PINOD, "vgone", 0); 1875 return; 1876 } 1877 1878 /* 1879 * Clean out the filesystem specific data. 1880 */ 1881 vclean(vp, DOCLOSE, p); 1882 simple_lock(&vp->v_interlock); 1883 1884 /* 1885 * Delete from old mount point vnode list, if on one. 1886 */ 1887 if (vp->v_mount != NULL) 1888 insmntque(vp, (struct mount *)0); 1889 /* 1890 * If special device, remove it from special device alias list 1891 * if it is on one. 1892 */ 1893 if ((vp->v_type == VBLK || vp->v_type == VCHR) && vp->v_specinfo != 0) { 1894 simple_lock(&spechash_slock); 1895 if (vp->v_hashchain == vp) { 1896 vp->v_hashchain = vp->v_specnext; 1897 } else { 1898 for (vq = vp->v_hashchain; vq; vq = vq->v_specnext) { 1899 if (vq->v_specnext != vp) 1900 continue; 1901 vq->v_specnext = vp->v_specnext; 1902 break; 1903 } 1904 if (vq == NULL) 1905 panic("missing bdev"); 1906 } 1907 if (vp->v_flag & VALIASED) { 1908 vx = NULL; 1909 for (vq = vp->v_hashchain; vq; vq = vq->v_specnext) { 1910 if (vq->v_type != vp->v_type) 1911 continue; 1912 if (vx) 1913 break; 1914 vx = vq; 1915 } 1916 if (vx == NULL) 1917 panic("missing alias"); 1918 if (vq == NULL) 1919 vx->v_flag &= ~VALIASED; 1920 vp->v_flag &= ~VALIASED; 1921 } 1922 simple_unlock(&spechash_slock); 1923 vp->v_specinfo = NULL; 1924 } 1925 1926 /* 1927 * If it is on the freelist and not already at the head, 1928 * move it to the head of the list. The test of the back 1929 * pointer and the reference count of zero is because 1930 * it will be removed from the free list by getnewvnode, 1931 * but will not have its reference count incremented until 1932 * after calling vgone. If the reference count were 1933 * incremented first, vgone would (incorrectly) try to 1934 * close the previous instance of the underlying object. 1935 */ 1936 if (vp->v_usecount == 0 && !(vp->v_flag & VDOOMED)) { 1937 s = splbio(); 1938 simple_lock(&vnode_free_list_slock); 1939 if (vp->v_flag & VFREE) { 1940 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); 1941 } else if (vp->v_flag & VTBFREE) { 1942 TAILQ_REMOVE(&vnode_tobefree_list, vp, v_freelist); 1943 vp->v_flag &= ~VTBFREE; 1944 freevnodes++; 1945 } else 1946 freevnodes++; 1947 vp->v_flag |= VFREE; 1948 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); 1949 simple_unlock(&vnode_free_list_slock); 1950 splx(s); 1951 } 1952 1953 vp->v_type = VBAD; 1954 simple_unlock(&vp->v_interlock); 1955} 1956 1957/* 1958 * Lookup a vnode by device number. 1959 */ 1960int 1961vfinddev(dev, type, vpp) 1962 dev_t dev; 1963 enum vtype type; 1964 struct vnode **vpp; 1965{ 1966 register struct vnode *vp; 1967 int rc = 0; 1968 1969 simple_lock(&spechash_slock); 1970 for (vp = dev->si_hlist; vp; vp = vp->v_specnext) { 1971 if (type != vp->v_type) 1972 continue; 1973 *vpp = vp; 1974 rc = 1; 1975 break; 1976 } 1977 simple_unlock(&spechash_slock); 1978 return (rc); 1979} 1980 1981/* 1982 * Calculate the total number of references to a special device. 1983 */ 1984int 1985vcount(vp) 1986 register struct vnode *vp; 1987{ 1988 struct vnode *vq, *vnext; 1989 int count; 1990 1991loop: 1992 if ((vp->v_flag & VALIASED) == 0) 1993 return (vp->v_usecount); 1994 simple_lock(&spechash_slock); 1995 for (count = 0, vq = vp->v_hashchain; vq; vq = vnext) { 1996 vnext = vq->v_specnext; 1997 if (vq->v_type != vp->v_type) 1998 continue; 1999 /* 2000 * Alias, but not in use, so flush it out. 2001 */ 2002 if (vq->v_usecount == 0 && vq != vp) { 2003 simple_unlock(&spechash_slock); 2004 vgone(vq); 2005 goto loop; 2006 } 2007 count += vq->v_usecount; 2008 } 2009 simple_unlock(&spechash_slock); 2010 return (count); 2011} 2012/* 2013 * Print out a description of a vnode. 2014 */ 2015static char *typename[] = 2016{"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"}; 2017 2018void 2019vprint(label, vp) 2020 char *label; 2021 register struct vnode *vp; 2022{ 2023 char buf[96]; 2024 2025 if (label != NULL) 2026 printf("%s: %p: ", label, (void *)vp); 2027 else 2028 printf("%p: ", (void *)vp); 2029 printf("type %s, usecount %d, writecount %d, refcount %d,", 2030 typename[vp->v_type], vp->v_usecount, vp->v_writecount, 2031 vp->v_holdcnt); 2032 buf[0] = '\0'; 2033 if (vp->v_flag & VROOT) 2034 strcat(buf, "|VROOT"); 2035 if (vp->v_flag & VTEXT) 2036 strcat(buf, "|VTEXT"); 2037 if (vp->v_flag & VSYSTEM) 2038 strcat(buf, "|VSYSTEM"); 2039 if (vp->v_flag & VXLOCK) 2040 strcat(buf, "|VXLOCK"); 2041 if (vp->v_flag & VXWANT) 2042 strcat(buf, "|VXWANT"); 2043 if (vp->v_flag & VBWAIT) 2044 strcat(buf, "|VBWAIT"); 2045 if (vp->v_flag & VALIASED) 2046 strcat(buf, "|VALIASED"); 2047 if (vp->v_flag & VDOOMED) 2048 strcat(buf, "|VDOOMED"); 2049 if (vp->v_flag & VFREE) 2050 strcat(buf, "|VFREE"); 2051 if (vp->v_flag & VOBJBUF) 2052 strcat(buf, "|VOBJBUF"); 2053 if (buf[0] != '\0') 2054 printf(" flags (%s)", &buf[1]); 2055 if (vp->v_data == NULL) { 2056 printf("\n"); 2057 } else { 2058 printf("\n\t"); 2059 VOP_PRINT(vp); 2060 } 2061} 2062 2063#ifdef DDB 2064#include <ddb/ddb.h> 2065/* 2066 * List all of the locked vnodes in the system. 2067 * Called when debugging the kernel. 2068 */ 2069DB_SHOW_COMMAND(lockedvnodes, lockedvnodes) 2070{ 2071 struct proc *p = curproc; /* XXX */ 2072 struct mount *mp, *nmp; 2073 struct vnode *vp; 2074 2075 printf("Locked vnodes\n"); 2076 simple_lock(&mountlist_slock); 2077 for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) { 2078 if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) { 2079 nmp = mp->mnt_list.cqe_next; 2080 continue; 2081 } 2082 for (vp = mp->mnt_vnodelist.lh_first; 2083 vp != NULL; 2084 vp = vp->v_mntvnodes.le_next) { 2085 if (VOP_ISLOCKED(vp)) 2086 vprint((char *)0, vp); 2087 } 2088 simple_lock(&mountlist_slock); 2089 nmp = mp->mnt_list.cqe_next; 2090 vfs_unbusy(mp, p); 2091 } 2092 simple_unlock(&mountlist_slock); 2093} 2094#endif 2095 2096/* 2097 * Top level filesystem related information gathering. 2098 */ 2099static int sysctl_ovfs_conf __P(SYSCTL_HANDLER_ARGS); 2100 2101static int 2102vfs_sysctl SYSCTL_HANDLER_ARGS 2103{ 2104 int *name = (int *)arg1 - 1; /* XXX */ 2105 u_int namelen = arg2 + 1; /* XXX */ 2106 struct vfsconf *vfsp; 2107 2108#if 1 || defined(COMPAT_PRELITE2) 2109 /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */ 2110 if (namelen == 1) 2111 return (sysctl_ovfs_conf(oidp, arg1, arg2, req)); 2112#endif 2113 2114#ifdef notyet 2115 /* all sysctl names at this level are at least name and field */ 2116 if (namelen < 2) 2117 return (ENOTDIR); /* overloaded */ 2118 if (name[0] != VFS_GENERIC) { 2119 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) 2120 if (vfsp->vfc_typenum == name[0]) 2121 break; 2122 if (vfsp == NULL) 2123 return (EOPNOTSUPP); 2124 return ((*vfsp->vfc_vfsops->vfs_sysctl)(&name[1], namelen - 1, 2125 oldp, oldlenp, newp, newlen, p)); 2126 } 2127#endif 2128 switch (name[1]) { 2129 case VFS_MAXTYPENUM: 2130 if (namelen != 2) 2131 return (ENOTDIR); 2132 return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int))); 2133 case VFS_CONF: 2134 if (namelen != 3) 2135 return (ENOTDIR); /* overloaded */ 2136 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) 2137 if (vfsp->vfc_typenum == name[2]) 2138 break; 2139 if (vfsp == NULL) 2140 return (EOPNOTSUPP); 2141 return (SYSCTL_OUT(req, vfsp, sizeof *vfsp)); 2142 } 2143 return (EOPNOTSUPP); 2144} 2145 2146SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD, vfs_sysctl, 2147 "Generic filesystem"); 2148 2149#if 1 || defined(COMPAT_PRELITE2) 2150 2151static int 2152sysctl_ovfs_conf SYSCTL_HANDLER_ARGS 2153{ 2154 int error; 2155 struct vfsconf *vfsp; 2156 struct ovfsconf ovfs; 2157 2158 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) { 2159 ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */ 2160 strcpy(ovfs.vfc_name, vfsp->vfc_name); 2161 ovfs.vfc_index = vfsp->vfc_typenum; 2162 ovfs.vfc_refcount = vfsp->vfc_refcount; 2163 ovfs.vfc_flags = vfsp->vfc_flags; 2164 error = SYSCTL_OUT(req, &ovfs, sizeof ovfs); 2165 if (error) 2166 return error; 2167 } 2168 return 0; 2169} 2170 2171#endif /* 1 || COMPAT_PRELITE2 */ 2172 2173#if 0 2174#define KINFO_VNODESLOP 10 2175/* 2176 * Dump vnode list (via sysctl). 2177 * Copyout address of vnode followed by vnode. 2178 */ 2179/* ARGSUSED */ 2180static int 2181sysctl_vnode SYSCTL_HANDLER_ARGS 2182{ 2183 struct proc *p = curproc; /* XXX */ 2184 struct mount *mp, *nmp; 2185 struct vnode *nvp, *vp; 2186 int error; 2187 2188#define VPTRSZ sizeof (struct vnode *) 2189#define VNODESZ sizeof (struct vnode) 2190 2191 req->lock = 0; 2192 if (!req->oldptr) /* Make an estimate */ 2193 return (SYSCTL_OUT(req, 0, 2194 (numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ))); 2195 2196 simple_lock(&mountlist_slock); 2197 for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) { 2198 if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) { 2199 nmp = mp->mnt_list.cqe_next; 2200 continue; 2201 } 2202again: 2203 simple_lock(&mntvnode_slock); 2204 for (vp = mp->mnt_vnodelist.lh_first; 2205 vp != NULL; 2206 vp = nvp) { 2207 /* 2208 * Check that the vp is still associated with 2209 * this filesystem. RACE: could have been 2210 * recycled onto the same filesystem. 2211 */ 2212 if (vp->v_mount != mp) { 2213 simple_unlock(&mntvnode_slock); 2214 goto again; 2215 } 2216 nvp = vp->v_mntvnodes.le_next; 2217 simple_unlock(&mntvnode_slock); 2218 if ((error = SYSCTL_OUT(req, &vp, VPTRSZ)) || 2219 (error = SYSCTL_OUT(req, vp, VNODESZ))) 2220 return (error); 2221 simple_lock(&mntvnode_slock); 2222 } 2223 simple_unlock(&mntvnode_slock); 2224 simple_lock(&mountlist_slock); 2225 nmp = mp->mnt_list.cqe_next; 2226 vfs_unbusy(mp, p); 2227 } 2228 simple_unlock(&mountlist_slock); 2229 2230 return (0); 2231} 2232#endif 2233 2234/* 2235 * XXX 2236 * Exporting the vnode list on large systems causes them to crash. 2237 * Exporting the vnode list on medium systems causes sysctl to coredump. 2238 */ 2239#if 0 2240SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD, 2241 0, 0, sysctl_vnode, "S,vnode", ""); 2242#endif 2243 2244/* 2245 * Check to see if a filesystem is mounted on a block device. 2246 */ 2247int 2248vfs_mountedon(vp) 2249 struct vnode *vp; 2250{ 2251 struct vnode *vq; 2252 int error = 0; 2253 2254 if (vp->v_specmountpoint != NULL) 2255 return (EBUSY); 2256 if (vp->v_flag & VALIASED) { 2257 simple_lock(&spechash_slock); 2258 for (vq = vp->v_hashchain; vq; vq = vq->v_specnext) { 2259 if (vq->v_type != vp->v_type) 2260 continue; 2261 if (vq->v_specmountpoint != NULL) { 2262 error = EBUSY; 2263 break; 2264 } 2265 } 2266 simple_unlock(&spechash_slock); 2267 } 2268 return (error); 2269} 2270 2271/* 2272 * Unmount all filesystems. The list is traversed in reverse order 2273 * of mounting to avoid dependencies. 2274 */ 2275void 2276vfs_unmountall() 2277{ 2278 struct mount *mp, *nmp; 2279 struct proc *p; 2280 int error; 2281 2282 if (curproc != NULL) 2283 p = curproc; 2284 else 2285 p = initproc; /* XXX XXX should this be proc0? */ 2286 /* 2287 * Since this only runs when rebooting, it is not interlocked. 2288 */ 2289 for (mp = mountlist.cqh_last; mp != (void *)&mountlist; mp = nmp) { 2290 nmp = mp->mnt_list.cqe_prev; 2291 error = dounmount(mp, MNT_FORCE, p); 2292 if (error) { 2293 printf("unmount of %s failed (", 2294 mp->mnt_stat.f_mntonname); 2295 if (error == EBUSY) 2296 printf("BUSY)\n"); 2297 else 2298 printf("%d)\n", error); 2299 } 2300 } 2301} 2302 2303/* 2304 * Build hash lists of net addresses and hang them off the mount point. 2305 * Called by ufs_mount() to set up the lists of export addresses. 2306 */ 2307static int 2308vfs_hang_addrlist(mp, nep, argp) 2309 struct mount *mp; 2310 struct netexport *nep; 2311 struct export_args *argp; 2312{ 2313 register struct netcred *np; 2314 register struct radix_node_head *rnh; 2315 register int i; 2316 struct radix_node *rn; 2317 struct sockaddr *saddr, *smask = 0; 2318 struct domain *dom; 2319 int error; 2320 2321 if (argp->ex_addrlen == 0) { 2322 if (mp->mnt_flag & MNT_DEFEXPORTED) 2323 return (EPERM); 2324 np = &nep->ne_defexported; 2325 np->netc_exflags = argp->ex_flags; 2326 np->netc_anon = argp->ex_anon; 2327 np->netc_anon.cr_ref = 1; 2328 mp->mnt_flag |= MNT_DEFEXPORTED; 2329 return (0); 2330 } 2331 i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen; 2332 np = (struct netcred *) malloc(i, M_NETADDR, M_WAITOK); 2333 bzero((caddr_t) np, i); 2334 saddr = (struct sockaddr *) (np + 1); 2335 if ((error = copyin(argp->ex_addr, (caddr_t) saddr, argp->ex_addrlen))) 2336 goto out; 2337 if (saddr->sa_len > argp->ex_addrlen) 2338 saddr->sa_len = argp->ex_addrlen; 2339 if (argp->ex_masklen) { 2340 smask = (struct sockaddr *) ((caddr_t) saddr + argp->ex_addrlen); 2341 error = copyin(argp->ex_mask, (caddr_t) smask, argp->ex_masklen); 2342 if (error) 2343 goto out; 2344 if (smask->sa_len > argp->ex_masklen) 2345 smask->sa_len = argp->ex_masklen; 2346 } 2347 i = saddr->sa_family; 2348 if ((rnh = nep->ne_rtable[i]) == 0) { 2349 /* 2350 * Seems silly to initialize every AF when most are not used, 2351 * do so on demand here 2352 */ 2353 for (dom = domains; dom; dom = dom->dom_next) 2354 if (dom->dom_family == i && dom->dom_rtattach) { 2355 dom->dom_rtattach((void **) &nep->ne_rtable[i], 2356 dom->dom_rtoffset); 2357 break; 2358 } 2359 if ((rnh = nep->ne_rtable[i]) == 0) { 2360 error = ENOBUFS; 2361 goto out; 2362 } 2363 } 2364 rn = (*rnh->rnh_addaddr) ((caddr_t) saddr, (caddr_t) smask, rnh, 2365 np->netc_rnodes); 2366 if (rn == 0 || np != (struct netcred *) rn) { /* already exists */ 2367 error = EPERM; 2368 goto out; 2369 } 2370 np->netc_exflags = argp->ex_flags; 2371 np->netc_anon = argp->ex_anon; 2372 np->netc_anon.cr_ref = 1; 2373 return (0); 2374out: 2375 free(np, M_NETADDR); 2376 return (error); 2377} 2378 2379/* ARGSUSED */ 2380static int 2381vfs_free_netcred(rn, w) 2382 struct radix_node *rn; 2383 void *w; 2384{ 2385 register struct radix_node_head *rnh = (struct radix_node_head *) w; 2386 2387 (*rnh->rnh_deladdr) (rn->rn_key, rn->rn_mask, rnh); 2388 free((caddr_t) rn, M_NETADDR); 2389 return (0); 2390} 2391 2392/* 2393 * Free the net address hash lists that are hanging off the mount points. 2394 */ 2395static void 2396vfs_free_addrlist(nep) 2397 struct netexport *nep; 2398{ 2399 register int i; 2400 register struct radix_node_head *rnh; 2401 2402 for (i = 0; i <= AF_MAX; i++) 2403 if ((rnh = nep->ne_rtable[i])) { 2404 (*rnh->rnh_walktree) (rnh, vfs_free_netcred, 2405 (caddr_t) rnh); 2406 free((caddr_t) rnh, M_RTABLE); 2407 nep->ne_rtable[i] = 0; 2408 } 2409} 2410 2411int 2412vfs_export(mp, nep, argp) 2413 struct mount *mp; 2414 struct netexport *nep; 2415 struct export_args *argp; 2416{ 2417 int error; 2418 2419 if (argp->ex_flags & MNT_DELEXPORT) { 2420 if (mp->mnt_flag & MNT_EXPUBLIC) { 2421 vfs_setpublicfs(NULL, NULL, NULL); 2422 mp->mnt_flag &= ~MNT_EXPUBLIC; 2423 } 2424 vfs_free_addrlist(nep); 2425 mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED); 2426 } 2427 if (argp->ex_flags & MNT_EXPORTED) { 2428 if (argp->ex_flags & MNT_EXPUBLIC) { 2429 if ((error = vfs_setpublicfs(mp, nep, argp)) != 0) 2430 return (error); 2431 mp->mnt_flag |= MNT_EXPUBLIC; 2432 } 2433 if ((error = vfs_hang_addrlist(mp, nep, argp))) 2434 return (error); 2435 mp->mnt_flag |= MNT_EXPORTED; 2436 } 2437 return (0); 2438} 2439 2440 2441/* 2442 * Set the publicly exported filesystem (WebNFS). Currently, only 2443 * one public filesystem is possible in the spec (RFC 2054 and 2055) 2444 */ 2445int 2446vfs_setpublicfs(mp, nep, argp) 2447 struct mount *mp; 2448 struct netexport *nep; 2449 struct export_args *argp; 2450{ 2451 int error; 2452 struct vnode *rvp; 2453 char *cp; 2454 2455 /* 2456 * mp == NULL -> invalidate the current info, the FS is 2457 * no longer exported. May be called from either vfs_export 2458 * or unmount, so check if it hasn't already been done. 2459 */ 2460 if (mp == NULL) { 2461 if (nfs_pub.np_valid) { 2462 nfs_pub.np_valid = 0; 2463 if (nfs_pub.np_index != NULL) { 2464 FREE(nfs_pub.np_index, M_TEMP); 2465 nfs_pub.np_index = NULL; 2466 } 2467 } 2468 return (0); 2469 } 2470 2471 /* 2472 * Only one allowed at a time. 2473 */ 2474 if (nfs_pub.np_valid != 0 && mp != nfs_pub.np_mount) 2475 return (EBUSY); 2476 2477 /* 2478 * Get real filehandle for root of exported FS. 2479 */ 2480 bzero((caddr_t)&nfs_pub.np_handle, sizeof(nfs_pub.np_handle)); 2481 nfs_pub.np_handle.fh_fsid = mp->mnt_stat.f_fsid; 2482 2483 if ((error = VFS_ROOT(mp, &rvp))) 2484 return (error); 2485 2486 if ((error = VFS_VPTOFH(rvp, &nfs_pub.np_handle.fh_fid))) 2487 return (error); 2488 2489 vput(rvp); 2490 2491 /* 2492 * If an indexfile was specified, pull it in. 2493 */ 2494 if (argp->ex_indexfile != NULL) { 2495 MALLOC(nfs_pub.np_index, char *, MAXNAMLEN + 1, M_TEMP, 2496 M_WAITOK); 2497 error = copyinstr(argp->ex_indexfile, nfs_pub.np_index, 2498 MAXNAMLEN, (size_t *)0); 2499 if (!error) { 2500 /* 2501 * Check for illegal filenames. 2502 */ 2503 for (cp = nfs_pub.np_index; *cp; cp++) { 2504 if (*cp == '/') { 2505 error = EINVAL; 2506 break; 2507 } 2508 } 2509 } 2510 if (error) { 2511 FREE(nfs_pub.np_index, M_TEMP); 2512 return (error); 2513 } 2514 } 2515 2516 nfs_pub.np_mount = mp; 2517 nfs_pub.np_valid = 1; 2518 return (0); 2519} 2520 2521struct netcred * 2522vfs_export_lookup(mp, nep, nam) 2523 register struct mount *mp; 2524 struct netexport *nep; 2525 struct sockaddr *nam; 2526{ 2527 register struct netcred *np; 2528 register struct radix_node_head *rnh; 2529 struct sockaddr *saddr; 2530 2531 np = NULL; 2532 if (mp->mnt_flag & MNT_EXPORTED) { 2533 /* 2534 * Lookup in the export list first. 2535 */ 2536 if (nam != NULL) { 2537 saddr = nam; 2538 rnh = nep->ne_rtable[saddr->sa_family]; 2539 if (rnh != NULL) { 2540 np = (struct netcred *) 2541 (*rnh->rnh_matchaddr)((caddr_t)saddr, 2542 rnh); 2543 if (np && np->netc_rnodes->rn_flags & RNF_ROOT) 2544 np = NULL; 2545 } 2546 } 2547 /* 2548 * If no address match, use the default if it exists. 2549 */ 2550 if (np == NULL && mp->mnt_flag & MNT_DEFEXPORTED) 2551 np = &nep->ne_defexported; 2552 } 2553 return (np); 2554} 2555 2556/* 2557 * perform msync on all vnodes under a mount point 2558 * the mount point must be locked. 2559 */ 2560void 2561vfs_msync(struct mount *mp, int flags) { 2562 struct vnode *vp, *nvp; 2563 struct vm_object *obj; 2564 int anyio, tries; 2565 2566 tries = 5; 2567loop: 2568 anyio = 0; 2569 for (vp = mp->mnt_vnodelist.lh_first; vp != NULL; vp = nvp) { 2570 2571 nvp = vp->v_mntvnodes.le_next; 2572 2573 if (vp->v_mount != mp) { 2574 goto loop; 2575 } 2576 2577 if (vp->v_flag & VXLOCK) /* XXX: what if MNT_WAIT? */ 2578 continue; 2579 2580 if (flags != MNT_WAIT) { 2581 obj = vp->v_object; 2582 if (obj == NULL || (obj->flags & OBJ_MIGHTBEDIRTY) == 0) 2583 continue; 2584 if (VOP_ISLOCKED(vp)) 2585 continue; 2586 } 2587 2588 simple_lock(&vp->v_interlock); 2589 if (vp->v_object && 2590 (vp->v_object->flags & OBJ_MIGHTBEDIRTY)) { 2591 if (!vget(vp, 2592 LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY | LK_NOOBJ, curproc)) { 2593 if (vp->v_object) { 2594 vm_object_page_clean(vp->v_object, 0, 0, flags == MNT_WAIT ? OBJPC_SYNC : 0); 2595 anyio = 1; 2596 } 2597 vput(vp); 2598 } 2599 } else { 2600 simple_unlock(&vp->v_interlock); 2601 } 2602 } 2603 if (anyio && (--tries > 0)) 2604 goto loop; 2605} 2606 2607/* 2608 * Create the VM object needed for VMIO and mmap support. This 2609 * is done for all VREG files in the system. Some filesystems might 2610 * afford the additional metadata buffering capability of the 2611 * VMIO code by making the device node be VMIO mode also. 2612 * 2613 * vp must be locked when vfs_object_create is called. 2614 */ 2615int 2616vfs_object_create(vp, p, cred) 2617 struct vnode *vp; 2618 struct proc *p; 2619 struct ucred *cred; 2620{ 2621 struct vattr vat; 2622 vm_object_t object; 2623 int error = 0; 2624 2625 if (vp->v_type != VBLK && vn_canvmio(vp) == FALSE) 2626 return 0; 2627 2628retry: 2629 if ((object = vp->v_object) == NULL) { 2630 if (vp->v_type == VREG || vp->v_type == VDIR) { 2631 if ((error = VOP_GETATTR(vp, &vat, cred, p)) != 0) 2632 goto retn; 2633 object = vnode_pager_alloc(vp, vat.va_size, 0, 0); 2634 } else if (bdevsw(vp->v_rdev) != NULL) { 2635 /* 2636 * This simply allocates the biggest object possible 2637 * for a VBLK vnode. This should be fixed, but doesn't 2638 * cause any problems (yet). 2639 */ 2640 object = vnode_pager_alloc(vp, IDX_TO_OFF(INT_MAX), 0, 0); 2641 } else { 2642 goto retn; 2643 } 2644 /* 2645 * Dereference the reference we just created. This assumes 2646 * that the object is associated with the vp. 2647 */ 2648 object->ref_count--; 2649 vp->v_usecount--; 2650 } else { 2651 if (object->flags & OBJ_DEAD) { 2652 VOP_UNLOCK(vp, 0, p); 2653 tsleep(object, PVM, "vodead", 0); 2654 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); 2655 goto retry; 2656 } 2657 } 2658 2659 KASSERT(vp->v_object != NULL, ("vfs_object_create: NULL object")); 2660 vp->v_flag |= VOBJBUF; 2661 2662retn: 2663 return error; 2664} 2665 2666static void 2667vfree(vp) 2668 struct vnode *vp; 2669{ 2670 int s; 2671 2672 s = splbio(); 2673 simple_lock(&vnode_free_list_slock); 2674 if (vp->v_flag & VTBFREE) { 2675 TAILQ_REMOVE(&vnode_tobefree_list, vp, v_freelist); 2676 vp->v_flag &= ~VTBFREE; 2677 } 2678 if (vp->v_flag & VAGE) { 2679 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); 2680 } else { 2681 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 2682 } 2683 freevnodes++; 2684 simple_unlock(&vnode_free_list_slock); 2685 vp->v_flag &= ~VAGE; 2686 vp->v_flag |= VFREE; 2687 splx(s); 2688} 2689 2690void 2691vbusy(vp) 2692 struct vnode *vp; 2693{ 2694 int s; 2695 2696 s = splbio(); 2697 simple_lock(&vnode_free_list_slock); 2698 if (vp->v_flag & VTBFREE) { 2699 TAILQ_REMOVE(&vnode_tobefree_list, vp, v_freelist); 2700 vp->v_flag &= ~VTBFREE; 2701 } else { 2702 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); 2703 freevnodes--; 2704 } 2705 simple_unlock(&vnode_free_list_slock); 2706 vp->v_flag &= ~(VFREE|VAGE); 2707 splx(s); 2708} 2709 2710/* 2711 * Record a process's interest in events which might happen to 2712 * a vnode. Because poll uses the historic select-style interface 2713 * internally, this routine serves as both the ``check for any 2714 * pending events'' and the ``record my interest in future events'' 2715 * functions. (These are done together, while the lock is held, 2716 * to avoid race conditions.) 2717 */ 2718int 2719vn_pollrecord(vp, p, events) 2720 struct vnode *vp; 2721 struct proc *p; 2722 short events; 2723{ 2724 simple_lock(&vp->v_pollinfo.vpi_lock); 2725 if (vp->v_pollinfo.vpi_revents & events) { 2726 /* 2727 * This leaves events we are not interested 2728 * in available for the other process which 2729 * which presumably had requested them 2730 * (otherwise they would never have been 2731 * recorded). 2732 */ 2733 events &= vp->v_pollinfo.vpi_revents; 2734 vp->v_pollinfo.vpi_revents &= ~events; 2735 2736 simple_unlock(&vp->v_pollinfo.vpi_lock); 2737 return events; 2738 } 2739 vp->v_pollinfo.vpi_events |= events; 2740 selrecord(p, &vp->v_pollinfo.vpi_selinfo); 2741 simple_unlock(&vp->v_pollinfo.vpi_lock); 2742 return 0; 2743} 2744 2745/* 2746 * Note the occurrence of an event. If the VN_POLLEVENT macro is used, 2747 * it is possible for us to miss an event due to race conditions, but 2748 * that condition is expected to be rare, so for the moment it is the 2749 * preferred interface. 2750 */ 2751void 2752vn_pollevent(vp, events) 2753 struct vnode *vp; 2754 short events; 2755{ 2756 simple_lock(&vp->v_pollinfo.vpi_lock); 2757 if (vp->v_pollinfo.vpi_events & events) { 2758 /* 2759 * We clear vpi_events so that we don't 2760 * call selwakeup() twice if two events are 2761 * posted before the polling process(es) is 2762 * awakened. This also ensures that we take at 2763 * most one selwakeup() if the polling process 2764 * is no longer interested. However, it does 2765 * mean that only one event can be noticed at 2766 * a time. (Perhaps we should only clear those 2767 * event bits which we note?) XXX 2768 */ 2769 vp->v_pollinfo.vpi_events = 0; /* &= ~events ??? */ 2770 vp->v_pollinfo.vpi_revents |= events; 2771 selwakeup(&vp->v_pollinfo.vpi_selinfo); 2772 } 2773 simple_unlock(&vp->v_pollinfo.vpi_lock); 2774} 2775 2776/* 2777 * Wake up anyone polling on vp because it is being revoked. 2778 * This depends on dead_poll() returning POLLHUP for correct 2779 * behavior. 2780 */ 2781void 2782vn_pollgone(vp) 2783 struct vnode *vp; 2784{ 2785 simple_lock(&vp->v_pollinfo.vpi_lock); 2786 if (vp->v_pollinfo.vpi_events) { 2787 vp->v_pollinfo.vpi_events = 0; 2788 selwakeup(&vp->v_pollinfo.vpi_selinfo); 2789 } 2790 simple_unlock(&vp->v_pollinfo.vpi_lock); 2791} 2792 2793 2794 2795/* 2796 * Routine to create and manage a filesystem syncer vnode. 2797 */ 2798#define sync_close ((int (*) __P((struct vop_close_args *)))nullop) 2799static int sync_fsync __P((struct vop_fsync_args *)); 2800static int sync_inactive __P((struct vop_inactive_args *)); 2801static int sync_reclaim __P((struct vop_reclaim_args *)); 2802#define sync_lock ((int (*) __P((struct vop_lock_args *)))vop_nolock) 2803#define sync_unlock ((int (*) __P((struct vop_unlock_args *)))vop_nounlock) 2804static int sync_print __P((struct vop_print_args *)); 2805#define sync_islocked ((int(*) __P((struct vop_islocked_args *)))vop_noislocked) 2806 2807static vop_t **sync_vnodeop_p; 2808static struct vnodeopv_entry_desc sync_vnodeop_entries[] = { 2809 { &vop_default_desc, (vop_t *) vop_eopnotsupp }, 2810 { &vop_close_desc, (vop_t *) sync_close }, /* close */ 2811 { &vop_fsync_desc, (vop_t *) sync_fsync }, /* fsync */ 2812 { &vop_inactive_desc, (vop_t *) sync_inactive }, /* inactive */ 2813 { &vop_reclaim_desc, (vop_t *) sync_reclaim }, /* reclaim */ 2814 { &vop_lock_desc, (vop_t *) sync_lock }, /* lock */ 2815 { &vop_unlock_desc, (vop_t *) sync_unlock }, /* unlock */ 2816 { &vop_print_desc, (vop_t *) sync_print }, /* print */ 2817 { &vop_islocked_desc, (vop_t *) sync_islocked }, /* islocked */ 2818 { NULL, NULL } 2819}; 2820static struct vnodeopv_desc sync_vnodeop_opv_desc = 2821 { &sync_vnodeop_p, sync_vnodeop_entries }; 2822 2823VNODEOP_SET(sync_vnodeop_opv_desc); 2824 2825/* 2826 * Create a new filesystem syncer vnode for the specified mount point. 2827 */ 2828int 2829vfs_allocate_syncvnode(mp) 2830 struct mount *mp; 2831{ 2832 struct vnode *vp; 2833 static long start, incr, next; 2834 int error; 2835 2836 /* Allocate a new vnode */ 2837 if ((error = getnewvnode(VT_VFS, mp, sync_vnodeop_p, &vp)) != 0) { 2838 mp->mnt_syncer = NULL; 2839 return (error); 2840 } 2841 vp->v_type = VNON; 2842 /* 2843 * Place the vnode onto the syncer worklist. We attempt to 2844 * scatter them about on the list so that they will go off 2845 * at evenly distributed times even if all the filesystems 2846 * are mounted at once. 2847 */ 2848 next += incr; 2849 if (next == 0 || next > syncer_maxdelay) { 2850 start /= 2; 2851 incr /= 2; 2852 if (start == 0) { 2853 start = syncer_maxdelay / 2; 2854 incr = syncer_maxdelay; 2855 } 2856 next = start; 2857 } 2858 vn_syncer_add_to_worklist(vp, syncdelay > 0 ? next % syncdelay : 0); 2859 mp->mnt_syncer = vp; 2860 return (0); 2861} 2862 2863/* 2864 * Do a lazy sync of the filesystem. 2865 */ 2866static int 2867sync_fsync(ap) 2868 struct vop_fsync_args /* { 2869 struct vnode *a_vp; 2870 struct ucred *a_cred; 2871 int a_waitfor; 2872 struct proc *a_p; 2873 } */ *ap; 2874{ 2875 struct vnode *syncvp = ap->a_vp; 2876 struct mount *mp = syncvp->v_mount; 2877 struct proc *p = ap->a_p; 2878 int asyncflag; 2879 2880 /* 2881 * We only need to do something if this is a lazy evaluation. 2882 */ 2883 if (ap->a_waitfor != MNT_LAZY) 2884 return (0); 2885 2886 /* 2887 * Move ourselves to the back of the sync list. 2888 */ 2889 vn_syncer_add_to_worklist(syncvp, syncdelay); 2890 2891 /* 2892 * Walk the list of vnodes pushing all that are dirty and 2893 * not already on the sync list. 2894 */ 2895 simple_lock(&mountlist_slock); 2896 if (vfs_busy(mp, LK_EXCLUSIVE | LK_NOWAIT, &mountlist_slock, p) != 0) { 2897 simple_unlock(&mountlist_slock); 2898 return (0); 2899 } 2900 asyncflag = mp->mnt_flag & MNT_ASYNC; 2901 mp->mnt_flag &= ~MNT_ASYNC; 2902 vfs_msync(mp, MNT_NOWAIT); 2903 VFS_SYNC(mp, MNT_LAZY, ap->a_cred, p); 2904 if (asyncflag) 2905 mp->mnt_flag |= MNT_ASYNC; 2906 vfs_unbusy(mp, p); 2907 return (0); 2908} 2909 2910/* 2911 * The syncer vnode is no referenced. 2912 */ 2913static int 2914sync_inactive(ap) 2915 struct vop_inactive_args /* { 2916 struct vnode *a_vp; 2917 struct proc *a_p; 2918 } */ *ap; 2919{ 2920 2921 vgone(ap->a_vp); 2922 return (0); 2923} 2924 2925/* 2926 * The syncer vnode is no longer needed and is being decommissioned. 2927 * 2928 * Modifications to the worklist must be protected at splbio(). 2929 */ 2930static int 2931sync_reclaim(ap) 2932 struct vop_reclaim_args /* { 2933 struct vnode *a_vp; 2934 } */ *ap; 2935{ 2936 struct vnode *vp = ap->a_vp; 2937 int s; 2938 2939 s = splbio(); 2940 vp->v_mount->mnt_syncer = NULL; 2941 if (vp->v_flag & VONWORKLST) { 2942 LIST_REMOVE(vp, v_synclist); 2943 vp->v_flag &= ~VONWORKLST; 2944 } 2945 splx(s); 2946 2947 return (0); 2948} 2949 2950/* 2951 * Print out a syncer vnode. 2952 */ 2953static int 2954sync_print(ap) 2955 struct vop_print_args /* { 2956 struct vnode *a_vp; 2957 } */ *ap; 2958{ 2959 struct vnode *vp = ap->a_vp; 2960 2961 printf("syncer vnode"); 2962 if (vp->v_vnlock != NULL) 2963 lockmgr_printinfo(vp->v_vnlock); 2964 printf("\n"); 2965 return (0); 2966} 2967 2968/* 2969 * extract the dev_t from a VBLK or VCHR 2970 */ 2971dev_t 2972vn_todev(vp) 2973 struct vnode *vp; 2974{ 2975 if (vp->v_type != VBLK && vp->v_type != VCHR) 2976 return (NODEV); 2977 return (vp->v_rdev); 2978} 2979