vfs_subr.c revision 49679
1124837Smtm/* 2124837Smtm * Copyright (c) 1989, 1993 3124837Smtm * The Regents of the University of California. All rights reserved. 4124837Smtm * (c) UNIX System Laboratories, Inc. 5124837Smtm * All or some portions of this file are derived from material licensed 6124837Smtm * to the University of California by American Telephone and Telegraph 7124837Smtm * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8124837Smtm * the permission of UNIX System Laboratories, Inc. 9124837Smtm * 10124837Smtm * Redistribution and use in source and binary forms, with or without 11124837Smtm * modification, are permitted provided that the following conditions 12124837Smtm * are met: 13124837Smtm * 1. Redistributions of source code must retain the above copyright 14124837Smtm * notice, this list of conditions and the following disclaimer. 15124837Smtm * 2. Redistributions in binary form must reproduce the above copyright 16124837Smtm * notice, this list of conditions and the following disclaimer in the 17124837Smtm * documentation and/or other materials provided with the distribution. 18124837Smtm * 3. All advertising materials mentioning features or use of this software 19124837Smtm * must display the following acknowledgement: 20124837Smtm * This product includes software developed by the University of 21124837Smtm * California, Berkeley and its contributors. 22124837Smtm * 4. Neither the name of the University nor the names of its contributors 23124837Smtm * may be used to endorse or promote products derived from this software 24124837Smtm * without specific prior written permission. 25124837Smtm * 26124837Smtm * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27124837Smtm * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28130643Sru * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29124837Smtm * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30124837Smtm * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31130643Sru * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32124837Smtm * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33124837Smtm * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34124837Smtm * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35124837Smtm * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36124837Smtm * SUCH DAMAGE. 37124837Smtm * 38124837Smtm * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95 39124837Smtm * $Id: vfs_subr.c,v 1.216 1999/08/13 10:10:01 phk Exp $ 40124837Smtm */ 41124837Smtm 42124837Smtm/* 43124837Smtm * External virtual filesystem routines 44124837Smtm */ 45124837Smtm#include "opt_ddb.h" 46124837Smtm 47124837Smtm#include <sys/param.h> 48124837Smtm#include <sys/systm.h> 49124837Smtm#include <sys/fcntl.h> 50124837Smtm#include <sys/kernel.h> 51130643Sru#include <sys/proc.h> 52124837Smtm#include <sys/kthread.h> 53124837Smtm#include <sys/malloc.h> 54124837Smtm#include <sys/mount.h> 55124837Smtm#include <sys/socket.h> 56130643Sru#include <sys/vnode.h> 57124837Smtm#include <sys/stat.h> 58124837Smtm#include <sys/buf.h> 59130643Sru#include <sys/domain.h> 60124837Smtm#include <sys/dirent.h> 61124837Smtm#include <sys/vmmeter.h> 62124837Smtm#include <sys/conf.h> 63124837Smtm 64124837Smtm#include <machine/limits.h> 65124837Smtm 66124837Smtm#include <vm/vm.h> 67130643Sru#include <vm/vm_param.h> 68124837Smtm#include <vm/vm_prot.h> 69124837Smtm#include <vm/vm_object.h> 70124837Smtm#include <vm/vm_extern.h> 71124837Smtm#include <vm/pmap.h> 72124837Smtm#include <vm/vm_map.h> 73124837Smtm#include <vm/vm_page.h> 74130643Sru#include <vm/vm_pager.h> 75124837Smtm#include <vm/vnode_pager.h> 76130643Sru#include <vm/vm_zone.h> 77130643Sru#include <sys/sysctl.h> 78124837Smtm 79124837Smtmstatic MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure"); 80124837Smtm 81124837Smtmstatic struct vnode *checkalias2 __P((struct vnode *nvp, dev_t dev, struct mount *mp)); 82124837Smtmstatic void insmntque __P((struct vnode *vp, struct mount *mp)); 83124837Smtmstatic void vclean __P((struct vnode *vp, int flags, struct proc *p)); 84124837Smtmstatic void vfree __P((struct vnode *)); 85124837Smtmstatic void vgonel __P((struct vnode *vp, struct proc *p)); 86124837Smtmstatic unsigned long numvnodes; 87124837SmtmSYSCTL_INT(_debug, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, ""); 88124837Smtm 89124837Smtmenum vtype iftovt_tab[16] = { 90124837Smtm VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, 91124837Smtm VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD, 92124837Smtm}; 93124837Smtmint vttoif_tab[9] = { 94124837Smtm 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, 95124837Smtm S_IFSOCK, S_IFIFO, S_IFMT, 96124837Smtm}; 97124837Smtm 98124837Smtmstatic TAILQ_HEAD(freelst, vnode) vnode_free_list; /* vnode free list */ 99124837Smtmstruct tobefreelist vnode_tobefree_list; /* vnode free list */ 100124837Smtm 101124837Smtmstatic u_long wantfreevnodes = 25; 102124837SmtmSYSCTL_INT(_debug, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, ""); 103124837Smtmstatic u_long freevnodes = 0; 104124837SmtmSYSCTL_INT(_debug, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, ""); 105124837Smtm 106124837Smtmstatic int reassignbufcalls; 107124837SmtmSYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0, ""); 108124837Smtmstatic int reassignbufloops; 109124837SmtmSYSCTL_INT(_vfs, OID_AUTO, reassignbufloops, CTLFLAG_RW, &reassignbufloops, 0, ""); 110124837Smtmstatic int reassignbufsortgood; 111124837SmtmSYSCTL_INT(_vfs, OID_AUTO, reassignbufsortgood, CTLFLAG_RW, &reassignbufsortgood, 0, ""); 112124837Smtmstatic int reassignbufsortbad; 113124837SmtmSYSCTL_INT(_vfs, OID_AUTO, reassignbufsortbad, CTLFLAG_RW, &reassignbufsortbad, 0, ""); 114124837Smtmstatic int reassignbufmethod = 1; 115124837SmtmSYSCTL_INT(_vfs, OID_AUTO, reassignbufmethod, CTLFLAG_RW, &reassignbufmethod, 0, ""); 116124837Smtm 117172880Sru#ifdef ENABLE_VFS_IOOPT 118124837Smtmint vfs_ioopt = 0; 119124837SmtmSYSCTL_INT(_vfs, OID_AUTO, ioopt, CTLFLAG_RW, &vfs_ioopt, 0, ""); 120124837Smtm#endif 121124837Smtm 122124837Smtmstruct mntlist mountlist; /* mounted filesystem list */ 123124837Smtmstruct simplelock mountlist_slock; 124124837Smtmstruct simplelock mntvnode_slock; 125124837Smtmint nfs_mount_type = -1; 126124837Smtm#ifndef NULL_SIMPLELOCKS 127124837Smtmstatic struct simplelock mntid_slock; 128124837Smtmstatic struct simplelock vnode_free_list_slock; 129124837Smtmstatic struct simplelock spechash_slock; 130124837Smtm#endif 131124837Smtmstruct nfs_public nfs_pub; /* publicly exported FS */ 132302010Sjillesstatic vm_zone_t vnode_zone; 133124837Smtm 134302010Sjilles/* 135130643Sru * The workitem queue. 136130643Sru */ 137#define SYNCER_MAXDELAY 32 138static int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */ 139time_t syncdelay = 30; /* max time to delay syncing data */ 140time_t filedelay = 30; /* time to delay syncing files */ 141SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, ""); 142time_t dirdelay = 29; /* time to delay syncing directories */ 143SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, ""); 144time_t metadelay = 28; /* time to delay syncing metadata */ 145SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, ""); 146static int rushjob; /* number of slots to run ASAP */ 147static int stat_rush_requests; /* number of times I/O speeded up */ 148SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, ""); 149 150static int syncer_delayno = 0; 151static long syncer_mask; 152LIST_HEAD(synclist, vnode); 153static struct synclist *syncer_workitem_pending; 154 155int desiredvnodes; 156SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW, 157 &desiredvnodes, 0, "Maximum number of vnodes"); 158 159static void vfs_free_addrlist __P((struct netexport *nep)); 160static int vfs_free_netcred __P((struct radix_node *rn, void *w)); 161static int vfs_hang_addrlist __P((struct mount *mp, struct netexport *nep, 162 struct export_args *argp)); 163 164/* 165 * Initialize the vnode management data structures. 166 */ 167void 168vntblinit() 169{ 170 171 desiredvnodes = maxproc + cnt.v_page_count / 4; 172 simple_lock_init(&mntvnode_slock); 173 simple_lock_init(&mntid_slock); 174 simple_lock_init(&spechash_slock); 175 TAILQ_INIT(&vnode_free_list); 176 TAILQ_INIT(&vnode_tobefree_list); 177 simple_lock_init(&vnode_free_list_slock); 178 CIRCLEQ_INIT(&mountlist); 179 vnode_zone = zinit("VNODE", sizeof (struct vnode), 0, 0, 5); 180 /* 181 * Initialize the filesystem syncer. 182 */ 183 syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE, 184 &syncer_mask); 185 syncer_maxdelay = syncer_mask + 1; 186} 187 188/* 189 * Mark a mount point as busy. Used to synchronize access and to delay 190 * unmounting. Interlock is not released on failure. 191 */ 192int 193vfs_busy(mp, flags, interlkp, p) 194 struct mount *mp; 195 int flags; 196 struct simplelock *interlkp; 197 struct proc *p; 198{ 199 int lkflags; 200 201 if (mp->mnt_kern_flag & MNTK_UNMOUNT) { 202 if (flags & LK_NOWAIT) 203 return (ENOENT); 204 mp->mnt_kern_flag |= MNTK_MWAIT; 205 if (interlkp) { 206 simple_unlock(interlkp); 207 } 208 /* 209 * Since all busy locks are shared except the exclusive 210 * lock granted when unmounting, the only place that a 211 * wakeup needs to be done is at the release of the 212 * exclusive lock at the end of dounmount. 213 */ 214 tsleep((caddr_t)mp, PVFS, "vfs_busy", 0); 215 if (interlkp) { 216 simple_lock(interlkp); 217 } 218 return (ENOENT); 219 } 220 lkflags = LK_SHARED | LK_NOPAUSE; 221 if (interlkp) 222 lkflags |= LK_INTERLOCK; 223 if (lockmgr(&mp->mnt_lock, lkflags, interlkp, p)) 224 panic("vfs_busy: unexpected lock failure"); 225 return (0); 226} 227 228/* 229 * Free a busy filesystem. 230 */ 231void 232vfs_unbusy(mp, p) 233 struct mount *mp; 234 struct proc *p; 235{ 236 237 lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, p); 238} 239 240/* 241 * Lookup a filesystem type, and if found allocate and initialize 242 * a mount structure for it. 243 * 244 * Devname is usually updated by mount(8) after booting. 245 */ 246int 247vfs_rootmountalloc(fstypename, devname, mpp) 248 char *fstypename; 249 char *devname; 250 struct mount **mpp; 251{ 252 struct proc *p = curproc; /* XXX */ 253 struct vfsconf *vfsp; 254 struct mount *mp; 255 256 if (fstypename == NULL) 257 return (ENODEV); 258 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) 259 if (!strcmp(vfsp->vfc_name, fstypename)) 260 break; 261 if (vfsp == NULL) 262 return (ENODEV); 263 mp = malloc((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK); 264 bzero((char *)mp, (u_long)sizeof(struct mount)); 265 lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, LK_NOPAUSE); 266 (void)vfs_busy(mp, LK_NOWAIT, 0, p); 267 LIST_INIT(&mp->mnt_vnodelist); 268 mp->mnt_vfc = vfsp; 269 mp->mnt_op = vfsp->vfc_vfsops; 270 mp->mnt_flag = MNT_RDONLY; 271 mp->mnt_vnodecovered = NULLVP; 272 vfsp->vfc_refcount++; 273 mp->mnt_stat.f_type = vfsp->vfc_typenum; 274 mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK; 275 strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN); 276 mp->mnt_stat.f_mntonname[0] = '/'; 277 mp->mnt_stat.f_mntonname[1] = 0; 278 (void) copystr(devname, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, 0); 279 *mpp = mp; 280 return (0); 281} 282 283/* 284 * Find an appropriate filesystem to use for the root. If a filesystem 285 * has not been preselected, walk through the list of known filesystems 286 * trying those that have mountroot routines, and try them until one 287 * works or we have tried them all. 288 */ 289#ifdef notdef /* XXX JH */ 290int 291lite2_vfs_mountroot() 292{ 293 struct vfsconf *vfsp; 294 extern int (*lite2_mountroot) __P((void)); 295 int error; 296 297 if (lite2_mountroot != NULL) 298 return ((*lite2_mountroot)()); 299 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) { 300 if (vfsp->vfc_mountroot == NULL) 301 continue; 302 if ((error = (*vfsp->vfc_mountroot)()) == 0) 303 return (0); 304 printf("%s_mountroot failed: %d\n", vfsp->vfc_name, error); 305 } 306 return (ENODEV); 307} 308#endif 309 310/* 311 * Lookup a mount point by filesystem identifier. 312 */ 313struct mount * 314vfs_getvfs(fsid) 315 fsid_t *fsid; 316{ 317 register struct mount *mp; 318 319 simple_lock(&mountlist_slock); 320 for (mp = mountlist.cqh_first; mp != (void *)&mountlist; 321 mp = mp->mnt_list.cqe_next) { 322 if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] && 323 mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) { 324 simple_unlock(&mountlist_slock); 325 return (mp); 326 } 327 } 328 simple_unlock(&mountlist_slock); 329 return ((struct mount *) 0); 330} 331 332/* 333 * Get a new unique fsid 334 */ 335void 336vfs_getnewfsid(mp) 337 struct mount *mp; 338{ 339 static u_short xxxfs_mntid; 340 341 fsid_t tfsid; 342 int mtype; 343 344 simple_lock(&mntid_slock); 345 mtype = mp->mnt_vfc->vfc_typenum; 346 mp->mnt_stat.f_fsid.val[0] = makeudev(255, mtype); 347 mp->mnt_stat.f_fsid.val[1] = mtype; 348 if (xxxfs_mntid == 0) 349 ++xxxfs_mntid; 350 tfsid.val[0] = makeudev(255, mtype + (xxxfs_mntid << 16)); 351 tfsid.val[1] = mtype; 352 if (mountlist.cqh_first != (void *)&mountlist) { 353 while (vfs_getvfs(&tfsid)) { 354 xxxfs_mntid++; 355 tfsid.val[0] = makeudev(255, 356 mtype + (xxxfs_mntid << 16)); 357 } 358 } 359 mp->mnt_stat.f_fsid.val[0] = tfsid.val[0]; 360 simple_unlock(&mntid_slock); 361} 362 363/* 364 * Set vnode attributes to VNOVAL 365 */ 366void 367vattr_null(vap) 368 register struct vattr *vap; 369{ 370 371 vap->va_type = VNON; 372 vap->va_size = VNOVAL; 373 vap->va_bytes = VNOVAL; 374 vap->va_mode = VNOVAL; 375 vap->va_nlink = VNOVAL; 376 vap->va_uid = VNOVAL; 377 vap->va_gid = VNOVAL; 378 vap->va_fsid = VNOVAL; 379 vap->va_fileid = VNOVAL; 380 vap->va_blocksize = VNOVAL; 381 vap->va_rdev = VNOVAL; 382 vap->va_atime.tv_sec = VNOVAL; 383 vap->va_atime.tv_nsec = VNOVAL; 384 vap->va_mtime.tv_sec = VNOVAL; 385 vap->va_mtime.tv_nsec = VNOVAL; 386 vap->va_ctime.tv_sec = VNOVAL; 387 vap->va_ctime.tv_nsec = VNOVAL; 388 vap->va_flags = VNOVAL; 389 vap->va_gen = VNOVAL; 390 vap->va_vaflags = 0; 391} 392 393/* 394 * Routines having to do with the management of the vnode table. 395 */ 396extern vop_t **dead_vnodeop_p; 397 398/* 399 * Return the next vnode from the free list. 400 */ 401int 402getnewvnode(tag, mp, vops, vpp) 403 enum vtagtype tag; 404 struct mount *mp; 405 vop_t **vops; 406 struct vnode **vpp; 407{ 408 int s; 409 struct proc *p = curproc; /* XXX */ 410 struct vnode *vp, *tvp, *nvp; 411 vm_object_t object; 412 TAILQ_HEAD(freelst, vnode) vnode_tmp_list; 413 414 /* 415 * We take the least recently used vnode from the freelist 416 * if we can get it and it has no cached pages, and no 417 * namecache entries are relative to it. 418 * Otherwise we allocate a new vnode 419 */ 420 421 s = splbio(); 422 simple_lock(&vnode_free_list_slock); 423 TAILQ_INIT(&vnode_tmp_list); 424 425 for (vp = TAILQ_FIRST(&vnode_tobefree_list); vp; vp = nvp) { 426 nvp = TAILQ_NEXT(vp, v_freelist); 427 TAILQ_REMOVE(&vnode_tobefree_list, vp, v_freelist); 428 if (vp->v_flag & VAGE) { 429 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); 430 } else { 431 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 432 } 433 vp->v_flag &= ~(VTBFREE|VAGE); 434 vp->v_flag |= VFREE; 435 if (vp->v_usecount) 436 panic("tobe free vnode isn't"); 437 freevnodes++; 438 } 439 440 if (wantfreevnodes && freevnodes < wantfreevnodes) { 441 vp = NULL; 442 } else if (!wantfreevnodes && freevnodes <= desiredvnodes) { 443 /* 444 * XXX: this is only here to be backwards compatible 445 */ 446 vp = NULL; 447 } else { 448 for (vp = TAILQ_FIRST(&vnode_free_list); vp; vp = nvp) { 449 nvp = TAILQ_NEXT(vp, v_freelist); 450 if (!simple_lock_try(&vp->v_interlock)) 451 continue; 452 if (vp->v_usecount) 453 panic("free vnode isn't"); 454 455 object = vp->v_object; 456 if (object && (object->resident_page_count || object->ref_count)) { 457 printf("object inconsistant state: RPC: %d, RC: %d\n", 458 object->resident_page_count, object->ref_count); 459 /* Don't recycle if it's caching some pages */ 460 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); 461 TAILQ_INSERT_TAIL(&vnode_tmp_list, vp, v_freelist); 462 continue; 463 } else if (LIST_FIRST(&vp->v_cache_src)) { 464 /* Don't recycle if active in the namecache */ 465 simple_unlock(&vp->v_interlock); 466 continue; 467 } else { 468 break; 469 } 470 } 471 } 472 473 for (tvp = TAILQ_FIRST(&vnode_tmp_list); tvp; tvp = nvp) { 474 nvp = TAILQ_NEXT(tvp, v_freelist); 475 TAILQ_REMOVE(&vnode_tmp_list, tvp, v_freelist); 476 TAILQ_INSERT_TAIL(&vnode_free_list, tvp, v_freelist); 477 simple_unlock(&tvp->v_interlock); 478 } 479 480 if (vp) { 481 vp->v_flag |= VDOOMED; 482 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); 483 freevnodes--; 484 simple_unlock(&vnode_free_list_slock); 485 cache_purge(vp); 486 vp->v_lease = NULL; 487 if (vp->v_type != VBAD) { 488 vgonel(vp, p); 489 } else { 490 simple_unlock(&vp->v_interlock); 491 } 492 493#ifdef INVARIANTS 494 { 495 int s; 496 497 if (vp->v_data) 498 panic("cleaned vnode isn't"); 499 s = splbio(); 500 if (vp->v_numoutput) 501 panic("Clean vnode has pending I/O's"); 502 splx(s); 503 } 504#endif 505 vp->v_flag = 0; 506 vp->v_lastr = 0; 507 vp->v_lastw = 0; 508 vp->v_lasta = 0; 509 vp->v_cstart = 0; 510 vp->v_clen = 0; 511 vp->v_socket = 0; 512 vp->v_writecount = 0; /* XXX */ 513 vp->v_maxio = 0; 514 } else { 515 simple_unlock(&vnode_free_list_slock); 516 vp = (struct vnode *) zalloc(vnode_zone); 517 bzero((char *) vp, sizeof *vp); 518 simple_lock_init(&vp->v_interlock); 519 vp->v_dd = vp; 520 cache_purge(vp); 521 LIST_INIT(&vp->v_cache_src); 522 TAILQ_INIT(&vp->v_cache_dst); 523 numvnodes++; 524 } 525 526 TAILQ_INIT(&vp->v_cleanblkhd); 527 TAILQ_INIT(&vp->v_dirtyblkhd); 528 vp->v_type = VNON; 529 vp->v_tag = tag; 530 vp->v_op = vops; 531 insmntque(vp, mp); 532 *vpp = vp; 533 vp->v_usecount = 1; 534 vp->v_data = 0; 535 splx(s); 536 537 vfs_object_create(vp, p, p->p_ucred); 538 return (0); 539} 540 541/* 542 * Move a vnode from one mount queue to another. 543 */ 544static void 545insmntque(vp, mp) 546 register struct vnode *vp; 547 register struct mount *mp; 548{ 549 550 simple_lock(&mntvnode_slock); 551 /* 552 * Delete from old mount point vnode list, if on one. 553 */ 554 if (vp->v_mount != NULL) 555 LIST_REMOVE(vp, v_mntvnodes); 556 /* 557 * Insert into list of vnodes for the new mount point, if available. 558 */ 559 if ((vp->v_mount = mp) == NULL) { 560 simple_unlock(&mntvnode_slock); 561 return; 562 } 563 LIST_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes); 564 simple_unlock(&mntvnode_slock); 565} 566 567/* 568 * Update outstanding I/O count and do wakeup if requested. 569 */ 570void 571vwakeup(bp) 572 register struct buf *bp; 573{ 574 register struct vnode *vp; 575 576 bp->b_flags &= ~B_WRITEINPROG; 577 if ((vp = bp->b_vp)) { 578 vp->v_numoutput--; 579 if (vp->v_numoutput < 0) 580 panic("vwakeup: neg numoutput"); 581 if ((vp->v_numoutput == 0) && (vp->v_flag & VBWAIT)) { 582 vp->v_flag &= ~VBWAIT; 583 wakeup((caddr_t) &vp->v_numoutput); 584 } 585 } 586} 587 588/* 589 * Flush out and invalidate all buffers associated with a vnode. 590 * Called with the underlying object locked. 591 */ 592int 593vinvalbuf(vp, flags, cred, p, slpflag, slptimeo) 594 register struct vnode *vp; 595 int flags; 596 struct ucred *cred; 597 struct proc *p; 598 int slpflag, slptimeo; 599{ 600 register struct buf *bp; 601 struct buf *nbp, *blist; 602 int s, error; 603 vm_object_t object; 604 605 if (flags & V_SAVE) { 606 s = splbio(); 607 while (vp->v_numoutput) { 608 vp->v_flag |= VBWAIT; 609 error = tsleep((caddr_t)&vp->v_numoutput, 610 slpflag | (PRIBIO + 1), "vinvlbuf", slptimeo); 611 if (error) { 612 splx(s); 613 return (error); 614 } 615 } 616 if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) { 617 splx(s); 618 if ((error = VOP_FSYNC(vp, cred, MNT_WAIT, p)) != 0) 619 return (error); 620 s = splbio(); 621 if (vp->v_numoutput > 0 || 622 !TAILQ_EMPTY(&vp->v_dirtyblkhd)) 623 panic("vinvalbuf: dirty bufs"); 624 } 625 splx(s); 626 } 627 s = splbio(); 628 for (;;) { 629 blist = TAILQ_FIRST(&vp->v_cleanblkhd); 630 if (!blist) 631 blist = TAILQ_FIRST(&vp->v_dirtyblkhd); 632 if (!blist) 633 break; 634 635 for (bp = blist; bp; bp = nbp) { 636 nbp = TAILQ_NEXT(bp, b_vnbufs); 637 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { 638 error = BUF_TIMELOCK(bp, 639 LK_EXCLUSIVE | LK_SLEEPFAIL, 640 "vinvalbuf", slpflag, slptimeo); 641 if (error == ENOLCK) 642 break; 643 splx(s); 644 return (error); 645 } 646 /* 647 * XXX Since there are no node locks for NFS, I 648 * believe there is a slight chance that a delayed 649 * write will occur while sleeping just above, so 650 * check for it. Note that vfs_bio_awrite expects 651 * buffers to reside on a queue, while VOP_BWRITE and 652 * brelse do not. 653 */ 654 if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) && 655 (flags & V_SAVE)) { 656 657 if (bp->b_vp == vp) { 658 if (bp->b_flags & B_CLUSTEROK) { 659 BUF_UNLOCK(bp); 660 vfs_bio_awrite(bp); 661 } else { 662 bremfree(bp); 663 bp->b_flags |= B_ASYNC; 664 VOP_BWRITE(bp->b_vp, bp); 665 } 666 } else { 667 bremfree(bp); 668 (void) VOP_BWRITE(bp->b_vp, bp); 669 } 670 break; 671 } 672 bremfree(bp); 673 bp->b_flags |= (B_INVAL | B_NOCACHE | B_RELBUF); 674 bp->b_flags &= ~B_ASYNC; 675 brelse(bp); 676 } 677 } 678 679 while (vp->v_numoutput > 0) { 680 vp->v_flag |= VBWAIT; 681 tsleep(&vp->v_numoutput, PVM, "vnvlbv", 0); 682 } 683 684 splx(s); 685 686 /* 687 * Destroy the copy in the VM cache, too. 688 */ 689 simple_lock(&vp->v_interlock); 690 object = vp->v_object; 691 if (object != NULL) { 692 vm_object_page_remove(object, 0, 0, 693 (flags & V_SAVE) ? TRUE : FALSE); 694 } 695 simple_unlock(&vp->v_interlock); 696 697 if (!TAILQ_EMPTY(&vp->v_dirtyblkhd) || !TAILQ_EMPTY(&vp->v_cleanblkhd)) 698 panic("vinvalbuf: flush failed"); 699 return (0); 700} 701 702/* 703 * Truncate a file's buffer and pages to a specified length. This 704 * is in lieu of the old vinvalbuf mechanism, which performed unneeded 705 * sync activity. 706 */ 707int 708vtruncbuf(vp, cred, p, length, blksize) 709 register struct vnode *vp; 710 struct ucred *cred; 711 struct proc *p; 712 off_t length; 713 int blksize; 714{ 715 register struct buf *bp; 716 struct buf *nbp; 717 int s, anyfreed; 718 int trunclbn; 719 720 /* 721 * Round up to the *next* lbn. 722 */ 723 trunclbn = (length + blksize - 1) / blksize; 724 725 s = splbio(); 726restart: 727 anyfreed = 1; 728 for (;anyfreed;) { 729 anyfreed = 0; 730 for (bp = TAILQ_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) { 731 nbp = TAILQ_NEXT(bp, b_vnbufs); 732 if (bp->b_lblkno >= trunclbn) { 733 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { 734 BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL); 735 goto restart; 736 } else { 737 bremfree(bp); 738 bp->b_flags |= (B_INVAL | B_RELBUF); 739 bp->b_flags &= ~B_ASYNC; 740 brelse(bp); 741 anyfreed = 1; 742 } 743 if (nbp && (((nbp->b_xflags & B_VNCLEAN) == 0)|| 744 (nbp->b_vp != vp) || 745 (nbp->b_flags & B_DELWRI))) { 746 goto restart; 747 } 748 } 749 } 750 751 for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { 752 nbp = TAILQ_NEXT(bp, b_vnbufs); 753 if (bp->b_lblkno >= trunclbn) { 754 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { 755 BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL); 756 goto restart; 757 } else { 758 bremfree(bp); 759 bp->b_flags |= (B_INVAL | B_RELBUF); 760 bp->b_flags &= ~B_ASYNC; 761 brelse(bp); 762 anyfreed = 1; 763 } 764 if (nbp && (((nbp->b_xflags & B_VNDIRTY) == 0)|| 765 (nbp->b_vp != vp) || 766 (nbp->b_flags & B_DELWRI) == 0)) { 767 goto restart; 768 } 769 } 770 } 771 } 772 773 if (length > 0) { 774restartsync: 775 for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { 776 nbp = TAILQ_NEXT(bp, b_vnbufs); 777 if ((bp->b_flags & B_DELWRI) && (bp->b_lblkno < 0)) { 778 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { 779 BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL); 780 goto restart; 781 } else { 782 bremfree(bp); 783 if (bp->b_vp == vp) { 784 bp->b_flags |= B_ASYNC; 785 } else { 786 bp->b_flags &= ~B_ASYNC; 787 } 788 VOP_BWRITE(bp->b_vp, bp); 789 } 790 goto restartsync; 791 } 792 793 } 794 } 795 796 while (vp->v_numoutput > 0) { 797 vp->v_flag |= VBWAIT; 798 tsleep(&vp->v_numoutput, PVM, "vbtrunc", 0); 799 } 800 801 splx(s); 802 803 vnode_pager_setsize(vp, length); 804 805 return (0); 806} 807 808/* 809 * Associate a buffer with a vnode. 810 */ 811void 812bgetvp(vp, bp) 813 register struct vnode *vp; 814 register struct buf *bp; 815{ 816 int s; 817 818 KASSERT(bp->b_vp == NULL, ("bgetvp: not free")); 819 820 vhold(vp); 821 bp->b_vp = vp; 822 if (vp->v_type == VBLK || vp->v_type == VCHR) 823 bp->b_dev = vp->v_rdev; 824 else 825 bp->b_dev = NODEV; 826 /* 827 * Insert onto list for new vnode. 828 */ 829 s = splbio(); 830 bp->b_xflags |= B_VNCLEAN; 831 bp->b_xflags &= ~B_VNDIRTY; 832 TAILQ_INSERT_TAIL(&vp->v_cleanblkhd, bp, b_vnbufs); 833 splx(s); 834} 835 836/* 837 * Disassociate a buffer from a vnode. 838 */ 839void 840brelvp(bp) 841 register struct buf *bp; 842{ 843 struct vnode *vp; 844 struct buflists *listheadp; 845 int s; 846 847 KASSERT(bp->b_vp != NULL, ("brelvp: NULL")); 848 849 /* 850 * Delete from old vnode list, if on one. 851 */ 852 vp = bp->b_vp; 853 s = splbio(); 854 if (bp->b_xflags & (B_VNDIRTY|B_VNCLEAN)) { 855 if (bp->b_xflags & B_VNDIRTY) 856 listheadp = &vp->v_dirtyblkhd; 857 else 858 listheadp = &vp->v_cleanblkhd; 859 TAILQ_REMOVE(listheadp, bp, b_vnbufs); 860 bp->b_xflags &= ~(B_VNDIRTY|B_VNCLEAN); 861 } 862 if ((vp->v_flag & VONWORKLST) && TAILQ_EMPTY(&vp->v_dirtyblkhd)) { 863 vp->v_flag &= ~VONWORKLST; 864 LIST_REMOVE(vp, v_synclist); 865 } 866 splx(s); 867 bp->b_vp = (struct vnode *) 0; 868 vdrop(vp); 869} 870 871/* 872 * The workitem queue. 873 * 874 * It is useful to delay writes of file data and filesystem metadata 875 * for tens of seconds so that quickly created and deleted files need 876 * not waste disk bandwidth being created and removed. To realize this, 877 * we append vnodes to a "workitem" queue. When running with a soft 878 * updates implementation, most pending metadata dependencies should 879 * not wait for more than a few seconds. Thus, mounted on block devices 880 * are delayed only about a half the time that file data is delayed. 881 * Similarly, directory updates are more critical, so are only delayed 882 * about a third the time that file data is delayed. Thus, there are 883 * SYNCER_MAXDELAY queues that are processed round-robin at a rate of 884 * one each second (driven off the filesystem syncer process). The 885 * syncer_delayno variable indicates the next queue that is to be processed. 886 * Items that need to be processed soon are placed in this queue: 887 * 888 * syncer_workitem_pending[syncer_delayno] 889 * 890 * A delay of fifteen seconds is done by placing the request fifteen 891 * entries later in the queue: 892 * 893 * syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask] 894 * 895 */ 896 897/* 898 * Add an item to the syncer work queue. 899 */ 900static void 901vn_syncer_add_to_worklist(struct vnode *vp, int delay) 902{ 903 int s, slot; 904 905 s = splbio(); 906 907 if (vp->v_flag & VONWORKLST) { 908 LIST_REMOVE(vp, v_synclist); 909 } 910 911 if (delay > syncer_maxdelay - 2) 912 delay = syncer_maxdelay - 2; 913 slot = (syncer_delayno + delay) & syncer_mask; 914 915 LIST_INSERT_HEAD(&syncer_workitem_pending[slot], vp, v_synclist); 916 vp->v_flag |= VONWORKLST; 917 splx(s); 918} 919 920struct proc *updateproc; 921static void sched_sync __P((void)); 922static struct kproc_desc up_kp = { 923 "syncer", 924 sched_sync, 925 &updateproc 926}; 927SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp) 928 929/* 930 * System filesystem synchronizer daemon. 931 */ 932void 933sched_sync(void) 934{ 935 struct synclist *slp; 936 struct vnode *vp; 937 long starttime; 938 int s; 939 struct proc *p = updateproc; 940 941 p->p_flag |= P_BUFEXHAUST; 942 943 for (;;) { 944 starttime = time_second; 945 946 /* 947 * Push files whose dirty time has expired. Be careful 948 * of interrupt race on slp queue. 949 */ 950 s = splbio(); 951 slp = &syncer_workitem_pending[syncer_delayno]; 952 syncer_delayno += 1; 953 if (syncer_delayno == syncer_maxdelay) 954 syncer_delayno = 0; 955 splx(s); 956 957 while ((vp = LIST_FIRST(slp)) != NULL) { 958 if (VOP_ISLOCKED(vp) == 0) { 959 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); 960 (void) VOP_FSYNC(vp, p->p_ucred, MNT_LAZY, p); 961 VOP_UNLOCK(vp, 0, p); 962 } 963 s = splbio(); 964 if (LIST_FIRST(slp) == vp) { 965 /* 966 * Note: v_tag VT_VFS vps can remain on the 967 * worklist too with no dirty blocks, but 968 * since sync_fsync() moves it to a different 969 * slot we are safe. 970 */ 971 if (TAILQ_EMPTY(&vp->v_dirtyblkhd) && 972 vp->v_type != VBLK) 973 panic("sched_sync: fsync failed vp %p tag %d", vp, vp->v_tag); 974 /* 975 * Put us back on the worklist. The worklist 976 * routine will remove us from our current 977 * position and then add us back in at a later 978 * position. 979 */ 980 vn_syncer_add_to_worklist(vp, syncdelay); 981 } 982 splx(s); 983 } 984 985 /* 986 * Do soft update processing. 987 */ 988 if (bioops.io_sync) 989 (*bioops.io_sync)(NULL); 990 991 /* 992 * The variable rushjob allows the kernel to speed up the 993 * processing of the filesystem syncer process. A rushjob 994 * value of N tells the filesystem syncer to process the next 995 * N seconds worth of work on its queue ASAP. Currently rushjob 996 * is used by the soft update code to speed up the filesystem 997 * syncer process when the incore state is getting so far 998 * ahead of the disk that the kernel memory pool is being 999 * threatened with exhaustion. 1000 */ 1001 if (rushjob > 0) { 1002 rushjob -= 1; 1003 continue; 1004 } 1005 /* 1006 * If it has taken us less than a second to process the 1007 * current work, then wait. Otherwise start right over 1008 * again. We can still lose time if any single round 1009 * takes more than two seconds, but it does not really 1010 * matter as we are just trying to generally pace the 1011 * filesystem activity. 1012 */ 1013 if (time_second == starttime) 1014 tsleep(&lbolt, PPAUSE, "syncer", 0); 1015 } 1016} 1017 1018/* 1019 * Request the syncer daemon to speed up its work. 1020 * We never push it to speed up more than half of its 1021 * normal turn time, otherwise it could take over the cpu. 1022 */ 1023int 1024speedup_syncer() 1025{ 1026 int s; 1027 1028 s = splhigh(); 1029 if (updateproc->p_wchan == &lbolt) 1030 setrunnable(updateproc); 1031 splx(s); 1032 if (rushjob < syncdelay / 2) { 1033 rushjob += 1; 1034 stat_rush_requests += 1; 1035 return (1); 1036 } 1037 return(0); 1038} 1039 1040/* 1041 * Associate a p-buffer with a vnode. 1042 * 1043 * Also sets B_PAGING flag to indicate that vnode is not fully associated 1044 * with the buffer. i.e. the bp has not been linked into the vnode or 1045 * ref-counted. 1046 */ 1047void 1048pbgetvp(vp, bp) 1049 register struct vnode *vp; 1050 register struct buf *bp; 1051{ 1052 1053 KASSERT(bp->b_vp == NULL, ("pbgetvp: not free")); 1054 1055 bp->b_vp = vp; 1056 bp->b_flags |= B_PAGING; 1057 if (vp->v_type == VBLK || vp->v_type == VCHR) 1058 bp->b_dev = vp->v_rdev; 1059 else 1060 bp->b_dev = NODEV; 1061} 1062 1063/* 1064 * Disassociate a p-buffer from a vnode. 1065 */ 1066void 1067pbrelvp(bp) 1068 register struct buf *bp; 1069{ 1070 1071 KASSERT(bp->b_vp != NULL, ("pbrelvp: NULL")); 1072 1073#if !defined(MAX_PERF) 1074 /* XXX REMOVE ME */ 1075 if (bp->b_vnbufs.tqe_next != NULL) { 1076 panic( 1077 "relpbuf(): b_vp was probably reassignbuf()d %p %x", 1078 bp, 1079 (int)bp->b_flags 1080 ); 1081 } 1082#endif 1083 bp->b_vp = (struct vnode *) 0; 1084 bp->b_flags &= ~B_PAGING; 1085} 1086 1087void 1088pbreassignbuf(bp, newvp) 1089 struct buf *bp; 1090 struct vnode *newvp; 1091{ 1092#if !defined(MAX_PERF) 1093 if ((bp->b_flags & B_PAGING) == 0) { 1094 panic( 1095 "pbreassignbuf() on non phys bp %p", 1096 bp 1097 ); 1098 } 1099#endif 1100 bp->b_vp = newvp; 1101} 1102 1103/* 1104 * Reassign a buffer from one vnode to another. 1105 * Used to assign file specific control information 1106 * (indirect blocks) to the vnode to which they belong. 1107 */ 1108void 1109reassignbuf(bp, newvp) 1110 register struct buf *bp; 1111 register struct vnode *newvp; 1112{ 1113 struct buflists *listheadp; 1114 int delay; 1115 int s; 1116 1117 if (newvp == NULL) { 1118 printf("reassignbuf: NULL"); 1119 return; 1120 } 1121 ++reassignbufcalls; 1122 1123#if !defined(MAX_PERF) 1124 /* 1125 * B_PAGING flagged buffers cannot be reassigned because their vp 1126 * is not fully linked in. 1127 */ 1128 if (bp->b_flags & B_PAGING) 1129 panic("cannot reassign paging buffer"); 1130#endif 1131 1132 s = splbio(); 1133 /* 1134 * Delete from old vnode list, if on one. 1135 */ 1136 if (bp->b_xflags & (B_VNDIRTY|B_VNCLEAN)) { 1137 if (bp->b_xflags & B_VNDIRTY) 1138 listheadp = &bp->b_vp->v_dirtyblkhd; 1139 else 1140 listheadp = &bp->b_vp->v_cleanblkhd; 1141 TAILQ_REMOVE(listheadp, bp, b_vnbufs); 1142 bp->b_xflags &= ~(B_VNDIRTY|B_VNCLEAN); 1143 if (bp->b_vp != newvp) { 1144 vdrop(bp->b_vp); 1145 bp->b_vp = NULL; /* for clarification */ 1146 } 1147 } 1148 /* 1149 * If dirty, put on list of dirty buffers; otherwise insert onto list 1150 * of clean buffers. 1151 */ 1152 if (bp->b_flags & B_DELWRI) { 1153 struct buf *tbp; 1154 1155 listheadp = &newvp->v_dirtyblkhd; 1156 if ((newvp->v_flag & VONWORKLST) == 0) { 1157 switch (newvp->v_type) { 1158 case VDIR: 1159 delay = dirdelay; 1160 break; 1161 case VBLK: 1162 if (newvp->v_specmountpoint != NULL) { 1163 delay = metadelay; 1164 break; 1165 } 1166 /* fall through */ 1167 default: 1168 delay = filedelay; 1169 } 1170 vn_syncer_add_to_worklist(newvp, delay); 1171 } 1172 bp->b_xflags |= B_VNDIRTY; 1173 tbp = TAILQ_FIRST(listheadp); 1174 if (tbp == NULL || 1175 bp->b_lblkno == 0 || 1176 (bp->b_lblkno > 0 && bp->b_lblkno < tbp->b_lblkno)) { 1177 TAILQ_INSERT_HEAD(listheadp, bp, b_vnbufs); 1178 ++reassignbufsortgood; 1179 } else if (bp->b_lblkno < 0) { 1180 TAILQ_INSERT_TAIL(listheadp, bp, b_vnbufs); 1181 ++reassignbufsortgood; 1182 } else if (reassignbufmethod == 1) { 1183 /* 1184 * New sorting algorithm, only handle sequential case, 1185 * otherwise guess. 1186 */ 1187 if ((tbp = gbincore(newvp, bp->b_lblkno - 1)) != NULL && 1188 (tbp->b_xflags & B_VNDIRTY)) { 1189 TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs); 1190 ++reassignbufsortgood; 1191 } else { 1192 TAILQ_INSERT_HEAD(listheadp, bp, b_vnbufs); 1193 ++reassignbufsortbad; 1194 } 1195 } else { 1196 /* 1197 * Old sorting algorithm, scan queue and insert 1198 */ 1199 struct buf *ttbp; 1200 while ((ttbp = TAILQ_NEXT(tbp, b_vnbufs)) && 1201 (ttbp->b_lblkno < bp->b_lblkno)) { 1202 ++reassignbufloops; 1203 tbp = ttbp; 1204 } 1205 TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs); 1206 } 1207 } else { 1208 bp->b_xflags |= B_VNCLEAN; 1209 TAILQ_INSERT_TAIL(&newvp->v_cleanblkhd, bp, b_vnbufs); 1210 if ((newvp->v_flag & VONWORKLST) && 1211 TAILQ_EMPTY(&newvp->v_dirtyblkhd)) { 1212 newvp->v_flag &= ~VONWORKLST; 1213 LIST_REMOVE(newvp, v_synclist); 1214 } 1215 } 1216 if (bp->b_vp != newvp) { 1217 bp->b_vp = newvp; 1218 vhold(bp->b_vp); 1219 } 1220 splx(s); 1221} 1222 1223/* 1224 * Create a vnode for a block device. 1225 * Used for mounting the root file system. 1226 */ 1227int 1228bdevvp(dev, vpp) 1229 dev_t dev; 1230 struct vnode **vpp; 1231{ 1232 register struct vnode *vp; 1233 struct vnode *nvp; 1234 int error; 1235 1236 if (dev == NODEV) { 1237 *vpp = NULLVP; 1238 return (ENXIO); 1239 } 1240 error = getnewvnode(VT_NON, (struct mount *)0, spec_vnodeop_p, &nvp); 1241 if (error) { 1242 *vpp = NULLVP; 1243 return (error); 1244 } 1245 vp = nvp; 1246 /* dev2udev() results in a CDEV, so we need to cheat here. */ 1247 vp->v_type = VBLK; 1248 if ((nvp = checkalias2(vp, dev, (struct mount *)0)) != NULL) { 1249 vput(vp); 1250 vp = nvp; 1251 } 1252 *vpp = vp; 1253 return (0); 1254} 1255 1256/* 1257 * Check to see if the new vnode represents a special device 1258 * for which we already have a vnode (either because of 1259 * bdevvp() or because of a different vnode representing 1260 * the same block device). If such an alias exists, deallocate 1261 * the existing contents and return the aliased vnode. The 1262 * caller is responsible for filling it with its new contents. 1263 */ 1264struct vnode * 1265checkalias(nvp, nvp_rdev, mp) 1266 register struct vnode *nvp; 1267 udev_t nvp_rdev; 1268 struct mount *mp; 1269{ 1270 dev_t dev; 1271 1272 if (nvp->v_type != VBLK && nvp->v_type != VCHR) 1273 return (NULLVP); 1274 1275 dev = udev2dev(nvp_rdev, nvp->v_type == VBLK ? 1 : 0); 1276 return (checkalias2(nvp, dev, mp)); 1277} 1278 1279static struct vnode * 1280checkalias2(nvp, dev, mp) 1281 register struct vnode *nvp; 1282 dev_t dev; 1283 struct mount *mp; 1284{ 1285 struct proc *p = curproc; /* XXX */ 1286 struct vnode *vp; 1287 struct vnode **vpp; 1288 1289 if (nvp->v_type != VBLK && nvp->v_type != VCHR) 1290 return (NULLVP); 1291 1292 vpp = &dev->si_hlist; 1293loop: 1294 simple_lock(&spechash_slock); 1295 for (vp = *vpp; vp; vp = vp->v_specnext) { 1296 if (nvp->v_type != vp->v_type) 1297 continue; 1298 /* 1299 * Alias, but not in use, so flush it out. 1300 * Only alias active device nodes. 1301 * Not sure why we don't re-use this like we do below. 1302 */ 1303 simple_lock(&vp->v_interlock); 1304 if (vp->v_usecount == 0) { 1305 simple_unlock(&spechash_slock); 1306 vgonel(vp, p); 1307 goto loop; 1308 } 1309 if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, p)) { 1310 /* 1311 * It dissappeared, and we may have slept. 1312 * Restart from the beginning 1313 */ 1314 simple_unlock(&spechash_slock); 1315 goto loop; 1316 } 1317 break; 1318 } 1319 /* 1320 * It would be a lot clearer what is going on here if 1321 * this had been expressed as: 1322 * if ( vp && (vp->v_tag == VT_NULL)) 1323 * and the clauses had been swapped. 1324 */ 1325 if (vp == NULL || vp->v_tag != VT_NON) { 1326 struct specinfo *sinfo; 1327 1328 /* 1329 * Put the new vnode into the hash chain. 1330 * and if there was an alias, connect them. 1331 */ 1332 nvp->v_specnext = *vpp; 1333 *vpp = nvp; 1334 nvp->v_rdev = sinfo = dev; 1335 1336 simple_unlock(&spechash_slock); 1337 if (vp != NULLVP) { 1338 nvp->v_flag |= VALIASED; 1339 vp->v_flag |= VALIASED; 1340 vput(vp); 1341 } 1342 return (NULLVP); 1343 } 1344 /* 1345 * if ( vp && (vp->v_tag == VT_NULL)) 1346 * We have a vnode alias, but it is a trashed. 1347 * Make it look like it's newly allocated. (by getnewvnode()) 1348 * The caller should use this instead. 1349 */ 1350 simple_unlock(&spechash_slock); 1351 VOP_UNLOCK(vp, 0, p); 1352 simple_lock(&vp->v_interlock); 1353 vclean(vp, 0, p); 1354 vp->v_op = nvp->v_op; 1355 vp->v_tag = nvp->v_tag; 1356 nvp->v_type = VNON; 1357 insmntque(vp, mp); 1358 return (vp); 1359} 1360 1361/* 1362 * Grab a particular vnode from the free list, increment its 1363 * reference count and lock it. The vnode lock bit is set if the 1364 * vnode is being eliminated in vgone. The process is awakened 1365 * when the transition is completed, and an error returned to 1366 * indicate that the vnode is no longer usable (possibly having 1367 * been changed to a new file system type). 1368 */ 1369int 1370vget(vp, flags, p) 1371 register struct vnode *vp; 1372 int flags; 1373 struct proc *p; 1374{ 1375 int error; 1376 1377 /* 1378 * If the vnode is in the process of being cleaned out for 1379 * another use, we wait for the cleaning to finish and then 1380 * return failure. Cleaning is determined by checking that 1381 * the VXLOCK flag is set. 1382 */ 1383 if ((flags & LK_INTERLOCK) == 0) { 1384 simple_lock(&vp->v_interlock); 1385 } 1386 if (vp->v_flag & VXLOCK) { 1387 vp->v_flag |= VXWANT; 1388 simple_unlock(&vp->v_interlock); 1389 tsleep((caddr_t)vp, PINOD, "vget", 0); 1390 return (ENOENT); 1391 } 1392 1393 vp->v_usecount++; 1394 1395 if (VSHOULDBUSY(vp)) 1396 vbusy(vp); 1397 if (flags & LK_TYPE_MASK) { 1398 if ((error = vn_lock(vp, flags | LK_INTERLOCK, p)) != 0) { 1399 /* 1400 * must expand vrele here because we do not want 1401 * to call VOP_INACTIVE if the reference count 1402 * drops back to zero since it was never really 1403 * active. We must remove it from the free list 1404 * before sleeping so that multiple processes do 1405 * not try to recycle it. 1406 */ 1407 simple_lock(&vp->v_interlock); 1408 vp->v_usecount--; 1409 if (VSHOULDFREE(vp)) 1410 vfree(vp); 1411 simple_unlock(&vp->v_interlock); 1412 } 1413 return (error); 1414 } 1415 simple_unlock(&vp->v_interlock); 1416 return (0); 1417} 1418 1419void 1420vref(struct vnode *vp) 1421{ 1422 simple_lock(&vp->v_interlock); 1423 vp->v_usecount++; 1424 simple_unlock(&vp->v_interlock); 1425} 1426 1427/* 1428 * Vnode put/release. 1429 * If count drops to zero, call inactive routine and return to freelist. 1430 */ 1431void 1432vrele(vp) 1433 struct vnode *vp; 1434{ 1435 struct proc *p = curproc; /* XXX */ 1436 1437 KASSERT(vp != NULL, ("vrele: null vp")); 1438 1439 simple_lock(&vp->v_interlock); 1440 1441 if (vp->v_usecount > 1) { 1442 1443 vp->v_usecount--; 1444 simple_unlock(&vp->v_interlock); 1445 1446 return; 1447 } 1448 1449 if (vp->v_usecount == 1) { 1450 1451 vp->v_usecount--; 1452 if (VSHOULDFREE(vp)) 1453 vfree(vp); 1454 /* 1455 * If we are doing a vput, the node is already locked, and we must 1456 * call VOP_INACTIVE with the node locked. So, in the case of 1457 * vrele, we explicitly lock the vnode before calling VOP_INACTIVE. 1458 */ 1459 if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK, p) == 0) { 1460 VOP_INACTIVE(vp, p); 1461 } 1462 1463 } else { 1464#ifdef DIAGNOSTIC 1465 vprint("vrele: negative ref count", vp); 1466 simple_unlock(&vp->v_interlock); 1467#endif 1468 panic("vrele: negative ref cnt"); 1469 } 1470} 1471 1472void 1473vput(vp) 1474 struct vnode *vp; 1475{ 1476 struct proc *p = curproc; /* XXX */ 1477 1478 KASSERT(vp != NULL, ("vput: null vp")); 1479 1480 simple_lock(&vp->v_interlock); 1481 1482 if (vp->v_usecount > 1) { 1483 1484 vp->v_usecount--; 1485 VOP_UNLOCK(vp, LK_INTERLOCK, p); 1486 return; 1487 1488 } 1489 1490 if (vp->v_usecount == 1) { 1491 1492 vp->v_usecount--; 1493 if (VSHOULDFREE(vp)) 1494 vfree(vp); 1495 /* 1496 * If we are doing a vput, the node is already locked, and we must 1497 * call VOP_INACTIVE with the node locked. So, in the case of 1498 * vrele, we explicitly lock the vnode before calling VOP_INACTIVE. 1499 */ 1500 simple_unlock(&vp->v_interlock); 1501 VOP_INACTIVE(vp, p); 1502 1503 } else { 1504#ifdef DIAGNOSTIC 1505 vprint("vput: negative ref count", vp); 1506#endif 1507 panic("vput: negative ref cnt"); 1508 } 1509} 1510 1511/* 1512 * Somebody doesn't want the vnode recycled. 1513 */ 1514void 1515vhold(vp) 1516 register struct vnode *vp; 1517{ 1518 int s; 1519 1520 s = splbio(); 1521 vp->v_holdcnt++; 1522 if (VSHOULDBUSY(vp)) 1523 vbusy(vp); 1524 splx(s); 1525} 1526 1527/* 1528 * One less who cares about this vnode. 1529 */ 1530void 1531vdrop(vp) 1532 register struct vnode *vp; 1533{ 1534 int s; 1535 1536 s = splbio(); 1537 if (vp->v_holdcnt <= 0) 1538 panic("vdrop: holdcnt"); 1539 vp->v_holdcnt--; 1540 if (VSHOULDFREE(vp)) 1541 vfree(vp); 1542 splx(s); 1543} 1544 1545/* 1546 * Remove any vnodes in the vnode table belonging to mount point mp. 1547 * 1548 * If MNT_NOFORCE is specified, there should not be any active ones, 1549 * return error if any are found (nb: this is a user error, not a 1550 * system error). If MNT_FORCE is specified, detach any active vnodes 1551 * that are found. 1552 */ 1553#ifdef DIAGNOSTIC 1554static int busyprt = 0; /* print out busy vnodes */ 1555SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, ""); 1556#endif 1557 1558int 1559vflush(mp, skipvp, flags) 1560 struct mount *mp; 1561 struct vnode *skipvp; 1562 int flags; 1563{ 1564 struct proc *p = curproc; /* XXX */ 1565 struct vnode *vp, *nvp; 1566 int busy = 0; 1567 1568 simple_lock(&mntvnode_slock); 1569loop: 1570 for (vp = mp->mnt_vnodelist.lh_first; vp; vp = nvp) { 1571 /* 1572 * Make sure this vnode wasn't reclaimed in getnewvnode(). 1573 * Start over if it has (it won't be on the list anymore). 1574 */ 1575 if (vp->v_mount != mp) 1576 goto loop; 1577 nvp = vp->v_mntvnodes.le_next; 1578 /* 1579 * Skip over a selected vnode. 1580 */ 1581 if (vp == skipvp) 1582 continue; 1583 1584 simple_lock(&vp->v_interlock); 1585 /* 1586 * Skip over a vnodes marked VSYSTEM. 1587 */ 1588 if ((flags & SKIPSYSTEM) && (vp->v_flag & VSYSTEM)) { 1589 simple_unlock(&vp->v_interlock); 1590 continue; 1591 } 1592 /* 1593 * If WRITECLOSE is set, only flush out regular file vnodes 1594 * open for writing. 1595 */ 1596 if ((flags & WRITECLOSE) && 1597 (vp->v_writecount == 0 || vp->v_type != VREG)) { 1598 simple_unlock(&vp->v_interlock); 1599 continue; 1600 } 1601 1602 /* 1603 * With v_usecount == 0, all we need to do is clear out the 1604 * vnode data structures and we are done. 1605 */ 1606 if (vp->v_usecount == 0) { 1607 simple_unlock(&mntvnode_slock); 1608 vgonel(vp, p); 1609 simple_lock(&mntvnode_slock); 1610 continue; 1611 } 1612 1613 /* 1614 * If FORCECLOSE is set, forcibly close the vnode. For block 1615 * or character devices, revert to an anonymous device. For 1616 * all other files, just kill them. 1617 */ 1618 if (flags & FORCECLOSE) { 1619 simple_unlock(&mntvnode_slock); 1620 if (vp->v_type != VBLK && vp->v_type != VCHR) { 1621 vgonel(vp, p); 1622 } else { 1623 vclean(vp, 0, p); 1624 vp->v_op = spec_vnodeop_p; 1625 insmntque(vp, (struct mount *) 0); 1626 } 1627 simple_lock(&mntvnode_slock); 1628 continue; 1629 } 1630#ifdef DIAGNOSTIC 1631 if (busyprt) 1632 vprint("vflush: busy vnode", vp); 1633#endif 1634 simple_unlock(&vp->v_interlock); 1635 busy++; 1636 } 1637 simple_unlock(&mntvnode_slock); 1638 if (busy) 1639 return (EBUSY); 1640 return (0); 1641} 1642 1643/* 1644 * Disassociate the underlying file system from a vnode. 1645 */ 1646static void 1647vclean(vp, flags, p) 1648 struct vnode *vp; 1649 int flags; 1650 struct proc *p; 1651{ 1652 int active; 1653 vm_object_t obj; 1654 1655 /* 1656 * Check to see if the vnode is in use. If so we have to reference it 1657 * before we clean it out so that its count cannot fall to zero and 1658 * generate a race against ourselves to recycle it. 1659 */ 1660 if ((active = vp->v_usecount)) 1661 vp->v_usecount++; 1662 1663 /* 1664 * Prevent the vnode from being recycled or brought into use while we 1665 * clean it out. 1666 */ 1667 if (vp->v_flag & VXLOCK) 1668 panic("vclean: deadlock"); 1669 vp->v_flag |= VXLOCK; 1670 /* 1671 * Even if the count is zero, the VOP_INACTIVE routine may still 1672 * have the object locked while it cleans it out. The VOP_LOCK 1673 * ensures that the VOP_INACTIVE routine is done with its work. 1674 * For active vnodes, it ensures that no other activity can 1675 * occur while the underlying object is being cleaned out. 1676 */ 1677 VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK, p); 1678 1679 /* 1680 * Clean out any buffers associated with the vnode. 1681 */ 1682 vinvalbuf(vp, V_SAVE, NOCRED, p, 0, 0); 1683 if ((obj = vp->v_object) != NULL) { 1684 if (obj->ref_count == 0) { 1685 /* 1686 * This is a normal way of shutting down the object/vnode 1687 * association. 1688 */ 1689 vm_object_terminate(obj); 1690 } else { 1691 /* 1692 * Woe to the process that tries to page now :-). 1693 */ 1694 vm_pager_deallocate(obj); 1695 } 1696 } 1697 1698 /* 1699 * If purging an active vnode, it must be closed and 1700 * deactivated before being reclaimed. Note that the 1701 * VOP_INACTIVE will unlock the vnode. 1702 */ 1703 if (active) { 1704 if (flags & DOCLOSE) 1705 VOP_CLOSE(vp, FNONBLOCK, NOCRED, p); 1706 VOP_INACTIVE(vp, p); 1707 } else { 1708 /* 1709 * Any other processes trying to obtain this lock must first 1710 * wait for VXLOCK to clear, then call the new lock operation. 1711 */ 1712 VOP_UNLOCK(vp, 0, p); 1713 } 1714 /* 1715 * Reclaim the vnode. 1716 */ 1717 if (VOP_RECLAIM(vp, p)) 1718 panic("vclean: cannot reclaim"); 1719 1720 if (active) 1721 vrele(vp); 1722 1723 cache_purge(vp); 1724 if (vp->v_vnlock) { 1725#if 0 /* This is the only place we have LK_DRAINED in the entire kernel ??? */ 1726#ifdef DIAGNOSTIC 1727 if ((vp->v_vnlock->lk_flags & LK_DRAINED) == 0) 1728 vprint("vclean: lock not drained", vp); 1729#endif 1730#endif 1731 FREE(vp->v_vnlock, M_VNODE); 1732 vp->v_vnlock = NULL; 1733 } 1734 1735 if (VSHOULDFREE(vp)) 1736 vfree(vp); 1737 1738 /* 1739 * Done with purge, notify sleepers of the grim news. 1740 */ 1741 vp->v_op = dead_vnodeop_p; 1742 vn_pollgone(vp); 1743 vp->v_tag = VT_NON; 1744 vp->v_flag &= ~VXLOCK; 1745 if (vp->v_flag & VXWANT) { 1746 vp->v_flag &= ~VXWANT; 1747 wakeup((caddr_t) vp); 1748 } 1749} 1750 1751/* 1752 * Eliminate all activity associated with the requested vnode 1753 * and with all vnodes aliased to the requested vnode. 1754 */ 1755int 1756vop_revoke(ap) 1757 struct vop_revoke_args /* { 1758 struct vnode *a_vp; 1759 int a_flags; 1760 } */ *ap; 1761{ 1762 struct vnode *vp, *vq; 1763 struct proc *p = curproc; /* XXX */ 1764 1765 KASSERT((ap->a_flags & REVOKEALL) != 0, ("vop_revoke")); 1766 1767 vp = ap->a_vp; 1768 simple_lock(&vp->v_interlock); 1769 1770 if (vp->v_flag & VALIASED) { 1771 /* 1772 * If a vgone (or vclean) is already in progress, 1773 * wait until it is done and return. 1774 */ 1775 if (vp->v_flag & VXLOCK) { 1776 vp->v_flag |= VXWANT; 1777 simple_unlock(&vp->v_interlock); 1778 tsleep((caddr_t)vp, PINOD, "vop_revokeall", 0); 1779 return (0); 1780 } 1781 /* 1782 * Ensure that vp will not be vgone'd while we 1783 * are eliminating its aliases. 1784 */ 1785 vp->v_flag |= VXLOCK; 1786 simple_unlock(&vp->v_interlock); 1787 while (vp->v_flag & VALIASED) { 1788 simple_lock(&spechash_slock); 1789 for (vq = vp->v_hashchain; vq; vq = vq->v_specnext) { 1790 if (vq->v_type != vp->v_type || vp == vq) 1791 continue; 1792 simple_unlock(&spechash_slock); 1793 vgone(vq); 1794 break; 1795 } 1796 if (vq == NULLVP) { 1797 simple_unlock(&spechash_slock); 1798 } 1799 } 1800 /* 1801 * Remove the lock so that vgone below will 1802 * really eliminate the vnode after which time 1803 * vgone will awaken any sleepers. 1804 */ 1805 simple_lock(&vp->v_interlock); 1806 vp->v_flag &= ~VXLOCK; 1807 if (vp->v_flag & VXWANT) { 1808 vp->v_flag &= ~VXWANT; 1809 wakeup(vp); 1810 } 1811 } 1812 vgonel(vp, p); 1813 return (0); 1814} 1815 1816/* 1817 * Recycle an unused vnode to the front of the free list. 1818 * Release the passed interlock if the vnode will be recycled. 1819 */ 1820int 1821vrecycle(vp, inter_lkp, p) 1822 struct vnode *vp; 1823 struct simplelock *inter_lkp; 1824 struct proc *p; 1825{ 1826 1827 simple_lock(&vp->v_interlock); 1828 if (vp->v_usecount == 0) { 1829 if (inter_lkp) { 1830 simple_unlock(inter_lkp); 1831 } 1832 vgonel(vp, p); 1833 return (1); 1834 } 1835 simple_unlock(&vp->v_interlock); 1836 return (0); 1837} 1838 1839/* 1840 * Eliminate all activity associated with a vnode 1841 * in preparation for reuse. 1842 */ 1843void 1844vgone(vp) 1845 register struct vnode *vp; 1846{ 1847 struct proc *p = curproc; /* XXX */ 1848 1849 simple_lock(&vp->v_interlock); 1850 vgonel(vp, p); 1851} 1852 1853/* 1854 * vgone, with the vp interlock held. 1855 */ 1856static void 1857vgonel(vp, p) 1858 struct vnode *vp; 1859 struct proc *p; 1860{ 1861 int s; 1862 struct vnode *vq; 1863 struct vnode *vx; 1864 1865 /* 1866 * If a vgone (or vclean) is already in progress, 1867 * wait until it is done and return. 1868 */ 1869 if (vp->v_flag & VXLOCK) { 1870 vp->v_flag |= VXWANT; 1871 simple_unlock(&vp->v_interlock); 1872 tsleep((caddr_t)vp, PINOD, "vgone", 0); 1873 return; 1874 } 1875 1876 /* 1877 * Clean out the filesystem specific data. 1878 */ 1879 vclean(vp, DOCLOSE, p); 1880 simple_lock(&vp->v_interlock); 1881 1882 /* 1883 * Delete from old mount point vnode list, if on one. 1884 */ 1885 if (vp->v_mount != NULL) 1886 insmntque(vp, (struct mount *)0); 1887 /* 1888 * If special device, remove it from special device alias list 1889 * if it is on one. 1890 */ 1891 if ((vp->v_type == VBLK || vp->v_type == VCHR) && vp->v_rdev != 0) { 1892 simple_lock(&spechash_slock); 1893 if (vp->v_hashchain == vp) { 1894 vp->v_hashchain = vp->v_specnext; 1895 } else { 1896 for (vq = vp->v_hashchain; vq; vq = vq->v_specnext) { 1897 if (vq->v_specnext != vp) 1898 continue; 1899 vq->v_specnext = vp->v_specnext; 1900 break; 1901 } 1902 if (vq == NULL) 1903 panic("missing bdev"); 1904 } 1905 if (vp->v_flag & VALIASED) { 1906 vx = NULL; 1907 for (vq = vp->v_hashchain; vq; vq = vq->v_specnext) { 1908 if (vq->v_type != vp->v_type) 1909 continue; 1910 if (vx) 1911 break; 1912 vx = vq; 1913 } 1914 if (vx == NULL) 1915 panic("missing alias"); 1916 if (vq == NULL) 1917 vx->v_flag &= ~VALIASED; 1918 vp->v_flag &= ~VALIASED; 1919 } 1920 simple_unlock(&spechash_slock); 1921 vp->v_rdev = NULL; 1922 } 1923 1924 /* 1925 * If it is on the freelist and not already at the head, 1926 * move it to the head of the list. The test of the back 1927 * pointer and the reference count of zero is because 1928 * it will be removed from the free list by getnewvnode, 1929 * but will not have its reference count incremented until 1930 * after calling vgone. If the reference count were 1931 * incremented first, vgone would (incorrectly) try to 1932 * close the previous instance of the underlying object. 1933 */ 1934 if (vp->v_usecount == 0 && !(vp->v_flag & VDOOMED)) { 1935 s = splbio(); 1936 simple_lock(&vnode_free_list_slock); 1937 if (vp->v_flag & VFREE) { 1938 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); 1939 } else if (vp->v_flag & VTBFREE) { 1940 TAILQ_REMOVE(&vnode_tobefree_list, vp, v_freelist); 1941 vp->v_flag &= ~VTBFREE; 1942 freevnodes++; 1943 } else 1944 freevnodes++; 1945 vp->v_flag |= VFREE; 1946 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); 1947 simple_unlock(&vnode_free_list_slock); 1948 splx(s); 1949 } 1950 1951 vp->v_type = VBAD; 1952 simple_unlock(&vp->v_interlock); 1953} 1954 1955/* 1956 * Lookup a vnode by device number. 1957 */ 1958int 1959vfinddev(dev, type, vpp) 1960 dev_t dev; 1961 enum vtype type; 1962 struct vnode **vpp; 1963{ 1964 register struct vnode *vp; 1965 int rc = 0; 1966 1967 simple_lock(&spechash_slock); 1968 for (vp = dev->si_hlist; vp; vp = vp->v_specnext) { 1969 if (type != vp->v_type) 1970 continue; 1971 *vpp = vp; 1972 rc = 1; 1973 break; 1974 } 1975 simple_unlock(&spechash_slock); 1976 return (rc); 1977} 1978 1979/* 1980 * Calculate the total number of references to a special device. 1981 */ 1982int 1983vcount(vp) 1984 register struct vnode *vp; 1985{ 1986 struct vnode *vq, *vnext; 1987 int count; 1988 1989loop: 1990 if ((vp->v_flag & VALIASED) == 0) 1991 return (vp->v_usecount); 1992 simple_lock(&spechash_slock); 1993 for (count = 0, vq = vp->v_hashchain; vq; vq = vnext) { 1994 vnext = vq->v_specnext; 1995 if (vq->v_type != vp->v_type) 1996 continue; 1997 /* 1998 * Alias, but not in use, so flush it out. 1999 */ 2000 if (vq->v_usecount == 0 && vq != vp) { 2001 simple_unlock(&spechash_slock); 2002 vgone(vq); 2003 goto loop; 2004 } 2005 count += vq->v_usecount; 2006 } 2007 simple_unlock(&spechash_slock); 2008 return (count); 2009} 2010/* 2011 * Print out a description of a vnode. 2012 */ 2013static char *typename[] = 2014{"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"}; 2015 2016void 2017vprint(label, vp) 2018 char *label; 2019 register struct vnode *vp; 2020{ 2021 char buf[96]; 2022 2023 if (label != NULL) 2024 printf("%s: %p: ", label, (void *)vp); 2025 else 2026 printf("%p: ", (void *)vp); 2027 printf("type %s, usecount %d, writecount %d, refcount %d,", 2028 typename[vp->v_type], vp->v_usecount, vp->v_writecount, 2029 vp->v_holdcnt); 2030 buf[0] = '\0'; 2031 if (vp->v_flag & VROOT) 2032 strcat(buf, "|VROOT"); 2033 if (vp->v_flag & VTEXT) 2034 strcat(buf, "|VTEXT"); 2035 if (vp->v_flag & VSYSTEM) 2036 strcat(buf, "|VSYSTEM"); 2037 if (vp->v_flag & VXLOCK) 2038 strcat(buf, "|VXLOCK"); 2039 if (vp->v_flag & VXWANT) 2040 strcat(buf, "|VXWANT"); 2041 if (vp->v_flag & VBWAIT) 2042 strcat(buf, "|VBWAIT"); 2043 if (vp->v_flag & VALIASED) 2044 strcat(buf, "|VALIASED"); 2045 if (vp->v_flag & VDOOMED) 2046 strcat(buf, "|VDOOMED"); 2047 if (vp->v_flag & VFREE) 2048 strcat(buf, "|VFREE"); 2049 if (vp->v_flag & VOBJBUF) 2050 strcat(buf, "|VOBJBUF"); 2051 if (buf[0] != '\0') 2052 printf(" flags (%s)", &buf[1]); 2053 if (vp->v_data == NULL) { 2054 printf("\n"); 2055 } else { 2056 printf("\n\t"); 2057 VOP_PRINT(vp); 2058 } 2059} 2060 2061#ifdef DDB 2062#include <ddb/ddb.h> 2063/* 2064 * List all of the locked vnodes in the system. 2065 * Called when debugging the kernel. 2066 */ 2067DB_SHOW_COMMAND(lockedvnodes, lockedvnodes) 2068{ 2069 struct proc *p = curproc; /* XXX */ 2070 struct mount *mp, *nmp; 2071 struct vnode *vp; 2072 2073 printf("Locked vnodes\n"); 2074 simple_lock(&mountlist_slock); 2075 for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) { 2076 if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) { 2077 nmp = mp->mnt_list.cqe_next; 2078 continue; 2079 } 2080 for (vp = mp->mnt_vnodelist.lh_first; 2081 vp != NULL; 2082 vp = vp->v_mntvnodes.le_next) { 2083 if (VOP_ISLOCKED(vp)) 2084 vprint((char *)0, vp); 2085 } 2086 simple_lock(&mountlist_slock); 2087 nmp = mp->mnt_list.cqe_next; 2088 vfs_unbusy(mp, p); 2089 } 2090 simple_unlock(&mountlist_slock); 2091} 2092#endif 2093 2094/* 2095 * Top level filesystem related information gathering. 2096 */ 2097static int sysctl_ovfs_conf __P(SYSCTL_HANDLER_ARGS); 2098 2099static int 2100vfs_sysctl SYSCTL_HANDLER_ARGS 2101{ 2102 int *name = (int *)arg1 - 1; /* XXX */ 2103 u_int namelen = arg2 + 1; /* XXX */ 2104 struct vfsconf *vfsp; 2105 2106#if 1 || defined(COMPAT_PRELITE2) 2107 /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */ 2108 if (namelen == 1) 2109 return (sysctl_ovfs_conf(oidp, arg1, arg2, req)); 2110#endif 2111 2112#ifdef notyet 2113 /* all sysctl names at this level are at least name and field */ 2114 if (namelen < 2) 2115 return (ENOTDIR); /* overloaded */ 2116 if (name[0] != VFS_GENERIC) { 2117 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) 2118 if (vfsp->vfc_typenum == name[0]) 2119 break; 2120 if (vfsp == NULL) 2121 return (EOPNOTSUPP); 2122 return ((*vfsp->vfc_vfsops->vfs_sysctl)(&name[1], namelen - 1, 2123 oldp, oldlenp, newp, newlen, p)); 2124 } 2125#endif 2126 switch (name[1]) { 2127 case VFS_MAXTYPENUM: 2128 if (namelen != 2) 2129 return (ENOTDIR); 2130 return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int))); 2131 case VFS_CONF: 2132 if (namelen != 3) 2133 return (ENOTDIR); /* overloaded */ 2134 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) 2135 if (vfsp->vfc_typenum == name[2]) 2136 break; 2137 if (vfsp == NULL) 2138 return (EOPNOTSUPP); 2139 return (SYSCTL_OUT(req, vfsp, sizeof *vfsp)); 2140 } 2141 return (EOPNOTSUPP); 2142} 2143 2144SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD, vfs_sysctl, 2145 "Generic filesystem"); 2146 2147#if 1 || defined(COMPAT_PRELITE2) 2148 2149static int 2150sysctl_ovfs_conf SYSCTL_HANDLER_ARGS 2151{ 2152 int error; 2153 struct vfsconf *vfsp; 2154 struct ovfsconf ovfs; 2155 2156 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) { 2157 ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */ 2158 strcpy(ovfs.vfc_name, vfsp->vfc_name); 2159 ovfs.vfc_index = vfsp->vfc_typenum; 2160 ovfs.vfc_refcount = vfsp->vfc_refcount; 2161 ovfs.vfc_flags = vfsp->vfc_flags; 2162 error = SYSCTL_OUT(req, &ovfs, sizeof ovfs); 2163 if (error) 2164 return error; 2165 } 2166 return 0; 2167} 2168 2169#endif /* 1 || COMPAT_PRELITE2 */ 2170 2171#if 0 2172#define KINFO_VNODESLOP 10 2173/* 2174 * Dump vnode list (via sysctl). 2175 * Copyout address of vnode followed by vnode. 2176 */ 2177/* ARGSUSED */ 2178static int 2179sysctl_vnode SYSCTL_HANDLER_ARGS 2180{ 2181 struct proc *p = curproc; /* XXX */ 2182 struct mount *mp, *nmp; 2183 struct vnode *nvp, *vp; 2184 int error; 2185 2186#define VPTRSZ sizeof (struct vnode *) 2187#define VNODESZ sizeof (struct vnode) 2188 2189 req->lock = 0; 2190 if (!req->oldptr) /* Make an estimate */ 2191 return (SYSCTL_OUT(req, 0, 2192 (numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ))); 2193 2194 simple_lock(&mountlist_slock); 2195 for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) { 2196 if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) { 2197 nmp = mp->mnt_list.cqe_next; 2198 continue; 2199 } 2200again: 2201 simple_lock(&mntvnode_slock); 2202 for (vp = mp->mnt_vnodelist.lh_first; 2203 vp != NULL; 2204 vp = nvp) { 2205 /* 2206 * Check that the vp is still associated with 2207 * this filesystem. RACE: could have been 2208 * recycled onto the same filesystem. 2209 */ 2210 if (vp->v_mount != mp) { 2211 simple_unlock(&mntvnode_slock); 2212 goto again; 2213 } 2214 nvp = vp->v_mntvnodes.le_next; 2215 simple_unlock(&mntvnode_slock); 2216 if ((error = SYSCTL_OUT(req, &vp, VPTRSZ)) || 2217 (error = SYSCTL_OUT(req, vp, VNODESZ))) 2218 return (error); 2219 simple_lock(&mntvnode_slock); 2220 } 2221 simple_unlock(&mntvnode_slock); 2222 simple_lock(&mountlist_slock); 2223 nmp = mp->mnt_list.cqe_next; 2224 vfs_unbusy(mp, p); 2225 } 2226 simple_unlock(&mountlist_slock); 2227 2228 return (0); 2229} 2230#endif 2231 2232/* 2233 * XXX 2234 * Exporting the vnode list on large systems causes them to crash. 2235 * Exporting the vnode list on medium systems causes sysctl to coredump. 2236 */ 2237#if 0 2238SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD, 2239 0, 0, sysctl_vnode, "S,vnode", ""); 2240#endif 2241 2242/* 2243 * Check to see if a filesystem is mounted on a block device. 2244 */ 2245int 2246vfs_mountedon(vp) 2247 struct vnode *vp; 2248{ 2249 struct vnode *vq; 2250 int error = 0; 2251 2252 if (vp->v_specmountpoint != NULL) 2253 return (EBUSY); 2254 if (vp->v_flag & VALIASED) { 2255 simple_lock(&spechash_slock); 2256 for (vq = vp->v_hashchain; vq; vq = vq->v_specnext) { 2257 if (vq->v_type != vp->v_type) 2258 continue; 2259 if (vq->v_specmountpoint != NULL) { 2260 error = EBUSY; 2261 break; 2262 } 2263 } 2264 simple_unlock(&spechash_slock); 2265 } 2266 return (error); 2267} 2268 2269/* 2270 * Unmount all filesystems. The list is traversed in reverse order 2271 * of mounting to avoid dependencies. 2272 */ 2273void 2274vfs_unmountall() 2275{ 2276 struct mount *mp, *nmp; 2277 struct proc *p; 2278 int error; 2279 2280 if (curproc != NULL) 2281 p = curproc; 2282 else 2283 p = initproc; /* XXX XXX should this be proc0? */ 2284 /* 2285 * Since this only runs when rebooting, it is not interlocked. 2286 */ 2287 for (mp = mountlist.cqh_last; mp != (void *)&mountlist; mp = nmp) { 2288 nmp = mp->mnt_list.cqe_prev; 2289 error = dounmount(mp, MNT_FORCE, p); 2290 if (error) { 2291 printf("unmount of %s failed (", 2292 mp->mnt_stat.f_mntonname); 2293 if (error == EBUSY) 2294 printf("BUSY)\n"); 2295 else 2296 printf("%d)\n", error); 2297 } 2298 } 2299} 2300 2301/* 2302 * Build hash lists of net addresses and hang them off the mount point. 2303 * Called by ufs_mount() to set up the lists of export addresses. 2304 */ 2305static int 2306vfs_hang_addrlist(mp, nep, argp) 2307 struct mount *mp; 2308 struct netexport *nep; 2309 struct export_args *argp; 2310{ 2311 register struct netcred *np; 2312 register struct radix_node_head *rnh; 2313 register int i; 2314 struct radix_node *rn; 2315 struct sockaddr *saddr, *smask = 0; 2316 struct domain *dom; 2317 int error; 2318 2319 if (argp->ex_addrlen == 0) { 2320 if (mp->mnt_flag & MNT_DEFEXPORTED) 2321 return (EPERM); 2322 np = &nep->ne_defexported; 2323 np->netc_exflags = argp->ex_flags; 2324 np->netc_anon = argp->ex_anon; 2325 np->netc_anon.cr_ref = 1; 2326 mp->mnt_flag |= MNT_DEFEXPORTED; 2327 return (0); 2328 } 2329 i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen; 2330 np = (struct netcred *) malloc(i, M_NETADDR, M_WAITOK); 2331 bzero((caddr_t) np, i); 2332 saddr = (struct sockaddr *) (np + 1); 2333 if ((error = copyin(argp->ex_addr, (caddr_t) saddr, argp->ex_addrlen))) 2334 goto out; 2335 if (saddr->sa_len > argp->ex_addrlen) 2336 saddr->sa_len = argp->ex_addrlen; 2337 if (argp->ex_masklen) { 2338 smask = (struct sockaddr *) ((caddr_t) saddr + argp->ex_addrlen); 2339 error = copyin(argp->ex_mask, (caddr_t) smask, argp->ex_masklen); 2340 if (error) 2341 goto out; 2342 if (smask->sa_len > argp->ex_masklen) 2343 smask->sa_len = argp->ex_masklen; 2344 } 2345 i = saddr->sa_family; 2346 if ((rnh = nep->ne_rtable[i]) == 0) { 2347 /* 2348 * Seems silly to initialize every AF when most are not used, 2349 * do so on demand here 2350 */ 2351 for (dom = domains; dom; dom = dom->dom_next) 2352 if (dom->dom_family == i && dom->dom_rtattach) { 2353 dom->dom_rtattach((void **) &nep->ne_rtable[i], 2354 dom->dom_rtoffset); 2355 break; 2356 } 2357 if ((rnh = nep->ne_rtable[i]) == 0) { 2358 error = ENOBUFS; 2359 goto out; 2360 } 2361 } 2362 rn = (*rnh->rnh_addaddr) ((caddr_t) saddr, (caddr_t) smask, rnh, 2363 np->netc_rnodes); 2364 if (rn == 0 || np != (struct netcred *) rn) { /* already exists */ 2365 error = EPERM; 2366 goto out; 2367 } 2368 np->netc_exflags = argp->ex_flags; 2369 np->netc_anon = argp->ex_anon; 2370 np->netc_anon.cr_ref = 1; 2371 return (0); 2372out: 2373 free(np, M_NETADDR); 2374 return (error); 2375} 2376 2377/* ARGSUSED */ 2378static int 2379vfs_free_netcred(rn, w) 2380 struct radix_node *rn; 2381 void *w; 2382{ 2383 register struct radix_node_head *rnh = (struct radix_node_head *) w; 2384 2385 (*rnh->rnh_deladdr) (rn->rn_key, rn->rn_mask, rnh); 2386 free((caddr_t) rn, M_NETADDR); 2387 return (0); 2388} 2389 2390/* 2391 * Free the net address hash lists that are hanging off the mount points. 2392 */ 2393static void 2394vfs_free_addrlist(nep) 2395 struct netexport *nep; 2396{ 2397 register int i; 2398 register struct radix_node_head *rnh; 2399 2400 for (i = 0; i <= AF_MAX; i++) 2401 if ((rnh = nep->ne_rtable[i])) { 2402 (*rnh->rnh_walktree) (rnh, vfs_free_netcred, 2403 (caddr_t) rnh); 2404 free((caddr_t) rnh, M_RTABLE); 2405 nep->ne_rtable[i] = 0; 2406 } 2407} 2408 2409int 2410vfs_export(mp, nep, argp) 2411 struct mount *mp; 2412 struct netexport *nep; 2413 struct export_args *argp; 2414{ 2415 int error; 2416 2417 if (argp->ex_flags & MNT_DELEXPORT) { 2418 if (mp->mnt_flag & MNT_EXPUBLIC) { 2419 vfs_setpublicfs(NULL, NULL, NULL); 2420 mp->mnt_flag &= ~MNT_EXPUBLIC; 2421 } 2422 vfs_free_addrlist(nep); 2423 mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED); 2424 } 2425 if (argp->ex_flags & MNT_EXPORTED) { 2426 if (argp->ex_flags & MNT_EXPUBLIC) { 2427 if ((error = vfs_setpublicfs(mp, nep, argp)) != 0) 2428 return (error); 2429 mp->mnt_flag |= MNT_EXPUBLIC; 2430 } 2431 if ((error = vfs_hang_addrlist(mp, nep, argp))) 2432 return (error); 2433 mp->mnt_flag |= MNT_EXPORTED; 2434 } 2435 return (0); 2436} 2437 2438 2439/* 2440 * Set the publicly exported filesystem (WebNFS). Currently, only 2441 * one public filesystem is possible in the spec (RFC 2054 and 2055) 2442 */ 2443int 2444vfs_setpublicfs(mp, nep, argp) 2445 struct mount *mp; 2446 struct netexport *nep; 2447 struct export_args *argp; 2448{ 2449 int error; 2450 struct vnode *rvp; 2451 char *cp; 2452 2453 /* 2454 * mp == NULL -> invalidate the current info, the FS is 2455 * no longer exported. May be called from either vfs_export 2456 * or unmount, so check if it hasn't already been done. 2457 */ 2458 if (mp == NULL) { 2459 if (nfs_pub.np_valid) { 2460 nfs_pub.np_valid = 0; 2461 if (nfs_pub.np_index != NULL) { 2462 FREE(nfs_pub.np_index, M_TEMP); 2463 nfs_pub.np_index = NULL; 2464 } 2465 } 2466 return (0); 2467 } 2468 2469 /* 2470 * Only one allowed at a time. 2471 */ 2472 if (nfs_pub.np_valid != 0 && mp != nfs_pub.np_mount) 2473 return (EBUSY); 2474 2475 /* 2476 * Get real filehandle for root of exported FS. 2477 */ 2478 bzero((caddr_t)&nfs_pub.np_handle, sizeof(nfs_pub.np_handle)); 2479 nfs_pub.np_handle.fh_fsid = mp->mnt_stat.f_fsid; 2480 2481 if ((error = VFS_ROOT(mp, &rvp))) 2482 return (error); 2483 2484 if ((error = VFS_VPTOFH(rvp, &nfs_pub.np_handle.fh_fid))) 2485 return (error); 2486 2487 vput(rvp); 2488 2489 /* 2490 * If an indexfile was specified, pull it in. 2491 */ 2492 if (argp->ex_indexfile != NULL) { 2493 MALLOC(nfs_pub.np_index, char *, MAXNAMLEN + 1, M_TEMP, 2494 M_WAITOK); 2495 error = copyinstr(argp->ex_indexfile, nfs_pub.np_index, 2496 MAXNAMLEN, (size_t *)0); 2497 if (!error) { 2498 /* 2499 * Check for illegal filenames. 2500 */ 2501 for (cp = nfs_pub.np_index; *cp; cp++) { 2502 if (*cp == '/') { 2503 error = EINVAL; 2504 break; 2505 } 2506 } 2507 } 2508 if (error) { 2509 FREE(nfs_pub.np_index, M_TEMP); 2510 return (error); 2511 } 2512 } 2513 2514 nfs_pub.np_mount = mp; 2515 nfs_pub.np_valid = 1; 2516 return (0); 2517} 2518 2519struct netcred * 2520vfs_export_lookup(mp, nep, nam) 2521 register struct mount *mp; 2522 struct netexport *nep; 2523 struct sockaddr *nam; 2524{ 2525 register struct netcred *np; 2526 register struct radix_node_head *rnh; 2527 struct sockaddr *saddr; 2528 2529 np = NULL; 2530 if (mp->mnt_flag & MNT_EXPORTED) { 2531 /* 2532 * Lookup in the export list first. 2533 */ 2534 if (nam != NULL) { 2535 saddr = nam; 2536 rnh = nep->ne_rtable[saddr->sa_family]; 2537 if (rnh != NULL) { 2538 np = (struct netcred *) 2539 (*rnh->rnh_matchaddr)((caddr_t)saddr, 2540 rnh); 2541 if (np && np->netc_rnodes->rn_flags & RNF_ROOT) 2542 np = NULL; 2543 } 2544 } 2545 /* 2546 * If no address match, use the default if it exists. 2547 */ 2548 if (np == NULL && mp->mnt_flag & MNT_DEFEXPORTED) 2549 np = &nep->ne_defexported; 2550 } 2551 return (np); 2552} 2553 2554/* 2555 * perform msync on all vnodes under a mount point 2556 * the mount point must be locked. 2557 */ 2558void 2559vfs_msync(struct mount *mp, int flags) { 2560 struct vnode *vp, *nvp; 2561 struct vm_object *obj; 2562 int anyio, tries; 2563 2564 tries = 5; 2565loop: 2566 anyio = 0; 2567 for (vp = mp->mnt_vnodelist.lh_first; vp != NULL; vp = nvp) { 2568 2569 nvp = vp->v_mntvnodes.le_next; 2570 2571 if (vp->v_mount != mp) { 2572 goto loop; 2573 } 2574 2575 if (vp->v_flag & VXLOCK) /* XXX: what if MNT_WAIT? */ 2576 continue; 2577 2578 if (flags != MNT_WAIT) { 2579 obj = vp->v_object; 2580 if (obj == NULL || (obj->flags & OBJ_MIGHTBEDIRTY) == 0) 2581 continue; 2582 if (VOP_ISLOCKED(vp)) 2583 continue; 2584 } 2585 2586 simple_lock(&vp->v_interlock); 2587 if (vp->v_object && 2588 (vp->v_object->flags & OBJ_MIGHTBEDIRTY)) { 2589 if (!vget(vp, 2590 LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY | LK_NOOBJ, curproc)) { 2591 if (vp->v_object) { 2592 vm_object_page_clean(vp->v_object, 0, 0, flags == MNT_WAIT ? OBJPC_SYNC : 0); 2593 anyio = 1; 2594 } 2595 vput(vp); 2596 } 2597 } else { 2598 simple_unlock(&vp->v_interlock); 2599 } 2600 } 2601 if (anyio && (--tries > 0)) 2602 goto loop; 2603} 2604 2605/* 2606 * Create the VM object needed for VMIO and mmap support. This 2607 * is done for all VREG files in the system. Some filesystems might 2608 * afford the additional metadata buffering capability of the 2609 * VMIO code by making the device node be VMIO mode also. 2610 * 2611 * vp must be locked when vfs_object_create is called. 2612 */ 2613int 2614vfs_object_create(vp, p, cred) 2615 struct vnode *vp; 2616 struct proc *p; 2617 struct ucred *cred; 2618{ 2619 struct vattr vat; 2620 vm_object_t object; 2621 int error = 0; 2622 2623 if (vp->v_type != VBLK && vn_canvmio(vp) == FALSE) 2624 return 0; 2625 2626retry: 2627 if ((object = vp->v_object) == NULL) { 2628 if (vp->v_type == VREG || vp->v_type == VDIR) { 2629 if ((error = VOP_GETATTR(vp, &vat, cred, p)) != 0) 2630 goto retn; 2631 object = vnode_pager_alloc(vp, vat.va_size, 0, 0); 2632 } else if (devsw(vp->v_rdev) != NULL) { 2633 /* 2634 * This simply allocates the biggest object possible 2635 * for a VBLK vnode. This should be fixed, but doesn't 2636 * cause any problems (yet). 2637 */ 2638 object = vnode_pager_alloc(vp, IDX_TO_OFF(INT_MAX), 0, 0); 2639 } else { 2640 goto retn; 2641 } 2642 /* 2643 * Dereference the reference we just created. This assumes 2644 * that the object is associated with the vp. 2645 */ 2646 object->ref_count--; 2647 vp->v_usecount--; 2648 } else { 2649 if (object->flags & OBJ_DEAD) { 2650 VOP_UNLOCK(vp, 0, p); 2651 tsleep(object, PVM, "vodead", 0); 2652 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); 2653 goto retry; 2654 } 2655 } 2656 2657 KASSERT(vp->v_object != NULL, ("vfs_object_create: NULL object")); 2658 vp->v_flag |= VOBJBUF; 2659 2660retn: 2661 return error; 2662} 2663 2664static void 2665vfree(vp) 2666 struct vnode *vp; 2667{ 2668 int s; 2669 2670 s = splbio(); 2671 simple_lock(&vnode_free_list_slock); 2672 if (vp->v_flag & VTBFREE) { 2673 TAILQ_REMOVE(&vnode_tobefree_list, vp, v_freelist); 2674 vp->v_flag &= ~VTBFREE; 2675 } 2676 if (vp->v_flag & VAGE) { 2677 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); 2678 } else { 2679 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 2680 } 2681 freevnodes++; 2682 simple_unlock(&vnode_free_list_slock); 2683 vp->v_flag &= ~VAGE; 2684 vp->v_flag |= VFREE; 2685 splx(s); 2686} 2687 2688void 2689vbusy(vp) 2690 struct vnode *vp; 2691{ 2692 int s; 2693 2694 s = splbio(); 2695 simple_lock(&vnode_free_list_slock); 2696 if (vp->v_flag & VTBFREE) { 2697 TAILQ_REMOVE(&vnode_tobefree_list, vp, v_freelist); 2698 vp->v_flag &= ~VTBFREE; 2699 } else { 2700 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); 2701 freevnodes--; 2702 } 2703 simple_unlock(&vnode_free_list_slock); 2704 vp->v_flag &= ~(VFREE|VAGE); 2705 splx(s); 2706} 2707 2708/* 2709 * Record a process's interest in events which might happen to 2710 * a vnode. Because poll uses the historic select-style interface 2711 * internally, this routine serves as both the ``check for any 2712 * pending events'' and the ``record my interest in future events'' 2713 * functions. (These are done together, while the lock is held, 2714 * to avoid race conditions.) 2715 */ 2716int 2717vn_pollrecord(vp, p, events) 2718 struct vnode *vp; 2719 struct proc *p; 2720 short events; 2721{ 2722 simple_lock(&vp->v_pollinfo.vpi_lock); 2723 if (vp->v_pollinfo.vpi_revents & events) { 2724 /* 2725 * This leaves events we are not interested 2726 * in available for the other process which 2727 * which presumably had requested them 2728 * (otherwise they would never have been 2729 * recorded). 2730 */ 2731 events &= vp->v_pollinfo.vpi_revents; 2732 vp->v_pollinfo.vpi_revents &= ~events; 2733 2734 simple_unlock(&vp->v_pollinfo.vpi_lock); 2735 return events; 2736 } 2737 vp->v_pollinfo.vpi_events |= events; 2738 selrecord(p, &vp->v_pollinfo.vpi_selinfo); 2739 simple_unlock(&vp->v_pollinfo.vpi_lock); 2740 return 0; 2741} 2742 2743/* 2744 * Note the occurrence of an event. If the VN_POLLEVENT macro is used, 2745 * it is possible for us to miss an event due to race conditions, but 2746 * that condition is expected to be rare, so for the moment it is the 2747 * preferred interface. 2748 */ 2749void 2750vn_pollevent(vp, events) 2751 struct vnode *vp; 2752 short events; 2753{ 2754 simple_lock(&vp->v_pollinfo.vpi_lock); 2755 if (vp->v_pollinfo.vpi_events & events) { 2756 /* 2757 * We clear vpi_events so that we don't 2758 * call selwakeup() twice if two events are 2759 * posted before the polling process(es) is 2760 * awakened. This also ensures that we take at 2761 * most one selwakeup() if the polling process 2762 * is no longer interested. However, it does 2763 * mean that only one event can be noticed at 2764 * a time. (Perhaps we should only clear those 2765 * event bits which we note?) XXX 2766 */ 2767 vp->v_pollinfo.vpi_events = 0; /* &= ~events ??? */ 2768 vp->v_pollinfo.vpi_revents |= events; 2769 selwakeup(&vp->v_pollinfo.vpi_selinfo); 2770 } 2771 simple_unlock(&vp->v_pollinfo.vpi_lock); 2772} 2773 2774/* 2775 * Wake up anyone polling on vp because it is being revoked. 2776 * This depends on dead_poll() returning POLLHUP for correct 2777 * behavior. 2778 */ 2779void 2780vn_pollgone(vp) 2781 struct vnode *vp; 2782{ 2783 simple_lock(&vp->v_pollinfo.vpi_lock); 2784 if (vp->v_pollinfo.vpi_events) { 2785 vp->v_pollinfo.vpi_events = 0; 2786 selwakeup(&vp->v_pollinfo.vpi_selinfo); 2787 } 2788 simple_unlock(&vp->v_pollinfo.vpi_lock); 2789} 2790 2791 2792 2793/* 2794 * Routine to create and manage a filesystem syncer vnode. 2795 */ 2796#define sync_close ((int (*) __P((struct vop_close_args *)))nullop) 2797static int sync_fsync __P((struct vop_fsync_args *)); 2798static int sync_inactive __P((struct vop_inactive_args *)); 2799static int sync_reclaim __P((struct vop_reclaim_args *)); 2800#define sync_lock ((int (*) __P((struct vop_lock_args *)))vop_nolock) 2801#define sync_unlock ((int (*) __P((struct vop_unlock_args *)))vop_nounlock) 2802static int sync_print __P((struct vop_print_args *)); 2803#define sync_islocked ((int(*) __P((struct vop_islocked_args *)))vop_noislocked) 2804 2805static vop_t **sync_vnodeop_p; 2806static struct vnodeopv_entry_desc sync_vnodeop_entries[] = { 2807 { &vop_default_desc, (vop_t *) vop_eopnotsupp }, 2808 { &vop_close_desc, (vop_t *) sync_close }, /* close */ 2809 { &vop_fsync_desc, (vop_t *) sync_fsync }, /* fsync */ 2810 { &vop_inactive_desc, (vop_t *) sync_inactive }, /* inactive */ 2811 { &vop_reclaim_desc, (vop_t *) sync_reclaim }, /* reclaim */ 2812 { &vop_lock_desc, (vop_t *) sync_lock }, /* lock */ 2813 { &vop_unlock_desc, (vop_t *) sync_unlock }, /* unlock */ 2814 { &vop_print_desc, (vop_t *) sync_print }, /* print */ 2815 { &vop_islocked_desc, (vop_t *) sync_islocked }, /* islocked */ 2816 { NULL, NULL } 2817}; 2818static struct vnodeopv_desc sync_vnodeop_opv_desc = 2819 { &sync_vnodeop_p, sync_vnodeop_entries }; 2820 2821VNODEOP_SET(sync_vnodeop_opv_desc); 2822 2823/* 2824 * Create a new filesystem syncer vnode for the specified mount point. 2825 */ 2826int 2827vfs_allocate_syncvnode(mp) 2828 struct mount *mp; 2829{ 2830 struct vnode *vp; 2831 static long start, incr, next; 2832 int error; 2833 2834 /* Allocate a new vnode */ 2835 if ((error = getnewvnode(VT_VFS, mp, sync_vnodeop_p, &vp)) != 0) { 2836 mp->mnt_syncer = NULL; 2837 return (error); 2838 } 2839 vp->v_type = VNON; 2840 /* 2841 * Place the vnode onto the syncer worklist. We attempt to 2842 * scatter them about on the list so that they will go off 2843 * at evenly distributed times even if all the filesystems 2844 * are mounted at once. 2845 */ 2846 next += incr; 2847 if (next == 0 || next > syncer_maxdelay) { 2848 start /= 2; 2849 incr /= 2; 2850 if (start == 0) { 2851 start = syncer_maxdelay / 2; 2852 incr = syncer_maxdelay; 2853 } 2854 next = start; 2855 } 2856 vn_syncer_add_to_worklist(vp, syncdelay > 0 ? next % syncdelay : 0); 2857 mp->mnt_syncer = vp; 2858 return (0); 2859} 2860 2861/* 2862 * Do a lazy sync of the filesystem. 2863 */ 2864static int 2865sync_fsync(ap) 2866 struct vop_fsync_args /* { 2867 struct vnode *a_vp; 2868 struct ucred *a_cred; 2869 int a_waitfor; 2870 struct proc *a_p; 2871 } */ *ap; 2872{ 2873 struct vnode *syncvp = ap->a_vp; 2874 struct mount *mp = syncvp->v_mount; 2875 struct proc *p = ap->a_p; 2876 int asyncflag; 2877 2878 /* 2879 * We only need to do something if this is a lazy evaluation. 2880 */ 2881 if (ap->a_waitfor != MNT_LAZY) 2882 return (0); 2883 2884 /* 2885 * Move ourselves to the back of the sync list. 2886 */ 2887 vn_syncer_add_to_worklist(syncvp, syncdelay); 2888 2889 /* 2890 * Walk the list of vnodes pushing all that are dirty and 2891 * not already on the sync list. 2892 */ 2893 simple_lock(&mountlist_slock); 2894 if (vfs_busy(mp, LK_EXCLUSIVE | LK_NOWAIT, &mountlist_slock, p) != 0) { 2895 simple_unlock(&mountlist_slock); 2896 return (0); 2897 } 2898 asyncflag = mp->mnt_flag & MNT_ASYNC; 2899 mp->mnt_flag &= ~MNT_ASYNC; 2900 vfs_msync(mp, MNT_NOWAIT); 2901 VFS_SYNC(mp, MNT_LAZY, ap->a_cred, p); 2902 if (asyncflag) 2903 mp->mnt_flag |= MNT_ASYNC; 2904 vfs_unbusy(mp, p); 2905 return (0); 2906} 2907 2908/* 2909 * The syncer vnode is no referenced. 2910 */ 2911static int 2912sync_inactive(ap) 2913 struct vop_inactive_args /* { 2914 struct vnode *a_vp; 2915 struct proc *a_p; 2916 } */ *ap; 2917{ 2918 2919 vgone(ap->a_vp); 2920 return (0); 2921} 2922 2923/* 2924 * The syncer vnode is no longer needed and is being decommissioned. 2925 * 2926 * Modifications to the worklist must be protected at splbio(). 2927 */ 2928static int 2929sync_reclaim(ap) 2930 struct vop_reclaim_args /* { 2931 struct vnode *a_vp; 2932 } */ *ap; 2933{ 2934 struct vnode *vp = ap->a_vp; 2935 int s; 2936 2937 s = splbio(); 2938 vp->v_mount->mnt_syncer = NULL; 2939 if (vp->v_flag & VONWORKLST) { 2940 LIST_REMOVE(vp, v_synclist); 2941 vp->v_flag &= ~VONWORKLST; 2942 } 2943 splx(s); 2944 2945 return (0); 2946} 2947 2948/* 2949 * Print out a syncer vnode. 2950 */ 2951static int 2952sync_print(ap) 2953 struct vop_print_args /* { 2954 struct vnode *a_vp; 2955 } */ *ap; 2956{ 2957 struct vnode *vp = ap->a_vp; 2958 2959 printf("syncer vnode"); 2960 if (vp->v_vnlock != NULL) 2961 lockmgr_printinfo(vp->v_vnlock); 2962 printf("\n"); 2963 return (0); 2964} 2965 2966/* 2967 * extract the dev_t from a VBLK or VCHR 2968 */ 2969dev_t 2970vn_todev(vp) 2971 struct vnode *vp; 2972{ 2973 if (vp->v_type != VBLK && vp->v_type != VCHR) 2974 return (NODEV); 2975 return (vp->v_rdev); 2976} 2977