vfs_subr.c revision 56949
1/* 2 * Copyright (c) 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the University of 21 * California, Berkeley and its contributors. 22 * 4. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95 39 * $FreeBSD: head/sys/kern/vfs_subr.c 56949 2000-02-02 07:07:17Z rwatson $ 40 */ 41 42/* 43 * External virtual filesystem routines 44 */ 45#include "opt_ddb.h" 46 47#include <sys/param.h> 48#include <sys/systm.h> 49#include <sys/buf.h> 50#include <sys/conf.h> 51#include <sys/dirent.h> 52#include <sys/domain.h> 53#include <sys/eventhandler.h> 54#include <sys/fcntl.h> 55#include <sys/kernel.h> 56#include <sys/kthread.h> 57#include <sys/malloc.h> 58#include <sys/mount.h> 59#include <sys/namei.h> 60#include <sys/proc.h> 61#include <sys/reboot.h> 62#include <sys/socket.h> 63#include <sys/stat.h> 64#include <sys/sysctl.h> 65#include <sys/vmmeter.h> 66#include <sys/vnode.h> 67 68#include <machine/limits.h> 69 70#include <vm/vm.h> 71#include <vm/vm_object.h> 72#include <vm/vm_extern.h> 73#include <vm/pmap.h> 74#include <vm/vm_map.h> 75#include <vm/vm_page.h> 76#include <vm/vm_pager.h> 77#include <vm/vnode_pager.h> 78#include <vm/vm_zone.h> 79 80static MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure"); 81 82static void insmntque __P((struct vnode *vp, struct mount *mp)); 83static void vclean __P((struct vnode *vp, int flags, struct proc *p)); 84static void vfree __P((struct vnode *)); 85static unsigned long numvnodes; 86SYSCTL_INT(_debug, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, ""); 87 88enum vtype iftovt_tab[16] = { 89 VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, 90 VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD, 91}; 92int vttoif_tab[9] = { 93 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, 94 S_IFSOCK, S_IFIFO, S_IFMT, 95}; 96 97static TAILQ_HEAD(freelst, vnode) vnode_free_list; /* vnode free list */ 98struct tobefreelist vnode_tobefree_list; /* vnode free list */ 99 100static u_long wantfreevnodes = 25; 101SYSCTL_INT(_debug, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, ""); 102static u_long freevnodes = 0; 103SYSCTL_INT(_debug, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, ""); 104 105static int reassignbufcalls; 106SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0, ""); 107static int reassignbufloops; 108SYSCTL_INT(_vfs, OID_AUTO, reassignbufloops, CTLFLAG_RW, &reassignbufloops, 0, ""); 109static int reassignbufsortgood; 110SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortgood, CTLFLAG_RW, &reassignbufsortgood, 0, ""); 111static int reassignbufsortbad; 112SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortbad, CTLFLAG_RW, &reassignbufsortbad, 0, ""); 113static int reassignbufmethod = 1; 114SYSCTL_INT(_vfs, OID_AUTO, reassignbufmethod, CTLFLAG_RW, &reassignbufmethod, 0, ""); 115 116#ifdef ENABLE_VFS_IOOPT 117int vfs_ioopt = 0; 118SYSCTL_INT(_vfs, OID_AUTO, ioopt, CTLFLAG_RW, &vfs_ioopt, 0, ""); 119#endif 120 121struct mntlist mountlist = TAILQ_HEAD_INITIALIZER(mountlist); /* mounted fs */ 122struct simplelock mountlist_slock; 123struct simplelock mntvnode_slock; 124int nfs_mount_type = -1; 125#ifndef NULL_SIMPLELOCKS 126static struct simplelock mntid_slock; 127static struct simplelock vnode_free_list_slock; 128static struct simplelock spechash_slock; 129#endif 130struct nfs_public nfs_pub; /* publicly exported FS */ 131static vm_zone_t vnode_zone; 132 133/* 134 * The workitem queue. 135 */ 136#define SYNCER_MAXDELAY 32 137static int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */ 138time_t syncdelay = 30; /* max time to delay syncing data */ 139time_t filedelay = 30; /* time to delay syncing files */ 140SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, ""); 141time_t dirdelay = 29; /* time to delay syncing directories */ 142SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, ""); 143time_t metadelay = 28; /* time to delay syncing metadata */ 144SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, ""); 145static int rushjob; /* number of slots to run ASAP */ 146static int stat_rush_requests; /* number of times I/O speeded up */ 147SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, ""); 148 149static int syncer_delayno = 0; 150static long syncer_mask; 151LIST_HEAD(synclist, vnode); 152static struct synclist *syncer_workitem_pending; 153 154int desiredvnodes; 155SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW, 156 &desiredvnodes, 0, "Maximum number of vnodes"); 157 158static void vfs_free_addrlist __P((struct netexport *nep)); 159static int vfs_free_netcred __P((struct radix_node *rn, void *w)); 160static int vfs_hang_addrlist __P((struct mount *mp, struct netexport *nep, 161 struct export_args *argp)); 162 163/* 164 * Initialize the vnode management data structures. 165 */ 166void 167vntblinit() 168{ 169 170 desiredvnodes = maxproc + cnt.v_page_count / 4; 171 simple_lock_init(&mntvnode_slock); 172 simple_lock_init(&mntid_slock); 173 simple_lock_init(&spechash_slock); 174 TAILQ_INIT(&vnode_free_list); 175 TAILQ_INIT(&vnode_tobefree_list); 176 simple_lock_init(&vnode_free_list_slock); 177 vnode_zone = zinit("VNODE", sizeof (struct vnode), 0, 0, 5); 178 /* 179 * Initialize the filesystem syncer. 180 */ 181 syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE, 182 &syncer_mask); 183 syncer_maxdelay = syncer_mask + 1; 184} 185 186/* 187 * Mark a mount point as busy. Used to synchronize access and to delay 188 * unmounting. Interlock is not released on failure. 189 */ 190int 191vfs_busy(mp, flags, interlkp, p) 192 struct mount *mp; 193 int flags; 194 struct simplelock *interlkp; 195 struct proc *p; 196{ 197 int lkflags; 198 199 if (mp->mnt_kern_flag & MNTK_UNMOUNT) { 200 if (flags & LK_NOWAIT) 201 return (ENOENT); 202 mp->mnt_kern_flag |= MNTK_MWAIT; 203 if (interlkp) { 204 simple_unlock(interlkp); 205 } 206 /* 207 * Since all busy locks are shared except the exclusive 208 * lock granted when unmounting, the only place that a 209 * wakeup needs to be done is at the release of the 210 * exclusive lock at the end of dounmount. 211 */ 212 tsleep((caddr_t)mp, PVFS, "vfs_busy", 0); 213 if (interlkp) { 214 simple_lock(interlkp); 215 } 216 return (ENOENT); 217 } 218 lkflags = LK_SHARED | LK_NOPAUSE; 219 if (interlkp) 220 lkflags |= LK_INTERLOCK; 221 if (lockmgr(&mp->mnt_lock, lkflags, interlkp, p)) 222 panic("vfs_busy: unexpected lock failure"); 223 return (0); 224} 225 226/* 227 * Free a busy filesystem. 228 */ 229void 230vfs_unbusy(mp, p) 231 struct mount *mp; 232 struct proc *p; 233{ 234 235 lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, p); 236} 237 238/* 239 * Lookup a filesystem type, and if found allocate and initialize 240 * a mount structure for it. 241 * 242 * Devname is usually updated by mount(8) after booting. 243 */ 244int 245vfs_rootmountalloc(fstypename, devname, mpp) 246 char *fstypename; 247 char *devname; 248 struct mount **mpp; 249{ 250 struct proc *p = curproc; /* XXX */ 251 struct vfsconf *vfsp; 252 struct mount *mp; 253 254 if (fstypename == NULL) 255 return (ENODEV); 256 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) 257 if (!strcmp(vfsp->vfc_name, fstypename)) 258 break; 259 if (vfsp == NULL) 260 return (ENODEV); 261 mp = malloc((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK); 262 bzero((char *)mp, (u_long)sizeof(struct mount)); 263 lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, LK_NOPAUSE); 264 (void)vfs_busy(mp, LK_NOWAIT, 0, p); 265 LIST_INIT(&mp->mnt_vnodelist); 266 mp->mnt_vfc = vfsp; 267 mp->mnt_op = vfsp->vfc_vfsops; 268 mp->mnt_flag = MNT_RDONLY; 269 mp->mnt_vnodecovered = NULLVP; 270 vfsp->vfc_refcount++; 271 mp->mnt_iosize_max = DFLTPHYS; 272 mp->mnt_stat.f_type = vfsp->vfc_typenum; 273 mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK; 274 strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN); 275 mp->mnt_stat.f_mntonname[0] = '/'; 276 mp->mnt_stat.f_mntonname[1] = 0; 277 (void) copystr(devname, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, 0); 278 *mpp = mp; 279 return (0); 280} 281 282/* 283 * Find an appropriate filesystem to use for the root. If a filesystem 284 * has not been preselected, walk through the list of known filesystems 285 * trying those that have mountroot routines, and try them until one 286 * works or we have tried them all. 287 */ 288#ifdef notdef /* XXX JH */ 289int 290lite2_vfs_mountroot() 291{ 292 struct vfsconf *vfsp; 293 extern int (*lite2_mountroot) __P((void)); 294 int error; 295 296 if (lite2_mountroot != NULL) 297 return ((*lite2_mountroot)()); 298 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) { 299 if (vfsp->vfc_mountroot == NULL) 300 continue; 301 if ((error = (*vfsp->vfc_mountroot)()) == 0) 302 return (0); 303 printf("%s_mountroot failed: %d\n", vfsp->vfc_name, error); 304 } 305 return (ENODEV); 306} 307#endif 308 309/* 310 * Lookup a mount point by filesystem identifier. 311 */ 312struct mount * 313vfs_getvfs(fsid) 314 fsid_t *fsid; 315{ 316 register struct mount *mp; 317 318 simple_lock(&mountlist_slock); 319 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 320 if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] && 321 mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) { 322 simple_unlock(&mountlist_slock); 323 return (mp); 324 } 325 } 326 simple_unlock(&mountlist_slock); 327 return ((struct mount *) 0); 328} 329 330/* 331 * Get a new unique fsid 332 * 333 * Keep in mind that several mounts may be running in parallel, 334 * so always increment mntid_base even if lower numbers are available. 335 */ 336 337static u_short mntid_base; 338 339void 340vfs_getnewfsid(mp) 341 struct mount *mp; 342{ 343 fsid_t tfsid; 344 int mtype; 345 346 simple_lock(&mntid_slock); 347 348 mtype = mp->mnt_vfc->vfc_typenum; 349 for (;;) { 350 tfsid.val[0] = makeudev(255, mtype + (mntid_base << 16)); 351 tfsid.val[1] = mtype; 352 ++mntid_base; 353 if (vfs_getvfs(&tfsid) == NULL) 354 break; 355 } 356 357 mp->mnt_stat.f_fsid.val[0] = tfsid.val[0]; 358 mp->mnt_stat.f_fsid.val[1] = tfsid.val[1]; 359 360 simple_unlock(&mntid_slock); 361} 362 363/* 364 * Knob to control the precision of file timestamps: 365 * 366 * 0 = seconds only; nanoseconds zeroed. 367 * 1 = seconds and nanoseconds, accurate within 1/HZ. 368 * 2 = seconds and nanoseconds, truncated to microseconds. 369 * >=3 = seconds and nanoseconds, maximum precision. 370 */ 371enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC }; 372 373static int timestamp_precision = TSP_SEC; 374SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW, 375 ×tamp_precision, 0, ""); 376 377/* 378 * Get a current timestamp. 379 */ 380void 381vfs_timestamp(tsp) 382 struct timespec *tsp; 383{ 384 struct timeval tv; 385 386 switch (timestamp_precision) { 387 case TSP_SEC: 388 tsp->tv_sec = time_second; 389 tsp->tv_nsec = 0; 390 break; 391 case TSP_HZ: 392 getnanotime(tsp); 393 break; 394 case TSP_USEC: 395 microtime(&tv); 396 TIMEVAL_TO_TIMESPEC(&tv, tsp); 397 break; 398 case TSP_NSEC: 399 default: 400 nanotime(tsp); 401 break; 402 } 403} 404 405/* 406 * Set vnode attributes to VNOVAL 407 */ 408void 409vattr_null(vap) 410 register struct vattr *vap; 411{ 412 413 vap->va_type = VNON; 414 vap->va_size = VNOVAL; 415 vap->va_bytes = VNOVAL; 416 vap->va_mode = VNOVAL; 417 vap->va_nlink = VNOVAL; 418 vap->va_uid = VNOVAL; 419 vap->va_gid = VNOVAL; 420 vap->va_fsid = VNOVAL; 421 vap->va_fileid = VNOVAL; 422 vap->va_blocksize = VNOVAL; 423 vap->va_rdev = VNOVAL; 424 vap->va_atime.tv_sec = VNOVAL; 425 vap->va_atime.tv_nsec = VNOVAL; 426 vap->va_mtime.tv_sec = VNOVAL; 427 vap->va_mtime.tv_nsec = VNOVAL; 428 vap->va_ctime.tv_sec = VNOVAL; 429 vap->va_ctime.tv_nsec = VNOVAL; 430 vap->va_flags = VNOVAL; 431 vap->va_gen = VNOVAL; 432 vap->va_vaflags = 0; 433} 434 435/* 436 * Routines having to do with the management of the vnode table. 437 */ 438extern vop_t **dead_vnodeop_p; 439 440/* 441 * Return the next vnode from the free list. 442 */ 443int 444getnewvnode(tag, mp, vops, vpp) 445 enum vtagtype tag; 446 struct mount *mp; 447 vop_t **vops; 448 struct vnode **vpp; 449{ 450 int s; 451 struct proc *p = curproc; /* XXX */ 452 struct vnode *vp, *tvp, *nvp; 453 vm_object_t object; 454 TAILQ_HEAD(freelst, vnode) vnode_tmp_list; 455 456 /* 457 * We take the least recently used vnode from the freelist 458 * if we can get it and it has no cached pages, and no 459 * namecache entries are relative to it. 460 * Otherwise we allocate a new vnode 461 */ 462 463 s = splbio(); 464 simple_lock(&vnode_free_list_slock); 465 TAILQ_INIT(&vnode_tmp_list); 466 467 for (vp = TAILQ_FIRST(&vnode_tobefree_list); vp; vp = nvp) { 468 nvp = TAILQ_NEXT(vp, v_freelist); 469 TAILQ_REMOVE(&vnode_tobefree_list, vp, v_freelist); 470 if (vp->v_flag & VAGE) { 471 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); 472 } else { 473 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 474 } 475 vp->v_flag &= ~(VTBFREE|VAGE); 476 vp->v_flag |= VFREE; 477 if (vp->v_usecount) 478 panic("tobe free vnode isn't"); 479 freevnodes++; 480 } 481 482 if (wantfreevnodes && freevnodes < wantfreevnodes) { 483 vp = NULL; 484 } else if (!wantfreevnodes && freevnodes <= desiredvnodes) { 485 /* 486 * XXX: this is only here to be backwards compatible 487 */ 488 vp = NULL; 489 } else { 490 for (vp = TAILQ_FIRST(&vnode_free_list); vp; vp = nvp) { 491 nvp = TAILQ_NEXT(vp, v_freelist); 492 if (!simple_lock_try(&vp->v_interlock)) 493 continue; 494 if (vp->v_usecount) 495 panic("free vnode isn't"); 496 497 object = vp->v_object; 498 if (object && (object->resident_page_count || object->ref_count)) { 499 printf("object inconsistant state: RPC: %d, RC: %d\n", 500 object->resident_page_count, object->ref_count); 501 /* Don't recycle if it's caching some pages */ 502 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); 503 TAILQ_INSERT_TAIL(&vnode_tmp_list, vp, v_freelist); 504 continue; 505 } else if (LIST_FIRST(&vp->v_cache_src)) { 506 /* Don't recycle if active in the namecache */ 507 simple_unlock(&vp->v_interlock); 508 continue; 509 } else { 510 break; 511 } 512 } 513 } 514 515 for (tvp = TAILQ_FIRST(&vnode_tmp_list); tvp; tvp = nvp) { 516 nvp = TAILQ_NEXT(tvp, v_freelist); 517 TAILQ_REMOVE(&vnode_tmp_list, tvp, v_freelist); 518 TAILQ_INSERT_TAIL(&vnode_free_list, tvp, v_freelist); 519 simple_unlock(&tvp->v_interlock); 520 } 521 522 if (vp) { 523 vp->v_flag |= VDOOMED; 524 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); 525 freevnodes--; 526 simple_unlock(&vnode_free_list_slock); 527 cache_purge(vp); 528 vp->v_lease = NULL; 529 if (vp->v_type != VBAD) { 530 vgonel(vp, p); 531 } else { 532 simple_unlock(&vp->v_interlock); 533 } 534 535#ifdef INVARIANTS 536 { 537 int s; 538 539 if (vp->v_data) 540 panic("cleaned vnode isn't"); 541 s = splbio(); 542 if (vp->v_numoutput) 543 panic("Clean vnode has pending I/O's"); 544 splx(s); 545 } 546#endif 547 vp->v_flag = 0; 548 vp->v_lastw = 0; 549 vp->v_lasta = 0; 550 vp->v_cstart = 0; 551 vp->v_clen = 0; 552 vp->v_socket = 0; 553 vp->v_writecount = 0; /* XXX */ 554 } else { 555 simple_unlock(&vnode_free_list_slock); 556 vp = (struct vnode *) zalloc(vnode_zone); 557 bzero((char *) vp, sizeof *vp); 558 simple_lock_init(&vp->v_interlock); 559 vp->v_dd = vp; 560 cache_purge(vp); 561 LIST_INIT(&vp->v_cache_src); 562 TAILQ_INIT(&vp->v_cache_dst); 563 numvnodes++; 564 } 565 566 TAILQ_INIT(&vp->v_cleanblkhd); 567 TAILQ_INIT(&vp->v_dirtyblkhd); 568 vp->v_type = VNON; 569 vp->v_tag = tag; 570 vp->v_op = vops; 571 insmntque(vp, mp); 572 *vpp = vp; 573 vp->v_usecount = 1; 574 vp->v_data = 0; 575 splx(s); 576 577 vfs_object_create(vp, p, p->p_ucred); 578 return (0); 579} 580 581/* 582 * Move a vnode from one mount queue to another. 583 */ 584static void 585insmntque(vp, mp) 586 register struct vnode *vp; 587 register struct mount *mp; 588{ 589 590 simple_lock(&mntvnode_slock); 591 /* 592 * Delete from old mount point vnode list, if on one. 593 */ 594 if (vp->v_mount != NULL) 595 LIST_REMOVE(vp, v_mntvnodes); 596 /* 597 * Insert into list of vnodes for the new mount point, if available. 598 */ 599 if ((vp->v_mount = mp) == NULL) { 600 simple_unlock(&mntvnode_slock); 601 return; 602 } 603 LIST_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes); 604 simple_unlock(&mntvnode_slock); 605} 606 607/* 608 * Update outstanding I/O count and do wakeup if requested. 609 */ 610void 611vwakeup(bp) 612 register struct buf *bp; 613{ 614 register struct vnode *vp; 615 616 bp->b_flags &= ~B_WRITEINPROG; 617 if ((vp = bp->b_vp)) { 618 vp->v_numoutput--; 619 if (vp->v_numoutput < 0) 620 panic("vwakeup: neg numoutput"); 621 if ((vp->v_numoutput == 0) && (vp->v_flag & VBWAIT)) { 622 vp->v_flag &= ~VBWAIT; 623 wakeup((caddr_t) &vp->v_numoutput); 624 } 625 } 626} 627 628/* 629 * Flush out and invalidate all buffers associated with a vnode. 630 * Called with the underlying object locked. 631 */ 632int 633vinvalbuf(vp, flags, cred, p, slpflag, slptimeo) 634 register struct vnode *vp; 635 int flags; 636 struct ucred *cred; 637 struct proc *p; 638 int slpflag, slptimeo; 639{ 640 register struct buf *bp; 641 struct buf *nbp, *blist; 642 int s, error; 643 vm_object_t object; 644 645 if (flags & V_SAVE) { 646 s = splbio(); 647 while (vp->v_numoutput) { 648 vp->v_flag |= VBWAIT; 649 error = tsleep((caddr_t)&vp->v_numoutput, 650 slpflag | (PRIBIO + 1), "vinvlbuf", slptimeo); 651 if (error) { 652 splx(s); 653 return (error); 654 } 655 } 656 if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) { 657 splx(s); 658 if ((error = VOP_FSYNC(vp, cred, MNT_WAIT, p)) != 0) 659 return (error); 660 s = splbio(); 661 if (vp->v_numoutput > 0 || 662 !TAILQ_EMPTY(&vp->v_dirtyblkhd)) 663 panic("vinvalbuf: dirty bufs"); 664 } 665 splx(s); 666 } 667 s = splbio(); 668 for (;;) { 669 blist = TAILQ_FIRST(&vp->v_cleanblkhd); 670 if (!blist) 671 blist = TAILQ_FIRST(&vp->v_dirtyblkhd); 672 if (!blist) 673 break; 674 675 for (bp = blist; bp; bp = nbp) { 676 nbp = TAILQ_NEXT(bp, b_vnbufs); 677 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { 678 error = BUF_TIMELOCK(bp, 679 LK_EXCLUSIVE | LK_SLEEPFAIL, 680 "vinvalbuf", slpflag, slptimeo); 681 if (error == ENOLCK) 682 break; 683 splx(s); 684 return (error); 685 } 686 /* 687 * XXX Since there are no node locks for NFS, I 688 * believe there is a slight chance that a delayed 689 * write will occur while sleeping just above, so 690 * check for it. Note that vfs_bio_awrite expects 691 * buffers to reside on a queue, while VOP_BWRITE and 692 * brelse do not. 693 */ 694 if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) && 695 (flags & V_SAVE)) { 696 697 if (bp->b_vp == vp) { 698 if (bp->b_flags & B_CLUSTEROK) { 699 BUF_UNLOCK(bp); 700 vfs_bio_awrite(bp); 701 } else { 702 bremfree(bp); 703 bp->b_flags |= B_ASYNC; 704 VOP_BWRITE(bp->b_vp, bp); 705 } 706 } else { 707 bremfree(bp); 708 (void) VOP_BWRITE(bp->b_vp, bp); 709 } 710 break; 711 } 712 bremfree(bp); 713 bp->b_flags |= (B_INVAL | B_NOCACHE | B_RELBUF); 714 bp->b_flags &= ~B_ASYNC; 715 brelse(bp); 716 } 717 } 718 719 while (vp->v_numoutput > 0) { 720 vp->v_flag |= VBWAIT; 721 tsleep(&vp->v_numoutput, PVM, "vnvlbv", 0); 722 } 723 724 splx(s); 725 726 /* 727 * Destroy the copy in the VM cache, too. 728 */ 729 simple_lock(&vp->v_interlock); 730 object = vp->v_object; 731 if (object != NULL) { 732 vm_object_page_remove(object, 0, 0, 733 (flags & V_SAVE) ? TRUE : FALSE); 734 } 735 simple_unlock(&vp->v_interlock); 736 737 if (!TAILQ_EMPTY(&vp->v_dirtyblkhd) || !TAILQ_EMPTY(&vp->v_cleanblkhd)) 738 panic("vinvalbuf: flush failed"); 739 return (0); 740} 741 742/* 743 * Truncate a file's buffer and pages to a specified length. This 744 * is in lieu of the old vinvalbuf mechanism, which performed unneeded 745 * sync activity. 746 */ 747int 748vtruncbuf(vp, cred, p, length, blksize) 749 register struct vnode *vp; 750 struct ucred *cred; 751 struct proc *p; 752 off_t length; 753 int blksize; 754{ 755 register struct buf *bp; 756 struct buf *nbp; 757 int s, anyfreed; 758 int trunclbn; 759 760 /* 761 * Round up to the *next* lbn. 762 */ 763 trunclbn = (length + blksize - 1) / blksize; 764 765 s = splbio(); 766restart: 767 anyfreed = 1; 768 for (;anyfreed;) { 769 anyfreed = 0; 770 for (bp = TAILQ_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) { 771 nbp = TAILQ_NEXT(bp, b_vnbufs); 772 if (bp->b_lblkno >= trunclbn) { 773 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { 774 BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL); 775 goto restart; 776 } else { 777 bremfree(bp); 778 bp->b_flags |= (B_INVAL | B_RELBUF); 779 bp->b_flags &= ~B_ASYNC; 780 brelse(bp); 781 anyfreed = 1; 782 } 783 if (nbp && 784 (((nbp->b_xflags & BX_VNCLEAN) == 0) || 785 (nbp->b_vp != vp) || 786 (nbp->b_flags & B_DELWRI))) { 787 goto restart; 788 } 789 } 790 } 791 792 for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { 793 nbp = TAILQ_NEXT(bp, b_vnbufs); 794 if (bp->b_lblkno >= trunclbn) { 795 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { 796 BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL); 797 goto restart; 798 } else { 799 bremfree(bp); 800 bp->b_flags |= (B_INVAL | B_RELBUF); 801 bp->b_flags &= ~B_ASYNC; 802 brelse(bp); 803 anyfreed = 1; 804 } 805 if (nbp && 806 (((nbp->b_xflags & BX_VNDIRTY) == 0) || 807 (nbp->b_vp != vp) || 808 (nbp->b_flags & B_DELWRI) == 0)) { 809 goto restart; 810 } 811 } 812 } 813 } 814 815 if (length > 0) { 816restartsync: 817 for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { 818 nbp = TAILQ_NEXT(bp, b_vnbufs); 819 if ((bp->b_flags & B_DELWRI) && (bp->b_lblkno < 0)) { 820 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { 821 BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL); 822 goto restart; 823 } else { 824 bremfree(bp); 825 if (bp->b_vp == vp) { 826 bp->b_flags |= B_ASYNC; 827 } else { 828 bp->b_flags &= ~B_ASYNC; 829 } 830 VOP_BWRITE(bp->b_vp, bp); 831 } 832 goto restartsync; 833 } 834 835 } 836 } 837 838 while (vp->v_numoutput > 0) { 839 vp->v_flag |= VBWAIT; 840 tsleep(&vp->v_numoutput, PVM, "vbtrunc", 0); 841 } 842 843 splx(s); 844 845 vnode_pager_setsize(vp, length); 846 847 return (0); 848} 849 850/* 851 * Associate a buffer with a vnode. 852 */ 853void 854bgetvp(vp, bp) 855 register struct vnode *vp; 856 register struct buf *bp; 857{ 858 int s; 859 860 KASSERT(bp->b_vp == NULL, ("bgetvp: not free")); 861 862 vhold(vp); 863 bp->b_vp = vp; 864 bp->b_dev = vn_todev(vp); 865 /* 866 * Insert onto list for new vnode. 867 */ 868 s = splbio(); 869 bp->b_xflags |= BX_VNCLEAN; 870 bp->b_xflags &= ~BX_VNDIRTY; 871 TAILQ_INSERT_TAIL(&vp->v_cleanblkhd, bp, b_vnbufs); 872 splx(s); 873} 874 875/* 876 * Disassociate a buffer from a vnode. 877 */ 878void 879brelvp(bp) 880 register struct buf *bp; 881{ 882 struct vnode *vp; 883 struct buflists *listheadp; 884 int s; 885 886 KASSERT(bp->b_vp != NULL, ("brelvp: NULL")); 887 888 /* 889 * Delete from old vnode list, if on one. 890 */ 891 vp = bp->b_vp; 892 s = splbio(); 893 if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) { 894 if (bp->b_xflags & BX_VNDIRTY) 895 listheadp = &vp->v_dirtyblkhd; 896 else 897 listheadp = &vp->v_cleanblkhd; 898 TAILQ_REMOVE(listheadp, bp, b_vnbufs); 899 bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN); 900 } 901 if ((vp->v_flag & VONWORKLST) && TAILQ_EMPTY(&vp->v_dirtyblkhd)) { 902 vp->v_flag &= ~VONWORKLST; 903 LIST_REMOVE(vp, v_synclist); 904 } 905 splx(s); 906 bp->b_vp = (struct vnode *) 0; 907 vdrop(vp); 908} 909 910/* 911 * The workitem queue. 912 * 913 * It is useful to delay writes of file data and filesystem metadata 914 * for tens of seconds so that quickly created and deleted files need 915 * not waste disk bandwidth being created and removed. To realize this, 916 * we append vnodes to a "workitem" queue. When running with a soft 917 * updates implementation, most pending metadata dependencies should 918 * not wait for more than a few seconds. Thus, mounted on block devices 919 * are delayed only about a half the time that file data is delayed. 920 * Similarly, directory updates are more critical, so are only delayed 921 * about a third the time that file data is delayed. Thus, there are 922 * SYNCER_MAXDELAY queues that are processed round-robin at a rate of 923 * one each second (driven off the filesystem syncer process). The 924 * syncer_delayno variable indicates the next queue that is to be processed. 925 * Items that need to be processed soon are placed in this queue: 926 * 927 * syncer_workitem_pending[syncer_delayno] 928 * 929 * A delay of fifteen seconds is done by placing the request fifteen 930 * entries later in the queue: 931 * 932 * syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask] 933 * 934 */ 935 936/* 937 * Add an item to the syncer work queue. 938 */ 939static void 940vn_syncer_add_to_worklist(struct vnode *vp, int delay) 941{ 942 int s, slot; 943 944 s = splbio(); 945 946 if (vp->v_flag & VONWORKLST) { 947 LIST_REMOVE(vp, v_synclist); 948 } 949 950 if (delay > syncer_maxdelay - 2) 951 delay = syncer_maxdelay - 2; 952 slot = (syncer_delayno + delay) & syncer_mask; 953 954 LIST_INSERT_HEAD(&syncer_workitem_pending[slot], vp, v_synclist); 955 vp->v_flag |= VONWORKLST; 956 splx(s); 957} 958 959struct proc *updateproc; 960static void sched_sync __P((void)); 961static struct kproc_desc up_kp = { 962 "syncer", 963 sched_sync, 964 &updateproc 965}; 966SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp) 967 968/* 969 * System filesystem synchronizer daemon. 970 */ 971void 972sched_sync(void) 973{ 974 struct synclist *slp; 975 struct vnode *vp; 976 long starttime; 977 int s; 978 struct proc *p = updateproc; 979 980 EVENTHANDLER_REGISTER(shutdown_pre_sync, shutdown_kproc, p, 981 SHUTDOWN_PRI_LAST); 982 983 for (;;) { 984 kproc_suspend_loop(p); 985 986 starttime = time_second; 987 988 /* 989 * Push files whose dirty time has expired. Be careful 990 * of interrupt race on slp queue. 991 */ 992 s = splbio(); 993 slp = &syncer_workitem_pending[syncer_delayno]; 994 syncer_delayno += 1; 995 if (syncer_delayno == syncer_maxdelay) 996 syncer_delayno = 0; 997 splx(s); 998 999 while ((vp = LIST_FIRST(slp)) != NULL) { 1000 if (VOP_ISLOCKED(vp, NULL) == 0) { 1001 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); 1002 (void) VOP_FSYNC(vp, p->p_ucred, MNT_LAZY, p); 1003 VOP_UNLOCK(vp, 0, p); 1004 } 1005 s = splbio(); 1006 if (LIST_FIRST(slp) == vp) { 1007 /* 1008 * Note: v_tag VT_VFS vps can remain on the 1009 * worklist too with no dirty blocks, but 1010 * since sync_fsync() moves it to a different 1011 * slot we are safe. 1012 */ 1013 if (TAILQ_EMPTY(&vp->v_dirtyblkhd) && 1014 !vn_isdisk(vp, NULL)) 1015 panic("sched_sync: fsync failed vp %p tag %d", vp, vp->v_tag); 1016 /* 1017 * Put us back on the worklist. The worklist 1018 * routine will remove us from our current 1019 * position and then add us back in at a later 1020 * position. 1021 */ 1022 vn_syncer_add_to_worklist(vp, syncdelay); 1023 } 1024 splx(s); 1025 } 1026 1027 /* 1028 * Do soft update processing. 1029 */ 1030 if (bioops.io_sync) 1031 (*bioops.io_sync)(NULL); 1032 1033 /* 1034 * The variable rushjob allows the kernel to speed up the 1035 * processing of the filesystem syncer process. A rushjob 1036 * value of N tells the filesystem syncer to process the next 1037 * N seconds worth of work on its queue ASAP. Currently rushjob 1038 * is used by the soft update code to speed up the filesystem 1039 * syncer process when the incore state is getting so far 1040 * ahead of the disk that the kernel memory pool is being 1041 * threatened with exhaustion. 1042 */ 1043 if (rushjob > 0) { 1044 rushjob -= 1; 1045 continue; 1046 } 1047 /* 1048 * If it has taken us less than a second to process the 1049 * current work, then wait. Otherwise start right over 1050 * again. We can still lose time if any single round 1051 * takes more than two seconds, but it does not really 1052 * matter as we are just trying to generally pace the 1053 * filesystem activity. 1054 */ 1055 if (time_second == starttime) 1056 tsleep(&lbolt, PPAUSE, "syncer", 0); 1057 } 1058} 1059 1060/* 1061 * Request the syncer daemon to speed up its work. 1062 * We never push it to speed up more than half of its 1063 * normal turn time, otherwise it could take over the cpu. 1064 */ 1065int 1066speedup_syncer() 1067{ 1068 int s; 1069 1070 s = splhigh(); 1071 if (updateproc->p_wchan == &lbolt) 1072 setrunnable(updateproc); 1073 splx(s); 1074 if (rushjob < syncdelay / 2) { 1075 rushjob += 1; 1076 stat_rush_requests += 1; 1077 return (1); 1078 } 1079 return(0); 1080} 1081 1082/* 1083 * Associate a p-buffer with a vnode. 1084 * 1085 * Also sets B_PAGING flag to indicate that vnode is not fully associated 1086 * with the buffer. i.e. the bp has not been linked into the vnode or 1087 * ref-counted. 1088 */ 1089void 1090pbgetvp(vp, bp) 1091 register struct vnode *vp; 1092 register struct buf *bp; 1093{ 1094 1095 KASSERT(bp->b_vp == NULL, ("pbgetvp: not free")); 1096 1097 bp->b_vp = vp; 1098 bp->b_flags |= B_PAGING; 1099 bp->b_dev = vn_todev(vp); 1100} 1101 1102/* 1103 * Disassociate a p-buffer from a vnode. 1104 */ 1105void 1106pbrelvp(bp) 1107 register struct buf *bp; 1108{ 1109 1110 KASSERT(bp->b_vp != NULL, ("pbrelvp: NULL")); 1111 1112#if !defined(MAX_PERF) 1113 /* XXX REMOVE ME */ 1114 if (bp->b_vnbufs.tqe_next != NULL) { 1115 panic( 1116 "relpbuf(): b_vp was probably reassignbuf()d %p %x", 1117 bp, 1118 (int)bp->b_flags 1119 ); 1120 } 1121#endif 1122 bp->b_vp = (struct vnode *) 0; 1123 bp->b_flags &= ~B_PAGING; 1124} 1125 1126void 1127pbreassignbuf(bp, newvp) 1128 struct buf *bp; 1129 struct vnode *newvp; 1130{ 1131#if !defined(MAX_PERF) 1132 if ((bp->b_flags & B_PAGING) == 0) { 1133 panic( 1134 "pbreassignbuf() on non phys bp %p", 1135 bp 1136 ); 1137 } 1138#endif 1139 bp->b_vp = newvp; 1140} 1141 1142/* 1143 * Reassign a buffer from one vnode to another. 1144 * Used to assign file specific control information 1145 * (indirect blocks) to the vnode to which they belong. 1146 */ 1147void 1148reassignbuf(bp, newvp) 1149 register struct buf *bp; 1150 register struct vnode *newvp; 1151{ 1152 struct buflists *listheadp; 1153 int delay; 1154 int s; 1155 1156 if (newvp == NULL) { 1157 printf("reassignbuf: NULL"); 1158 return; 1159 } 1160 ++reassignbufcalls; 1161 1162#if !defined(MAX_PERF) 1163 /* 1164 * B_PAGING flagged buffers cannot be reassigned because their vp 1165 * is not fully linked in. 1166 */ 1167 if (bp->b_flags & B_PAGING) 1168 panic("cannot reassign paging buffer"); 1169#endif 1170 1171 s = splbio(); 1172 /* 1173 * Delete from old vnode list, if on one. 1174 */ 1175 if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) { 1176 if (bp->b_xflags & BX_VNDIRTY) 1177 listheadp = &bp->b_vp->v_dirtyblkhd; 1178 else 1179 listheadp = &bp->b_vp->v_cleanblkhd; 1180 TAILQ_REMOVE(listheadp, bp, b_vnbufs); 1181 bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN); 1182 if (bp->b_vp != newvp) { 1183 vdrop(bp->b_vp); 1184 bp->b_vp = NULL; /* for clarification */ 1185 } 1186 } 1187 /* 1188 * If dirty, put on list of dirty buffers; otherwise insert onto list 1189 * of clean buffers. 1190 */ 1191 if (bp->b_flags & B_DELWRI) { 1192 struct buf *tbp; 1193 1194 listheadp = &newvp->v_dirtyblkhd; 1195 if ((newvp->v_flag & VONWORKLST) == 0) { 1196 switch (newvp->v_type) { 1197 case VDIR: 1198 delay = dirdelay; 1199 break; 1200 case VCHR: 1201 case VBLK: 1202 if (newvp->v_specmountpoint != NULL) { 1203 delay = metadelay; 1204 break; 1205 } 1206 /* fall through */ 1207 default: 1208 delay = filedelay; 1209 } 1210 vn_syncer_add_to_worklist(newvp, delay); 1211 } 1212 bp->b_xflags |= BX_VNDIRTY; 1213 tbp = TAILQ_FIRST(listheadp); 1214 if (tbp == NULL || 1215 bp->b_lblkno == 0 || 1216 (bp->b_lblkno > 0 && tbp->b_lblkno < 0) || 1217 (bp->b_lblkno > 0 && bp->b_lblkno < tbp->b_lblkno)) { 1218 TAILQ_INSERT_HEAD(listheadp, bp, b_vnbufs); 1219 ++reassignbufsortgood; 1220 } else if (bp->b_lblkno < 0) { 1221 TAILQ_INSERT_TAIL(listheadp, bp, b_vnbufs); 1222 ++reassignbufsortgood; 1223 } else if (reassignbufmethod == 1) { 1224 /* 1225 * New sorting algorithm, only handle sequential case, 1226 * otherwise append to end (but before metadata) 1227 */ 1228 if ((tbp = gbincore(newvp, bp->b_lblkno - 1)) != NULL && 1229 (tbp->b_xflags & BX_VNDIRTY)) { 1230 /* 1231 * Found the best place to insert the buffer 1232 */ 1233 TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs); 1234 ++reassignbufsortgood; 1235 } else { 1236 /* 1237 * Missed, append to end, but before meta-data. 1238 * We know that the head buffer in the list is 1239 * not meta-data due to prior conditionals. 1240 * 1241 * Indirect effects: NFS second stage write 1242 * tends to wind up here, giving maximum 1243 * distance between the unstable write and the 1244 * commit rpc. 1245 */ 1246 tbp = TAILQ_LAST(listheadp, buflists); 1247 while (tbp && tbp->b_lblkno < 0) 1248 tbp = TAILQ_PREV(tbp, buflists, b_vnbufs); 1249 TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs); 1250 ++reassignbufsortbad; 1251 } 1252 } else { 1253 /* 1254 * Old sorting algorithm, scan queue and insert 1255 */ 1256 struct buf *ttbp; 1257 while ((ttbp = TAILQ_NEXT(tbp, b_vnbufs)) && 1258 (ttbp->b_lblkno < bp->b_lblkno)) { 1259 ++reassignbufloops; 1260 tbp = ttbp; 1261 } 1262 TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs); 1263 } 1264 } else { 1265 bp->b_xflags |= BX_VNCLEAN; 1266 TAILQ_INSERT_TAIL(&newvp->v_cleanblkhd, bp, b_vnbufs); 1267 if ((newvp->v_flag & VONWORKLST) && 1268 TAILQ_EMPTY(&newvp->v_dirtyblkhd)) { 1269 newvp->v_flag &= ~VONWORKLST; 1270 LIST_REMOVE(newvp, v_synclist); 1271 } 1272 } 1273 if (bp->b_vp != newvp) { 1274 bp->b_vp = newvp; 1275 vhold(bp->b_vp); 1276 } 1277 splx(s); 1278} 1279 1280/* 1281 * Create a vnode for a block device. 1282 * Used for mounting the root file system. 1283 */ 1284int 1285bdevvp(dev, vpp) 1286 dev_t dev; 1287 struct vnode **vpp; 1288{ 1289 register struct vnode *vp; 1290 struct vnode *nvp; 1291 int error; 1292 1293 if (dev == NODEV) { 1294 *vpp = NULLVP; 1295 return (ENXIO); 1296 } 1297 error = getnewvnode(VT_NON, (struct mount *)0, spec_vnodeop_p, &nvp); 1298 if (error) { 1299 *vpp = NULLVP; 1300 return (error); 1301 } 1302 vp = nvp; 1303 vp->v_type = VBLK; 1304 addalias(vp, dev); 1305 *vpp = vp; 1306 return (0); 1307} 1308 1309/* 1310 * Add vnode to the alias list hung off the dev_t. 1311 * 1312 * The reason for this gunk is that multiple vnodes can reference 1313 * the same physical device, so checking vp->v_usecount to see 1314 * how many users there are is inadequate; the v_usecount for 1315 * the vnodes need to be accumulated. vcount() does that. 1316 */ 1317void 1318addaliasu(nvp, nvp_rdev) 1319 struct vnode *nvp; 1320 udev_t nvp_rdev; 1321{ 1322 1323 if (nvp->v_type != VBLK && nvp->v_type != VCHR) 1324 panic("addaliasu on non-special vnode"); 1325 addalias(nvp, udev2dev(nvp_rdev, nvp->v_type == VBLK ? 1 : 0)); 1326} 1327 1328void 1329addalias(nvp, dev) 1330 struct vnode *nvp; 1331 dev_t dev; 1332{ 1333 1334 if (nvp->v_type != VBLK && nvp->v_type != VCHR) 1335 panic("addalias on non-special vnode"); 1336 1337 nvp->v_rdev = dev; 1338 simple_lock(&spechash_slock); 1339 SLIST_INSERT_HEAD(&dev->si_hlist, nvp, v_specnext); 1340 simple_unlock(&spechash_slock); 1341} 1342 1343/* 1344 * Grab a particular vnode from the free list, increment its 1345 * reference count and lock it. The vnode lock bit is set if the 1346 * vnode is being eliminated in vgone. The process is awakened 1347 * when the transition is completed, and an error returned to 1348 * indicate that the vnode is no longer usable (possibly having 1349 * been changed to a new file system type). 1350 */ 1351int 1352vget(vp, flags, p) 1353 register struct vnode *vp; 1354 int flags; 1355 struct proc *p; 1356{ 1357 int error; 1358 1359 /* 1360 * If the vnode is in the process of being cleaned out for 1361 * another use, we wait for the cleaning to finish and then 1362 * return failure. Cleaning is determined by checking that 1363 * the VXLOCK flag is set. 1364 */ 1365 if ((flags & LK_INTERLOCK) == 0) { 1366 simple_lock(&vp->v_interlock); 1367 } 1368 if (vp->v_flag & VXLOCK) { 1369 vp->v_flag |= VXWANT; 1370 simple_unlock(&vp->v_interlock); 1371 tsleep((caddr_t)vp, PINOD, "vget", 0); 1372 return (ENOENT); 1373 } 1374 1375 vp->v_usecount++; 1376 1377 if (VSHOULDBUSY(vp)) 1378 vbusy(vp); 1379 if (flags & LK_TYPE_MASK) { 1380 if ((error = vn_lock(vp, flags | LK_INTERLOCK, p)) != 0) { 1381 /* 1382 * must expand vrele here because we do not want 1383 * to call VOP_INACTIVE if the reference count 1384 * drops back to zero since it was never really 1385 * active. We must remove it from the free list 1386 * before sleeping so that multiple processes do 1387 * not try to recycle it. 1388 */ 1389 simple_lock(&vp->v_interlock); 1390 vp->v_usecount--; 1391 if (VSHOULDFREE(vp)) 1392 vfree(vp); 1393 simple_unlock(&vp->v_interlock); 1394 } 1395 return (error); 1396 } 1397 simple_unlock(&vp->v_interlock); 1398 return (0); 1399} 1400 1401void 1402vref(struct vnode *vp) 1403{ 1404 simple_lock(&vp->v_interlock); 1405 vp->v_usecount++; 1406 simple_unlock(&vp->v_interlock); 1407} 1408 1409/* 1410 * Vnode put/release. 1411 * If count drops to zero, call inactive routine and return to freelist. 1412 */ 1413void 1414vrele(vp) 1415 struct vnode *vp; 1416{ 1417 struct proc *p = curproc; /* XXX */ 1418 1419 KASSERT(vp != NULL, ("vrele: null vp")); 1420 1421 simple_lock(&vp->v_interlock); 1422 1423 if (vp->v_usecount > 1) { 1424 1425 vp->v_usecount--; 1426 simple_unlock(&vp->v_interlock); 1427 1428 return; 1429 } 1430 1431 if (vp->v_usecount == 1) { 1432 1433 vp->v_usecount--; 1434 if (VSHOULDFREE(vp)) 1435 vfree(vp); 1436 /* 1437 * If we are doing a vput, the node is already locked, and we must 1438 * call VOP_INACTIVE with the node locked. So, in the case of 1439 * vrele, we explicitly lock the vnode before calling VOP_INACTIVE. 1440 */ 1441 if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK, p) == 0) { 1442 VOP_INACTIVE(vp, p); 1443 } 1444 1445 } else { 1446#ifdef DIAGNOSTIC 1447 vprint("vrele: negative ref count", vp); 1448 simple_unlock(&vp->v_interlock); 1449#endif 1450 panic("vrele: negative ref cnt"); 1451 } 1452} 1453 1454void 1455vput(vp) 1456 struct vnode *vp; 1457{ 1458 struct proc *p = curproc; /* XXX */ 1459 1460 KASSERT(vp != NULL, ("vput: null vp")); 1461 1462 simple_lock(&vp->v_interlock); 1463 1464 if (vp->v_usecount > 1) { 1465 1466 vp->v_usecount--; 1467 VOP_UNLOCK(vp, LK_INTERLOCK, p); 1468 return; 1469 1470 } 1471 1472 if (vp->v_usecount == 1) { 1473 1474 vp->v_usecount--; 1475 if (VSHOULDFREE(vp)) 1476 vfree(vp); 1477 /* 1478 * If we are doing a vput, the node is already locked, and we must 1479 * call VOP_INACTIVE with the node locked. So, in the case of 1480 * vrele, we explicitly lock the vnode before calling VOP_INACTIVE. 1481 */ 1482 simple_unlock(&vp->v_interlock); 1483 VOP_INACTIVE(vp, p); 1484 1485 } else { 1486#ifdef DIAGNOSTIC 1487 vprint("vput: negative ref count", vp); 1488#endif 1489 panic("vput: negative ref cnt"); 1490 } 1491} 1492 1493/* 1494 * Somebody doesn't want the vnode recycled. 1495 */ 1496void 1497vhold(vp) 1498 register struct vnode *vp; 1499{ 1500 int s; 1501 1502 s = splbio(); 1503 vp->v_holdcnt++; 1504 if (VSHOULDBUSY(vp)) 1505 vbusy(vp); 1506 splx(s); 1507} 1508 1509/* 1510 * One less who cares about this vnode. 1511 */ 1512void 1513vdrop(vp) 1514 register struct vnode *vp; 1515{ 1516 int s; 1517 1518 s = splbio(); 1519 if (vp->v_holdcnt <= 0) 1520 panic("vdrop: holdcnt"); 1521 vp->v_holdcnt--; 1522 if (VSHOULDFREE(vp)) 1523 vfree(vp); 1524 splx(s); 1525} 1526 1527/* 1528 * Remove any vnodes in the vnode table belonging to mount point mp. 1529 * 1530 * If MNT_NOFORCE is specified, there should not be any active ones, 1531 * return error if any are found (nb: this is a user error, not a 1532 * system error). If MNT_FORCE is specified, detach any active vnodes 1533 * that are found. 1534 */ 1535#ifdef DIAGNOSTIC 1536static int busyprt = 0; /* print out busy vnodes */ 1537SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, ""); 1538#endif 1539 1540int 1541vflush(mp, skipvp, flags) 1542 struct mount *mp; 1543 struct vnode *skipvp; 1544 int flags; 1545{ 1546 struct proc *p = curproc; /* XXX */ 1547 struct vnode *vp, *nvp; 1548 int busy = 0; 1549 1550 simple_lock(&mntvnode_slock); 1551loop: 1552 for (vp = LIST_FIRST(&mp->mnt_vnodelist); vp; vp = nvp) { 1553 /* 1554 * Make sure this vnode wasn't reclaimed in getnewvnode(). 1555 * Start over if it has (it won't be on the list anymore). 1556 */ 1557 if (vp->v_mount != mp) 1558 goto loop; 1559 nvp = LIST_NEXT(vp, v_mntvnodes); 1560 /* 1561 * Skip over a selected vnode. 1562 */ 1563 if (vp == skipvp) 1564 continue; 1565 1566 simple_lock(&vp->v_interlock); 1567 /* 1568 * Skip over a vnodes marked VSYSTEM. 1569 */ 1570 if ((flags & SKIPSYSTEM) && (vp->v_flag & VSYSTEM)) { 1571 simple_unlock(&vp->v_interlock); 1572 continue; 1573 } 1574 /* 1575 * If WRITECLOSE is set, only flush out regular file vnodes 1576 * open for writing. 1577 */ 1578 if ((flags & WRITECLOSE) && 1579 (vp->v_writecount == 0 || vp->v_type != VREG)) { 1580 simple_unlock(&vp->v_interlock); 1581 continue; 1582 } 1583 1584 /* 1585 * With v_usecount == 0, all we need to do is clear out the 1586 * vnode data structures and we are done. 1587 */ 1588 if (vp->v_usecount == 0) { 1589 simple_unlock(&mntvnode_slock); 1590 vgonel(vp, p); 1591 simple_lock(&mntvnode_slock); 1592 continue; 1593 } 1594 1595 /* 1596 * If FORCECLOSE is set, forcibly close the vnode. For block 1597 * or character devices, revert to an anonymous device. For 1598 * all other files, just kill them. 1599 */ 1600 if (flags & FORCECLOSE) { 1601 simple_unlock(&mntvnode_slock); 1602 if (vp->v_type != VBLK && vp->v_type != VCHR) { 1603 vgonel(vp, p); 1604 } else { 1605 vclean(vp, 0, p); 1606 vp->v_op = spec_vnodeop_p; 1607 insmntque(vp, (struct mount *) 0); 1608 } 1609 simple_lock(&mntvnode_slock); 1610 continue; 1611 } 1612#ifdef DIAGNOSTIC 1613 if (busyprt) 1614 vprint("vflush: busy vnode", vp); 1615#endif 1616 simple_unlock(&vp->v_interlock); 1617 busy++; 1618 } 1619 simple_unlock(&mntvnode_slock); 1620 if (busy) 1621 return (EBUSY); 1622 return (0); 1623} 1624 1625/* 1626 * Disassociate the underlying file system from a vnode. 1627 */ 1628static void 1629vclean(vp, flags, p) 1630 struct vnode *vp; 1631 int flags; 1632 struct proc *p; 1633{ 1634 int active; 1635 vm_object_t obj; 1636 1637 /* 1638 * Check to see if the vnode is in use. If so we have to reference it 1639 * before we clean it out so that its count cannot fall to zero and 1640 * generate a race against ourselves to recycle it. 1641 */ 1642 if ((active = vp->v_usecount)) 1643 vp->v_usecount++; 1644 1645 /* 1646 * Prevent the vnode from being recycled or brought into use while we 1647 * clean it out. 1648 */ 1649 if (vp->v_flag & VXLOCK) 1650 panic("vclean: deadlock"); 1651 vp->v_flag |= VXLOCK; 1652 /* 1653 * Even if the count is zero, the VOP_INACTIVE routine may still 1654 * have the object locked while it cleans it out. The VOP_LOCK 1655 * ensures that the VOP_INACTIVE routine is done with its work. 1656 * For active vnodes, it ensures that no other activity can 1657 * occur while the underlying object is being cleaned out. 1658 */ 1659 VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK, p); 1660 1661 /* 1662 * Clean out any buffers associated with the vnode. 1663 */ 1664 vinvalbuf(vp, V_SAVE, NOCRED, p, 0, 0); 1665 if ((obj = vp->v_object) != NULL) { 1666 if (obj->ref_count == 0) { 1667 /* 1668 * vclean() may be called twice. The first time removes the 1669 * primary reference to the object, the second time goes 1670 * one further and is a special-case to terminate the object. 1671 */ 1672 vm_object_terminate(obj); 1673 } else { 1674 /* 1675 * Woe to the process that tries to page now :-). 1676 */ 1677 vm_pager_deallocate(obj); 1678 } 1679 } 1680 1681 /* 1682 * If purging an active vnode, it must be closed and 1683 * deactivated before being reclaimed. Note that the 1684 * VOP_INACTIVE will unlock the vnode. 1685 */ 1686 if (active) { 1687 if (flags & DOCLOSE) 1688 VOP_CLOSE(vp, FNONBLOCK, NOCRED, p); 1689 VOP_INACTIVE(vp, p); 1690 } else { 1691 /* 1692 * Any other processes trying to obtain this lock must first 1693 * wait for VXLOCK to clear, then call the new lock operation. 1694 */ 1695 VOP_UNLOCK(vp, 0, p); 1696 } 1697 /* 1698 * Reclaim the vnode. 1699 */ 1700 if (VOP_RECLAIM(vp, p)) 1701 panic("vclean: cannot reclaim"); 1702 1703 if (active) { 1704 /* 1705 * Inline copy of vrele() since VOP_INACTIVE 1706 * has already been called. 1707 */ 1708 simple_lock(&vp->v_interlock); 1709 if (--vp->v_usecount <= 0) { 1710#ifdef DIAGNOSTIC 1711 if (vp->v_usecount < 0 || vp->v_writecount != 0) { 1712 vprint("vclean: bad ref count", vp); 1713 panic("vclean: ref cnt"); 1714 } 1715#endif 1716 vfree(vp); 1717 } 1718 simple_unlock(&vp->v_interlock); 1719 } 1720 1721 cache_purge(vp); 1722 if (vp->v_vnlock) { 1723 FREE(vp->v_vnlock, M_VNODE); 1724 vp->v_vnlock = NULL; 1725 } 1726 1727 if (VSHOULDFREE(vp)) 1728 vfree(vp); 1729 1730 /* 1731 * Done with purge, notify sleepers of the grim news. 1732 */ 1733 vp->v_op = dead_vnodeop_p; 1734 vn_pollgone(vp); 1735 vp->v_tag = VT_NON; 1736 vp->v_flag &= ~VXLOCK; 1737 if (vp->v_flag & VXWANT) { 1738 vp->v_flag &= ~VXWANT; 1739 wakeup((caddr_t) vp); 1740 } 1741} 1742 1743/* 1744 * Eliminate all activity associated with the requested vnode 1745 * and with all vnodes aliased to the requested vnode. 1746 */ 1747int 1748vop_revoke(ap) 1749 struct vop_revoke_args /* { 1750 struct vnode *a_vp; 1751 int a_flags; 1752 } */ *ap; 1753{ 1754 struct vnode *vp, *vq; 1755 dev_t dev; 1756 1757 KASSERT((ap->a_flags & REVOKEALL) != 0, ("vop_revoke")); 1758 1759 vp = ap->a_vp; 1760 /* 1761 * If a vgone (or vclean) is already in progress, 1762 * wait until it is done and return. 1763 */ 1764 if (vp->v_flag & VXLOCK) { 1765 vp->v_flag |= VXWANT; 1766 simple_unlock(&vp->v_interlock); 1767 tsleep((caddr_t)vp, PINOD, "vop_revokeall", 0); 1768 return (0); 1769 } 1770 dev = vp->v_rdev; 1771 for (;;) { 1772 simple_lock(&spechash_slock); 1773 vq = SLIST_FIRST(&dev->si_hlist); 1774 simple_unlock(&spechash_slock); 1775 if (!vq) 1776 break; 1777 vgone(vq); 1778 } 1779 return (0); 1780} 1781 1782/* 1783 * Recycle an unused vnode to the front of the free list. 1784 * Release the passed interlock if the vnode will be recycled. 1785 */ 1786int 1787vrecycle(vp, inter_lkp, p) 1788 struct vnode *vp; 1789 struct simplelock *inter_lkp; 1790 struct proc *p; 1791{ 1792 1793 simple_lock(&vp->v_interlock); 1794 if (vp->v_usecount == 0) { 1795 if (inter_lkp) { 1796 simple_unlock(inter_lkp); 1797 } 1798 vgonel(vp, p); 1799 return (1); 1800 } 1801 simple_unlock(&vp->v_interlock); 1802 return (0); 1803} 1804 1805/* 1806 * Eliminate all activity associated with a vnode 1807 * in preparation for reuse. 1808 */ 1809void 1810vgone(vp) 1811 register struct vnode *vp; 1812{ 1813 struct proc *p = curproc; /* XXX */ 1814 1815 simple_lock(&vp->v_interlock); 1816 vgonel(vp, p); 1817} 1818 1819/* 1820 * vgone, with the vp interlock held. 1821 */ 1822void 1823vgonel(vp, p) 1824 struct vnode *vp; 1825 struct proc *p; 1826{ 1827 int s; 1828 1829 /* 1830 * If a vgone (or vclean) is already in progress, 1831 * wait until it is done and return. 1832 */ 1833 if (vp->v_flag & VXLOCK) { 1834 vp->v_flag |= VXWANT; 1835 simple_unlock(&vp->v_interlock); 1836 tsleep((caddr_t)vp, PINOD, "vgone", 0); 1837 return; 1838 } 1839 1840 /* 1841 * Clean out the filesystem specific data. 1842 */ 1843 vclean(vp, DOCLOSE, p); 1844 simple_lock(&vp->v_interlock); 1845 1846 /* 1847 * Delete from old mount point vnode list, if on one. 1848 */ 1849 if (vp->v_mount != NULL) 1850 insmntque(vp, (struct mount *)0); 1851 /* 1852 * If special device, remove it from special device alias list 1853 * if it is on one. 1854 */ 1855 if ((vp->v_type == VBLK || vp->v_type == VCHR) && vp->v_rdev != NULL) { 1856 simple_lock(&spechash_slock); 1857 SLIST_REMOVE(&vp->v_hashchain, vp, vnode, v_specnext); 1858 freedev(vp->v_rdev); 1859 simple_unlock(&spechash_slock); 1860 vp->v_rdev = NULL; 1861 } 1862 1863 /* 1864 * If it is on the freelist and not already at the head, 1865 * move it to the head of the list. The test of the back 1866 * pointer and the reference count of zero is because 1867 * it will be removed from the free list by getnewvnode, 1868 * but will not have its reference count incremented until 1869 * after calling vgone. If the reference count were 1870 * incremented first, vgone would (incorrectly) try to 1871 * close the previous instance of the underlying object. 1872 */ 1873 if (vp->v_usecount == 0 && !(vp->v_flag & VDOOMED)) { 1874 s = splbio(); 1875 simple_lock(&vnode_free_list_slock); 1876 if (vp->v_flag & VFREE) { 1877 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); 1878 } else if (vp->v_flag & VTBFREE) { 1879 TAILQ_REMOVE(&vnode_tobefree_list, vp, v_freelist); 1880 vp->v_flag &= ~VTBFREE; 1881 freevnodes++; 1882 } else 1883 freevnodes++; 1884 vp->v_flag |= VFREE; 1885 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); 1886 simple_unlock(&vnode_free_list_slock); 1887 splx(s); 1888 } 1889 1890 vp->v_type = VBAD; 1891 simple_unlock(&vp->v_interlock); 1892} 1893 1894/* 1895 * Lookup a vnode by device number. 1896 */ 1897int 1898vfinddev(dev, type, vpp) 1899 dev_t dev; 1900 enum vtype type; 1901 struct vnode **vpp; 1902{ 1903 struct vnode *vp; 1904 1905 simple_lock(&spechash_slock); 1906 SLIST_FOREACH(vp, &dev->si_hlist, v_specnext) { 1907 if (type == vp->v_type) { 1908 *vpp = vp; 1909 simple_unlock(&spechash_slock); 1910 return (1); 1911 } 1912 } 1913 simple_unlock(&spechash_slock); 1914 return (0); 1915} 1916 1917/* 1918 * Calculate the total number of references to a special device. 1919 */ 1920int 1921vcount(vp) 1922 struct vnode *vp; 1923{ 1924 struct vnode *vq; 1925 int count; 1926 1927 count = 0; 1928 simple_lock(&spechash_slock); 1929 SLIST_FOREACH(vq, &vp->v_hashchain, v_specnext) 1930 count += vq->v_usecount; 1931 simple_unlock(&spechash_slock); 1932 return (count); 1933} 1934 1935/* 1936 * Print out a description of a vnode. 1937 */ 1938static char *typename[] = 1939{"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"}; 1940 1941void 1942vprint(label, vp) 1943 char *label; 1944 struct vnode *vp; 1945{ 1946 char buf[96]; 1947 1948 if (label != NULL) 1949 printf("%s: %p: ", label, (void *)vp); 1950 else 1951 printf("%p: ", (void *)vp); 1952 printf("type %s, usecount %d, writecount %d, refcount %d,", 1953 typename[vp->v_type], vp->v_usecount, vp->v_writecount, 1954 vp->v_holdcnt); 1955 buf[0] = '\0'; 1956 if (vp->v_flag & VROOT) 1957 strcat(buf, "|VROOT"); 1958 if (vp->v_flag & VTEXT) 1959 strcat(buf, "|VTEXT"); 1960 if (vp->v_flag & VSYSTEM) 1961 strcat(buf, "|VSYSTEM"); 1962 if (vp->v_flag & VXLOCK) 1963 strcat(buf, "|VXLOCK"); 1964 if (vp->v_flag & VXWANT) 1965 strcat(buf, "|VXWANT"); 1966 if (vp->v_flag & VBWAIT) 1967 strcat(buf, "|VBWAIT"); 1968 if (vp->v_flag & VDOOMED) 1969 strcat(buf, "|VDOOMED"); 1970 if (vp->v_flag & VFREE) 1971 strcat(buf, "|VFREE"); 1972 if (vp->v_flag & VOBJBUF) 1973 strcat(buf, "|VOBJBUF"); 1974 if (buf[0] != '\0') 1975 printf(" flags (%s)", &buf[1]); 1976 if (vp->v_data == NULL) { 1977 printf("\n"); 1978 } else { 1979 printf("\n\t"); 1980 VOP_PRINT(vp); 1981 } 1982} 1983 1984#ifdef DDB 1985#include <ddb/ddb.h> 1986/* 1987 * List all of the locked vnodes in the system. 1988 * Called when debugging the kernel. 1989 */ 1990DB_SHOW_COMMAND(lockedvnodes, lockedvnodes) 1991{ 1992 struct proc *p = curproc; /* XXX */ 1993 struct mount *mp, *nmp; 1994 struct vnode *vp; 1995 1996 printf("Locked vnodes\n"); 1997 simple_lock(&mountlist_slock); 1998 for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { 1999 if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) { 2000 nmp = TAILQ_NEXT(mp, mnt_list); 2001 continue; 2002 } 2003 LIST_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) { 2004 if (VOP_ISLOCKED(vp, NULL)) 2005 vprint((char *)0, vp); 2006 } 2007 simple_lock(&mountlist_slock); 2008 nmp = TAILQ_NEXT(mp, mnt_list); 2009 vfs_unbusy(mp, p); 2010 } 2011 simple_unlock(&mountlist_slock); 2012} 2013#endif 2014 2015/* 2016 * Top level filesystem related information gathering. 2017 */ 2018static int sysctl_ovfs_conf __P(SYSCTL_HANDLER_ARGS); 2019 2020static int 2021vfs_sysctl SYSCTL_HANDLER_ARGS 2022{ 2023 int *name = (int *)arg1 - 1; /* XXX */ 2024 u_int namelen = arg2 + 1; /* XXX */ 2025 struct vfsconf *vfsp; 2026 2027#if 1 || defined(COMPAT_PRELITE2) 2028 /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */ 2029 if (namelen == 1) 2030 return (sysctl_ovfs_conf(oidp, arg1, arg2, req)); 2031#endif 2032 2033#ifdef notyet 2034 /* all sysctl names at this level are at least name and field */ 2035 if (namelen < 2) 2036 return (ENOTDIR); /* overloaded */ 2037 if (name[0] != VFS_GENERIC) { 2038 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) 2039 if (vfsp->vfc_typenum == name[0]) 2040 break; 2041 if (vfsp == NULL) 2042 return (EOPNOTSUPP); 2043 return ((*vfsp->vfc_vfsops->vfs_sysctl)(&name[1], namelen - 1, 2044 oldp, oldlenp, newp, newlen, p)); 2045 } 2046#endif 2047 switch (name[1]) { 2048 case VFS_MAXTYPENUM: 2049 if (namelen != 2) 2050 return (ENOTDIR); 2051 return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int))); 2052 case VFS_CONF: 2053 if (namelen != 3) 2054 return (ENOTDIR); /* overloaded */ 2055 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) 2056 if (vfsp->vfc_typenum == name[2]) 2057 break; 2058 if (vfsp == NULL) 2059 return (EOPNOTSUPP); 2060 return (SYSCTL_OUT(req, vfsp, sizeof *vfsp)); 2061 } 2062 return (EOPNOTSUPP); 2063} 2064 2065SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD, vfs_sysctl, 2066 "Generic filesystem"); 2067 2068#if 1 || defined(COMPAT_PRELITE2) 2069 2070static int 2071sysctl_ovfs_conf SYSCTL_HANDLER_ARGS 2072{ 2073 int error; 2074 struct vfsconf *vfsp; 2075 struct ovfsconf ovfs; 2076 2077 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) { 2078 ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */ 2079 strcpy(ovfs.vfc_name, vfsp->vfc_name); 2080 ovfs.vfc_index = vfsp->vfc_typenum; 2081 ovfs.vfc_refcount = vfsp->vfc_refcount; 2082 ovfs.vfc_flags = vfsp->vfc_flags; 2083 error = SYSCTL_OUT(req, &ovfs, sizeof ovfs); 2084 if (error) 2085 return error; 2086 } 2087 return 0; 2088} 2089 2090#endif /* 1 || COMPAT_PRELITE2 */ 2091 2092#if 0 2093#define KINFO_VNODESLOP 10 2094/* 2095 * Dump vnode list (via sysctl). 2096 * Copyout address of vnode followed by vnode. 2097 */ 2098/* ARGSUSED */ 2099static int 2100sysctl_vnode SYSCTL_HANDLER_ARGS 2101{ 2102 struct proc *p = curproc; /* XXX */ 2103 struct mount *mp, *nmp; 2104 struct vnode *nvp, *vp; 2105 int error; 2106 2107#define VPTRSZ sizeof (struct vnode *) 2108#define VNODESZ sizeof (struct vnode) 2109 2110 req->lock = 0; 2111 if (!req->oldptr) /* Make an estimate */ 2112 return (SYSCTL_OUT(req, 0, 2113 (numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ))); 2114 2115 simple_lock(&mountlist_slock); 2116 for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { 2117 if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) { 2118 nmp = TAILQ_NEXT(mp, mnt_list); 2119 continue; 2120 } 2121again: 2122 simple_lock(&mntvnode_slock); 2123 for (vp = LIST_FIRST(&mp->mnt_vnodelist); 2124 vp != NULL; 2125 vp = nvp) { 2126 /* 2127 * Check that the vp is still associated with 2128 * this filesystem. RACE: could have been 2129 * recycled onto the same filesystem. 2130 */ 2131 if (vp->v_mount != mp) { 2132 simple_unlock(&mntvnode_slock); 2133 goto again; 2134 } 2135 nvp = LIST_NEXT(vp, v_mntvnodes); 2136 simple_unlock(&mntvnode_slock); 2137 if ((error = SYSCTL_OUT(req, &vp, VPTRSZ)) || 2138 (error = SYSCTL_OUT(req, vp, VNODESZ))) 2139 return (error); 2140 simple_lock(&mntvnode_slock); 2141 } 2142 simple_unlock(&mntvnode_slock); 2143 simple_lock(&mountlist_slock); 2144 nmp = TAILQ_NEXT(mp, mnt_list); 2145 vfs_unbusy(mp, p); 2146 } 2147 simple_unlock(&mountlist_slock); 2148 2149 return (0); 2150} 2151#endif 2152 2153/* 2154 * XXX 2155 * Exporting the vnode list on large systems causes them to crash. 2156 * Exporting the vnode list on medium systems causes sysctl to coredump. 2157 */ 2158#if 0 2159SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD, 2160 0, 0, sysctl_vnode, "S,vnode", ""); 2161#endif 2162 2163/* 2164 * Check to see if a filesystem is mounted on a block device. 2165 */ 2166int 2167vfs_mountedon(vp) 2168 struct vnode *vp; 2169{ 2170 2171 if (vp->v_specmountpoint != NULL) 2172 return (EBUSY); 2173 return (0); 2174} 2175 2176/* 2177 * Unmount all filesystems. The list is traversed in reverse order 2178 * of mounting to avoid dependencies. 2179 */ 2180void 2181vfs_unmountall() 2182{ 2183 struct mount *mp; 2184 struct proc *p; 2185 int error; 2186 2187 if (curproc != NULL) 2188 p = curproc; 2189 else 2190 p = initproc; /* XXX XXX should this be proc0? */ 2191 /* 2192 * Since this only runs when rebooting, it is not interlocked. 2193 */ 2194 while(!TAILQ_EMPTY(&mountlist)) { 2195 mp = TAILQ_LAST(&mountlist, mntlist); 2196 error = dounmount(mp, MNT_FORCE, p); 2197 if (error) { 2198 TAILQ_REMOVE(&mountlist, mp, mnt_list); 2199 printf("unmount of %s failed (", 2200 mp->mnt_stat.f_mntonname); 2201 if (error == EBUSY) 2202 printf("BUSY)\n"); 2203 else 2204 printf("%d)\n", error); 2205 } else { 2206 /* The unmount has removed mp from the mountlist */ 2207 } 2208 } 2209} 2210 2211/* 2212 * Build hash lists of net addresses and hang them off the mount point. 2213 * Called by ufs_mount() to set up the lists of export addresses. 2214 */ 2215static int 2216vfs_hang_addrlist(mp, nep, argp) 2217 struct mount *mp; 2218 struct netexport *nep; 2219 struct export_args *argp; 2220{ 2221 register struct netcred *np; 2222 register struct radix_node_head *rnh; 2223 register int i; 2224 struct radix_node *rn; 2225 struct sockaddr *saddr, *smask = 0; 2226 struct domain *dom; 2227 int error; 2228 2229 if (argp->ex_addrlen == 0) { 2230 if (mp->mnt_flag & MNT_DEFEXPORTED) 2231 return (EPERM); 2232 np = &nep->ne_defexported; 2233 np->netc_exflags = argp->ex_flags; 2234 np->netc_anon = argp->ex_anon; 2235 np->netc_anon.cr_ref = 1; 2236 mp->mnt_flag |= MNT_DEFEXPORTED; 2237 return (0); 2238 } 2239 i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen; 2240 np = (struct netcred *) malloc(i, M_NETADDR, M_WAITOK); 2241 bzero((caddr_t) np, i); 2242 saddr = (struct sockaddr *) (np + 1); 2243 if ((error = copyin(argp->ex_addr, (caddr_t) saddr, argp->ex_addrlen))) 2244 goto out; 2245 if (saddr->sa_len > argp->ex_addrlen) 2246 saddr->sa_len = argp->ex_addrlen; 2247 if (argp->ex_masklen) { 2248 smask = (struct sockaddr *) ((caddr_t) saddr + argp->ex_addrlen); 2249 error = copyin(argp->ex_mask, (caddr_t) smask, argp->ex_masklen); 2250 if (error) 2251 goto out; 2252 if (smask->sa_len > argp->ex_masklen) 2253 smask->sa_len = argp->ex_masklen; 2254 } 2255 i = saddr->sa_family; 2256 if ((rnh = nep->ne_rtable[i]) == 0) { 2257 /* 2258 * Seems silly to initialize every AF when most are not used, 2259 * do so on demand here 2260 */ 2261 for (dom = domains; dom; dom = dom->dom_next) 2262 if (dom->dom_family == i && dom->dom_rtattach) { 2263 dom->dom_rtattach((void **) &nep->ne_rtable[i], 2264 dom->dom_rtoffset); 2265 break; 2266 } 2267 if ((rnh = nep->ne_rtable[i]) == 0) { 2268 error = ENOBUFS; 2269 goto out; 2270 } 2271 } 2272 rn = (*rnh->rnh_addaddr) ((caddr_t) saddr, (caddr_t) smask, rnh, 2273 np->netc_rnodes); 2274 if (rn == 0 || np != (struct netcred *) rn) { /* already exists */ 2275 error = EPERM; 2276 goto out; 2277 } 2278 np->netc_exflags = argp->ex_flags; 2279 np->netc_anon = argp->ex_anon; 2280 np->netc_anon.cr_ref = 1; 2281 return (0); 2282out: 2283 free(np, M_NETADDR); 2284 return (error); 2285} 2286 2287/* ARGSUSED */ 2288static int 2289vfs_free_netcred(rn, w) 2290 struct radix_node *rn; 2291 void *w; 2292{ 2293 register struct radix_node_head *rnh = (struct radix_node_head *) w; 2294 2295 (*rnh->rnh_deladdr) (rn->rn_key, rn->rn_mask, rnh); 2296 free((caddr_t) rn, M_NETADDR); 2297 return (0); 2298} 2299 2300/* 2301 * Free the net address hash lists that are hanging off the mount points. 2302 */ 2303static void 2304vfs_free_addrlist(nep) 2305 struct netexport *nep; 2306{ 2307 register int i; 2308 register struct radix_node_head *rnh; 2309 2310 for (i = 0; i <= AF_MAX; i++) 2311 if ((rnh = nep->ne_rtable[i])) { 2312 (*rnh->rnh_walktree) (rnh, vfs_free_netcred, 2313 (caddr_t) rnh); 2314 free((caddr_t) rnh, M_RTABLE); 2315 nep->ne_rtable[i] = 0; 2316 } 2317} 2318 2319int 2320vfs_export(mp, nep, argp) 2321 struct mount *mp; 2322 struct netexport *nep; 2323 struct export_args *argp; 2324{ 2325 int error; 2326 2327 if (argp->ex_flags & MNT_DELEXPORT) { 2328 if (mp->mnt_flag & MNT_EXPUBLIC) { 2329 vfs_setpublicfs(NULL, NULL, NULL); 2330 mp->mnt_flag &= ~MNT_EXPUBLIC; 2331 } 2332 vfs_free_addrlist(nep); 2333 mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED); 2334 } 2335 if (argp->ex_flags & MNT_EXPORTED) { 2336 if (argp->ex_flags & MNT_EXPUBLIC) { 2337 if ((error = vfs_setpublicfs(mp, nep, argp)) != 0) 2338 return (error); 2339 mp->mnt_flag |= MNT_EXPUBLIC; 2340 } 2341 if ((error = vfs_hang_addrlist(mp, nep, argp))) 2342 return (error); 2343 mp->mnt_flag |= MNT_EXPORTED; 2344 } 2345 return (0); 2346} 2347 2348 2349/* 2350 * Set the publicly exported filesystem (WebNFS). Currently, only 2351 * one public filesystem is possible in the spec (RFC 2054 and 2055) 2352 */ 2353int 2354vfs_setpublicfs(mp, nep, argp) 2355 struct mount *mp; 2356 struct netexport *nep; 2357 struct export_args *argp; 2358{ 2359 int error; 2360 struct vnode *rvp; 2361 char *cp; 2362 2363 /* 2364 * mp == NULL -> invalidate the current info, the FS is 2365 * no longer exported. May be called from either vfs_export 2366 * or unmount, so check if it hasn't already been done. 2367 */ 2368 if (mp == NULL) { 2369 if (nfs_pub.np_valid) { 2370 nfs_pub.np_valid = 0; 2371 if (nfs_pub.np_index != NULL) { 2372 FREE(nfs_pub.np_index, M_TEMP); 2373 nfs_pub.np_index = NULL; 2374 } 2375 } 2376 return (0); 2377 } 2378 2379 /* 2380 * Only one allowed at a time. 2381 */ 2382 if (nfs_pub.np_valid != 0 && mp != nfs_pub.np_mount) 2383 return (EBUSY); 2384 2385 /* 2386 * Get real filehandle for root of exported FS. 2387 */ 2388 bzero((caddr_t)&nfs_pub.np_handle, sizeof(nfs_pub.np_handle)); 2389 nfs_pub.np_handle.fh_fsid = mp->mnt_stat.f_fsid; 2390 2391 if ((error = VFS_ROOT(mp, &rvp))) 2392 return (error); 2393 2394 if ((error = VFS_VPTOFH(rvp, &nfs_pub.np_handle.fh_fid))) 2395 return (error); 2396 2397 vput(rvp); 2398 2399 /* 2400 * If an indexfile was specified, pull it in. 2401 */ 2402 if (argp->ex_indexfile != NULL) { 2403 MALLOC(nfs_pub.np_index, char *, MAXNAMLEN + 1, M_TEMP, 2404 M_WAITOK); 2405 error = copyinstr(argp->ex_indexfile, nfs_pub.np_index, 2406 MAXNAMLEN, (size_t *)0); 2407 if (!error) { 2408 /* 2409 * Check for illegal filenames. 2410 */ 2411 for (cp = nfs_pub.np_index; *cp; cp++) { 2412 if (*cp == '/') { 2413 error = EINVAL; 2414 break; 2415 } 2416 } 2417 } 2418 if (error) { 2419 FREE(nfs_pub.np_index, M_TEMP); 2420 return (error); 2421 } 2422 } 2423 2424 nfs_pub.np_mount = mp; 2425 nfs_pub.np_valid = 1; 2426 return (0); 2427} 2428 2429struct netcred * 2430vfs_export_lookup(mp, nep, nam) 2431 register struct mount *mp; 2432 struct netexport *nep; 2433 struct sockaddr *nam; 2434{ 2435 register struct netcred *np; 2436 register struct radix_node_head *rnh; 2437 struct sockaddr *saddr; 2438 2439 np = NULL; 2440 if (mp->mnt_flag & MNT_EXPORTED) { 2441 /* 2442 * Lookup in the export list first. 2443 */ 2444 if (nam != NULL) { 2445 saddr = nam; 2446 rnh = nep->ne_rtable[saddr->sa_family]; 2447 if (rnh != NULL) { 2448 np = (struct netcred *) 2449 (*rnh->rnh_matchaddr)((caddr_t)saddr, 2450 rnh); 2451 if (np && np->netc_rnodes->rn_flags & RNF_ROOT) 2452 np = NULL; 2453 } 2454 } 2455 /* 2456 * If no address match, use the default if it exists. 2457 */ 2458 if (np == NULL && mp->mnt_flag & MNT_DEFEXPORTED) 2459 np = &nep->ne_defexported; 2460 } 2461 return (np); 2462} 2463 2464/* 2465 * perform msync on all vnodes under a mount point 2466 * the mount point must be locked. 2467 */ 2468void 2469vfs_msync(struct mount *mp, int flags) { 2470 struct vnode *vp, *nvp; 2471 struct vm_object *obj; 2472 int anyio, tries; 2473 2474 tries = 5; 2475loop: 2476 anyio = 0; 2477 for (vp = LIST_FIRST(&mp->mnt_vnodelist); vp != NULL; vp = nvp) { 2478 2479 nvp = LIST_NEXT(vp, v_mntvnodes); 2480 2481 if (vp->v_mount != mp) { 2482 goto loop; 2483 } 2484 2485 if (vp->v_flag & VXLOCK) /* XXX: what if MNT_WAIT? */ 2486 continue; 2487 2488 if (flags != MNT_WAIT) { 2489 obj = vp->v_object; 2490 if (obj == NULL || (obj->flags & OBJ_MIGHTBEDIRTY) == 0) 2491 continue; 2492 if (VOP_ISLOCKED(vp, NULL)) 2493 continue; 2494 } 2495 2496 simple_lock(&vp->v_interlock); 2497 if (vp->v_object && 2498 (vp->v_object->flags & OBJ_MIGHTBEDIRTY)) { 2499 if (!vget(vp, 2500 LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY | LK_NOOBJ, curproc)) { 2501 if (vp->v_object) { 2502 vm_object_page_clean(vp->v_object, 0, 0, flags == MNT_WAIT ? OBJPC_SYNC : OBJPC_NOSYNC); 2503 anyio = 1; 2504 } 2505 vput(vp); 2506 } 2507 } else { 2508 simple_unlock(&vp->v_interlock); 2509 } 2510 } 2511 if (anyio && (--tries > 0)) 2512 goto loop; 2513} 2514 2515/* 2516 * Create the VM object needed for VMIO and mmap support. This 2517 * is done for all VREG files in the system. Some filesystems might 2518 * afford the additional metadata buffering capability of the 2519 * VMIO code by making the device node be VMIO mode also. 2520 * 2521 * vp must be locked when vfs_object_create is called. 2522 */ 2523int 2524vfs_object_create(vp, p, cred) 2525 struct vnode *vp; 2526 struct proc *p; 2527 struct ucred *cred; 2528{ 2529 struct vattr vat; 2530 vm_object_t object; 2531 int error = 0; 2532 2533 if (!vn_isdisk(vp, NULL) && vn_canvmio(vp) == FALSE) 2534 return 0; 2535 2536retry: 2537 if ((object = vp->v_object) == NULL) { 2538 if (vp->v_type == VREG || vp->v_type == VDIR) { 2539 if ((error = VOP_GETATTR(vp, &vat, cred, p)) != 0) 2540 goto retn; 2541 object = vnode_pager_alloc(vp, vat.va_size, 0, 0); 2542 } else if (devsw(vp->v_rdev) != NULL) { 2543 /* 2544 * This simply allocates the biggest object possible 2545 * for a disk vnode. This should be fixed, but doesn't 2546 * cause any problems (yet). 2547 */ 2548 object = vnode_pager_alloc(vp, IDX_TO_OFF(INT_MAX), 0, 0); 2549 } else { 2550 goto retn; 2551 } 2552 /* 2553 * Dereference the reference we just created. This assumes 2554 * that the object is associated with the vp. 2555 */ 2556 object->ref_count--; 2557 vp->v_usecount--; 2558 } else { 2559 if (object->flags & OBJ_DEAD) { 2560 VOP_UNLOCK(vp, 0, p); 2561 tsleep(object, PVM, "vodead", 0); 2562 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); 2563 goto retry; 2564 } 2565 } 2566 2567 KASSERT(vp->v_object != NULL, ("vfs_object_create: NULL object")); 2568 vp->v_flag |= VOBJBUF; 2569 2570retn: 2571 return error; 2572} 2573 2574static void 2575vfree(vp) 2576 struct vnode *vp; 2577{ 2578 int s; 2579 2580 s = splbio(); 2581 simple_lock(&vnode_free_list_slock); 2582 if (vp->v_flag & VTBFREE) { 2583 TAILQ_REMOVE(&vnode_tobefree_list, vp, v_freelist); 2584 vp->v_flag &= ~VTBFREE; 2585 } 2586 if (vp->v_flag & VAGE) { 2587 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); 2588 } else { 2589 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 2590 } 2591 freevnodes++; 2592 simple_unlock(&vnode_free_list_slock); 2593 vp->v_flag &= ~VAGE; 2594 vp->v_flag |= VFREE; 2595 splx(s); 2596} 2597 2598void 2599vbusy(vp) 2600 struct vnode *vp; 2601{ 2602 int s; 2603 2604 s = splbio(); 2605 simple_lock(&vnode_free_list_slock); 2606 if (vp->v_flag & VTBFREE) { 2607 TAILQ_REMOVE(&vnode_tobefree_list, vp, v_freelist); 2608 vp->v_flag &= ~VTBFREE; 2609 } else { 2610 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); 2611 freevnodes--; 2612 } 2613 simple_unlock(&vnode_free_list_slock); 2614 vp->v_flag &= ~(VFREE|VAGE); 2615 splx(s); 2616} 2617 2618/* 2619 * Record a process's interest in events which might happen to 2620 * a vnode. Because poll uses the historic select-style interface 2621 * internally, this routine serves as both the ``check for any 2622 * pending events'' and the ``record my interest in future events'' 2623 * functions. (These are done together, while the lock is held, 2624 * to avoid race conditions.) 2625 */ 2626int 2627vn_pollrecord(vp, p, events) 2628 struct vnode *vp; 2629 struct proc *p; 2630 short events; 2631{ 2632 simple_lock(&vp->v_pollinfo.vpi_lock); 2633 if (vp->v_pollinfo.vpi_revents & events) { 2634 /* 2635 * This leaves events we are not interested 2636 * in available for the other process which 2637 * which presumably had requested them 2638 * (otherwise they would never have been 2639 * recorded). 2640 */ 2641 events &= vp->v_pollinfo.vpi_revents; 2642 vp->v_pollinfo.vpi_revents &= ~events; 2643 2644 simple_unlock(&vp->v_pollinfo.vpi_lock); 2645 return events; 2646 } 2647 vp->v_pollinfo.vpi_events |= events; 2648 selrecord(p, &vp->v_pollinfo.vpi_selinfo); 2649 simple_unlock(&vp->v_pollinfo.vpi_lock); 2650 return 0; 2651} 2652 2653/* 2654 * Note the occurrence of an event. If the VN_POLLEVENT macro is used, 2655 * it is possible for us to miss an event due to race conditions, but 2656 * that condition is expected to be rare, so for the moment it is the 2657 * preferred interface. 2658 */ 2659void 2660vn_pollevent(vp, events) 2661 struct vnode *vp; 2662 short events; 2663{ 2664 simple_lock(&vp->v_pollinfo.vpi_lock); 2665 if (vp->v_pollinfo.vpi_events & events) { 2666 /* 2667 * We clear vpi_events so that we don't 2668 * call selwakeup() twice if two events are 2669 * posted before the polling process(es) is 2670 * awakened. This also ensures that we take at 2671 * most one selwakeup() if the polling process 2672 * is no longer interested. However, it does 2673 * mean that only one event can be noticed at 2674 * a time. (Perhaps we should only clear those 2675 * event bits which we note?) XXX 2676 */ 2677 vp->v_pollinfo.vpi_events = 0; /* &= ~events ??? */ 2678 vp->v_pollinfo.vpi_revents |= events; 2679 selwakeup(&vp->v_pollinfo.vpi_selinfo); 2680 } 2681 simple_unlock(&vp->v_pollinfo.vpi_lock); 2682} 2683 2684/* 2685 * Wake up anyone polling on vp because it is being revoked. 2686 * This depends on dead_poll() returning POLLHUP for correct 2687 * behavior. 2688 */ 2689void 2690vn_pollgone(vp) 2691 struct vnode *vp; 2692{ 2693 simple_lock(&vp->v_pollinfo.vpi_lock); 2694 if (vp->v_pollinfo.vpi_events) { 2695 vp->v_pollinfo.vpi_events = 0; 2696 selwakeup(&vp->v_pollinfo.vpi_selinfo); 2697 } 2698 simple_unlock(&vp->v_pollinfo.vpi_lock); 2699} 2700 2701 2702 2703/* 2704 * Routine to create and manage a filesystem syncer vnode. 2705 */ 2706#define sync_close ((int (*) __P((struct vop_close_args *)))nullop) 2707static int sync_fsync __P((struct vop_fsync_args *)); 2708static int sync_inactive __P((struct vop_inactive_args *)); 2709static int sync_reclaim __P((struct vop_reclaim_args *)); 2710#define sync_lock ((int (*) __P((struct vop_lock_args *)))vop_nolock) 2711#define sync_unlock ((int (*) __P((struct vop_unlock_args *)))vop_nounlock) 2712static int sync_print __P((struct vop_print_args *)); 2713#define sync_islocked ((int(*) __P((struct vop_islocked_args *)))vop_noislocked) 2714 2715static vop_t **sync_vnodeop_p; 2716static struct vnodeopv_entry_desc sync_vnodeop_entries[] = { 2717 { &vop_default_desc, (vop_t *) vop_eopnotsupp }, 2718 { &vop_close_desc, (vop_t *) sync_close }, /* close */ 2719 { &vop_fsync_desc, (vop_t *) sync_fsync }, /* fsync */ 2720 { &vop_inactive_desc, (vop_t *) sync_inactive }, /* inactive */ 2721 { &vop_reclaim_desc, (vop_t *) sync_reclaim }, /* reclaim */ 2722 { &vop_lock_desc, (vop_t *) sync_lock }, /* lock */ 2723 { &vop_unlock_desc, (vop_t *) sync_unlock }, /* unlock */ 2724 { &vop_print_desc, (vop_t *) sync_print }, /* print */ 2725 { &vop_islocked_desc, (vop_t *) sync_islocked }, /* islocked */ 2726 { NULL, NULL } 2727}; 2728static struct vnodeopv_desc sync_vnodeop_opv_desc = 2729 { &sync_vnodeop_p, sync_vnodeop_entries }; 2730 2731VNODEOP_SET(sync_vnodeop_opv_desc); 2732 2733/* 2734 * Create a new filesystem syncer vnode for the specified mount point. 2735 */ 2736int 2737vfs_allocate_syncvnode(mp) 2738 struct mount *mp; 2739{ 2740 struct vnode *vp; 2741 static long start, incr, next; 2742 int error; 2743 2744 /* Allocate a new vnode */ 2745 if ((error = getnewvnode(VT_VFS, mp, sync_vnodeop_p, &vp)) != 0) { 2746 mp->mnt_syncer = NULL; 2747 return (error); 2748 } 2749 vp->v_type = VNON; 2750 /* 2751 * Place the vnode onto the syncer worklist. We attempt to 2752 * scatter them about on the list so that they will go off 2753 * at evenly distributed times even if all the filesystems 2754 * are mounted at once. 2755 */ 2756 next += incr; 2757 if (next == 0 || next > syncer_maxdelay) { 2758 start /= 2; 2759 incr /= 2; 2760 if (start == 0) { 2761 start = syncer_maxdelay / 2; 2762 incr = syncer_maxdelay; 2763 } 2764 next = start; 2765 } 2766 vn_syncer_add_to_worklist(vp, syncdelay > 0 ? next % syncdelay : 0); 2767 mp->mnt_syncer = vp; 2768 return (0); 2769} 2770 2771/* 2772 * Do a lazy sync of the filesystem. 2773 */ 2774static int 2775sync_fsync(ap) 2776 struct vop_fsync_args /* { 2777 struct vnode *a_vp; 2778 struct ucred *a_cred; 2779 int a_waitfor; 2780 struct proc *a_p; 2781 } */ *ap; 2782{ 2783 struct vnode *syncvp = ap->a_vp; 2784 struct mount *mp = syncvp->v_mount; 2785 struct proc *p = ap->a_p; 2786 int asyncflag; 2787 2788 /* 2789 * We only need to do something if this is a lazy evaluation. 2790 */ 2791 if (ap->a_waitfor != MNT_LAZY) 2792 return (0); 2793 2794 /* 2795 * Move ourselves to the back of the sync list. 2796 */ 2797 vn_syncer_add_to_worklist(syncvp, syncdelay); 2798 2799 /* 2800 * Walk the list of vnodes pushing all that are dirty and 2801 * not already on the sync list. 2802 */ 2803 simple_lock(&mountlist_slock); 2804 if (vfs_busy(mp, LK_EXCLUSIVE | LK_NOWAIT, &mountlist_slock, p) != 0) { 2805 simple_unlock(&mountlist_slock); 2806 return (0); 2807 } 2808 asyncflag = mp->mnt_flag & MNT_ASYNC; 2809 mp->mnt_flag &= ~MNT_ASYNC; 2810 vfs_msync(mp, MNT_NOWAIT); 2811 VFS_SYNC(mp, MNT_LAZY, ap->a_cred, p); 2812 if (asyncflag) 2813 mp->mnt_flag |= MNT_ASYNC; 2814 vfs_unbusy(mp, p); 2815 return (0); 2816} 2817 2818/* 2819 * The syncer vnode is no referenced. 2820 */ 2821static int 2822sync_inactive(ap) 2823 struct vop_inactive_args /* { 2824 struct vnode *a_vp; 2825 struct proc *a_p; 2826 } */ *ap; 2827{ 2828 2829 vgone(ap->a_vp); 2830 return (0); 2831} 2832 2833/* 2834 * The syncer vnode is no longer needed and is being decommissioned. 2835 * 2836 * Modifications to the worklist must be protected at splbio(). 2837 */ 2838static int 2839sync_reclaim(ap) 2840 struct vop_reclaim_args /* { 2841 struct vnode *a_vp; 2842 } */ *ap; 2843{ 2844 struct vnode *vp = ap->a_vp; 2845 int s; 2846 2847 s = splbio(); 2848 vp->v_mount->mnt_syncer = NULL; 2849 if (vp->v_flag & VONWORKLST) { 2850 LIST_REMOVE(vp, v_synclist); 2851 vp->v_flag &= ~VONWORKLST; 2852 } 2853 splx(s); 2854 2855 return (0); 2856} 2857 2858/* 2859 * Print out a syncer vnode. 2860 */ 2861static int 2862sync_print(ap) 2863 struct vop_print_args /* { 2864 struct vnode *a_vp; 2865 } */ *ap; 2866{ 2867 struct vnode *vp = ap->a_vp; 2868 2869 printf("syncer vnode"); 2870 if (vp->v_vnlock != NULL) 2871 lockmgr_printinfo(vp->v_vnlock); 2872 printf("\n"); 2873 return (0); 2874} 2875 2876/* 2877 * extract the dev_t from a VBLK or VCHR 2878 */ 2879dev_t 2880vn_todev(vp) 2881 struct vnode *vp; 2882{ 2883 if (vp->v_type != VBLK && vp->v_type != VCHR) 2884 return (NODEV); 2885 return (vp->v_rdev); 2886} 2887 2888/* 2889 * Check if vnode represents a disk device 2890 */ 2891int 2892vn_isdisk(vp, errp) 2893 struct vnode *vp; 2894 int *errp; 2895{ 2896 if (vp->v_type != VBLK && vp->v_type != VCHR) { 2897 if (errp != NULL) 2898 *errp = ENOTBLK; 2899 return (0); 2900 } 2901 if (!devsw(vp->v_rdev)) { 2902 if (errp != NULL) 2903 *errp = ENXIO; 2904 return (0); 2905 } 2906 if (!(devsw(vp->v_rdev)->d_flags & D_DISK)) { 2907 if (errp != NULL) 2908 *errp = ENOTBLK; 2909 return (0); 2910 } 2911 if (errp != NULL) 2912 *errp = 0; 2913 return (1); 2914} 2915 2916void 2917NDFREE(ndp, flags) 2918 struct nameidata *ndp; 2919 const uint flags; 2920{ 2921 if (!(flags & NDF_NO_FREE_PNBUF) && 2922 (ndp->ni_cnd.cn_flags & HASBUF)) { 2923 zfree(namei_zone, ndp->ni_cnd.cn_pnbuf); 2924 ndp->ni_cnd.cn_flags &= ~HASBUF; 2925 } 2926 if (!(flags & NDF_NO_DVP_UNLOCK) && 2927 (ndp->ni_cnd.cn_flags & LOCKPARENT) && 2928 ndp->ni_dvp != ndp->ni_vp) 2929 VOP_UNLOCK(ndp->ni_dvp, 0, ndp->ni_cnd.cn_proc); 2930 if (!(flags & NDF_NO_DVP_RELE) && 2931 (ndp->ni_cnd.cn_flags & (LOCKPARENT|WANTPARENT))) { 2932 vrele(ndp->ni_dvp); 2933 ndp->ni_dvp = NULL; 2934 } 2935 if (!(flags & NDF_NO_VP_UNLOCK) && 2936 (ndp->ni_cnd.cn_flags & LOCKLEAF) && ndp->ni_vp) 2937 VOP_UNLOCK(ndp->ni_vp, 0, ndp->ni_cnd.cn_proc); 2938 if (!(flags & NDF_NO_VP_RELE) && 2939 ndp->ni_vp) { 2940 vrele(ndp->ni_vp); 2941 ndp->ni_vp = NULL; 2942 } 2943 if (!(flags & NDF_NO_STARTDIR_RELE) && 2944 (ndp->ni_cnd.cn_flags & SAVESTART)) { 2945 vrele(ndp->ni_startdir); 2946 ndp->ni_startdir = NULL; 2947 } 2948} 2949