vfs_subr.c revision 65557
1/* 2 * Copyright (c) 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the University of 21 * California, Berkeley and its contributors. 22 * 4. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95 39 * $FreeBSD: head/sys/kern/vfs_subr.c 65557 2000-09-07 01:33:02Z jasone $ 40 */ 41 42/* 43 * External virtual filesystem routines 44 */ 45#include "opt_ddb.h" 46#include "opt_ffs.h" 47 48#include <sys/param.h> 49#include <sys/systm.h> 50#include <sys/bio.h> 51#include <sys/buf.h> 52#include <sys/conf.h> 53#include <sys/dirent.h> 54#include <sys/domain.h> 55#include <sys/eventhandler.h> 56#include <sys/fcntl.h> 57#include <sys/kernel.h> 58#include <sys/kthread.h> 59#include <sys/ktr.h> 60#include <sys/malloc.h> 61#include <sys/mount.h> 62#include <sys/namei.h> 63#include <sys/proc.h> 64#include <sys/reboot.h> 65#include <sys/socket.h> 66#include <sys/stat.h> 67#include <sys/sysctl.h> 68#include <sys/vmmeter.h> 69#include <sys/vnode.h> 70 71#include <machine/limits.h> 72#include <machine/mutex.h> 73 74#include <vm/vm.h> 75#include <vm/vm_object.h> 76#include <vm/vm_extern.h> 77#include <vm/pmap.h> 78#include <vm/vm_map.h> 79#include <vm/vm_page.h> 80#include <vm/vm_pager.h> 81#include <vm/vnode_pager.h> 82#include <vm/vm_zone.h> 83 84static MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure"); 85 86static void insmntque __P((struct vnode *vp, struct mount *mp)); 87static void vclean __P((struct vnode *vp, int flags, struct proc *p)); 88static unsigned long numvnodes; 89SYSCTL_INT(_debug, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, ""); 90 91enum vtype iftovt_tab[16] = { 92 VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, 93 VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD, 94}; 95int vttoif_tab[9] = { 96 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, 97 S_IFSOCK, S_IFIFO, S_IFMT, 98}; 99 100static TAILQ_HEAD(freelst, vnode) vnode_free_list; /* vnode free list */ 101 102static u_long wantfreevnodes = 25; 103SYSCTL_INT(_debug, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, ""); 104static u_long freevnodes = 0; 105SYSCTL_INT(_debug, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, ""); 106 107static int reassignbufcalls; 108SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0, ""); 109static int reassignbufloops; 110SYSCTL_INT(_vfs, OID_AUTO, reassignbufloops, CTLFLAG_RW, &reassignbufloops, 0, ""); 111static int reassignbufsortgood; 112SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortgood, CTLFLAG_RW, &reassignbufsortgood, 0, ""); 113static int reassignbufsortbad; 114SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortbad, CTLFLAG_RW, &reassignbufsortbad, 0, ""); 115static int reassignbufmethod = 1; 116SYSCTL_INT(_vfs, OID_AUTO, reassignbufmethod, CTLFLAG_RW, &reassignbufmethod, 0, ""); 117 118#ifdef ENABLE_VFS_IOOPT 119int vfs_ioopt = 0; 120SYSCTL_INT(_vfs, OID_AUTO, ioopt, CTLFLAG_RW, &vfs_ioopt, 0, ""); 121#endif 122 123struct mntlist mountlist = TAILQ_HEAD_INITIALIZER(mountlist); /* mounted fs */ 124struct simplelock mountlist_slock; 125struct simplelock mntvnode_slock; 126int nfs_mount_type = -1; 127#ifndef NULL_SIMPLELOCKS 128static struct simplelock mntid_slock; 129static struct simplelock vnode_free_list_slock; 130static struct simplelock spechash_slock; 131#endif 132struct nfs_public nfs_pub; /* publicly exported FS */ 133static vm_zone_t vnode_zone; 134int prtactive = 0; /* 1 => print out reclaim of active vnodes */ 135 136/* 137 * The workitem queue. 138 */ 139#define SYNCER_MAXDELAY 32 140static int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */ 141time_t syncdelay = 30; /* max time to delay syncing data */ 142time_t filedelay = 30; /* time to delay syncing files */ 143SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, ""); 144time_t dirdelay = 29; /* time to delay syncing directories */ 145SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, ""); 146time_t metadelay = 28; /* time to delay syncing metadata */ 147SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, ""); 148static int rushjob; /* number of slots to run ASAP */ 149static int stat_rush_requests; /* number of times I/O speeded up */ 150SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, ""); 151 152static int syncer_delayno = 0; 153static long syncer_mask; 154LIST_HEAD(synclist, vnode); 155static struct synclist *syncer_workitem_pending; 156 157int desiredvnodes; 158SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW, 159 &desiredvnodes, 0, "Maximum number of vnodes"); 160 161static void vfs_free_addrlist __P((struct netexport *nep)); 162static int vfs_free_netcred __P((struct radix_node *rn, void *w)); 163static int vfs_hang_addrlist __P((struct mount *mp, struct netexport *nep, 164 struct export_args *argp)); 165 166/* 167 * Initialize the vnode management data structures. 168 */ 169void 170vntblinit() 171{ 172 173 desiredvnodes = maxproc + cnt.v_page_count / 4; 174 simple_lock_init(&mntvnode_slock); 175 simple_lock_init(&mntid_slock); 176 simple_lock_init(&spechash_slock); 177 TAILQ_INIT(&vnode_free_list); 178 simple_lock_init(&vnode_free_list_slock); 179 vnode_zone = zinit("VNODE", sizeof (struct vnode), 0, 0, 5); 180 /* 181 * Initialize the filesystem syncer. 182 */ 183 syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE, 184 &syncer_mask); 185 syncer_maxdelay = syncer_mask + 1; 186} 187 188/* 189 * Mark a mount point as busy. Used to synchronize access and to delay 190 * unmounting. Interlock is not released on failure. 191 */ 192int 193vfs_busy(mp, flags, interlkp, p) 194 struct mount *mp; 195 int flags; 196 struct simplelock *interlkp; 197 struct proc *p; 198{ 199 int lkflags; 200 201 if (mp->mnt_kern_flag & MNTK_UNMOUNT) { 202 if (flags & LK_NOWAIT) 203 return (ENOENT); 204 mp->mnt_kern_flag |= MNTK_MWAIT; 205 if (interlkp) { 206 simple_unlock(interlkp); 207 } 208 /* 209 * Since all busy locks are shared except the exclusive 210 * lock granted when unmounting, the only place that a 211 * wakeup needs to be done is at the release of the 212 * exclusive lock at the end of dounmount. 213 */ 214 tsleep((caddr_t)mp, PVFS, "vfs_busy", 0); 215 if (interlkp) { 216 simple_lock(interlkp); 217 } 218 return (ENOENT); 219 } 220 lkflags = LK_SHARED | LK_NOPAUSE; 221 if (interlkp) 222 lkflags |= LK_INTERLOCK; 223 if (lockmgr(&mp->mnt_lock, lkflags, interlkp, p)) 224 panic("vfs_busy: unexpected lock failure"); 225 return (0); 226} 227 228/* 229 * Free a busy filesystem. 230 */ 231void 232vfs_unbusy(mp, p) 233 struct mount *mp; 234 struct proc *p; 235{ 236 237 lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, p); 238} 239 240/* 241 * Lookup a filesystem type, and if found allocate and initialize 242 * a mount structure for it. 243 * 244 * Devname is usually updated by mount(8) after booting. 245 */ 246int 247vfs_rootmountalloc(fstypename, devname, mpp) 248 char *fstypename; 249 char *devname; 250 struct mount **mpp; 251{ 252 struct proc *p = curproc; /* XXX */ 253 struct vfsconf *vfsp; 254 struct mount *mp; 255 256 if (fstypename == NULL) 257 return (ENODEV); 258 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) 259 if (!strcmp(vfsp->vfc_name, fstypename)) 260 break; 261 if (vfsp == NULL) 262 return (ENODEV); 263 mp = malloc((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK); 264 bzero((char *)mp, (u_long)sizeof(struct mount)); 265 lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, LK_NOPAUSE); 266 (void)vfs_busy(mp, LK_NOWAIT, 0, p); 267 LIST_INIT(&mp->mnt_vnodelist); 268 mp->mnt_vfc = vfsp; 269 mp->mnt_op = vfsp->vfc_vfsops; 270 mp->mnt_flag = MNT_RDONLY; 271 mp->mnt_vnodecovered = NULLVP; 272 vfsp->vfc_refcount++; 273 mp->mnt_iosize_max = DFLTPHYS; 274 mp->mnt_stat.f_type = vfsp->vfc_typenum; 275 mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK; 276 strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN); 277 mp->mnt_stat.f_mntonname[0] = '/'; 278 mp->mnt_stat.f_mntonname[1] = 0; 279 (void) copystr(devname, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, 0); 280 *mpp = mp; 281 return (0); 282} 283 284/* 285 * Find an appropriate filesystem to use for the root. If a filesystem 286 * has not been preselected, walk through the list of known filesystems 287 * trying those that have mountroot routines, and try them until one 288 * works or we have tried them all. 289 */ 290#ifdef notdef /* XXX JH */ 291int 292lite2_vfs_mountroot() 293{ 294 struct vfsconf *vfsp; 295 extern int (*lite2_mountroot) __P((void)); 296 int error; 297 298 if (lite2_mountroot != NULL) 299 return ((*lite2_mountroot)()); 300 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) { 301 if (vfsp->vfc_mountroot == NULL) 302 continue; 303 if ((error = (*vfsp->vfc_mountroot)()) == 0) 304 return (0); 305 printf("%s_mountroot failed: %d\n", vfsp->vfc_name, error); 306 } 307 return (ENODEV); 308} 309#endif 310 311/* 312 * Lookup a mount point by filesystem identifier. 313 */ 314struct mount * 315vfs_getvfs(fsid) 316 fsid_t *fsid; 317{ 318 register struct mount *mp; 319 320 simple_lock(&mountlist_slock); 321 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 322 if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] && 323 mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) { 324 simple_unlock(&mountlist_slock); 325 return (mp); 326 } 327 } 328 simple_unlock(&mountlist_slock); 329 return ((struct mount *) 0); 330} 331 332/* 333 * Get a new unique fsid. Try to make its val[0] unique, since this value 334 * will be used to create fake device numbers for stat(). Also try (but 335 * not so hard) make its val[0] unique mod 2^16, since some emulators only 336 * support 16-bit device numbers. We end up with unique val[0]'s for the 337 * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls. 338 * 339 * Keep in mind that several mounts may be running in parallel. Starting 340 * the search one past where the previous search terminated is both a 341 * micro-optimization and a defense against returning the same fsid to 342 * different mounts. 343 */ 344void 345vfs_getnewfsid(mp) 346 struct mount *mp; 347{ 348 static u_int16_t mntid_base; 349 fsid_t tfsid; 350 int mtype; 351 352 simple_lock(&mntid_slock); 353 mtype = mp->mnt_vfc->vfc_typenum; 354 tfsid.val[1] = mtype; 355 mtype = (mtype & 0xFF) << 24; 356 for (;;) { 357 tfsid.val[0] = makeudev(255, 358 mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF)); 359 mntid_base++; 360 if (vfs_getvfs(&tfsid) == NULL) 361 break; 362 } 363 mp->mnt_stat.f_fsid.val[0] = tfsid.val[0]; 364 mp->mnt_stat.f_fsid.val[1] = tfsid.val[1]; 365 simple_unlock(&mntid_slock); 366} 367 368/* 369 * Knob to control the precision of file timestamps: 370 * 371 * 0 = seconds only; nanoseconds zeroed. 372 * 1 = seconds and nanoseconds, accurate within 1/HZ. 373 * 2 = seconds and nanoseconds, truncated to microseconds. 374 * >=3 = seconds and nanoseconds, maximum precision. 375 */ 376enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC }; 377 378static int timestamp_precision = TSP_SEC; 379SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW, 380 ×tamp_precision, 0, ""); 381 382/* 383 * Get a current timestamp. 384 */ 385void 386vfs_timestamp(tsp) 387 struct timespec *tsp; 388{ 389 struct timeval tv; 390 391 switch (timestamp_precision) { 392 case TSP_SEC: 393 tsp->tv_sec = time_second; 394 tsp->tv_nsec = 0; 395 break; 396 case TSP_HZ: 397 getnanotime(tsp); 398 break; 399 case TSP_USEC: 400 microtime(&tv); 401 TIMEVAL_TO_TIMESPEC(&tv, tsp); 402 break; 403 case TSP_NSEC: 404 default: 405 nanotime(tsp); 406 break; 407 } 408} 409 410/* 411 * Set vnode attributes to VNOVAL 412 */ 413void 414vattr_null(vap) 415 register struct vattr *vap; 416{ 417 418 vap->va_type = VNON; 419 vap->va_size = VNOVAL; 420 vap->va_bytes = VNOVAL; 421 vap->va_mode = VNOVAL; 422 vap->va_nlink = VNOVAL; 423 vap->va_uid = VNOVAL; 424 vap->va_gid = VNOVAL; 425 vap->va_fsid = VNOVAL; 426 vap->va_fileid = VNOVAL; 427 vap->va_blocksize = VNOVAL; 428 vap->va_rdev = VNOVAL; 429 vap->va_atime.tv_sec = VNOVAL; 430 vap->va_atime.tv_nsec = VNOVAL; 431 vap->va_mtime.tv_sec = VNOVAL; 432 vap->va_mtime.tv_nsec = VNOVAL; 433 vap->va_ctime.tv_sec = VNOVAL; 434 vap->va_ctime.tv_nsec = VNOVAL; 435 vap->va_flags = VNOVAL; 436 vap->va_gen = VNOVAL; 437 vap->va_vaflags = 0; 438} 439 440/* 441 * Routines having to do with the management of the vnode table. 442 */ 443 444/* 445 * Return the next vnode from the free list. 446 */ 447int 448getnewvnode(tag, mp, vops, vpp) 449 enum vtagtype tag; 450 struct mount *mp; 451 vop_t **vops; 452 struct vnode **vpp; 453{ 454 int s, count; 455 struct proc *p = curproc; /* XXX */ 456 struct vnode *vp = NULL; 457 struct mount *vnmp; 458 vm_object_t object; 459 460 /* 461 * We take the least recently used vnode from the freelist 462 * if we can get it and it has no cached pages, and no 463 * namecache entries are relative to it. 464 * Otherwise we allocate a new vnode 465 */ 466 467 s = splbio(); 468 simple_lock(&vnode_free_list_slock); 469 470 if (wantfreevnodes && freevnodes < wantfreevnodes) { 471 vp = NULL; 472 } else if (!wantfreevnodes && freevnodes <= desiredvnodes) { 473 /* 474 * XXX: this is only here to be backwards compatible 475 */ 476 vp = NULL; 477 } else for (count = 0; count < freevnodes; count++) { 478 vp = TAILQ_FIRST(&vnode_free_list); 479 if (vp == NULL || vp->v_usecount) 480 panic("getnewvnode: free vnode isn't"); 481 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); 482 /* 483 * Don't recycle if active in the namecache or 484 * if it still has cached pages or we cannot get 485 * its interlock. 486 */ 487 object = vp->v_object; 488 if (LIST_FIRST(&vp->v_cache_src) != NULL || 489 (object && (object->resident_page_count || 490 object->ref_count)) || 491 !simple_lock_try(&vp->v_interlock)) { 492 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 493 vp = NULL; 494 continue; 495 } 496 /* 497 * Skip over it if its filesystem is being suspended. 498 */ 499 if (vn_start_write(vp, &vnmp, V_NOWAIT) == 0) 500 break; 501 simple_unlock(&vp->v_interlock); 502 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 503 vp = NULL; 504 } 505 if (vp) { 506 vp->v_flag |= VDOOMED; 507 freevnodes--; 508 simple_unlock(&vnode_free_list_slock); 509 cache_purge(vp); 510 vp->v_lease = NULL; 511 if (vp->v_type != VBAD) { 512 vgonel(vp, p); 513 } else { 514 simple_unlock(&vp->v_interlock); 515 } 516 vn_finished_write(vnmp); 517 518#ifdef INVARIANTS 519 { 520 int s; 521 522 if (vp->v_data) 523 panic("cleaned vnode isn't"); 524 s = splbio(); 525 if (vp->v_numoutput) 526 panic("Clean vnode has pending I/O's"); 527 splx(s); 528 if (vp->v_writecount != 0) 529 panic("Non-zero write count"); 530 } 531#endif 532 vp->v_flag = 0; 533 vp->v_lastw = 0; 534 vp->v_lasta = 0; 535 vp->v_cstart = 0; 536 vp->v_clen = 0; 537 vp->v_socket = 0; 538 } else { 539 simple_unlock(&vnode_free_list_slock); 540 vp = (struct vnode *) zalloc(vnode_zone); 541 bzero((char *) vp, sizeof *vp); 542 simple_lock_init(&vp->v_interlock); 543 vp->v_dd = vp; 544 cache_purge(vp); 545 LIST_INIT(&vp->v_cache_src); 546 TAILQ_INIT(&vp->v_cache_dst); 547 numvnodes++; 548 } 549 550 TAILQ_INIT(&vp->v_cleanblkhd); 551 TAILQ_INIT(&vp->v_dirtyblkhd); 552 vp->v_type = VNON; 553 vp->v_tag = tag; 554 vp->v_op = vops; 555 insmntque(vp, mp); 556 *vpp = vp; 557 vp->v_usecount = 1; 558 vp->v_data = 0; 559 splx(s); 560 561 vfs_object_create(vp, p, p->p_ucred); 562 return (0); 563} 564 565/* 566 * Move a vnode from one mount queue to another. 567 */ 568static void 569insmntque(vp, mp) 570 register struct vnode *vp; 571 register struct mount *mp; 572{ 573 574 simple_lock(&mntvnode_slock); 575 /* 576 * Delete from old mount point vnode list, if on one. 577 */ 578 if (vp->v_mount != NULL) 579 LIST_REMOVE(vp, v_mntvnodes); 580 /* 581 * Insert into list of vnodes for the new mount point, if available. 582 */ 583 if ((vp->v_mount = mp) == NULL) { 584 simple_unlock(&mntvnode_slock); 585 return; 586 } 587 LIST_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes); 588 simple_unlock(&mntvnode_slock); 589} 590 591/* 592 * Update outstanding I/O count and do wakeup if requested. 593 */ 594void 595vwakeup(bp) 596 register struct buf *bp; 597{ 598 register struct vnode *vp; 599 600 bp->b_flags &= ~B_WRITEINPROG; 601 if ((vp = bp->b_vp)) { 602 vp->v_numoutput--; 603 if (vp->v_numoutput < 0) 604 panic("vwakeup: neg numoutput"); 605 if ((vp->v_numoutput == 0) && (vp->v_flag & VBWAIT)) { 606 vp->v_flag &= ~VBWAIT; 607 wakeup((caddr_t) &vp->v_numoutput); 608 } 609 } 610} 611 612/* 613 * Flush out and invalidate all buffers associated with a vnode. 614 * Called with the underlying object locked. 615 */ 616int 617vinvalbuf(vp, flags, cred, p, slpflag, slptimeo) 618 register struct vnode *vp; 619 int flags; 620 struct ucred *cred; 621 struct proc *p; 622 int slpflag, slptimeo; 623{ 624 register struct buf *bp; 625 struct buf *nbp, *blist; 626 int s, error; 627 vm_object_t object; 628 629 if (flags & V_SAVE) { 630 s = splbio(); 631 while (vp->v_numoutput) { 632 vp->v_flag |= VBWAIT; 633 error = tsleep((caddr_t)&vp->v_numoutput, 634 slpflag | (PRIBIO + 1), "vinvlbuf", slptimeo); 635 if (error) { 636 splx(s); 637 return (error); 638 } 639 } 640 if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) { 641 splx(s); 642 if ((error = VOP_FSYNC(vp, cred, MNT_WAIT, p)) != 0) 643 return (error); 644 s = splbio(); 645 if (vp->v_numoutput > 0 || 646 !TAILQ_EMPTY(&vp->v_dirtyblkhd)) 647 panic("vinvalbuf: dirty bufs"); 648 } 649 splx(s); 650 } 651 s = splbio(); 652 for (;;) { 653 blist = TAILQ_FIRST(&vp->v_cleanblkhd); 654 if (!blist) 655 blist = TAILQ_FIRST(&vp->v_dirtyblkhd); 656 if (!blist) 657 break; 658 659 for (bp = blist; bp; bp = nbp) { 660 nbp = TAILQ_NEXT(bp, b_vnbufs); 661 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { 662 error = BUF_TIMELOCK(bp, 663 LK_EXCLUSIVE | LK_SLEEPFAIL, 664 "vinvalbuf", slpflag, slptimeo); 665 if (error == ENOLCK) 666 break; 667 splx(s); 668 return (error); 669 } 670 /* 671 * XXX Since there are no node locks for NFS, I 672 * believe there is a slight chance that a delayed 673 * write will occur while sleeping just above, so 674 * check for it. Note that vfs_bio_awrite expects 675 * buffers to reside on a queue, while VOP_BWRITE and 676 * brelse do not. 677 */ 678 if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) && 679 (flags & V_SAVE)) { 680 681 if (bp->b_vp == vp) { 682 if (bp->b_flags & B_CLUSTEROK) { 683 BUF_UNLOCK(bp); 684 vfs_bio_awrite(bp); 685 } else { 686 bremfree(bp); 687 bp->b_flags |= B_ASYNC; 688 BUF_WRITE(bp); 689 } 690 } else { 691 bremfree(bp); 692 (void) BUF_WRITE(bp); 693 } 694 break; 695 } 696 bremfree(bp); 697 bp->b_flags |= (B_INVAL | B_NOCACHE | B_RELBUF); 698 bp->b_flags &= ~B_ASYNC; 699 brelse(bp); 700 } 701 } 702 703 while (vp->v_numoutput > 0) { 704 vp->v_flag |= VBWAIT; 705 tsleep(&vp->v_numoutput, PVM, "vnvlbv", 0); 706 } 707 708 splx(s); 709 710 /* 711 * Destroy the copy in the VM cache, too. 712 */ 713 simple_lock(&vp->v_interlock); 714 object = vp->v_object; 715 if (object != NULL) { 716 vm_object_page_remove(object, 0, 0, 717 (flags & V_SAVE) ? TRUE : FALSE); 718 } 719 simple_unlock(&vp->v_interlock); 720 721 if (!TAILQ_EMPTY(&vp->v_dirtyblkhd) || !TAILQ_EMPTY(&vp->v_cleanblkhd)) 722 panic("vinvalbuf: flush failed"); 723 return (0); 724} 725 726/* 727 * Truncate a file's buffer and pages to a specified length. This 728 * is in lieu of the old vinvalbuf mechanism, which performed unneeded 729 * sync activity. 730 */ 731int 732vtruncbuf(vp, cred, p, length, blksize) 733 register struct vnode *vp; 734 struct ucred *cred; 735 struct proc *p; 736 off_t length; 737 int blksize; 738{ 739 register struct buf *bp; 740 struct buf *nbp; 741 int s, anyfreed; 742 int trunclbn; 743 744 /* 745 * Round up to the *next* lbn. 746 */ 747 trunclbn = (length + blksize - 1) / blksize; 748 749 s = splbio(); 750restart: 751 anyfreed = 1; 752 for (;anyfreed;) { 753 anyfreed = 0; 754 for (bp = TAILQ_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) { 755 nbp = TAILQ_NEXT(bp, b_vnbufs); 756 if (bp->b_lblkno >= trunclbn) { 757 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { 758 BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL); 759 goto restart; 760 } else { 761 bremfree(bp); 762 bp->b_flags |= (B_INVAL | B_RELBUF); 763 bp->b_flags &= ~B_ASYNC; 764 brelse(bp); 765 anyfreed = 1; 766 } 767 if (nbp && 768 (((nbp->b_xflags & BX_VNCLEAN) == 0) || 769 (nbp->b_vp != vp) || 770 (nbp->b_flags & B_DELWRI))) { 771 goto restart; 772 } 773 } 774 } 775 776 for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { 777 nbp = TAILQ_NEXT(bp, b_vnbufs); 778 if (bp->b_lblkno >= trunclbn) { 779 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { 780 BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL); 781 goto restart; 782 } else { 783 bremfree(bp); 784 bp->b_flags |= (B_INVAL | B_RELBUF); 785 bp->b_flags &= ~B_ASYNC; 786 brelse(bp); 787 anyfreed = 1; 788 } 789 if (nbp && 790 (((nbp->b_xflags & BX_VNDIRTY) == 0) || 791 (nbp->b_vp != vp) || 792 (nbp->b_flags & B_DELWRI) == 0)) { 793 goto restart; 794 } 795 } 796 } 797 } 798 799 if (length > 0) { 800restartsync: 801 for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { 802 nbp = TAILQ_NEXT(bp, b_vnbufs); 803 if ((bp->b_flags & B_DELWRI) && (bp->b_lblkno < 0)) { 804 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { 805 BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL); 806 goto restart; 807 } else { 808 bremfree(bp); 809 if (bp->b_vp == vp) { 810 bp->b_flags |= B_ASYNC; 811 } else { 812 bp->b_flags &= ~B_ASYNC; 813 } 814 BUF_WRITE(bp); 815 } 816 goto restartsync; 817 } 818 819 } 820 } 821 822 while (vp->v_numoutput > 0) { 823 vp->v_flag |= VBWAIT; 824 tsleep(&vp->v_numoutput, PVM, "vbtrunc", 0); 825 } 826 827 splx(s); 828 829 vnode_pager_setsize(vp, length); 830 831 return (0); 832} 833 834/* 835 * Associate a buffer with a vnode. 836 */ 837void 838bgetvp(vp, bp) 839 register struct vnode *vp; 840 register struct buf *bp; 841{ 842 int s; 843 844 KASSERT(bp->b_vp == NULL, ("bgetvp: not free")); 845 846 vhold(vp); 847 bp->b_vp = vp; 848 bp->b_dev = vn_todev(vp); 849 /* 850 * Insert onto list for new vnode. 851 */ 852 s = splbio(); 853 bp->b_xflags |= BX_VNCLEAN; 854 bp->b_xflags &= ~BX_VNDIRTY; 855 TAILQ_INSERT_TAIL(&vp->v_cleanblkhd, bp, b_vnbufs); 856 splx(s); 857} 858 859/* 860 * Disassociate a buffer from a vnode. 861 */ 862void 863brelvp(bp) 864 register struct buf *bp; 865{ 866 struct vnode *vp; 867 struct buflists *listheadp; 868 int s; 869 870 KASSERT(bp->b_vp != NULL, ("brelvp: NULL")); 871 872 /* 873 * Delete from old vnode list, if on one. 874 */ 875 vp = bp->b_vp; 876 s = splbio(); 877 if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) { 878 if (bp->b_xflags & BX_VNDIRTY) 879 listheadp = &vp->v_dirtyblkhd; 880 else 881 listheadp = &vp->v_cleanblkhd; 882 TAILQ_REMOVE(listheadp, bp, b_vnbufs); 883 bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN); 884 } 885 if ((vp->v_flag & VONWORKLST) && TAILQ_EMPTY(&vp->v_dirtyblkhd)) { 886 vp->v_flag &= ~VONWORKLST; 887 LIST_REMOVE(vp, v_synclist); 888 } 889 splx(s); 890 bp->b_vp = (struct vnode *) 0; 891 vdrop(vp); 892} 893 894/* 895 * The workitem queue. 896 * 897 * It is useful to delay writes of file data and filesystem metadata 898 * for tens of seconds so that quickly created and deleted files need 899 * not waste disk bandwidth being created and removed. To realize this, 900 * we append vnodes to a "workitem" queue. When running with a soft 901 * updates implementation, most pending metadata dependencies should 902 * not wait for more than a few seconds. Thus, mounted on block devices 903 * are delayed only about a half the time that file data is delayed. 904 * Similarly, directory updates are more critical, so are only delayed 905 * about a third the time that file data is delayed. Thus, there are 906 * SYNCER_MAXDELAY queues that are processed round-robin at a rate of 907 * one each second (driven off the filesystem syncer process). The 908 * syncer_delayno variable indicates the next queue that is to be processed. 909 * Items that need to be processed soon are placed in this queue: 910 * 911 * syncer_workitem_pending[syncer_delayno] 912 * 913 * A delay of fifteen seconds is done by placing the request fifteen 914 * entries later in the queue: 915 * 916 * syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask] 917 * 918 */ 919 920/* 921 * Add an item to the syncer work queue. 922 */ 923static void 924vn_syncer_add_to_worklist(struct vnode *vp, int delay) 925{ 926 int s, slot; 927 928 s = splbio(); 929 930 if (vp->v_flag & VONWORKLST) { 931 LIST_REMOVE(vp, v_synclist); 932 } 933 934 if (delay > syncer_maxdelay - 2) 935 delay = syncer_maxdelay - 2; 936 slot = (syncer_delayno + delay) & syncer_mask; 937 938 LIST_INSERT_HEAD(&syncer_workitem_pending[slot], vp, v_synclist); 939 vp->v_flag |= VONWORKLST; 940 splx(s); 941} 942 943struct proc *updateproc; 944static void sched_sync __P((void)); 945static struct kproc_desc up_kp = { 946 "syncer", 947 sched_sync, 948 &updateproc 949}; 950SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp) 951 952/* 953 * System filesystem synchronizer daemon. 954 */ 955void 956sched_sync(void) 957{ 958 struct synclist *slp; 959 struct vnode *vp; 960 struct mount *mp; 961 long starttime; 962 int s; 963 struct proc *p = updateproc; 964 965 mtx_enter(&Giant, MTX_DEF); 966 967 EVENTHANDLER_REGISTER(shutdown_pre_sync, shutdown_kproc, p, 968 SHUTDOWN_PRI_LAST); 969 970 for (;;) { 971 kproc_suspend_loop(p); 972 973 starttime = time_second; 974 975 /* 976 * Push files whose dirty time has expired. Be careful 977 * of interrupt race on slp queue. 978 */ 979 s = splbio(); 980 slp = &syncer_workitem_pending[syncer_delayno]; 981 syncer_delayno += 1; 982 if (syncer_delayno == syncer_maxdelay) 983 syncer_delayno = 0; 984 splx(s); 985 986 while ((vp = LIST_FIRST(slp)) != NULL) { 987 if (VOP_ISLOCKED(vp, NULL) == 0 && 988 vn_start_write(vp, &mp, V_NOWAIT) == 0) { 989 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); 990 (void) VOP_FSYNC(vp, p->p_ucred, MNT_LAZY, p); 991 VOP_UNLOCK(vp, 0, p); 992 vn_finished_write(mp); 993 } 994 s = splbio(); 995 if (LIST_FIRST(slp) == vp) { 996 /* 997 * Note: v_tag VT_VFS vps can remain on the 998 * worklist too with no dirty blocks, but 999 * since sync_fsync() moves it to a different 1000 * slot we are safe. 1001 */ 1002 if (TAILQ_EMPTY(&vp->v_dirtyblkhd) && 1003 !vn_isdisk(vp, NULL)) 1004 panic("sched_sync: fsync failed vp %p tag %d", vp, vp->v_tag); 1005 /* 1006 * Put us back on the worklist. The worklist 1007 * routine will remove us from our current 1008 * position and then add us back in at a later 1009 * position. 1010 */ 1011 vn_syncer_add_to_worklist(vp, syncdelay); 1012 } 1013 splx(s); 1014 } 1015 1016 /* 1017 * Do soft update processing. 1018 */ 1019#ifdef SOFTUPDATES 1020 softdep_process_worklist(NULL); 1021#endif 1022 1023 /* 1024 * The variable rushjob allows the kernel to speed up the 1025 * processing of the filesystem syncer process. A rushjob 1026 * value of N tells the filesystem syncer to process the next 1027 * N seconds worth of work on its queue ASAP. Currently rushjob 1028 * is used by the soft update code to speed up the filesystem 1029 * syncer process when the incore state is getting so far 1030 * ahead of the disk that the kernel memory pool is being 1031 * threatened with exhaustion. 1032 */ 1033 if (rushjob > 0) { 1034 rushjob -= 1; 1035 continue; 1036 } 1037 /* 1038 * If it has taken us less than a second to process the 1039 * current work, then wait. Otherwise start right over 1040 * again. We can still lose time if any single round 1041 * takes more than two seconds, but it does not really 1042 * matter as we are just trying to generally pace the 1043 * filesystem activity. 1044 */ 1045 if (time_second == starttime) 1046 tsleep(&lbolt, PPAUSE, "syncer", 0); 1047 } 1048} 1049 1050/* 1051 * Request the syncer daemon to speed up its work. 1052 * We never push it to speed up more than half of its 1053 * normal turn time, otherwise it could take over the cpu. 1054 */ 1055int 1056speedup_syncer() 1057{ 1058 int s; 1059 1060 s = splhigh(); 1061 if (updateproc->p_wchan == &lbolt) 1062 setrunnable(updateproc); 1063 splx(s); 1064 if (rushjob < syncdelay / 2) { 1065 rushjob += 1; 1066 stat_rush_requests += 1; 1067 return (1); 1068 } 1069 return(0); 1070} 1071 1072/* 1073 * Associate a p-buffer with a vnode. 1074 * 1075 * Also sets B_PAGING flag to indicate that vnode is not fully associated 1076 * with the buffer. i.e. the bp has not been linked into the vnode or 1077 * ref-counted. 1078 */ 1079void 1080pbgetvp(vp, bp) 1081 register struct vnode *vp; 1082 register struct buf *bp; 1083{ 1084 1085 KASSERT(bp->b_vp == NULL, ("pbgetvp: not free")); 1086 1087 bp->b_vp = vp; 1088 bp->b_flags |= B_PAGING; 1089 bp->b_dev = vn_todev(vp); 1090} 1091 1092/* 1093 * Disassociate a p-buffer from a vnode. 1094 */ 1095void 1096pbrelvp(bp) 1097 register struct buf *bp; 1098{ 1099 1100 KASSERT(bp->b_vp != NULL, ("pbrelvp: NULL")); 1101 1102 /* XXX REMOVE ME */ 1103 if (bp->b_vnbufs.tqe_next != NULL) { 1104 panic( 1105 "relpbuf(): b_vp was probably reassignbuf()d %p %x", 1106 bp, 1107 (int)bp->b_flags 1108 ); 1109 } 1110 bp->b_vp = (struct vnode *) 0; 1111 bp->b_flags &= ~B_PAGING; 1112} 1113 1114void 1115pbreassignbuf(bp, newvp) 1116 struct buf *bp; 1117 struct vnode *newvp; 1118{ 1119 if ((bp->b_flags & B_PAGING) == 0) { 1120 panic( 1121 "pbreassignbuf() on non phys bp %p", 1122 bp 1123 ); 1124 } 1125 bp->b_vp = newvp; 1126} 1127 1128/* 1129 * Reassign a buffer from one vnode to another. 1130 * Used to assign file specific control information 1131 * (indirect blocks) to the vnode to which they belong. 1132 */ 1133void 1134reassignbuf(bp, newvp) 1135 register struct buf *bp; 1136 register struct vnode *newvp; 1137{ 1138 struct buflists *listheadp; 1139 int delay; 1140 int s; 1141 1142 if (newvp == NULL) { 1143 printf("reassignbuf: NULL"); 1144 return; 1145 } 1146 ++reassignbufcalls; 1147 1148 /* 1149 * B_PAGING flagged buffers cannot be reassigned because their vp 1150 * is not fully linked in. 1151 */ 1152 if (bp->b_flags & B_PAGING) 1153 panic("cannot reassign paging buffer"); 1154 1155 s = splbio(); 1156 /* 1157 * Delete from old vnode list, if on one. 1158 */ 1159 if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) { 1160 if (bp->b_xflags & BX_VNDIRTY) 1161 listheadp = &bp->b_vp->v_dirtyblkhd; 1162 else 1163 listheadp = &bp->b_vp->v_cleanblkhd; 1164 TAILQ_REMOVE(listheadp, bp, b_vnbufs); 1165 bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN); 1166 if (bp->b_vp != newvp) { 1167 vdrop(bp->b_vp); 1168 bp->b_vp = NULL; /* for clarification */ 1169 } 1170 } 1171 /* 1172 * If dirty, put on list of dirty buffers; otherwise insert onto list 1173 * of clean buffers. 1174 */ 1175 if (bp->b_flags & B_DELWRI) { 1176 struct buf *tbp; 1177 1178 listheadp = &newvp->v_dirtyblkhd; 1179 if ((newvp->v_flag & VONWORKLST) == 0) { 1180 switch (newvp->v_type) { 1181 case VDIR: 1182 delay = dirdelay; 1183 break; 1184 case VCHR: 1185 case VBLK: 1186 if (newvp->v_specmountpoint != NULL) { 1187 delay = metadelay; 1188 break; 1189 } 1190 /* fall through */ 1191 default: 1192 delay = filedelay; 1193 } 1194 vn_syncer_add_to_worklist(newvp, delay); 1195 } 1196 bp->b_xflags |= BX_VNDIRTY; 1197 tbp = TAILQ_FIRST(listheadp); 1198 if (tbp == NULL || 1199 bp->b_lblkno == 0 || 1200 (bp->b_lblkno > 0 && tbp->b_lblkno < 0) || 1201 (bp->b_lblkno > 0 && bp->b_lblkno < tbp->b_lblkno)) { 1202 TAILQ_INSERT_HEAD(listheadp, bp, b_vnbufs); 1203 ++reassignbufsortgood; 1204 } else if (bp->b_lblkno < 0) { 1205 TAILQ_INSERT_TAIL(listheadp, bp, b_vnbufs); 1206 ++reassignbufsortgood; 1207 } else if (reassignbufmethod == 1) { 1208 /* 1209 * New sorting algorithm, only handle sequential case, 1210 * otherwise append to end (but before metadata) 1211 */ 1212 if ((tbp = gbincore(newvp, bp->b_lblkno - 1)) != NULL && 1213 (tbp->b_xflags & BX_VNDIRTY)) { 1214 /* 1215 * Found the best place to insert the buffer 1216 */ 1217 TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs); 1218 ++reassignbufsortgood; 1219 } else { 1220 /* 1221 * Missed, append to end, but before meta-data. 1222 * We know that the head buffer in the list is 1223 * not meta-data due to prior conditionals. 1224 * 1225 * Indirect effects: NFS second stage write 1226 * tends to wind up here, giving maximum 1227 * distance between the unstable write and the 1228 * commit rpc. 1229 */ 1230 tbp = TAILQ_LAST(listheadp, buflists); 1231 while (tbp && tbp->b_lblkno < 0) 1232 tbp = TAILQ_PREV(tbp, buflists, b_vnbufs); 1233 TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs); 1234 ++reassignbufsortbad; 1235 } 1236 } else { 1237 /* 1238 * Old sorting algorithm, scan queue and insert 1239 */ 1240 struct buf *ttbp; 1241 while ((ttbp = TAILQ_NEXT(tbp, b_vnbufs)) && 1242 (ttbp->b_lblkno < bp->b_lblkno)) { 1243 ++reassignbufloops; 1244 tbp = ttbp; 1245 } 1246 TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs); 1247 } 1248 } else { 1249 bp->b_xflags |= BX_VNCLEAN; 1250 TAILQ_INSERT_TAIL(&newvp->v_cleanblkhd, bp, b_vnbufs); 1251 if ((newvp->v_flag & VONWORKLST) && 1252 TAILQ_EMPTY(&newvp->v_dirtyblkhd)) { 1253 newvp->v_flag &= ~VONWORKLST; 1254 LIST_REMOVE(newvp, v_synclist); 1255 } 1256 } 1257 if (bp->b_vp != newvp) { 1258 bp->b_vp = newvp; 1259 vhold(bp->b_vp); 1260 } 1261 splx(s); 1262} 1263 1264/* 1265 * Create a vnode for a block device. 1266 * Used for mounting the root file system. 1267 * XXX: This now changed to a VCHR due to the block/char merging. 1268 */ 1269int 1270bdevvp(dev, vpp) 1271 dev_t dev; 1272 struct vnode **vpp; 1273{ 1274 register struct vnode *vp; 1275 struct vnode *nvp; 1276 int error; 1277 1278 if (dev == NODEV) { 1279 *vpp = NULLVP; 1280 return (ENXIO); 1281 } 1282 error = getnewvnode(VT_NON, (struct mount *)0, spec_vnodeop_p, &nvp); 1283 if (error) { 1284 *vpp = NULLVP; 1285 return (error); 1286 } 1287 vp = nvp; 1288 vp->v_type = VCHR; 1289 addalias(vp, dev); 1290 *vpp = vp; 1291 return (0); 1292} 1293 1294/* 1295 * Add vnode to the alias list hung off the dev_t. 1296 * 1297 * The reason for this gunk is that multiple vnodes can reference 1298 * the same physical device, so checking vp->v_usecount to see 1299 * how many users there are is inadequate; the v_usecount for 1300 * the vnodes need to be accumulated. vcount() does that. 1301 */ 1302struct vnode * 1303addaliasu(nvp, nvp_rdev) 1304 struct vnode *nvp; 1305 udev_t nvp_rdev; 1306{ 1307 struct vnode *ovp; 1308 vop_t **ops; 1309 dev_t dev; 1310 1311 if (nvp->v_type != VBLK && nvp->v_type != VCHR) 1312 panic("addaliasu on non-special vnode"); 1313 dev = udev2dev(nvp_rdev, nvp->v_type == VBLK ? 1 : 0); 1314 /* 1315 * Check to see if we have a bdevvp vnode with no associated 1316 * filesystem. If so, we want to associate the filesystem of 1317 * the new newly instigated vnode with the bdevvp vnode and 1318 * discard the newly created vnode rather than leaving the 1319 * bdevvp vnode lying around with no associated filesystem. 1320 */ 1321 if (vfinddev(dev, nvp->v_type, &ovp) == 0 || ovp->v_data != NULL) { 1322 addalias(nvp, dev); 1323 return (nvp); 1324 } 1325 /* 1326 * Discard unneeded vnode, but save its node specific data. 1327 * Note that if there is a lock, it is carried over in the 1328 * node specific data to the replacement vnode. 1329 */ 1330 vref(ovp); 1331 ovp->v_data = nvp->v_data; 1332 ovp->v_tag = nvp->v_tag; 1333 nvp->v_data = NULL; 1334 ops = nvp->v_op; 1335 nvp->v_op = ovp->v_op; 1336 ovp->v_op = ops; 1337 insmntque(ovp, nvp->v_mount); 1338 vrele(nvp); 1339 vgone(nvp); 1340 return (ovp); 1341} 1342 1343void 1344addalias(nvp, dev) 1345 struct vnode *nvp; 1346 dev_t dev; 1347{ 1348 1349 if (nvp->v_type != VBLK && nvp->v_type != VCHR) 1350 panic("addalias on non-special vnode"); 1351 1352 nvp->v_rdev = dev; 1353 simple_lock(&spechash_slock); 1354 SLIST_INSERT_HEAD(&dev->si_hlist, nvp, v_specnext); 1355 simple_unlock(&spechash_slock); 1356} 1357 1358/* 1359 * Grab a particular vnode from the free list, increment its 1360 * reference count and lock it. The vnode lock bit is set if the 1361 * vnode is being eliminated in vgone. The process is awakened 1362 * when the transition is completed, and an error returned to 1363 * indicate that the vnode is no longer usable (possibly having 1364 * been changed to a new file system type). 1365 */ 1366int 1367vget(vp, flags, p) 1368 register struct vnode *vp; 1369 int flags; 1370 struct proc *p; 1371{ 1372 int error; 1373 1374 /* 1375 * If the vnode is in the process of being cleaned out for 1376 * another use, we wait for the cleaning to finish and then 1377 * return failure. Cleaning is determined by checking that 1378 * the VXLOCK flag is set. 1379 */ 1380 if ((flags & LK_INTERLOCK) == 0) { 1381 simple_lock(&vp->v_interlock); 1382 } 1383 if (vp->v_flag & VXLOCK) { 1384 vp->v_flag |= VXWANT; 1385 simple_unlock(&vp->v_interlock); 1386 tsleep((caddr_t)vp, PINOD, "vget", 0); 1387 return (ENOENT); 1388 } 1389 1390 vp->v_usecount++; 1391 1392 if (VSHOULDBUSY(vp)) 1393 vbusy(vp); 1394 if (flags & LK_TYPE_MASK) { 1395 if ((error = vn_lock(vp, flags | LK_INTERLOCK, p)) != 0) { 1396 /* 1397 * must expand vrele here because we do not want 1398 * to call VOP_INACTIVE if the reference count 1399 * drops back to zero since it was never really 1400 * active. We must remove it from the free list 1401 * before sleeping so that multiple processes do 1402 * not try to recycle it. 1403 */ 1404 simple_lock(&vp->v_interlock); 1405 vp->v_usecount--; 1406 if (VSHOULDFREE(vp)) 1407 vfree(vp); 1408 simple_unlock(&vp->v_interlock); 1409 } 1410 return (error); 1411 } 1412 simple_unlock(&vp->v_interlock); 1413 return (0); 1414} 1415 1416void 1417vref(struct vnode *vp) 1418{ 1419 simple_lock(&vp->v_interlock); 1420 vp->v_usecount++; 1421 simple_unlock(&vp->v_interlock); 1422} 1423 1424/* 1425 * Vnode put/release. 1426 * If count drops to zero, call inactive routine and return to freelist. 1427 */ 1428void 1429vrele(vp) 1430 struct vnode *vp; 1431{ 1432 struct proc *p = curproc; /* XXX */ 1433 1434 KASSERT(vp != NULL, ("vrele: null vp")); 1435 KASSERT(vp->v_writecount < vp->v_usecount, ("vrele: missed vn_close")); 1436 1437 simple_lock(&vp->v_interlock); 1438 1439 if (vp->v_usecount > 1) { 1440 1441 vp->v_usecount--; 1442 simple_unlock(&vp->v_interlock); 1443 1444 return; 1445 } 1446 1447 if (vp->v_usecount == 1) { 1448 1449 vp->v_usecount--; 1450 if (VSHOULDFREE(vp)) 1451 vfree(vp); 1452 /* 1453 * If we are doing a vput, the node is already locked, and we must 1454 * call VOP_INACTIVE with the node locked. So, in the case of 1455 * vrele, we explicitly lock the vnode before calling VOP_INACTIVE. 1456 */ 1457 if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK, p) == 0) { 1458 VOP_INACTIVE(vp, p); 1459 } 1460 1461 } else { 1462#ifdef DIAGNOSTIC 1463 vprint("vrele: negative ref count", vp); 1464 simple_unlock(&vp->v_interlock); 1465#endif 1466 panic("vrele: negative ref cnt"); 1467 } 1468} 1469 1470void 1471vput(vp) 1472 struct vnode *vp; 1473{ 1474 struct proc *p = curproc; /* XXX */ 1475 1476 KASSERT(vp != NULL, ("vput: null vp")); 1477 KASSERT(vp->v_writecount < vp->v_usecount, ("vput: missed vn_close")); 1478 1479 simple_lock(&vp->v_interlock); 1480 1481 if (vp->v_usecount > 1) { 1482 1483 vp->v_usecount--; 1484 VOP_UNLOCK(vp, LK_INTERLOCK, p); 1485 return; 1486 1487 } 1488 1489 if (vp->v_usecount == 1) { 1490 1491 vp->v_usecount--; 1492 if (VSHOULDFREE(vp)) 1493 vfree(vp); 1494 /* 1495 * If we are doing a vput, the node is already locked, and we must 1496 * call VOP_INACTIVE with the node locked. So, in the case of 1497 * vrele, we explicitly lock the vnode before calling VOP_INACTIVE. 1498 */ 1499 simple_unlock(&vp->v_interlock); 1500 VOP_INACTIVE(vp, p); 1501 1502 } else { 1503#ifdef DIAGNOSTIC 1504 vprint("vput: negative ref count", vp); 1505#endif 1506 panic("vput: negative ref cnt"); 1507 } 1508} 1509 1510/* 1511 * Somebody doesn't want the vnode recycled. 1512 */ 1513void 1514vhold(vp) 1515 register struct vnode *vp; 1516{ 1517 int s; 1518 1519 s = splbio(); 1520 vp->v_holdcnt++; 1521 if (VSHOULDBUSY(vp)) 1522 vbusy(vp); 1523 splx(s); 1524} 1525 1526/* 1527 * One less who cares about this vnode. 1528 */ 1529void 1530vdrop(vp) 1531 register struct vnode *vp; 1532{ 1533 int s; 1534 1535 s = splbio(); 1536 if (vp->v_holdcnt <= 0) 1537 panic("vdrop: holdcnt"); 1538 vp->v_holdcnt--; 1539 if (VSHOULDFREE(vp)) 1540 vfree(vp); 1541 splx(s); 1542} 1543 1544/* 1545 * Remove any vnodes in the vnode table belonging to mount point mp. 1546 * 1547 * If MNT_NOFORCE is specified, there should not be any active ones, 1548 * return error if any are found (nb: this is a user error, not a 1549 * system error). If MNT_FORCE is specified, detach any active vnodes 1550 * that are found. 1551 */ 1552#ifdef DIAGNOSTIC 1553static int busyprt = 0; /* print out busy vnodes */ 1554SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, ""); 1555#endif 1556 1557int 1558vflush(mp, skipvp, flags) 1559 struct mount *mp; 1560 struct vnode *skipvp; 1561 int flags; 1562{ 1563 struct proc *p = curproc; /* XXX */ 1564 struct vnode *vp, *nvp; 1565 int busy = 0; 1566 1567 simple_lock(&mntvnode_slock); 1568loop: 1569 for (vp = LIST_FIRST(&mp->mnt_vnodelist); vp; vp = nvp) { 1570 /* 1571 * Make sure this vnode wasn't reclaimed in getnewvnode(). 1572 * Start over if it has (it won't be on the list anymore). 1573 */ 1574 if (vp->v_mount != mp) 1575 goto loop; 1576 nvp = LIST_NEXT(vp, v_mntvnodes); 1577 /* 1578 * Skip over a selected vnode. 1579 */ 1580 if (vp == skipvp) 1581 continue; 1582 1583 simple_lock(&vp->v_interlock); 1584 /* 1585 * Skip over a vnodes marked VSYSTEM. 1586 */ 1587 if ((flags & SKIPSYSTEM) && (vp->v_flag & VSYSTEM)) { 1588 simple_unlock(&vp->v_interlock); 1589 continue; 1590 } 1591 /* 1592 * If WRITECLOSE is set, only flush out regular file vnodes 1593 * open for writing. 1594 */ 1595 if ((flags & WRITECLOSE) && 1596 (vp->v_writecount == 0 || vp->v_type != VREG)) { 1597 simple_unlock(&vp->v_interlock); 1598 continue; 1599 } 1600 1601 /* 1602 * With v_usecount == 0, all we need to do is clear out the 1603 * vnode data structures and we are done. 1604 */ 1605 if (vp->v_usecount == 0) { 1606 simple_unlock(&mntvnode_slock); 1607 vgonel(vp, p); 1608 simple_lock(&mntvnode_slock); 1609 continue; 1610 } 1611 1612 /* 1613 * If FORCECLOSE is set, forcibly close the vnode. For block 1614 * or character devices, revert to an anonymous device. For 1615 * all other files, just kill them. 1616 */ 1617 if (flags & FORCECLOSE) { 1618 simple_unlock(&mntvnode_slock); 1619 if (vp->v_type != VBLK && vp->v_type != VCHR) { 1620 vgonel(vp, p); 1621 } else { 1622 vclean(vp, 0, p); 1623 vp->v_op = spec_vnodeop_p; 1624 insmntque(vp, (struct mount *) 0); 1625 } 1626 simple_lock(&mntvnode_slock); 1627 continue; 1628 } 1629#ifdef DIAGNOSTIC 1630 if (busyprt) 1631 vprint("vflush: busy vnode", vp); 1632#endif 1633 simple_unlock(&vp->v_interlock); 1634 busy++; 1635 } 1636 simple_unlock(&mntvnode_slock); 1637 if (busy) 1638 return (EBUSY); 1639 return (0); 1640} 1641 1642/* 1643 * Disassociate the underlying file system from a vnode. 1644 */ 1645static void 1646vclean(vp, flags, p) 1647 struct vnode *vp; 1648 int flags; 1649 struct proc *p; 1650{ 1651 int active; 1652 vm_object_t obj; 1653 1654 /* 1655 * Check to see if the vnode is in use. If so we have to reference it 1656 * before we clean it out so that its count cannot fall to zero and 1657 * generate a race against ourselves to recycle it. 1658 */ 1659 if ((active = vp->v_usecount)) 1660 vp->v_usecount++; 1661 1662 /* 1663 * Prevent the vnode from being recycled or brought into use while we 1664 * clean it out. 1665 */ 1666 if (vp->v_flag & VXLOCK) 1667 panic("vclean: deadlock"); 1668 vp->v_flag |= VXLOCK; 1669 /* 1670 * Even if the count is zero, the VOP_INACTIVE routine may still 1671 * have the object locked while it cleans it out. The VOP_LOCK 1672 * ensures that the VOP_INACTIVE routine is done with its work. 1673 * For active vnodes, it ensures that no other activity can 1674 * occur while the underlying object is being cleaned out. 1675 */ 1676 VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK, p); 1677 1678 /* 1679 * Clean out any buffers associated with the vnode. 1680 * If the flush fails, just toss the buffers. 1681 */ 1682 if (flags & DOCLOSE) { 1683 if (TAILQ_FIRST(&vp->v_dirtyblkhd) != NULL) 1684 (void) vn_write_suspend_wait(vp, NULL, V_WAIT); 1685 if (vinvalbuf(vp, V_SAVE, NOCRED, p, 0, 0) != 0) 1686 vinvalbuf(vp, 0, NOCRED, p, 0, 0); 1687 } 1688 1689 if ((obj = vp->v_object) != NULL) { 1690 if (obj->ref_count == 0) { 1691 /* 1692 * vclean() may be called twice. The first time 1693 * removes the primary reference to the object, 1694 * the second time goes one further and is a 1695 * special-case to terminate the object. 1696 */ 1697 vm_object_terminate(obj); 1698 } else { 1699 /* 1700 * Woe to the process that tries to page now :-). 1701 */ 1702 vm_pager_deallocate(obj); 1703 } 1704 } 1705 1706 /* 1707 * If purging an active vnode, it must be closed and 1708 * deactivated before being reclaimed. Note that the 1709 * VOP_INACTIVE will unlock the vnode. 1710 */ 1711 if (active) { 1712 if (flags & DOCLOSE) 1713 VOP_CLOSE(vp, FNONBLOCK, NOCRED, p); 1714 VOP_INACTIVE(vp, p); 1715 } else { 1716 /* 1717 * Any other processes trying to obtain this lock must first 1718 * wait for VXLOCK to clear, then call the new lock operation. 1719 */ 1720 VOP_UNLOCK(vp, 0, p); 1721 } 1722 /* 1723 * Reclaim the vnode. 1724 */ 1725 if (VOP_RECLAIM(vp, p)) 1726 panic("vclean: cannot reclaim"); 1727 1728 if (active) { 1729 /* 1730 * Inline copy of vrele() since VOP_INACTIVE 1731 * has already been called. 1732 */ 1733 simple_lock(&vp->v_interlock); 1734 if (--vp->v_usecount <= 0) { 1735#ifdef DIAGNOSTIC 1736 if (vp->v_usecount < 0 || vp->v_writecount != 0) { 1737 vprint("vclean: bad ref count", vp); 1738 panic("vclean: ref cnt"); 1739 } 1740#endif 1741 vfree(vp); 1742 } 1743 simple_unlock(&vp->v_interlock); 1744 } 1745 1746 cache_purge(vp); 1747 if (vp->v_vnlock) { 1748 FREE(vp->v_vnlock, M_VNODE); 1749 vp->v_vnlock = NULL; 1750 } 1751 1752 if (VSHOULDFREE(vp)) 1753 vfree(vp); 1754 1755 /* 1756 * Done with purge, notify sleepers of the grim news. 1757 */ 1758 vp->v_op = dead_vnodeop_p; 1759 vn_pollgone(vp); 1760 vp->v_tag = VT_NON; 1761 vp->v_flag &= ~VXLOCK; 1762 if (vp->v_flag & VXWANT) { 1763 vp->v_flag &= ~VXWANT; 1764 wakeup((caddr_t) vp); 1765 } 1766} 1767 1768/* 1769 * Eliminate all activity associated with the requested vnode 1770 * and with all vnodes aliased to the requested vnode. 1771 */ 1772int 1773vop_revoke(ap) 1774 struct vop_revoke_args /* { 1775 struct vnode *a_vp; 1776 int a_flags; 1777 } */ *ap; 1778{ 1779 struct vnode *vp, *vq; 1780 dev_t dev; 1781 1782 KASSERT((ap->a_flags & REVOKEALL) != 0, ("vop_revoke")); 1783 1784 vp = ap->a_vp; 1785 /* 1786 * If a vgone (or vclean) is already in progress, 1787 * wait until it is done and return. 1788 */ 1789 if (vp->v_flag & VXLOCK) { 1790 vp->v_flag |= VXWANT; 1791 simple_unlock(&vp->v_interlock); 1792 tsleep((caddr_t)vp, PINOD, "vop_revokeall", 0); 1793 return (0); 1794 } 1795 dev = vp->v_rdev; 1796 for (;;) { 1797 simple_lock(&spechash_slock); 1798 vq = SLIST_FIRST(&dev->si_hlist); 1799 simple_unlock(&spechash_slock); 1800 if (!vq) 1801 break; 1802 vgone(vq); 1803 } 1804 return (0); 1805} 1806 1807/* 1808 * Recycle an unused vnode to the front of the free list. 1809 * Release the passed interlock if the vnode will be recycled. 1810 */ 1811int 1812vrecycle(vp, inter_lkp, p) 1813 struct vnode *vp; 1814 struct simplelock *inter_lkp; 1815 struct proc *p; 1816{ 1817 1818 simple_lock(&vp->v_interlock); 1819 if (vp->v_usecount == 0) { 1820 if (inter_lkp) { 1821 simple_unlock(inter_lkp); 1822 } 1823 vgonel(vp, p); 1824 return (1); 1825 } 1826 simple_unlock(&vp->v_interlock); 1827 return (0); 1828} 1829 1830/* 1831 * Eliminate all activity associated with a vnode 1832 * in preparation for reuse. 1833 */ 1834void 1835vgone(vp) 1836 register struct vnode *vp; 1837{ 1838 struct proc *p = curproc; /* XXX */ 1839 1840 simple_lock(&vp->v_interlock); 1841 vgonel(vp, p); 1842} 1843 1844/* 1845 * vgone, with the vp interlock held. 1846 */ 1847void 1848vgonel(vp, p) 1849 struct vnode *vp; 1850 struct proc *p; 1851{ 1852 int s; 1853 1854 /* 1855 * If a vgone (or vclean) is already in progress, 1856 * wait until it is done and return. 1857 */ 1858 if (vp->v_flag & VXLOCK) { 1859 vp->v_flag |= VXWANT; 1860 simple_unlock(&vp->v_interlock); 1861 tsleep((caddr_t)vp, PINOD, "vgone", 0); 1862 return; 1863 } 1864 1865 /* 1866 * Clean out the filesystem specific data. 1867 */ 1868 vclean(vp, DOCLOSE, p); 1869 simple_lock(&vp->v_interlock); 1870 1871 /* 1872 * Delete from old mount point vnode list, if on one. 1873 */ 1874 if (vp->v_mount != NULL) 1875 insmntque(vp, (struct mount *)0); 1876 /* 1877 * If special device, remove it from special device alias list 1878 * if it is on one. 1879 */ 1880 if ((vp->v_type == VBLK || vp->v_type == VCHR) && vp->v_rdev != NULL) { 1881 simple_lock(&spechash_slock); 1882 SLIST_REMOVE(&vp->v_hashchain, vp, vnode, v_specnext); 1883 freedev(vp->v_rdev); 1884 simple_unlock(&spechash_slock); 1885 vp->v_rdev = NULL; 1886 } 1887 1888 /* 1889 * If it is on the freelist and not already at the head, 1890 * move it to the head of the list. The test of the 1891 * VDOOMED flag and the reference count of zero is because 1892 * it will be removed from the free list by getnewvnode, 1893 * but will not have its reference count incremented until 1894 * after calling vgone. If the reference count were 1895 * incremented first, vgone would (incorrectly) try to 1896 * close the previous instance of the underlying object. 1897 */ 1898 if (vp->v_usecount == 0 && !(vp->v_flag & VDOOMED)) { 1899 s = splbio(); 1900 simple_lock(&vnode_free_list_slock); 1901 if (vp->v_flag & VFREE) 1902 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); 1903 else 1904 freevnodes++; 1905 vp->v_flag |= VFREE; 1906 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); 1907 simple_unlock(&vnode_free_list_slock); 1908 splx(s); 1909 } 1910 1911 vp->v_type = VBAD; 1912 simple_unlock(&vp->v_interlock); 1913} 1914 1915/* 1916 * Lookup a vnode by device number. 1917 */ 1918int 1919vfinddev(dev, type, vpp) 1920 dev_t dev; 1921 enum vtype type; 1922 struct vnode **vpp; 1923{ 1924 struct vnode *vp; 1925 1926 simple_lock(&spechash_slock); 1927 SLIST_FOREACH(vp, &dev->si_hlist, v_specnext) { 1928 if (type == vp->v_type) { 1929 *vpp = vp; 1930 simple_unlock(&spechash_slock); 1931 return (1); 1932 } 1933 } 1934 simple_unlock(&spechash_slock); 1935 return (0); 1936} 1937 1938/* 1939 * Calculate the total number of references to a special device. 1940 */ 1941int 1942vcount(vp) 1943 struct vnode *vp; 1944{ 1945 struct vnode *vq; 1946 int count; 1947 1948 count = 0; 1949 simple_lock(&spechash_slock); 1950 SLIST_FOREACH(vq, &vp->v_hashchain, v_specnext) 1951 count += vq->v_usecount; 1952 simple_unlock(&spechash_slock); 1953 return (count); 1954} 1955 1956/* 1957 * Same as above, but using the dev_t as argument 1958 */ 1959 1960int 1961count_dev(dev) 1962 dev_t dev; 1963{ 1964 struct vnode *vp; 1965 1966 vp = SLIST_FIRST(&dev->si_hlist); 1967 if (vp == NULL) 1968 return (0); 1969 return(vcount(vp)); 1970} 1971 1972/* 1973 * Print out a description of a vnode. 1974 */ 1975static char *typename[] = 1976{"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"}; 1977 1978void 1979vprint(label, vp) 1980 char *label; 1981 struct vnode *vp; 1982{ 1983 char buf[96]; 1984 1985 if (label != NULL) 1986 printf("%s: %p: ", label, (void *)vp); 1987 else 1988 printf("%p: ", (void *)vp); 1989 printf("type %s, usecount %d, writecount %d, refcount %d,", 1990 typename[vp->v_type], vp->v_usecount, vp->v_writecount, 1991 vp->v_holdcnt); 1992 buf[0] = '\0'; 1993 if (vp->v_flag & VROOT) 1994 strcat(buf, "|VROOT"); 1995 if (vp->v_flag & VTEXT) 1996 strcat(buf, "|VTEXT"); 1997 if (vp->v_flag & VSYSTEM) 1998 strcat(buf, "|VSYSTEM"); 1999 if (vp->v_flag & VXLOCK) 2000 strcat(buf, "|VXLOCK"); 2001 if (vp->v_flag & VXWANT) 2002 strcat(buf, "|VXWANT"); 2003 if (vp->v_flag & VBWAIT) 2004 strcat(buf, "|VBWAIT"); 2005 if (vp->v_flag & VDOOMED) 2006 strcat(buf, "|VDOOMED"); 2007 if (vp->v_flag & VFREE) 2008 strcat(buf, "|VFREE"); 2009 if (vp->v_flag & VOBJBUF) 2010 strcat(buf, "|VOBJBUF"); 2011 if (buf[0] != '\0') 2012 printf(" flags (%s)", &buf[1]); 2013 if (vp->v_data == NULL) { 2014 printf("\n"); 2015 } else { 2016 printf("\n\t"); 2017 VOP_PRINT(vp); 2018 } 2019} 2020 2021#ifdef DDB 2022#include <ddb/ddb.h> 2023/* 2024 * List all of the locked vnodes in the system. 2025 * Called when debugging the kernel. 2026 */ 2027DB_SHOW_COMMAND(lockedvnodes, lockedvnodes) 2028{ 2029 struct proc *p = curproc; /* XXX */ 2030 struct mount *mp, *nmp; 2031 struct vnode *vp; 2032 2033 printf("Locked vnodes\n"); 2034 simple_lock(&mountlist_slock); 2035 for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { 2036 if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) { 2037 nmp = TAILQ_NEXT(mp, mnt_list); 2038 continue; 2039 } 2040 LIST_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) { 2041 if (VOP_ISLOCKED(vp, NULL)) 2042 vprint((char *)0, vp); 2043 } 2044 simple_lock(&mountlist_slock); 2045 nmp = TAILQ_NEXT(mp, mnt_list); 2046 vfs_unbusy(mp, p); 2047 } 2048 simple_unlock(&mountlist_slock); 2049} 2050#endif 2051 2052/* 2053 * Top level filesystem related information gathering. 2054 */ 2055static int sysctl_ovfs_conf __P((SYSCTL_HANDLER_ARGS)); 2056 2057static int 2058vfs_sysctl(SYSCTL_HANDLER_ARGS) 2059{ 2060 int *name = (int *)arg1 - 1; /* XXX */ 2061 u_int namelen = arg2 + 1; /* XXX */ 2062 struct vfsconf *vfsp; 2063 2064#if 1 || defined(COMPAT_PRELITE2) 2065 /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */ 2066 if (namelen == 1) 2067 return (sysctl_ovfs_conf(oidp, arg1, arg2, req)); 2068#endif 2069 2070#ifdef notyet 2071 /* all sysctl names at this level are at least name and field */ 2072 if (namelen < 2) 2073 return (ENOTDIR); /* overloaded */ 2074 if (name[0] != VFS_GENERIC) { 2075 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) 2076 if (vfsp->vfc_typenum == name[0]) 2077 break; 2078 if (vfsp == NULL) 2079 return (EOPNOTSUPP); 2080 return ((*vfsp->vfc_vfsops->vfs_sysctl)(&name[1], namelen - 1, 2081 oldp, oldlenp, newp, newlen, p)); 2082 } 2083#endif 2084 switch (name[1]) { 2085 case VFS_MAXTYPENUM: 2086 if (namelen != 2) 2087 return (ENOTDIR); 2088 return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int))); 2089 case VFS_CONF: 2090 if (namelen != 3) 2091 return (ENOTDIR); /* overloaded */ 2092 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) 2093 if (vfsp->vfc_typenum == name[2]) 2094 break; 2095 if (vfsp == NULL) 2096 return (EOPNOTSUPP); 2097 return (SYSCTL_OUT(req, vfsp, sizeof *vfsp)); 2098 } 2099 return (EOPNOTSUPP); 2100} 2101 2102SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD, vfs_sysctl, 2103 "Generic filesystem"); 2104 2105#if 1 || defined(COMPAT_PRELITE2) 2106 2107static int 2108sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS) 2109{ 2110 int error; 2111 struct vfsconf *vfsp; 2112 struct ovfsconf ovfs; 2113 2114 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) { 2115 ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */ 2116 strcpy(ovfs.vfc_name, vfsp->vfc_name); 2117 ovfs.vfc_index = vfsp->vfc_typenum; 2118 ovfs.vfc_refcount = vfsp->vfc_refcount; 2119 ovfs.vfc_flags = vfsp->vfc_flags; 2120 error = SYSCTL_OUT(req, &ovfs, sizeof ovfs); 2121 if (error) 2122 return error; 2123 } 2124 return 0; 2125} 2126 2127#endif /* 1 || COMPAT_PRELITE2 */ 2128 2129#if 0 2130#define KINFO_VNODESLOP 10 2131/* 2132 * Dump vnode list (via sysctl). 2133 * Copyout address of vnode followed by vnode. 2134 */ 2135/* ARGSUSED */ 2136static int 2137sysctl_vnode(SYSCTL_HANDLER_ARGS) 2138{ 2139 struct proc *p = curproc; /* XXX */ 2140 struct mount *mp, *nmp; 2141 struct vnode *nvp, *vp; 2142 int error; 2143 2144#define VPTRSZ sizeof (struct vnode *) 2145#define VNODESZ sizeof (struct vnode) 2146 2147 req->lock = 0; 2148 if (!req->oldptr) /* Make an estimate */ 2149 return (SYSCTL_OUT(req, 0, 2150 (numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ))); 2151 2152 simple_lock(&mountlist_slock); 2153 for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { 2154 if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) { 2155 nmp = TAILQ_NEXT(mp, mnt_list); 2156 continue; 2157 } 2158again: 2159 simple_lock(&mntvnode_slock); 2160 for (vp = LIST_FIRST(&mp->mnt_vnodelist); 2161 vp != NULL; 2162 vp = nvp) { 2163 /* 2164 * Check that the vp is still associated with 2165 * this filesystem. RACE: could have been 2166 * recycled onto the same filesystem. 2167 */ 2168 if (vp->v_mount != mp) { 2169 simple_unlock(&mntvnode_slock); 2170 goto again; 2171 } 2172 nvp = LIST_NEXT(vp, v_mntvnodes); 2173 simple_unlock(&mntvnode_slock); 2174 if ((error = SYSCTL_OUT(req, &vp, VPTRSZ)) || 2175 (error = SYSCTL_OUT(req, vp, VNODESZ))) 2176 return (error); 2177 simple_lock(&mntvnode_slock); 2178 } 2179 simple_unlock(&mntvnode_slock); 2180 simple_lock(&mountlist_slock); 2181 nmp = TAILQ_NEXT(mp, mnt_list); 2182 vfs_unbusy(mp, p); 2183 } 2184 simple_unlock(&mountlist_slock); 2185 2186 return (0); 2187} 2188#endif 2189 2190/* 2191 * XXX 2192 * Exporting the vnode list on large systems causes them to crash. 2193 * Exporting the vnode list on medium systems causes sysctl to coredump. 2194 */ 2195#if 0 2196SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD, 2197 0, 0, sysctl_vnode, "S,vnode", ""); 2198#endif 2199 2200/* 2201 * Check to see if a filesystem is mounted on a block device. 2202 */ 2203int 2204vfs_mountedon(vp) 2205 struct vnode *vp; 2206{ 2207 2208 if (vp->v_specmountpoint != NULL) 2209 return (EBUSY); 2210 return (0); 2211} 2212 2213/* 2214 * Unmount all filesystems. The list is traversed in reverse order 2215 * of mounting to avoid dependencies. 2216 */ 2217void 2218vfs_unmountall() 2219{ 2220 struct mount *mp; 2221 struct proc *p; 2222 int error; 2223 2224 if (curproc != NULL) 2225 p = curproc; 2226 else 2227 p = initproc; /* XXX XXX should this be proc0? */ 2228 /* 2229 * Since this only runs when rebooting, it is not interlocked. 2230 */ 2231 while(!TAILQ_EMPTY(&mountlist)) { 2232 mp = TAILQ_LAST(&mountlist, mntlist); 2233 error = dounmount(mp, MNT_FORCE, p); 2234 if (error) { 2235 TAILQ_REMOVE(&mountlist, mp, mnt_list); 2236 printf("unmount of %s failed (", 2237 mp->mnt_stat.f_mntonname); 2238 if (error == EBUSY) 2239 printf("BUSY)\n"); 2240 else 2241 printf("%d)\n", error); 2242 } else { 2243 /* The unmount has removed mp from the mountlist */ 2244 } 2245 } 2246} 2247 2248/* 2249 * Build hash lists of net addresses and hang them off the mount point. 2250 * Called by ufs_mount() to set up the lists of export addresses. 2251 */ 2252static int 2253vfs_hang_addrlist(mp, nep, argp) 2254 struct mount *mp; 2255 struct netexport *nep; 2256 struct export_args *argp; 2257{ 2258 register struct netcred *np; 2259 register struct radix_node_head *rnh; 2260 register int i; 2261 struct radix_node *rn; 2262 struct sockaddr *saddr, *smask = 0; 2263 struct domain *dom; 2264 int error; 2265 2266 if (argp->ex_addrlen == 0) { 2267 if (mp->mnt_flag & MNT_DEFEXPORTED) 2268 return (EPERM); 2269 np = &nep->ne_defexported; 2270 np->netc_exflags = argp->ex_flags; 2271 np->netc_anon = argp->ex_anon; 2272 np->netc_anon.cr_ref = 1; 2273 mp->mnt_flag |= MNT_DEFEXPORTED; 2274 return (0); 2275 } 2276 i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen; 2277 np = (struct netcred *) malloc(i, M_NETADDR, M_WAITOK); 2278 bzero((caddr_t) np, i); 2279 saddr = (struct sockaddr *) (np + 1); 2280 if ((error = copyin(argp->ex_addr, (caddr_t) saddr, argp->ex_addrlen))) 2281 goto out; 2282 if (saddr->sa_len > argp->ex_addrlen) 2283 saddr->sa_len = argp->ex_addrlen; 2284 if (argp->ex_masklen) { 2285 smask = (struct sockaddr *) ((caddr_t) saddr + argp->ex_addrlen); 2286 error = copyin(argp->ex_mask, (caddr_t) smask, argp->ex_masklen); 2287 if (error) 2288 goto out; 2289 if (smask->sa_len > argp->ex_masklen) 2290 smask->sa_len = argp->ex_masklen; 2291 } 2292 i = saddr->sa_family; 2293 if ((rnh = nep->ne_rtable[i]) == 0) { 2294 /* 2295 * Seems silly to initialize every AF when most are not used, 2296 * do so on demand here 2297 */ 2298 for (dom = domains; dom; dom = dom->dom_next) 2299 if (dom->dom_family == i && dom->dom_rtattach) { 2300 dom->dom_rtattach((void **) &nep->ne_rtable[i], 2301 dom->dom_rtoffset); 2302 break; 2303 } 2304 if ((rnh = nep->ne_rtable[i]) == 0) { 2305 error = ENOBUFS; 2306 goto out; 2307 } 2308 } 2309 rn = (*rnh->rnh_addaddr) ((caddr_t) saddr, (caddr_t) smask, rnh, 2310 np->netc_rnodes); 2311 if (rn == 0 || np != (struct netcred *) rn) { /* already exists */ 2312 error = EPERM; 2313 goto out; 2314 } 2315 np->netc_exflags = argp->ex_flags; 2316 np->netc_anon = argp->ex_anon; 2317 np->netc_anon.cr_ref = 1; 2318 return (0); 2319out: 2320 free(np, M_NETADDR); 2321 return (error); 2322} 2323 2324/* ARGSUSED */ 2325static int 2326vfs_free_netcred(rn, w) 2327 struct radix_node *rn; 2328 void *w; 2329{ 2330 register struct radix_node_head *rnh = (struct radix_node_head *) w; 2331 2332 (*rnh->rnh_deladdr) (rn->rn_key, rn->rn_mask, rnh); 2333 free((caddr_t) rn, M_NETADDR); 2334 return (0); 2335} 2336 2337/* 2338 * Free the net address hash lists that are hanging off the mount points. 2339 */ 2340static void 2341vfs_free_addrlist(nep) 2342 struct netexport *nep; 2343{ 2344 register int i; 2345 register struct radix_node_head *rnh; 2346 2347 for (i = 0; i <= AF_MAX; i++) 2348 if ((rnh = nep->ne_rtable[i])) { 2349 (*rnh->rnh_walktree) (rnh, vfs_free_netcred, 2350 (caddr_t) rnh); 2351 free((caddr_t) rnh, M_RTABLE); 2352 nep->ne_rtable[i] = 0; 2353 } 2354} 2355 2356int 2357vfs_export(mp, nep, argp) 2358 struct mount *mp; 2359 struct netexport *nep; 2360 struct export_args *argp; 2361{ 2362 int error; 2363 2364 if (argp->ex_flags & MNT_DELEXPORT) { 2365 if (mp->mnt_flag & MNT_EXPUBLIC) { 2366 vfs_setpublicfs(NULL, NULL, NULL); 2367 mp->mnt_flag &= ~MNT_EXPUBLIC; 2368 } 2369 vfs_free_addrlist(nep); 2370 mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED); 2371 } 2372 if (argp->ex_flags & MNT_EXPORTED) { 2373 if (argp->ex_flags & MNT_EXPUBLIC) { 2374 if ((error = vfs_setpublicfs(mp, nep, argp)) != 0) 2375 return (error); 2376 mp->mnt_flag |= MNT_EXPUBLIC; 2377 } 2378 if ((error = vfs_hang_addrlist(mp, nep, argp))) 2379 return (error); 2380 mp->mnt_flag |= MNT_EXPORTED; 2381 } 2382 return (0); 2383} 2384 2385 2386/* 2387 * Set the publicly exported filesystem (WebNFS). Currently, only 2388 * one public filesystem is possible in the spec (RFC 2054 and 2055) 2389 */ 2390int 2391vfs_setpublicfs(mp, nep, argp) 2392 struct mount *mp; 2393 struct netexport *nep; 2394 struct export_args *argp; 2395{ 2396 int error; 2397 struct vnode *rvp; 2398 char *cp; 2399 2400 /* 2401 * mp == NULL -> invalidate the current info, the FS is 2402 * no longer exported. May be called from either vfs_export 2403 * or unmount, so check if it hasn't already been done. 2404 */ 2405 if (mp == NULL) { 2406 if (nfs_pub.np_valid) { 2407 nfs_pub.np_valid = 0; 2408 if (nfs_pub.np_index != NULL) { 2409 FREE(nfs_pub.np_index, M_TEMP); 2410 nfs_pub.np_index = NULL; 2411 } 2412 } 2413 return (0); 2414 } 2415 2416 /* 2417 * Only one allowed at a time. 2418 */ 2419 if (nfs_pub.np_valid != 0 && mp != nfs_pub.np_mount) 2420 return (EBUSY); 2421 2422 /* 2423 * Get real filehandle for root of exported FS. 2424 */ 2425 bzero((caddr_t)&nfs_pub.np_handle, sizeof(nfs_pub.np_handle)); 2426 nfs_pub.np_handle.fh_fsid = mp->mnt_stat.f_fsid; 2427 2428 if ((error = VFS_ROOT(mp, &rvp))) 2429 return (error); 2430 2431 if ((error = VFS_VPTOFH(rvp, &nfs_pub.np_handle.fh_fid))) 2432 return (error); 2433 2434 vput(rvp); 2435 2436 /* 2437 * If an indexfile was specified, pull it in. 2438 */ 2439 if (argp->ex_indexfile != NULL) { 2440 MALLOC(nfs_pub.np_index, char *, MAXNAMLEN + 1, M_TEMP, 2441 M_WAITOK); 2442 error = copyinstr(argp->ex_indexfile, nfs_pub.np_index, 2443 MAXNAMLEN, (size_t *)0); 2444 if (!error) { 2445 /* 2446 * Check for illegal filenames. 2447 */ 2448 for (cp = nfs_pub.np_index; *cp; cp++) { 2449 if (*cp == '/') { 2450 error = EINVAL; 2451 break; 2452 } 2453 } 2454 } 2455 if (error) { 2456 FREE(nfs_pub.np_index, M_TEMP); 2457 return (error); 2458 } 2459 } 2460 2461 nfs_pub.np_mount = mp; 2462 nfs_pub.np_valid = 1; 2463 return (0); 2464} 2465 2466struct netcred * 2467vfs_export_lookup(mp, nep, nam) 2468 register struct mount *mp; 2469 struct netexport *nep; 2470 struct sockaddr *nam; 2471{ 2472 register struct netcred *np; 2473 register struct radix_node_head *rnh; 2474 struct sockaddr *saddr; 2475 2476 np = NULL; 2477 if (mp->mnt_flag & MNT_EXPORTED) { 2478 /* 2479 * Lookup in the export list first. 2480 */ 2481 if (nam != NULL) { 2482 saddr = nam; 2483 rnh = nep->ne_rtable[saddr->sa_family]; 2484 if (rnh != NULL) { 2485 np = (struct netcred *) 2486 (*rnh->rnh_matchaddr)((caddr_t)saddr, 2487 rnh); 2488 if (np && np->netc_rnodes->rn_flags & RNF_ROOT) 2489 np = NULL; 2490 } 2491 } 2492 /* 2493 * If no address match, use the default if it exists. 2494 */ 2495 if (np == NULL && mp->mnt_flag & MNT_DEFEXPORTED) 2496 np = &nep->ne_defexported; 2497 } 2498 return (np); 2499} 2500 2501/* 2502 * perform msync on all vnodes under a mount point 2503 * the mount point must be locked. 2504 */ 2505void 2506vfs_msync(struct mount *mp, int flags) { 2507 struct vnode *vp, *nvp; 2508 struct vm_object *obj; 2509 int anyio, tries; 2510 2511 tries = 5; 2512loop: 2513 anyio = 0; 2514 for (vp = LIST_FIRST(&mp->mnt_vnodelist); vp != NULL; vp = nvp) { 2515 2516 nvp = LIST_NEXT(vp, v_mntvnodes); 2517 2518 if (vp->v_mount != mp) { 2519 goto loop; 2520 } 2521 2522 if (vp->v_flag & VXLOCK) /* XXX: what if MNT_WAIT? */ 2523 continue; 2524 2525 if (flags != MNT_WAIT) { 2526 obj = vp->v_object; 2527 if (obj == NULL || (obj->flags & OBJ_MIGHTBEDIRTY) == 0) 2528 continue; 2529 if (VOP_ISLOCKED(vp, NULL)) 2530 continue; 2531 } 2532 2533 simple_lock(&vp->v_interlock); 2534 if (vp->v_object && 2535 (vp->v_object->flags & OBJ_MIGHTBEDIRTY)) { 2536 if (!vget(vp, 2537 LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY | LK_NOOBJ, curproc)) { 2538 if (vp->v_object) { 2539 vm_object_page_clean(vp->v_object, 0, 0, flags == MNT_WAIT ? OBJPC_SYNC : OBJPC_NOSYNC); 2540 anyio = 1; 2541 } 2542 vput(vp); 2543 } 2544 } else { 2545 simple_unlock(&vp->v_interlock); 2546 } 2547 } 2548 if (anyio && (--tries > 0)) 2549 goto loop; 2550} 2551 2552/* 2553 * Create the VM object needed for VMIO and mmap support. This 2554 * is done for all VREG files in the system. Some filesystems might 2555 * afford the additional metadata buffering capability of the 2556 * VMIO code by making the device node be VMIO mode also. 2557 * 2558 * vp must be locked when vfs_object_create is called. 2559 */ 2560int 2561vfs_object_create(vp, p, cred) 2562 struct vnode *vp; 2563 struct proc *p; 2564 struct ucred *cred; 2565{ 2566 struct vattr vat; 2567 vm_object_t object; 2568 int error = 0; 2569 2570 if (!vn_isdisk(vp, NULL) && vn_canvmio(vp) == FALSE) 2571 return 0; 2572 2573retry: 2574 if ((object = vp->v_object) == NULL) { 2575 if (vp->v_type == VREG || vp->v_type == VDIR) { 2576 if ((error = VOP_GETATTR(vp, &vat, cred, p)) != 0) 2577 goto retn; 2578 object = vnode_pager_alloc(vp, vat.va_size, 0, 0); 2579 } else if (devsw(vp->v_rdev) != NULL) { 2580 /* 2581 * This simply allocates the biggest object possible 2582 * for a disk vnode. This should be fixed, but doesn't 2583 * cause any problems (yet). 2584 */ 2585 object = vnode_pager_alloc(vp, IDX_TO_OFF(INT_MAX), 0, 0); 2586 } else { 2587 goto retn; 2588 } 2589 /* 2590 * Dereference the reference we just created. This assumes 2591 * that the object is associated with the vp. 2592 */ 2593 object->ref_count--; 2594 vp->v_usecount--; 2595 } else { 2596 if (object->flags & OBJ_DEAD) { 2597 VOP_UNLOCK(vp, 0, p); 2598 tsleep(object, PVM, "vodead", 0); 2599 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); 2600 goto retry; 2601 } 2602 } 2603 2604 KASSERT(vp->v_object != NULL, ("vfs_object_create: NULL object")); 2605 vp->v_flag |= VOBJBUF; 2606 2607retn: 2608 return error; 2609} 2610 2611void 2612vfree(vp) 2613 struct vnode *vp; 2614{ 2615 int s; 2616 2617 s = splbio(); 2618 simple_lock(&vnode_free_list_slock); 2619 KASSERT((vp->v_flag & VFREE) == 0, ("vnode already free")); 2620 if (vp->v_flag & VAGE) { 2621 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); 2622 } else { 2623 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 2624 } 2625 freevnodes++; 2626 simple_unlock(&vnode_free_list_slock); 2627 vp->v_flag &= ~VAGE; 2628 vp->v_flag |= VFREE; 2629 splx(s); 2630} 2631 2632void 2633vbusy(vp) 2634 struct vnode *vp; 2635{ 2636 int s; 2637 2638 s = splbio(); 2639 simple_lock(&vnode_free_list_slock); 2640 KASSERT((vp->v_flag & VFREE) != 0, ("vnode not free")); 2641 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); 2642 freevnodes--; 2643 simple_unlock(&vnode_free_list_slock); 2644 vp->v_flag &= ~(VFREE|VAGE); 2645 splx(s); 2646} 2647 2648/* 2649 * Record a process's interest in events which might happen to 2650 * a vnode. Because poll uses the historic select-style interface 2651 * internally, this routine serves as both the ``check for any 2652 * pending events'' and the ``record my interest in future events'' 2653 * functions. (These are done together, while the lock is held, 2654 * to avoid race conditions.) 2655 */ 2656int 2657vn_pollrecord(vp, p, events) 2658 struct vnode *vp; 2659 struct proc *p; 2660 short events; 2661{ 2662 simple_lock(&vp->v_pollinfo.vpi_lock); 2663 if (vp->v_pollinfo.vpi_revents & events) { 2664 /* 2665 * This leaves events we are not interested 2666 * in available for the other process which 2667 * which presumably had requested them 2668 * (otherwise they would never have been 2669 * recorded). 2670 */ 2671 events &= vp->v_pollinfo.vpi_revents; 2672 vp->v_pollinfo.vpi_revents &= ~events; 2673 2674 simple_unlock(&vp->v_pollinfo.vpi_lock); 2675 return events; 2676 } 2677 vp->v_pollinfo.vpi_events |= events; 2678 selrecord(p, &vp->v_pollinfo.vpi_selinfo); 2679 simple_unlock(&vp->v_pollinfo.vpi_lock); 2680 return 0; 2681} 2682 2683/* 2684 * Note the occurrence of an event. If the VN_POLLEVENT macro is used, 2685 * it is possible for us to miss an event due to race conditions, but 2686 * that condition is expected to be rare, so for the moment it is the 2687 * preferred interface. 2688 */ 2689void 2690vn_pollevent(vp, events) 2691 struct vnode *vp; 2692 short events; 2693{ 2694 simple_lock(&vp->v_pollinfo.vpi_lock); 2695 if (vp->v_pollinfo.vpi_events & events) { 2696 /* 2697 * We clear vpi_events so that we don't 2698 * call selwakeup() twice if two events are 2699 * posted before the polling process(es) is 2700 * awakened. This also ensures that we take at 2701 * most one selwakeup() if the polling process 2702 * is no longer interested. However, it does 2703 * mean that only one event can be noticed at 2704 * a time. (Perhaps we should only clear those 2705 * event bits which we note?) XXX 2706 */ 2707 vp->v_pollinfo.vpi_events = 0; /* &= ~events ??? */ 2708 vp->v_pollinfo.vpi_revents |= events; 2709 selwakeup(&vp->v_pollinfo.vpi_selinfo); 2710 } 2711 simple_unlock(&vp->v_pollinfo.vpi_lock); 2712} 2713 2714/* 2715 * Wake up anyone polling on vp because it is being revoked. 2716 * This depends on dead_poll() returning POLLHUP for correct 2717 * behavior. 2718 */ 2719void 2720vn_pollgone(vp) 2721 struct vnode *vp; 2722{ 2723 simple_lock(&vp->v_pollinfo.vpi_lock); 2724 if (vp->v_pollinfo.vpi_events) { 2725 vp->v_pollinfo.vpi_events = 0; 2726 selwakeup(&vp->v_pollinfo.vpi_selinfo); 2727 } 2728 simple_unlock(&vp->v_pollinfo.vpi_lock); 2729} 2730 2731 2732 2733/* 2734 * Routine to create and manage a filesystem syncer vnode. 2735 */ 2736#define sync_close ((int (*) __P((struct vop_close_args *)))nullop) 2737static int sync_fsync __P((struct vop_fsync_args *)); 2738static int sync_inactive __P((struct vop_inactive_args *)); 2739static int sync_reclaim __P((struct vop_reclaim_args *)); 2740#define sync_lock ((int (*) __P((struct vop_lock_args *)))vop_nolock) 2741#define sync_unlock ((int (*) __P((struct vop_unlock_args *)))vop_nounlock) 2742static int sync_print __P((struct vop_print_args *)); 2743#define sync_islocked ((int(*) __P((struct vop_islocked_args *)))vop_noislocked) 2744 2745static vop_t **sync_vnodeop_p; 2746static struct vnodeopv_entry_desc sync_vnodeop_entries[] = { 2747 { &vop_default_desc, (vop_t *) vop_eopnotsupp }, 2748 { &vop_close_desc, (vop_t *) sync_close }, /* close */ 2749 { &vop_fsync_desc, (vop_t *) sync_fsync }, /* fsync */ 2750 { &vop_inactive_desc, (vop_t *) sync_inactive }, /* inactive */ 2751 { &vop_reclaim_desc, (vop_t *) sync_reclaim }, /* reclaim */ 2752 { &vop_lock_desc, (vop_t *) sync_lock }, /* lock */ 2753 { &vop_unlock_desc, (vop_t *) sync_unlock }, /* unlock */ 2754 { &vop_print_desc, (vop_t *) sync_print }, /* print */ 2755 { &vop_islocked_desc, (vop_t *) sync_islocked }, /* islocked */ 2756 { NULL, NULL } 2757}; 2758static struct vnodeopv_desc sync_vnodeop_opv_desc = 2759 { &sync_vnodeop_p, sync_vnodeop_entries }; 2760 2761VNODEOP_SET(sync_vnodeop_opv_desc); 2762 2763/* 2764 * Create a new filesystem syncer vnode for the specified mount point. 2765 */ 2766int 2767vfs_allocate_syncvnode(mp) 2768 struct mount *mp; 2769{ 2770 struct vnode *vp; 2771 static long start, incr, next; 2772 int error; 2773 2774 /* Allocate a new vnode */ 2775 if ((error = getnewvnode(VT_VFS, mp, sync_vnodeop_p, &vp)) != 0) { 2776 mp->mnt_syncer = NULL; 2777 return (error); 2778 } 2779 vp->v_type = VNON; 2780 /* 2781 * Place the vnode onto the syncer worklist. We attempt to 2782 * scatter them about on the list so that they will go off 2783 * at evenly distributed times even if all the filesystems 2784 * are mounted at once. 2785 */ 2786 next += incr; 2787 if (next == 0 || next > syncer_maxdelay) { 2788 start /= 2; 2789 incr /= 2; 2790 if (start == 0) { 2791 start = syncer_maxdelay / 2; 2792 incr = syncer_maxdelay; 2793 } 2794 next = start; 2795 } 2796 vn_syncer_add_to_worklist(vp, syncdelay > 0 ? next % syncdelay : 0); 2797 mp->mnt_syncer = vp; 2798 return (0); 2799} 2800 2801/* 2802 * Do a lazy sync of the filesystem. 2803 */ 2804static int 2805sync_fsync(ap) 2806 struct vop_fsync_args /* { 2807 struct vnode *a_vp; 2808 struct ucred *a_cred; 2809 int a_waitfor; 2810 struct proc *a_p; 2811 } */ *ap; 2812{ 2813 struct vnode *syncvp = ap->a_vp; 2814 struct mount *mp = syncvp->v_mount; 2815 struct proc *p = ap->a_p; 2816 int asyncflag; 2817 2818 /* 2819 * We only need to do something if this is a lazy evaluation. 2820 */ 2821 if (ap->a_waitfor != MNT_LAZY) 2822 return (0); 2823 2824 /* 2825 * Move ourselves to the back of the sync list. 2826 */ 2827 vn_syncer_add_to_worklist(syncvp, syncdelay); 2828 2829 /* 2830 * Walk the list of vnodes pushing all that are dirty and 2831 * not already on the sync list. 2832 */ 2833 simple_lock(&mountlist_slock); 2834 if (vfs_busy(mp, LK_EXCLUSIVE | LK_NOWAIT, &mountlist_slock, p) != 0) { 2835 simple_unlock(&mountlist_slock); 2836 return (0); 2837 } 2838 if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) { 2839 vfs_unbusy(mp, p); 2840 simple_unlock(&mountlist_slock); 2841 return (0); 2842 } 2843 asyncflag = mp->mnt_flag & MNT_ASYNC; 2844 mp->mnt_flag &= ~MNT_ASYNC; 2845 vfs_msync(mp, MNT_NOWAIT); 2846 VFS_SYNC(mp, MNT_LAZY, ap->a_cred, p); 2847 if (asyncflag) 2848 mp->mnt_flag |= MNT_ASYNC; 2849 vn_finished_write(mp); 2850 vfs_unbusy(mp, p); 2851 return (0); 2852} 2853 2854/* 2855 * The syncer vnode is no referenced. 2856 */ 2857static int 2858sync_inactive(ap) 2859 struct vop_inactive_args /* { 2860 struct vnode *a_vp; 2861 struct proc *a_p; 2862 } */ *ap; 2863{ 2864 2865 vgone(ap->a_vp); 2866 return (0); 2867} 2868 2869/* 2870 * The syncer vnode is no longer needed and is being decommissioned. 2871 * 2872 * Modifications to the worklist must be protected at splbio(). 2873 */ 2874static int 2875sync_reclaim(ap) 2876 struct vop_reclaim_args /* { 2877 struct vnode *a_vp; 2878 } */ *ap; 2879{ 2880 struct vnode *vp = ap->a_vp; 2881 int s; 2882 2883 s = splbio(); 2884 vp->v_mount->mnt_syncer = NULL; 2885 if (vp->v_flag & VONWORKLST) { 2886 LIST_REMOVE(vp, v_synclist); 2887 vp->v_flag &= ~VONWORKLST; 2888 } 2889 splx(s); 2890 2891 return (0); 2892} 2893 2894/* 2895 * Print out a syncer vnode. 2896 */ 2897static int 2898sync_print(ap) 2899 struct vop_print_args /* { 2900 struct vnode *a_vp; 2901 } */ *ap; 2902{ 2903 struct vnode *vp = ap->a_vp; 2904 2905 printf("syncer vnode"); 2906 if (vp->v_vnlock != NULL) 2907 lockmgr_printinfo(vp->v_vnlock); 2908 printf("\n"); 2909 return (0); 2910} 2911 2912/* 2913 * extract the dev_t from a VBLK or VCHR 2914 */ 2915dev_t 2916vn_todev(vp) 2917 struct vnode *vp; 2918{ 2919 if (vp->v_type != VBLK && vp->v_type != VCHR) 2920 return (NODEV); 2921 return (vp->v_rdev); 2922} 2923 2924/* 2925 * Check if vnode represents a disk device 2926 */ 2927int 2928vn_isdisk(vp, errp) 2929 struct vnode *vp; 2930 int *errp; 2931{ 2932 struct cdevsw *cdevsw; 2933 2934 if (vp->v_type != VBLK && vp->v_type != VCHR) { 2935 if (errp != NULL) 2936 *errp = ENOTBLK; 2937 return (0); 2938 } 2939 if (vp->v_rdev == NULL) { 2940 if (errp != NULL) 2941 *errp = ENXIO; 2942 return (0); 2943 } 2944 cdevsw = devsw(vp->v_rdev); 2945 if (cdevsw == NULL) { 2946 if (errp != NULL) 2947 *errp = ENXIO; 2948 return (0); 2949 } 2950 if (!(cdevsw->d_flags & D_DISK)) { 2951 if (errp != NULL) 2952 *errp = ENOTBLK; 2953 return (0); 2954 } 2955 if (errp != NULL) 2956 *errp = 0; 2957 return (1); 2958} 2959 2960void 2961NDFREE(ndp, flags) 2962 struct nameidata *ndp; 2963 const uint flags; 2964{ 2965 if (!(flags & NDF_NO_FREE_PNBUF) && 2966 (ndp->ni_cnd.cn_flags & HASBUF)) { 2967 zfree(namei_zone, ndp->ni_cnd.cn_pnbuf); 2968 ndp->ni_cnd.cn_flags &= ~HASBUF; 2969 } 2970 if (!(flags & NDF_NO_DVP_UNLOCK) && 2971 (ndp->ni_cnd.cn_flags & LOCKPARENT) && 2972 ndp->ni_dvp != ndp->ni_vp) 2973 VOP_UNLOCK(ndp->ni_dvp, 0, ndp->ni_cnd.cn_proc); 2974 if (!(flags & NDF_NO_DVP_RELE) && 2975 (ndp->ni_cnd.cn_flags & (LOCKPARENT|WANTPARENT))) { 2976 vrele(ndp->ni_dvp); 2977 ndp->ni_dvp = NULL; 2978 } 2979 if (!(flags & NDF_NO_VP_UNLOCK) && 2980 (ndp->ni_cnd.cn_flags & LOCKLEAF) && ndp->ni_vp) 2981 VOP_UNLOCK(ndp->ni_vp, 0, ndp->ni_cnd.cn_proc); 2982 if (!(flags & NDF_NO_VP_RELE) && 2983 ndp->ni_vp) { 2984 vrele(ndp->ni_vp); 2985 ndp->ni_vp = NULL; 2986 } 2987 if (!(flags & NDF_NO_STARTDIR_RELE) && 2988 (ndp->ni_cnd.cn_flags & SAVESTART)) { 2989 vrele(ndp->ni_startdir); 2990 ndp->ni_startdir = NULL; 2991 } 2992} 2993 2994int 2995vaccess(type, file_mode, file_uid, file_gid, acc_mode, cred, privused) 2996 enum vtype type; 2997 mode_t file_mode; 2998 uid_t file_uid; 2999 gid_t file_gid; 3000 mode_t acc_mode; 3001 struct ucred *cred; 3002 int *privused; 3003{ 3004 mode_t dac_granted; 3005#ifdef CAPABILITIES 3006 mode_t cap_granted; 3007#endif 3008 3009 /* 3010 * Look for a normal, non-privileged way to access the file/directory 3011 * as requested. If it exists, go with that. 3012 */ 3013 3014 if (privused != NULL) 3015 *privused = 0; 3016 3017 dac_granted = 0; 3018 3019 /* Check the owner. */ 3020 if (cred->cr_uid == file_uid) { 3021 if (file_mode & S_IXUSR) 3022 dac_granted |= VEXEC; 3023 if (file_mode & S_IRUSR) 3024 dac_granted |= VREAD; 3025 if (file_mode & S_IWUSR) 3026 dac_granted |= VWRITE; 3027 3028 if ((acc_mode & dac_granted) == acc_mode) 3029 return (0); 3030 3031 goto privcheck; 3032 } 3033 3034 /* Otherwise, check the groups (first match) */ 3035 if (groupmember(file_gid, cred)) { 3036 if (file_mode & S_IXGRP) 3037 dac_granted |= VEXEC; 3038 if (file_mode & S_IRGRP) 3039 dac_granted |= VREAD; 3040 if (file_mode & S_IWGRP) 3041 dac_granted |= VWRITE; 3042 3043 if ((acc_mode & dac_granted) == acc_mode) 3044 return (0); 3045 3046 goto privcheck; 3047 } 3048 3049 /* Otherwise, check everyone else. */ 3050 if (file_mode & S_IXOTH) 3051 dac_granted |= VEXEC; 3052 if (file_mode & S_IROTH) 3053 dac_granted |= VREAD; 3054 if (file_mode & S_IWOTH) 3055 dac_granted |= VWRITE; 3056 if ((acc_mode & dac_granted) == acc_mode) 3057 return (0); 3058 3059privcheck: 3060 if (!suser_xxx(cred, NULL, PRISON_ROOT)) { 3061 /* XXX audit: privilege used */ 3062 if (privused != NULL) 3063 *privused = 1; 3064 return (0); 3065 } 3066 3067#ifdef CAPABILITIES 3068 /* 3069 * Build a capability mask to determine if the set of capabilities 3070 * satisfies the requirements when combined with the granted mask 3071 * from above. 3072 * For each capability, if the capability is required, bitwise 3073 * or the request type onto the cap_granted mask. 3074 */ 3075 cap_granted = 0; 3076 if ((acc_mode & VEXEC) && ((dac_granted & VEXEC) == 0) && 3077 !cap_check_xxx(cred, NULL, CAP_DAC_EXECUTE, PRISON_ROOT)) 3078 cap_granted |= VEXEC; 3079 3080 if ((acc_mode & VREAD) && ((dac_granted & VREAD) == 0) && 3081 !cap_check_xxx(cred, NULL, CAP_DAC_READ_SEARCH, PRISON_ROOT)) 3082 cap_granted |= VREAD; 3083 3084 if ((acc_mode & VWRITE) && ((dac_granted & VWRITE) == 0) && 3085 !cap_check_xxx(cred, NULL, CAP_DAC_WRITE, PRISON_ROOT)) 3086 cap_granted |= VWRITE; 3087 3088 if ((acc_mode & (cap_granted | dac_granted)) == acc_mode) { 3089 /* XXX audit: privilege used */ 3090 if (privused != NULL) 3091 *privused = 1; 3092 return (0); 3093 } 3094#endif 3095 3096 return (EACCES); 3097} 3098