Cross Reference: /freebsd-11.0-release/sys/kern/vfs

Deleted Added

sdiff udiff text old ( 65770 ) new ( 66067 )

full compact

vfs_subr.c (65770)	vfs_subr.c (66067)
1/* 2 * Copyright (c) 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the University of 21 * California, Berkeley and its contributors. 22 * 4. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95	1/* 2 * Copyright (c) 1989, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the University of 21 * California, Berkeley and its contributors. 22 * 4. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95
39 * $FreeBSD: head/sys/kern/vfs_subr.c 65770 2000-09-12 09:49:08Z bp $	39 * $FreeBSD: head/sys/kern/vfs_subr.c 66067 2000-09-19 10:28:44Z phk $
40 / 41 42/ 43 * External virtual filesystem routines 44 / 45#include "opt_ddb.h" 46#include "opt_ffs.h" 47 48#include <sys/param.h> 49#include <sys/systm.h> 50#include <sys/bio.h> 51#include <sys/buf.h> 52#include <sys/conf.h> 53#include <sys/dirent.h> 54#include <sys/domain.h> 55#include <sys/eventhandler.h> 56#include <sys/fcntl.h> 57#include <sys/kernel.h> 58#include <sys/kthread.h> 59#include <sys/ktr.h> 60#include <sys/malloc.h> 61#include <sys/mount.h> 62#include <sys/namei.h> 63#include <sys/proc.h> 64#include <sys/reboot.h> 65#include <sys/socket.h> 66#include <sys/stat.h> 67#include <sys/sysctl.h> 68#include <sys/vmmeter.h> 69#include <sys/vnode.h> 70 71#include <machine/limits.h> 72#include <machine/mutex.h> 73 74#include <vm/vm.h> 75#include <vm/vm_object.h> 76#include <vm/vm_extern.h> 77#include <vm/pmap.h> 78#include <vm/vm_map.h> 79#include <vm/vm_page.h> 80#include <vm/vm_pager.h> 81#include <vm/vnode_pager.h> 82#include <vm/vm_zone.h> 83 84static MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure"); 85 86static void insmntque __P((struct vnode vp, struct mount mp)); 87static void vclean __P((struct vnode vp, int flags, struct proc p)); 88static unsigned long numvnodes; 89SYSCTL_INT(_debug, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, ""); 90 91enum vtype iftovt_tab[16] = { 92 VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, 93 VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD, 94}; 95int vttoif_tab[9] = { 96 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, 97 S_IFSOCK, S_IFIFO, S_IFMT, 98}; 99 100static TAILQ_HEAD(freelst, vnode) vnode_free_list; / vnode free list / 101* 102static u_long wantfreevnodes = 25; 103SYSCTL_INT(_debug, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, ""); 104static u_long freevnodes = 0; 105SYSCTL_INT(_debug, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, ""); 106 107static int reassignbufcalls; 108SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0, ""); 109static int reassignbufloops; 110SYSCTL_INT(_vfs, OID_AUTO, reassignbufloops, CTLFLAG_RW, &reassignbufloops, 0, ""); 111static int reassignbufsortgood; 112SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortgood, CTLFLAG_RW, &reassignbufsortgood, 0, ""); 113static int reassignbufsortbad; 114SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortbad, CTLFLAG_RW, &reassignbufsortbad, 0, ""); 115static int reassignbufmethod = 1; 116SYSCTL_INT(_vfs, OID_AUTO, reassignbufmethod, CTLFLAG_RW, &reassignbufmethod, 0, ""); 117 118#ifdef ENABLE_VFS_IOOPT 119int vfs_ioopt = 0; 120SYSCTL_INT(_vfs, OID_AUTO, ioopt, CTLFLAG_RW, &vfs_ioopt, 0, ""); 121#endif 122 123struct mntlist mountlist = TAILQ_HEAD_INITIALIZER(mountlist); /* mounted fs / 124struct simplelock mountlist_slock; 125struct simplelock mntvnode_slock; 126int nfs_mount_type = -1; 127#ifndef NULL_SIMPLELOCKS 128static struct simplelock mntid_slock; 129static struct simplelock vnode_free_list_slock; 130static struct simplelock spechash_slock; 131#endif 132struct nfs_public nfs_pub; / publicly exported FS / 133static vm_zone_t vnode_zone; 134int prtactive = 0; / 1 => print out reclaim of active vnodes / 135* 136/* 137 * The workitem queue. 138 / 139#define SYNCER_MAXDELAY 32 140static int syncer_maxdelay = SYNCER_MAXDELAY; / maximum delay time / 141time_t syncdelay = 30; / max time to delay syncing data / 142time_t filedelay = 30; / time to delay syncing files / 143SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, ""); 144time_t dirdelay = 29; / time to delay syncing directories / 145SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, ""); 146time_t metadelay = 28; / time to delay syncing metadata / 147SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, ""); 148static int rushjob; / number of slots to run ASAP / 149static int stat_rush_requests; / number of times I/O speeded up / 150SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, ""); 151* 152static int syncer_delayno = 0; 153static long syncer_mask; 154LIST_HEAD(synclist, vnode); 155static struct synclist syncer_workitem_pending; 156* 157int desiredvnodes; 158SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW, 159 &desiredvnodes, 0, "Maximum number of vnodes"); 160 161static void vfs_free_addrlist __P((struct netexport nep)); 162static int vfs_free_netcred __P((struct radix_node rn, void w)); 163static int vfs_hang_addrlist __P((struct mount mp, struct netexport nep, 164* struct export_args argp)); 165* 166/* 167 * Initialize the vnode management data structures. 168 / 169void 170vntblinit() 171{ 172* 173 desiredvnodes = maxproc + cnt.v_page_count / 4; 174 simple_lock_init(&mntvnode_slock); 175 simple_lock_init(&mntid_slock); 176 simple_lock_init(&spechash_slock); 177 TAILQ_INIT(&vnode_free_list); 178 simple_lock_init(&vnode_free_list_slock); 179 vnode_zone = zinit("VNODE", sizeof (struct vnode), 0, 0, 5); 180 /* 181 * Initialize the filesystem syncer. 182 / 183* syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE, 184 &syncer_mask); 185 syncer_maxdelay = syncer_mask + 1; 186} 187 188/* 189 * Mark a mount point as busy. Used to synchronize access and to delay 190 * unmounting. Interlock is not released on failure. 191 / 192int 193vfs_busy(mp, flags, interlkp, p) 194* struct mount mp; 195* int flags; 196 struct simplelock interlkp; 197* struct proc p; 198{ 199* int lkflags; 200 201 if (mp->mnt_kern_flag & MNTK_UNMOUNT) { 202 if (flags & LK_NOWAIT) 203 return (ENOENT); 204 mp->mnt_kern_flag \|= MNTK_MWAIT; 205 if (interlkp) { 206 simple_unlock(interlkp); 207 } 208 /* 209 * Since all busy locks are shared except the exclusive 210 * lock granted when unmounting, the only place that a 211 * wakeup needs to be done is at the release of the 212 * exclusive lock at the end of dounmount. 213 / 214* tsleep((caddr_t)mp, PVFS, "vfs_busy", 0); 215 if (interlkp) { 216 simple_lock(interlkp); 217 } 218 return (ENOENT); 219 } 220 lkflags = LK_SHARED \| LK_NOPAUSE; 221 if (interlkp) 222 lkflags \|= LK_INTERLOCK; 223 if (lockmgr(&mp->mnt_lock, lkflags, interlkp, p)) 224 panic("vfs_busy: unexpected lock failure"); 225 return (0); 226} 227 228/* 229 * Free a busy filesystem. 230 / 231void 232vfs_unbusy(mp, p) 233* struct mount mp; 234* struct proc p; 235{ 236* 237 lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, p); 238} 239 240/* 241 * Lookup a filesystem type, and if found allocate and initialize 242 * a mount structure for it. 243 * 244 * Devname is usually updated by mount(8) after booting. 245 / 246int 247vfs_rootmountalloc(fstypename, devname, mpp) 248* char fstypename; 249* char devname; 250* struct mount *mpp; 251{ 252* struct proc p = curproc; / XXX / 253* struct vfsconf vfsp; 254* struct mount mp; 255* 256 if (fstypename == NULL) 257 return (ENODEV); 258 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) 259 if (!strcmp(vfsp->vfc_name, fstypename)) 260 break; 261 if (vfsp == NULL) 262 return (ENODEV); 263 mp = malloc((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK); 264 bzero((char )mp, (u_long)sizeof(struct mount)); 265* lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, LK_NOPAUSE); 266 (void)vfs_busy(mp, LK_NOWAIT, 0, p); 267 LIST_INIT(&mp->mnt_vnodelist); 268 mp->mnt_vfc = vfsp; 269 mp->mnt_op = vfsp->vfc_vfsops; 270 mp->mnt_flag = MNT_RDONLY; 271 mp->mnt_vnodecovered = NULLVP; 272 vfsp->vfc_refcount++; 273 mp->mnt_iosize_max = DFLTPHYS; 274 mp->mnt_stat.f_type = vfsp->vfc_typenum; 275 mp->mnt_flag \|= vfsp->vfc_flags & MNT_VISFLAGMASK; 276 strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN); 277 mp->mnt_stat.f_mntonname[0] = '/'; 278 mp->mnt_stat.f_mntonname[1] = 0; 279 (void) copystr(devname, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, 0); 280 mpp = mp; 281* return (0); 282} 283 284/* 285 * Find an appropriate filesystem to use for the root. If a filesystem 286 * has not been preselected, walk through the list of known filesystems 287 * trying those that have mountroot routines, and try them until one 288 * works or we have tried them all. 289 / 290#ifdef notdef / XXX JH / 291int 292lite2_vfs_mountroot() 293{ 294* struct vfsconf vfsp; 295* extern int (lite2_mountroot) __P((void)); 296* int error; 297 298 if (lite2_mountroot != NULL) 299 return ((lite2_mountroot)()); 300* for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) { 301 if (vfsp->vfc_mountroot == NULL) 302 continue; 303 if ((error = (vfsp->vfc_mountroot)()) == 0) 304* return (0); 305 printf("%s_mountroot failed: %d\n", vfsp->vfc_name, error); 306 } 307 return (ENODEV); 308} 309#endif 310 311/* 312 * Lookup a mount point by filesystem identifier. 313 / 314struct mount 315vfs_getvfs(fsid) 316 fsid_t fsid; 317{ 318* register struct mount mp; 319* 320 simple_lock(&mountlist_slock); 321 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 322 if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] && 323 mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) { 324 simple_unlock(&mountlist_slock); 325 return (mp); 326 } 327 } 328 simple_unlock(&mountlist_slock); 329 return ((struct mount ) 0); 330} 331* 332/* 333 * Get a new unique fsid. Try to make its val[0] unique, since this value 334 * will be used to create fake device numbers for stat(). Also try (but 335 * not so hard) make its val[0] unique mod 2^16, since some emulators only 336 * support 16-bit device numbers. We end up with unique val[0]'s for the 337 * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls. 338 * 339 * Keep in mind that several mounts may be running in parallel. Starting 340 * the search one past where the previous search terminated is both a 341 * micro-optimization and a defense against returning the same fsid to 342 * different mounts. 343 / 344void 345vfs_getnewfsid(mp) 346* struct mount mp; 347{ 348* static u_int16_t mntid_base; 349 fsid_t tfsid; 350 int mtype; 351 352 simple_lock(&mntid_slock); 353 mtype = mp->mnt_vfc->vfc_typenum; 354 tfsid.val[1] = mtype; 355 mtype = (mtype & 0xFF) << 24; 356 for (;;) { 357 tfsid.val[0] = makeudev(255, 358 mtype \| ((mntid_base & 0xFF00) << 8) \| (mntid_base & 0xFF)); 359 mntid_base++; 360 if (vfs_getvfs(&tfsid) == NULL) 361 break; 362 } 363 mp->mnt_stat.f_fsid.val[0] = tfsid.val[0]; 364 mp->mnt_stat.f_fsid.val[1] = tfsid.val[1]; 365 simple_unlock(&mntid_slock); 366} 367 368/* 369 * Knob to control the precision of file timestamps: 370 * 371 * 0 = seconds only; nanoseconds zeroed. 372 * 1 = seconds and nanoseconds, accurate within 1/HZ. 373 * 2 = seconds and nanoseconds, truncated to microseconds. 374 * >=3 = seconds and nanoseconds, maximum precision. 375 / 376enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC }; 377* 378static int timestamp_precision = TSP_SEC; 379SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW, 380 &timestamp_precision, 0, ""); 381 382/* 383 * Get a current timestamp. 384 / 385void 386vfs_timestamp(tsp) 387* struct timespec tsp; 388{ 389* struct timeval tv; 390 391 switch (timestamp_precision) { 392 case TSP_SEC: 393 tsp->tv_sec = time_second; 394 tsp->tv_nsec = 0; 395 break; 396 case TSP_HZ: 397 getnanotime(tsp); 398 break; 399 case TSP_USEC: 400 microtime(&tv); 401 TIMEVAL_TO_TIMESPEC(&tv, tsp); 402 break; 403 case TSP_NSEC: 404 default: 405 nanotime(tsp); 406 break; 407 } 408} 409 410/* 411 * Set vnode attributes to VNOVAL 412 / 413void 414vattr_null(vap) 415* register struct vattr vap; 416{ 417* 418 vap->va_type = VNON; 419 vap->va_size = VNOVAL; 420 vap->va_bytes = VNOVAL; 421 vap->va_mode = VNOVAL; 422 vap->va_nlink = VNOVAL; 423 vap->va_uid = VNOVAL; 424 vap->va_gid = VNOVAL; 425 vap->va_fsid = VNOVAL; 426 vap->va_fileid = VNOVAL; 427 vap->va_blocksize = VNOVAL; 428 vap->va_rdev = VNOVAL; 429 vap->va_atime.tv_sec = VNOVAL; 430 vap->va_atime.tv_nsec = VNOVAL; 431 vap->va_mtime.tv_sec = VNOVAL; 432 vap->va_mtime.tv_nsec = VNOVAL; 433 vap->va_ctime.tv_sec = VNOVAL; 434 vap->va_ctime.tv_nsec = VNOVAL; 435 vap->va_flags = VNOVAL; 436 vap->va_gen = VNOVAL; 437 vap->va_vaflags = 0; 438} 439 440/* 441 * Routines having to do with the management of the vnode table. 442 / 443* 444/* 445 * Return the next vnode from the free list. 446 / 447int 448getnewvnode(tag, mp, vops, vpp) 449* enum vtagtype tag; 450 struct mount mp; 451* vop_t *vops; 452* struct vnode *vpp; 453{ 454* int s, count; 455 struct proc p = curproc; / XXX / 456* struct vnode vp = NULL; 457* struct mount vnmp; 458* vm_object_t object; 459 460 /* 461 * We take the least recently used vnode from the freelist 462 * if we can get it and it has no cached pages, and no 463 * namecache entries are relative to it. 464 * Otherwise we allocate a new vnode 465 / 466* 467 s = splbio(); 468 simple_lock(&vnode_free_list_slock); 469 470 if (wantfreevnodes && freevnodes < wantfreevnodes) { 471 vp = NULL; 472 } else if (!wantfreevnodes && freevnodes <= desiredvnodes) { 473 /* 474 * XXX: this is only here to be backwards compatible 475 / 476* vp = NULL; 477 } else for (count = 0; count < freevnodes; count++) { 478 vp = TAILQ_FIRST(&vnode_free_list); 479 if (vp == NULL \|\| vp->v_usecount) 480 panic("getnewvnode: free vnode isn't"); 481 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); 482 /* 483 * Don't recycle if active in the namecache or 484 * if it still has cached pages or we cannot get 485 * its interlock. 486 / 487* if (LIST_FIRST(&vp->v_cache_src) != NULL \|\| 488 (VOP_GETVOBJECT(vp, &object) == 0 && 489 (object->resident_page_count \|\| object->ref_count)) \|\| 490 !simple_lock_try(&vp->v_interlock)) { 491 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 492 vp = NULL; 493 continue; 494 } 495 /* 496 * Skip over it if its filesystem is being suspended. 497 / 498* if (vn_start_write(vp, &vnmp, V_NOWAIT) == 0) 499 break; 500 simple_unlock(&vp->v_interlock); 501 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 502 vp = NULL; 503 } 504 if (vp) { 505 vp->v_flag \|= VDOOMED; 506 freevnodes--; 507 simple_unlock(&vnode_free_list_slock); 508 cache_purge(vp); 509 vp->v_lease = NULL; 510 if (vp->v_type != VBAD) { 511 vgonel(vp, p); 512 } else { 513 simple_unlock(&vp->v_interlock); 514 } 515 vn_finished_write(vnmp); 516 517#ifdef INVARIANTS 518 { 519 int s; 520 521 if (vp->v_data) 522 panic("cleaned vnode isn't"); 523 s = splbio(); 524 if (vp->v_numoutput) 525 panic("Clean vnode has pending I/O's"); 526 splx(s); 527 if (vp->v_writecount != 0) 528 panic("Non-zero write count"); 529 } 530#endif 531 vp->v_flag = 0; 532 vp->v_lastw = 0; 533 vp->v_lasta = 0; 534 vp->v_cstart = 0; 535 vp->v_clen = 0; 536 vp->v_socket = 0; 537 } else { 538 simple_unlock(&vnode_free_list_slock); 539 vp = (struct vnode ) zalloc(vnode_zone); 540* bzero((char ) vp, sizeof vp); 541 simple_lock_init(&vp->v_interlock); 542 vp->v_dd = vp; 543 cache_purge(vp); 544 LIST_INIT(&vp->v_cache_src); 545 TAILQ_INIT(&vp->v_cache_dst); 546 numvnodes++; 547 } 548 549 TAILQ_INIT(&vp->v_cleanblkhd); 550 TAILQ_INIT(&vp->v_dirtyblkhd); 551 vp->v_type = VNON; 552 vp->v_tag = tag; 553 vp->v_op = vops; 554 insmntque(vp, mp); 555 vpp = vp; 556* vp->v_usecount = 1; 557 vp->v_data = 0; 558 splx(s); 559 560 vfs_object_create(vp, p, p->p_ucred); 561 return (0); 562} 563 564/* 565 * Move a vnode from one mount queue to another. 566 / 567static void 568insmntque(vp, mp) 569* register struct vnode vp; 570* register struct mount mp; 571{ 572* 573 simple_lock(&mntvnode_slock); 574 /* 575 * Delete from old mount point vnode list, if on one. 576 / 577* if (vp->v_mount != NULL) 578 LIST_REMOVE(vp, v_mntvnodes); 579 /* 580 * Insert into list of vnodes for the new mount point, if available. 581 / 582* if ((vp->v_mount = mp) == NULL) { 583 simple_unlock(&mntvnode_slock); 584 return; 585 } 586 LIST_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes); 587 simple_unlock(&mntvnode_slock); 588} 589 590/* 591 * Update outstanding I/O count and do wakeup if requested. 592 / 593void 594vwakeup(bp) 595* register struct buf bp; 596{ 597* register struct vnode vp; 598* 599 bp->b_flags &= ~B_WRITEINPROG; 600 if ((vp = bp->b_vp)) { 601 vp->v_numoutput--; 602 if (vp->v_numoutput < 0) 603 panic("vwakeup: neg numoutput"); 604 if ((vp->v_numoutput == 0) && (vp->v_flag & VBWAIT)) { 605 vp->v_flag &= ~VBWAIT; 606 wakeup((caddr_t) &vp->v_numoutput); 607 } 608 } 609} 610 611/* 612 * Flush out and invalidate all buffers associated with a vnode. 613 * Called with the underlying object locked. 614 / 615int 616vinvalbuf(vp, flags, cred, p, slpflag, slptimeo) 617* register struct vnode vp; 618* int flags; 619 struct ucred cred; 620* struct proc p; 621* int slpflag, slptimeo; 622{ 623 register struct buf bp; 624* struct buf nbp, blist; 625 int s, error; 626 vm_object_t object; 627 628 if (flags & V_SAVE) { 629 s = splbio(); 630 while (vp->v_numoutput) { 631 vp->v_flag \|= VBWAIT; 632 error = tsleep((caddr_t)&vp->v_numoutput, 633 slpflag \| (PRIBIO + 1), "vinvlbuf", slptimeo); 634 if (error) { 635 splx(s); 636 return (error); 637 } 638 } 639 if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) { 640 splx(s); 641 if ((error = VOP_FSYNC(vp, cred, MNT_WAIT, p)) != 0) 642 return (error); 643 s = splbio(); 644 if (vp->v_numoutput > 0 \|\| 645 !TAILQ_EMPTY(&vp->v_dirtyblkhd)) 646 panic("vinvalbuf: dirty bufs"); 647 } 648 splx(s); 649 } 650 s = splbio(); 651 for (;;) { 652 blist = TAILQ_FIRST(&vp->v_cleanblkhd); 653 if (!blist) 654 blist = TAILQ_FIRST(&vp->v_dirtyblkhd); 655 if (!blist) 656 break; 657 658 for (bp = blist; bp; bp = nbp) { 659 nbp = TAILQ_NEXT(bp, b_vnbufs); 660 if (BUF_LOCK(bp, LK_EXCLUSIVE \| LK_NOWAIT)) { 661 error = BUF_TIMELOCK(bp, 662 LK_EXCLUSIVE \| LK_SLEEPFAIL, 663 "vinvalbuf", slpflag, slptimeo); 664 if (error == ENOLCK) 665 break; 666 splx(s); 667 return (error); 668 } 669 /* 670 * XXX Since there are no node locks for NFS, I 671 * believe there is a slight chance that a delayed 672 * write will occur while sleeping just above, so 673 * check for it. Note that vfs_bio_awrite expects 674 * buffers to reside on a queue, while VOP_BWRITE and 675 * brelse do not. 676 / 677* if (((bp->b_flags & (B_DELWRI \| B_INVAL)) == B_DELWRI) && 678 (flags & V_SAVE)) { 679 680 if (bp->b_vp == vp) { 681 if (bp->b_flags & B_CLUSTEROK) { 682 BUF_UNLOCK(bp); 683 vfs_bio_awrite(bp); 684 } else { 685 bremfree(bp); 686 bp->b_flags \|= B_ASYNC; 687 BUF_WRITE(bp); 688 } 689 } else { 690 bremfree(bp); 691 (void) BUF_WRITE(bp); 692 } 693 break; 694 } 695 bremfree(bp); 696 bp->b_flags \|= (B_INVAL \| B_NOCACHE \| B_RELBUF); 697 bp->b_flags &= ~B_ASYNC; 698 brelse(bp); 699 } 700 } 701 702 while (vp->v_numoutput > 0) { 703 vp->v_flag \|= VBWAIT; 704 tsleep(&vp->v_numoutput, PVM, "vnvlbv", 0); 705 } 706 707 splx(s); 708 709 /* 710 * Destroy the copy in the VM cache, too. 711 / 712* simple_lock(&vp->v_interlock); 713 if (VOP_GETVOBJECT(vp, &object) == 0) { 714 vm_object_page_remove(object, 0, 0, 715 (flags & V_SAVE) ? TRUE : FALSE); 716 } 717 simple_unlock(&vp->v_interlock); 718 719 if (!TAILQ_EMPTY(&vp->v_dirtyblkhd) \|\| !TAILQ_EMPTY(&vp->v_cleanblkhd)) 720 panic("vinvalbuf: flush failed"); 721 return (0); 722} 723 724/* 725 * Truncate a file's buffer and pages to a specified length. This 726 * is in lieu of the old vinvalbuf mechanism, which performed unneeded 727 * sync activity. 728 / 729int 730vtruncbuf(vp, cred, p, length, blksize) 731* register struct vnode vp; 732* struct ucred cred; 733* struct proc p; 734* off_t length; 735 int blksize; 736{ 737 register struct buf bp; 738* struct buf nbp; 739* int s, anyfreed; 740 int trunclbn; 741 742 /* 743 * Round up to the next lbn. 744 / 745* trunclbn = (length + blksize - 1) / blksize; 746 747 s = splbio(); 748restart: 749 anyfreed = 1; 750 for (;anyfreed;) { 751 anyfreed = 0; 752 for (bp = TAILQ_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) { 753 nbp = TAILQ_NEXT(bp, b_vnbufs); 754 if (bp->b_lblkno >= trunclbn) { 755 if (BUF_LOCK(bp, LK_EXCLUSIVE \| LK_NOWAIT)) { 756 BUF_LOCK(bp, LK_EXCLUSIVE\|LK_SLEEPFAIL); 757 goto restart; 758 } else { 759 bremfree(bp); 760 bp->b_flags \|= (B_INVAL \| B_RELBUF); 761 bp->b_flags &= ~B_ASYNC; 762 brelse(bp); 763 anyfreed = 1; 764 } 765 if (nbp && 766 (((nbp->b_xflags & BX_VNCLEAN) == 0) \|\| 767 (nbp->b_vp != vp) \|\| 768 (nbp->b_flags & B_DELWRI))) { 769 goto restart; 770 } 771 } 772 } 773 774 for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { 775 nbp = TAILQ_NEXT(bp, b_vnbufs); 776 if (bp->b_lblkno >= trunclbn) { 777 if (BUF_LOCK(bp, LK_EXCLUSIVE \| LK_NOWAIT)) { 778 BUF_LOCK(bp, LK_EXCLUSIVE\|LK_SLEEPFAIL); 779 goto restart; 780 } else { 781 bremfree(bp); 782 bp->b_flags \|= (B_INVAL \| B_RELBUF); 783 bp->b_flags &= ~B_ASYNC; 784 brelse(bp); 785 anyfreed = 1; 786 } 787 if (nbp && 788 (((nbp->b_xflags & BX_VNDIRTY) == 0) \|\| 789 (nbp->b_vp != vp) \|\| 790 (nbp->b_flags & B_DELWRI) == 0)) { 791 goto restart; 792 } 793 } 794 } 795 } 796 797 if (length > 0) { 798restartsync: 799 for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { 800 nbp = TAILQ_NEXT(bp, b_vnbufs); 801 if ((bp->b_flags & B_DELWRI) && (bp->b_lblkno < 0)) { 802 if (BUF_LOCK(bp, LK_EXCLUSIVE \| LK_NOWAIT)) { 803 BUF_LOCK(bp, LK_EXCLUSIVE\|LK_SLEEPFAIL); 804 goto restart; 805 } else { 806 bremfree(bp); 807 if (bp->b_vp == vp) { 808 bp->b_flags \|= B_ASYNC; 809 } else { 810 bp->b_flags &= ~B_ASYNC; 811 } 812 BUF_WRITE(bp); 813 } 814 goto restartsync; 815 } 816 817 } 818 } 819 820 while (vp->v_numoutput > 0) { 821 vp->v_flag \|= VBWAIT; 822 tsleep(&vp->v_numoutput, PVM, "vbtrunc", 0); 823 } 824 825 splx(s); 826 827 vnode_pager_setsize(vp, length); 828 829 return (0); 830} 831 832/* 833 * Associate a buffer with a vnode. 834 / 835void 836bgetvp(vp, bp) 837* register struct vnode vp; 838* register struct buf bp; 839{ 840* int s; 841 842 KASSERT(bp->b_vp == NULL, ("bgetvp: not free")); 843 844 vhold(vp); 845 bp->b_vp = vp; 846 bp->b_dev = vn_todev(vp); 847 /* 848 * Insert onto list for new vnode. 849 / 850* s = splbio(); 851 bp->b_xflags \|= BX_VNCLEAN; 852 bp->b_xflags &= ~BX_VNDIRTY; 853 TAILQ_INSERT_TAIL(&vp->v_cleanblkhd, bp, b_vnbufs); 854 splx(s); 855} 856 857/* 858 * Disassociate a buffer from a vnode. 859 / 860void 861brelvp(bp) 862* register struct buf bp; 863{ 864* struct vnode vp; 865* struct buflists listheadp; 866* int s; 867 868 KASSERT(bp->b_vp != NULL, ("brelvp: NULL")); 869 870 /* 871 * Delete from old vnode list, if on one. 872 / 873* vp = bp->b_vp; 874 s = splbio(); 875 if (bp->b_xflags & (BX_VNDIRTY \| BX_VNCLEAN)) { 876 if (bp->b_xflags & BX_VNDIRTY) 877 listheadp = &vp->v_dirtyblkhd; 878 else 879 listheadp = &vp->v_cleanblkhd; 880 TAILQ_REMOVE(listheadp, bp, b_vnbufs); 881 bp->b_xflags &= ~(BX_VNDIRTY \| BX_VNCLEAN); 882 } 883 if ((vp->v_flag & VONWORKLST) && TAILQ_EMPTY(&vp->v_dirtyblkhd)) { 884 vp->v_flag &= ~VONWORKLST; 885 LIST_REMOVE(vp, v_synclist); 886 } 887 splx(s); 888 bp->b_vp = (struct vnode ) 0; 889* vdrop(vp); 890} 891 892/* 893 * The workitem queue. 894 * 895 * It is useful to delay writes of file data and filesystem metadata 896 * for tens of seconds so that quickly created and deleted files need 897 * not waste disk bandwidth being created and removed. To realize this, 898 * we append vnodes to a "workitem" queue. When running with a soft 899 * updates implementation, most pending metadata dependencies should 900 * not wait for more than a few seconds. Thus, mounted on block devices 901 * are delayed only about a half the time that file data is delayed. 902 * Similarly, directory updates are more critical, so are only delayed 903 * about a third the time that file data is delayed. Thus, there are 904 * SYNCER_MAXDELAY queues that are processed round-robin at a rate of 905 * one each second (driven off the filesystem syncer process). The 906 * syncer_delayno variable indicates the next queue that is to be processed. 907 * Items that need to be processed soon are placed in this queue: 908 * 909 * syncer_workitem_pending[syncer_delayno] 910 * 911 * A delay of fifteen seconds is done by placing the request fifteen 912 * entries later in the queue: 913 * 914 * syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask] 915 * 916 / 917* 918/* 919 * Add an item to the syncer work queue. 920 / 921static void 922vn_syncer_add_to_worklist(struct vnode vp, int delay) 923{ 924 int s, slot; 925 926 s = splbio(); 927 928 if (vp->v_flag & VONWORKLST) { 929 LIST_REMOVE(vp, v_synclist); 930 } 931 932 if (delay > syncer_maxdelay - 2) 933 delay = syncer_maxdelay - 2; 934 slot = (syncer_delayno + delay) & syncer_mask; 935 936 LIST_INSERT_HEAD(&syncer_workitem_pending[slot], vp, v_synclist); 937 vp->v_flag \|= VONWORKLST; 938 splx(s); 939} 940 941struct proc updateproc; 942static void sched_sync __P((void)); 943static struct kproc_desc up_kp = { 944* "syncer", 945 sched_sync, 946 &updateproc 947}; 948SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp) 949 950/* 951 * System filesystem synchronizer daemon. 952 / 953void 954sched_sync(void) 955{ 956* struct synclist slp; 957* struct vnode vp; 958* struct mount mp; 959* long starttime; 960 int s; 961 struct proc p = updateproc; 962* 963 mtx_enter(&Giant, MTX_DEF); 964 965 EVENTHANDLER_REGISTER(shutdown_pre_sync, shutdown_kproc, p, 966 SHUTDOWN_PRI_LAST); 967 968 for (;;) { 969 kproc_suspend_loop(p); 970 971 starttime = time_second; 972 973 /* 974 * Push files whose dirty time has expired. Be careful 975 * of interrupt race on slp queue. 976 / 977* s = splbio(); 978 slp = &syncer_workitem_pending[syncer_delayno]; 979 syncer_delayno += 1; 980 if (syncer_delayno == syncer_maxdelay) 981 syncer_delayno = 0; 982 splx(s); 983 984 while ((vp = LIST_FIRST(slp)) != NULL) { 985 if (VOP_ISLOCKED(vp, NULL) == 0 && 986 vn_start_write(vp, &mp, V_NOWAIT) == 0) { 987 vn_lock(vp, LK_EXCLUSIVE \| LK_RETRY, p); 988 (void) VOP_FSYNC(vp, p->p_ucred, MNT_LAZY, p); 989 VOP_UNLOCK(vp, 0, p); 990 vn_finished_write(mp); 991 } 992 s = splbio(); 993 if (LIST_FIRST(slp) == vp) { 994 /* 995 * Note: v_tag VT_VFS vps can remain on the 996 * worklist too with no dirty blocks, but 997 * since sync_fsync() moves it to a different 998 * slot we are safe. 999 / 1000* if (TAILQ_EMPTY(&vp->v_dirtyblkhd) && 1001 !vn_isdisk(vp, NULL)) 1002 panic("sched_sync: fsync failed vp %p tag %d", vp, vp->v_tag); 1003 /* 1004 * Put us back on the worklist. The worklist 1005 * routine will remove us from our current 1006 * position and then add us back in at a later 1007 * position. 1008 / 1009* vn_syncer_add_to_worklist(vp, syncdelay); 1010 } 1011 splx(s); 1012 } 1013 1014 /* 1015 * Do soft update processing. 1016 / 1017#ifdef SOFTUPDATES 1018* softdep_process_worklist(NULL); 1019#endif 1020 1021 /* 1022 * The variable rushjob allows the kernel to speed up the 1023 * processing of the filesystem syncer process. A rushjob 1024 * value of N tells the filesystem syncer to process the next 1025 * N seconds worth of work on its queue ASAP. Currently rushjob 1026 * is used by the soft update code to speed up the filesystem 1027 * syncer process when the incore state is getting so far 1028 * ahead of the disk that the kernel memory pool is being 1029 * threatened with exhaustion. 1030 / 1031* if (rushjob > 0) { 1032 rushjob -= 1; 1033 continue; 1034 } 1035 /* 1036 * If it has taken us less than a second to process the 1037 * current work, then wait. Otherwise start right over 1038 * again. We can still lose time if any single round 1039 * takes more than two seconds, but it does not really 1040 * matter as we are just trying to generally pace the 1041 * filesystem activity. 1042 / 1043* if (time_second == starttime) 1044 tsleep(&lbolt, PPAUSE, "syncer", 0); 1045 } 1046} 1047 1048/* 1049 * Request the syncer daemon to speed up its work. 1050 * We never push it to speed up more than half of its 1051 * normal turn time, otherwise it could take over the cpu. 1052 / 1053int 1054speedup_syncer() 1055{ 1056* int s; 1057 1058 s = splhigh(); 1059 if (updateproc->p_wchan == &lbolt) 1060 setrunnable(updateproc); 1061 splx(s); 1062 if (rushjob < syncdelay / 2) { 1063 rushjob += 1; 1064 stat_rush_requests += 1; 1065 return (1); 1066 } 1067 return(0); 1068} 1069 1070/* 1071 * Associate a p-buffer with a vnode. 1072 * 1073 * Also sets B_PAGING flag to indicate that vnode is not fully associated 1074 * with the buffer. i.e. the bp has not been linked into the vnode or 1075 * ref-counted. 1076 / 1077void 1078pbgetvp(vp, bp) 1079* register struct vnode vp; 1080* register struct buf bp; 1081{ 1082* 1083 KASSERT(bp->b_vp == NULL, ("pbgetvp: not free")); 1084 1085 bp->b_vp = vp; 1086 bp->b_flags \|= B_PAGING; 1087 bp->b_dev = vn_todev(vp); 1088} 1089 1090/* 1091 * Disassociate a p-buffer from a vnode. 1092 / 1093void 1094pbrelvp(bp) 1095* register struct buf bp; 1096{ 1097* 1098 KASSERT(bp->b_vp != NULL, ("pbrelvp: NULL")); 1099 1100 /* XXX REMOVE ME / 1101* if (bp->b_vnbufs.tqe_next != NULL) { 1102 panic( 1103 "relpbuf(): b_vp was probably reassignbuf()d %p %x", 1104 bp, 1105 (int)bp->b_flags 1106 ); 1107 } 1108 bp->b_vp = (struct vnode ) 0; 1109* bp->b_flags &= ~B_PAGING; 1110} 1111 1112void 1113pbreassignbuf(bp, newvp) 1114 struct buf bp; 1115* struct vnode newvp; 1116{ 1117* if ((bp->b_flags & B_PAGING) == 0) { 1118 panic( 1119 "pbreassignbuf() on non phys bp %p", 1120 bp 1121 ); 1122 } 1123 bp->b_vp = newvp; 1124} 1125 1126/* 1127 * Reassign a buffer from one vnode to another. 1128 * Used to assign file specific control information 1129 * (indirect blocks) to the vnode to which they belong. 1130 / 1131void 1132reassignbuf(bp, newvp) 1133* register struct buf bp; 1134* register struct vnode newvp; 1135{ 1136* struct buflists listheadp; 1137* int delay; 1138 int s; 1139 1140 if (newvp == NULL) { 1141 printf("reassignbuf: NULL"); 1142 return; 1143 } 1144 ++reassignbufcalls; 1145 1146 /* 1147 * B_PAGING flagged buffers cannot be reassigned because their vp 1148 * is not fully linked in. 1149 / 1150* if (bp->b_flags & B_PAGING) 1151 panic("cannot reassign paging buffer"); 1152 1153 s = splbio(); 1154 /* 1155 * Delete from old vnode list, if on one. 1156 / 1157* if (bp->b_xflags & (BX_VNDIRTY \| BX_VNCLEAN)) { 1158 if (bp->b_xflags & BX_VNDIRTY) 1159 listheadp = &bp->b_vp->v_dirtyblkhd; 1160 else 1161 listheadp = &bp->b_vp->v_cleanblkhd; 1162 TAILQ_REMOVE(listheadp, bp, b_vnbufs); 1163 bp->b_xflags &= ~(BX_VNDIRTY \| BX_VNCLEAN); 1164 if (bp->b_vp != newvp) { 1165 vdrop(bp->b_vp); 1166 bp->b_vp = NULL; /* for clarification / 1167* } 1168 } 1169 /* 1170 * If dirty, put on list of dirty buffers; otherwise insert onto list 1171 * of clean buffers. 1172 / 1173* if (bp->b_flags & B_DELWRI) { 1174 struct buf tbp; 1175* 1176 listheadp = &newvp->v_dirtyblkhd; 1177 if ((newvp->v_flag & VONWORKLST) == 0) { 1178 switch (newvp->v_type) { 1179 case VDIR: 1180 delay = dirdelay; 1181 break; 1182 case VCHR: 1183 case VBLK: 1184 if (newvp->v_specmountpoint != NULL) { 1185 delay = metadelay; 1186 break; 1187 } 1188 /* fall through / 1189* default: 1190 delay = filedelay; 1191 } 1192 vn_syncer_add_to_worklist(newvp, delay); 1193 } 1194 bp->b_xflags \|= BX_VNDIRTY; 1195 tbp = TAILQ_FIRST(listheadp); 1196 if (tbp == NULL \|\| 1197 bp->b_lblkno == 0 \|\| 1198 (bp->b_lblkno > 0 && tbp->b_lblkno < 0) \|\| 1199 (bp->b_lblkno > 0 && bp->b_lblkno < tbp->b_lblkno)) { 1200 TAILQ_INSERT_HEAD(listheadp, bp, b_vnbufs); 1201 ++reassignbufsortgood; 1202 } else if (bp->b_lblkno < 0) { 1203 TAILQ_INSERT_TAIL(listheadp, bp, b_vnbufs); 1204 ++reassignbufsortgood; 1205 } else if (reassignbufmethod == 1) { 1206 /* 1207 * New sorting algorithm, only handle sequential case, 1208 * otherwise append to end (but before metadata) 1209 / 1210* if ((tbp = gbincore(newvp, bp->b_lblkno - 1)) != NULL && 1211 (tbp->b_xflags & BX_VNDIRTY)) { 1212 /* 1213 * Found the best place to insert the buffer 1214 / 1215* TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs); 1216 ++reassignbufsortgood; 1217 } else { 1218 /* 1219 * Missed, append to end, but before meta-data. 1220 * We know that the head buffer in the list is 1221 * not meta-data due to prior conditionals. 1222 * 1223 * Indirect effects: NFS second stage write 1224 * tends to wind up here, giving maximum 1225 * distance between the unstable write and the 1226 * commit rpc. 1227 / 1228* tbp = TAILQ_LAST(listheadp, buflists); 1229 while (tbp && tbp->b_lblkno < 0) 1230 tbp = TAILQ_PREV(tbp, buflists, b_vnbufs); 1231 TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs); 1232 ++reassignbufsortbad; 1233 } 1234 } else { 1235 /* 1236 * Old sorting algorithm, scan queue and insert 1237 / 1238* struct buf ttbp; 1239* while ((ttbp = TAILQ_NEXT(tbp, b_vnbufs)) && 1240 (ttbp->b_lblkno < bp->b_lblkno)) { 1241 ++reassignbufloops; 1242 tbp = ttbp; 1243 } 1244 TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs); 1245 } 1246 } else { 1247 bp->b_xflags \|= BX_VNCLEAN; 1248 TAILQ_INSERT_TAIL(&newvp->v_cleanblkhd, bp, b_vnbufs); 1249 if ((newvp->v_flag & VONWORKLST) && 1250 TAILQ_EMPTY(&newvp->v_dirtyblkhd)) { 1251 newvp->v_flag &= ~VONWORKLST; 1252 LIST_REMOVE(newvp, v_synclist); 1253 } 1254 } 1255 if (bp->b_vp != newvp) { 1256 bp->b_vp = newvp; 1257 vhold(bp->b_vp); 1258 } 1259 splx(s); 1260} 1261 1262/* 1263 * Create a vnode for a block device. 1264 * Used for mounting the root file system. 1265 * XXX: This now changed to a VCHR due to the block/char merging. 1266 / 1267int 1268bdevvp(dev, vpp) 1269* dev_t dev; 1270 struct vnode *vpp; 1271{ 1272* register struct vnode vp; 1273* struct vnode nvp; 1274* int error; 1275 1276 if (dev == NODEV) { 1277 vpp = NULLVP; 1278* return (ENXIO); 1279 } 1280 error = getnewvnode(VT_NON, (struct mount )0, spec_vnodeop_p, &nvp); 1281* if (error) { 1282 vpp = NULLVP; 1283* return (error); 1284 } 1285 vp = nvp; 1286 vp->v_type = VCHR; 1287 addalias(vp, dev); 1288 vpp = vp; 1289* return (0); 1290} 1291 1292/* 1293 * Add vnode to the alias list hung off the dev_t. 1294 * 1295 * The reason for this gunk is that multiple vnodes can reference 1296 * the same physical device, so checking vp->v_usecount to see 1297 * how many users there are is inadequate; the v_usecount for 1298 * the vnodes need to be accumulated. vcount() does that. 1299 / 1300struct vnode 1301addaliasu(nvp, nvp_rdev) 1302 struct vnode nvp; 1303* udev_t nvp_rdev; 1304{ 1305 struct vnode ovp; 1306* vop_t *ops; 1307* dev_t dev; 1308 1309 if (nvp->v_type != VBLK && nvp->v_type != VCHR) 1310 panic("addaliasu on non-special vnode"); 1311 dev = udev2dev(nvp_rdev, nvp->v_type == VBLK ? 1 : 0); 1312 /* 1313 * Check to see if we have a bdevvp vnode with no associated 1314 * filesystem. If so, we want to associate the filesystem of 1315 * the new newly instigated vnode with the bdevvp vnode and 1316 * discard the newly created vnode rather than leaving the 1317 * bdevvp vnode lying around with no associated filesystem. 1318 / 1319* if (vfinddev(dev, nvp->v_type, &ovp) == 0 \|\| ovp->v_data != NULL) { 1320 addalias(nvp, dev); 1321 return (nvp); 1322 } 1323 /* 1324 * Discard unneeded vnode, but save its node specific data. 1325 * Note that if there is a lock, it is carried over in the 1326 * node specific data to the replacement vnode. 1327 / 1328* vref(ovp); 1329 ovp->v_data = nvp->v_data; 1330 ovp->v_tag = nvp->v_tag; 1331 nvp->v_data = NULL; 1332 ops = nvp->v_op; 1333 nvp->v_op = ovp->v_op; 1334 ovp->v_op = ops; 1335 insmntque(ovp, nvp->v_mount); 1336 vrele(nvp); 1337 vgone(nvp); 1338 return (ovp); 1339} 1340 1341void 1342addalias(nvp, dev) 1343 struct vnode nvp; 1344* dev_t dev; 1345{ 1346 1347 if (nvp->v_type != VBLK && nvp->v_type != VCHR) 1348 panic("addalias on non-special vnode"); 1349 1350 nvp->v_rdev = dev; 1351 simple_lock(&spechash_slock); 1352 SLIST_INSERT_HEAD(&dev->si_hlist, nvp, v_specnext); 1353 simple_unlock(&spechash_slock); 1354} 1355 1356/* 1357 * Grab a particular vnode from the free list, increment its 1358 * reference count and lock it. The vnode lock bit is set if the 1359 * vnode is being eliminated in vgone. The process is awakened 1360 * when the transition is completed, and an error returned to 1361 * indicate that the vnode is no longer usable (possibly having 1362 * been changed to a new file system type). 1363 / 1364int 1365vget(vp, flags, p) 1366* register struct vnode vp; 1367* int flags; 1368 struct proc p; 1369{ 1370* int error; 1371 1372 /* 1373 * If the vnode is in the process of being cleaned out for 1374 * another use, we wait for the cleaning to finish and then 1375 * return failure. Cleaning is determined by checking that 1376 * the VXLOCK flag is set. 1377 / 1378* if ((flags & LK_INTERLOCK) == 0) { 1379 simple_lock(&vp->v_interlock); 1380 } 1381 if (vp->v_flag & VXLOCK) { 1382 vp->v_flag \|= VXWANT; 1383 simple_unlock(&vp->v_interlock); 1384 tsleep((caddr_t)vp, PINOD, "vget", 0); 1385 return (ENOENT); 1386 } 1387 1388 vp->v_usecount++; 1389 1390 if (VSHOULDBUSY(vp)) 1391 vbusy(vp); 1392 if (flags & LK_TYPE_MASK) { 1393 if ((error = vn_lock(vp, flags \| LK_INTERLOCK, p)) != 0) { 1394 /* 1395 * must expand vrele here because we do not want 1396 * to call VOP_INACTIVE if the reference count 1397 * drops back to zero since it was never really 1398 * active. We must remove it from the free list 1399 * before sleeping so that multiple processes do 1400 * not try to recycle it. 1401 / 1402* simple_lock(&vp->v_interlock); 1403 vp->v_usecount--; 1404 if (VSHOULDFREE(vp)) 1405 vfree(vp); 1406 simple_unlock(&vp->v_interlock); 1407 } 1408 return (error); 1409 } 1410 simple_unlock(&vp->v_interlock); 1411 return (0); 1412} 1413 1414void 1415vref(struct vnode vp) 1416{ 1417* simple_lock(&vp->v_interlock); 1418 vp->v_usecount++; 1419 simple_unlock(&vp->v_interlock); 1420} 1421 1422/* 1423 * Vnode put/release. 1424 * If count drops to zero, call inactive routine and return to freelist. 1425 / 1426void 1427vrele(vp) 1428* struct vnode vp; 1429{ 1430* struct proc p = curproc; / XXX / 1431* 1432 KASSERT(vp != NULL, ("vrele: null vp")); 1433 KASSERT(vp->v_writecount < vp->v_usecount, ("vrele: missed vn_close")); 1434 1435 simple_lock(&vp->v_interlock); 1436 1437 if (vp->v_usecount > 1) { 1438 1439 vp->v_usecount--; 1440 simple_unlock(&vp->v_interlock); 1441 1442 return; 1443 } 1444 1445 if (vp->v_usecount == 1) { 1446 1447 vp->v_usecount--; 1448 if (VSHOULDFREE(vp)) 1449 vfree(vp); 1450 /* 1451 * If we are doing a vput, the node is already locked, and we must 1452 * call VOP_INACTIVE with the node locked. So, in the case of 1453 * vrele, we explicitly lock the vnode before calling VOP_INACTIVE. 1454 / 1455* if (vn_lock(vp, LK_EXCLUSIVE \| LK_INTERLOCK, p) == 0) { 1456 VOP_INACTIVE(vp, p); 1457 } 1458 1459 } else { 1460#ifdef DIAGNOSTIC 1461 vprint("vrele: negative ref count", vp); 1462 simple_unlock(&vp->v_interlock); 1463#endif 1464 panic("vrele: negative ref cnt"); 1465 } 1466} 1467 1468void 1469vput(vp) 1470 struct vnode vp; 1471{ 1472* struct proc p = curproc; / XXX / 1473* 1474 KASSERT(vp != NULL, ("vput: null vp")); 1475 KASSERT(vp->v_writecount < vp->v_usecount, ("vput: missed vn_close")); 1476 1477 simple_lock(&vp->v_interlock); 1478 1479 if (vp->v_usecount > 1) { 1480 1481 vp->v_usecount--; 1482 VOP_UNLOCK(vp, LK_INTERLOCK, p); 1483 return; 1484 1485 } 1486 1487 if (vp->v_usecount == 1) { 1488 1489 vp->v_usecount--; 1490 if (VSHOULDFREE(vp)) 1491 vfree(vp); 1492 /* 1493 * If we are doing a vput, the node is already locked, and we must 1494 * call VOP_INACTIVE with the node locked. So, in the case of 1495 * vrele, we explicitly lock the vnode before calling VOP_INACTIVE. 1496 / 1497* simple_unlock(&vp->v_interlock); 1498 VOP_INACTIVE(vp, p); 1499 1500 } else { 1501#ifdef DIAGNOSTIC 1502 vprint("vput: negative ref count", vp); 1503#endif 1504 panic("vput: negative ref cnt"); 1505 } 1506} 1507 1508/* 1509 * Somebody doesn't want the vnode recycled. 1510 / 1511void 1512vhold(vp) 1513* register struct vnode vp; 1514{ 1515* int s; 1516 1517 s = splbio(); 1518 vp->v_holdcnt++; 1519 if (VSHOULDBUSY(vp)) 1520 vbusy(vp); 1521 splx(s); 1522} 1523 1524/* 1525 * One less who cares about this vnode. 1526 / 1527void 1528vdrop(vp) 1529* register struct vnode vp; 1530{ 1531* int s; 1532 1533 s = splbio(); 1534 if (vp->v_holdcnt <= 0) 1535 panic("vdrop: holdcnt"); 1536 vp->v_holdcnt--; 1537 if (VSHOULDFREE(vp)) 1538 vfree(vp); 1539 splx(s); 1540} 1541 1542/* 1543 * Remove any vnodes in the vnode table belonging to mount point mp. 1544 * 1545 * If MNT_NOFORCE is specified, there should not be any active ones, 1546 * return error if any are found (nb: this is a user error, not a 1547 * system error). If MNT_FORCE is specified, detach any active vnodes 1548 * that are found. 1549 / 1550#ifdef DIAGNOSTIC 1551static int busyprt = 0; / print out busy vnodes / 1552SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, ""); 1553#endif 1554* 1555int 1556vflush(mp, skipvp, flags) 1557 struct mount mp; 1558* struct vnode skipvp; 1559* int flags; 1560{ 1561 struct proc p = curproc; / XXX / 1562* struct vnode vp, nvp; 1563 int busy = 0; 1564 1565 simple_lock(&mntvnode_slock); 1566loop: 1567 for (vp = LIST_FIRST(&mp->mnt_vnodelist); vp; vp = nvp) { 1568 /* 1569 * Make sure this vnode wasn't reclaimed in getnewvnode(). 1570 * Start over if it has (it won't be on the list anymore). 1571 / 1572* if (vp->v_mount != mp) 1573 goto loop; 1574 nvp = LIST_NEXT(vp, v_mntvnodes); 1575 /* 1576 * Skip over a selected vnode. 1577 / 1578* if (vp == skipvp) 1579 continue; 1580 1581 simple_lock(&vp->v_interlock); 1582 /* 1583 * Skip over a vnodes marked VSYSTEM. 1584 / 1585* if ((flags & SKIPSYSTEM) && (vp->v_flag & VSYSTEM)) { 1586 simple_unlock(&vp->v_interlock); 1587 continue; 1588 } 1589 /* 1590 * If WRITECLOSE is set, only flush out regular file vnodes 1591 * open for writing. 1592 / 1593* if ((flags & WRITECLOSE) && 1594 (vp->v_writecount == 0 \|\| vp->v_type != VREG)) { 1595 simple_unlock(&vp->v_interlock); 1596 continue; 1597 } 1598 1599 /* 1600 * With v_usecount == 0, all we need to do is clear out the 1601 * vnode data structures and we are done. 1602 / 1603* if (vp->v_usecount == 0) { 1604 simple_unlock(&mntvnode_slock); 1605 vgonel(vp, p); 1606 simple_lock(&mntvnode_slock); 1607 continue; 1608 } 1609 1610 /* 1611 * If FORCECLOSE is set, forcibly close the vnode. For block 1612 * or character devices, revert to an anonymous device. For 1613 * all other files, just kill them. 1614 / 1615* if (flags & FORCECLOSE) { 1616 simple_unlock(&mntvnode_slock); 1617 if (vp->v_type != VBLK && vp->v_type != VCHR) { 1618 vgonel(vp, p); 1619 } else { 1620 vclean(vp, 0, p); 1621 vp->v_op = spec_vnodeop_p; 1622 insmntque(vp, (struct mount ) 0); 1623* } 1624 simple_lock(&mntvnode_slock); 1625 continue; 1626 } 1627#ifdef DIAGNOSTIC 1628 if (busyprt) 1629 vprint("vflush: busy vnode", vp); 1630#endif 1631 simple_unlock(&vp->v_interlock); 1632 busy++; 1633 } 1634 simple_unlock(&mntvnode_slock); 1635 if (busy) 1636 return (EBUSY); 1637 return (0); 1638} 1639 1640/* 1641 * Disassociate the underlying file system from a vnode. 1642 / 1643static void 1644vclean(vp, flags, p) 1645* struct vnode vp; 1646* int flags; 1647 struct proc p; 1648{ 1649* int active; 1650 1651 /* 1652 * Check to see if the vnode is in use. If so we have to reference it 1653 * before we clean it out so that its count cannot fall to zero and 1654 * generate a race against ourselves to recycle it. 1655 / 1656* if ((active = vp->v_usecount)) 1657 vp->v_usecount++; 1658 1659 /* 1660 * Prevent the vnode from being recycled or brought into use while we 1661 * clean it out. 1662 / 1663* if (vp->v_flag & VXLOCK) 1664 panic("vclean: deadlock"); 1665 vp->v_flag \|= VXLOCK; 1666 /* 1667 * Even if the count is zero, the VOP_INACTIVE routine may still 1668 * have the object locked while it cleans it out. The VOP_LOCK 1669 * ensures that the VOP_INACTIVE routine is done with its work. 1670 * For active vnodes, it ensures that no other activity can 1671 * occur while the underlying object is being cleaned out. 1672 / 1673* VOP_LOCK(vp, LK_DRAIN \| LK_INTERLOCK, p); 1674 1675 /* 1676 * Clean out any buffers associated with the vnode. 1677 * If the flush fails, just toss the buffers. 1678 / 1679* if (flags & DOCLOSE) { 1680 if (TAILQ_FIRST(&vp->v_dirtyblkhd) != NULL) 1681 (void) vn_write_suspend_wait(vp, NULL, V_WAIT); 1682 if (vinvalbuf(vp, V_SAVE, NOCRED, p, 0, 0) != 0) 1683 vinvalbuf(vp, 0, NOCRED, p, 0, 0); 1684 } 1685 1686 VOP_DESTROYVOBJECT(vp); 1687 1688 /* 1689 * If purging an active vnode, it must be closed and 1690 * deactivated before being reclaimed. Note that the 1691 * VOP_INACTIVE will unlock the vnode. 1692 / 1693* if (active) { 1694 if (flags & DOCLOSE) 1695 VOP_CLOSE(vp, FNONBLOCK, NOCRED, p); 1696 VOP_INACTIVE(vp, p); 1697 } else { 1698 /* 1699 * Any other processes trying to obtain this lock must first 1700 * wait for VXLOCK to clear, then call the new lock operation. 1701 / 1702* VOP_UNLOCK(vp, 0, p); 1703 } 1704 /* 1705 * Reclaim the vnode. 1706 / 1707* if (VOP_RECLAIM(vp, p)) 1708 panic("vclean: cannot reclaim"); 1709 1710 if (active) { 1711 /* 1712 * Inline copy of vrele() since VOP_INACTIVE 1713 * has already been called. 1714 / 1715* simple_lock(&vp->v_interlock); 1716 if (--vp->v_usecount <= 0) { 1717#ifdef DIAGNOSTIC 1718 if (vp->v_usecount < 0 \|\| vp->v_writecount != 0) { 1719 vprint("vclean: bad ref count", vp); 1720 panic("vclean: ref cnt"); 1721 } 1722#endif 1723 vfree(vp); 1724 } 1725 simple_unlock(&vp->v_interlock); 1726 } 1727 1728 cache_purge(vp); 1729 if (vp->v_vnlock) { 1730 FREE(vp->v_vnlock, M_VNODE); 1731 vp->v_vnlock = NULL; 1732 } 1733 1734 if (VSHOULDFREE(vp)) 1735 vfree(vp); 1736 1737 /* 1738 * Done with purge, notify sleepers of the grim news. 1739 / 1740* vp->v_op = dead_vnodeop_p; 1741 vn_pollgone(vp); 1742 vp->v_tag = VT_NON; 1743 vp->v_flag &= ~VXLOCK; 1744 if (vp->v_flag & VXWANT) { 1745 vp->v_flag &= ~VXWANT; 1746 wakeup((caddr_t) vp); 1747 } 1748} 1749 1750/* 1751 * Eliminate all activity associated with the requested vnode 1752 * and with all vnodes aliased to the requested vnode. 1753 / 1754int 1755vop_revoke(ap) 1756* struct vop_revoke_args /* { 1757 struct vnode a_vp; 1758* int a_flags; 1759 } / ap; 1760{ 1761 struct vnode vp, vq; 1762 dev_t dev; 1763 1764 KASSERT((ap->a_flags & REVOKEALL) != 0, ("vop_revoke")); 1765 1766 vp = ap->a_vp; 1767 /* 1768 * If a vgone (or vclean) is already in progress, 1769 * wait until it is done and return. 1770 / 1771* if (vp->v_flag & VXLOCK) { 1772 vp->v_flag \|= VXWANT; 1773 simple_unlock(&vp->v_interlock); 1774 tsleep((caddr_t)vp, PINOD, "vop_revokeall", 0); 1775 return (0); 1776 } 1777 dev = vp->v_rdev; 1778 for (;;) { 1779 simple_lock(&spechash_slock); 1780 vq = SLIST_FIRST(&dev->si_hlist); 1781 simple_unlock(&spechash_slock); 1782 if (!vq) 1783 break; 1784 vgone(vq); 1785 } 1786 return (0); 1787} 1788 1789/* 1790 * Recycle an unused vnode to the front of the free list. 1791 * Release the passed interlock if the vnode will be recycled. 1792 / 1793int 1794vrecycle(vp, inter_lkp, p) 1795* struct vnode vp; 1796* struct simplelock inter_lkp; 1797* struct proc p; 1798{ 1799* 1800 simple_lock(&vp->v_interlock); 1801 if (vp->v_usecount == 0) { 1802 if (inter_lkp) { 1803 simple_unlock(inter_lkp); 1804 } 1805 vgonel(vp, p); 1806 return (1); 1807 } 1808 simple_unlock(&vp->v_interlock); 1809 return (0); 1810} 1811 1812/* 1813 * Eliminate all activity associated with a vnode 1814 * in preparation for reuse. 1815 / 1816void 1817vgone(vp) 1818* register struct vnode vp; 1819{ 1820* struct proc p = curproc; / XXX / 1821* 1822 simple_lock(&vp->v_interlock); 1823 vgonel(vp, p); 1824} 1825 1826/* 1827 * vgone, with the vp interlock held. 1828 / 1829void 1830vgonel(vp, p) 1831* struct vnode vp; 1832* struct proc p; 1833{ 1834* int s; 1835 1836 /* 1837 * If a vgone (or vclean) is already in progress, 1838 * wait until it is done and return. 1839 / 1840* if (vp->v_flag & VXLOCK) { 1841 vp->v_flag \|= VXWANT; 1842 simple_unlock(&vp->v_interlock); 1843 tsleep((caddr_t)vp, PINOD, "vgone", 0); 1844 return; 1845 } 1846 1847 /* 1848 * Clean out the filesystem specific data. 1849 / 1850* vclean(vp, DOCLOSE, p); 1851 simple_lock(&vp->v_interlock); 1852 1853 /* 1854 * Delete from old mount point vnode list, if on one. 1855 / 1856* if (vp->v_mount != NULL) 1857 insmntque(vp, (struct mount )0); 1858* /* 1859 * If special device, remove it from special device alias list 1860 * if it is on one. 1861 / 1862* if ((vp->v_type == VBLK \|\| vp->v_type == VCHR) && vp->v_rdev != NULL) { 1863 simple_lock(&spechash_slock);	40 / 41 42/ 43 * External virtual filesystem routines 44 / 45#include "opt_ddb.h" 46#include "opt_ffs.h" 47 48#include <sys/param.h> 49#include <sys/systm.h> 50#include <sys/bio.h> 51#include <sys/buf.h> 52#include <sys/conf.h> 53#include <sys/dirent.h> 54#include <sys/domain.h> 55#include <sys/eventhandler.h> 56#include <sys/fcntl.h> 57#include <sys/kernel.h> 58#include <sys/kthread.h> 59#include <sys/ktr.h> 60#include <sys/malloc.h> 61#include <sys/mount.h> 62#include <sys/namei.h> 63#include <sys/proc.h> 64#include <sys/reboot.h> 65#include <sys/socket.h> 66#include <sys/stat.h> 67#include <sys/sysctl.h> 68#include <sys/vmmeter.h> 69#include <sys/vnode.h> 70 71#include <machine/limits.h> 72#include <machine/mutex.h> 73 74#include <vm/vm.h> 75#include <vm/vm_object.h> 76#include <vm/vm_extern.h> 77#include <vm/pmap.h> 78#include <vm/vm_map.h> 79#include <vm/vm_page.h> 80#include <vm/vm_pager.h> 81#include <vm/vnode_pager.h> 82#include <vm/vm_zone.h> 83 84static MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure"); 85 86static void insmntque __P((struct vnode vp, struct mount mp)); 87static void vclean __P((struct vnode vp, int flags, struct proc p)); 88static unsigned long numvnodes; 89SYSCTL_INT(_debug, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, ""); 90 91enum vtype iftovt_tab[16] = { 92 VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, 93 VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD, 94}; 95int vttoif_tab[9] = { 96 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, 97 S_IFSOCK, S_IFIFO, S_IFMT, 98}; 99 100static TAILQ_HEAD(freelst, vnode) vnode_free_list; / vnode free list / 101* 102static u_long wantfreevnodes = 25; 103SYSCTL_INT(_debug, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, ""); 104static u_long freevnodes = 0; 105SYSCTL_INT(_debug, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, ""); 106 107static int reassignbufcalls; 108SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0, ""); 109static int reassignbufloops; 110SYSCTL_INT(_vfs, OID_AUTO, reassignbufloops, CTLFLAG_RW, &reassignbufloops, 0, ""); 111static int reassignbufsortgood; 112SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortgood, CTLFLAG_RW, &reassignbufsortgood, 0, ""); 113static int reassignbufsortbad; 114SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortbad, CTLFLAG_RW, &reassignbufsortbad, 0, ""); 115static int reassignbufmethod = 1; 116SYSCTL_INT(_vfs, OID_AUTO, reassignbufmethod, CTLFLAG_RW, &reassignbufmethod, 0, ""); 117 118#ifdef ENABLE_VFS_IOOPT 119int vfs_ioopt = 0; 120SYSCTL_INT(_vfs, OID_AUTO, ioopt, CTLFLAG_RW, &vfs_ioopt, 0, ""); 121#endif 122 123struct mntlist mountlist = TAILQ_HEAD_INITIALIZER(mountlist); /* mounted fs / 124struct simplelock mountlist_slock; 125struct simplelock mntvnode_slock; 126int nfs_mount_type = -1; 127#ifndef NULL_SIMPLELOCKS 128static struct simplelock mntid_slock; 129static struct simplelock vnode_free_list_slock; 130static struct simplelock spechash_slock; 131#endif 132struct nfs_public nfs_pub; / publicly exported FS / 133static vm_zone_t vnode_zone; 134int prtactive = 0; / 1 => print out reclaim of active vnodes / 135* 136/* 137 * The workitem queue. 138 / 139#define SYNCER_MAXDELAY 32 140static int syncer_maxdelay = SYNCER_MAXDELAY; / maximum delay time / 141time_t syncdelay = 30; / max time to delay syncing data / 142time_t filedelay = 30; / time to delay syncing files / 143SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, ""); 144time_t dirdelay = 29; / time to delay syncing directories / 145SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, ""); 146time_t metadelay = 28; / time to delay syncing metadata / 147SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, ""); 148static int rushjob; / number of slots to run ASAP / 149static int stat_rush_requests; / number of times I/O speeded up / 150SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, ""); 151* 152static int syncer_delayno = 0; 153static long syncer_mask; 154LIST_HEAD(synclist, vnode); 155static struct synclist syncer_workitem_pending; 156* 157int desiredvnodes; 158SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW, 159 &desiredvnodes, 0, "Maximum number of vnodes"); 160 161static void vfs_free_addrlist __P((struct netexport nep)); 162static int vfs_free_netcred __P((struct radix_node rn, void w)); 163static int vfs_hang_addrlist __P((struct mount mp, struct netexport nep, 164* struct export_args argp)); 165* 166/* 167 * Initialize the vnode management data structures. 168 / 169void 170vntblinit() 171{ 172* 173 desiredvnodes = maxproc + cnt.v_page_count / 4; 174 simple_lock_init(&mntvnode_slock); 175 simple_lock_init(&mntid_slock); 176 simple_lock_init(&spechash_slock); 177 TAILQ_INIT(&vnode_free_list); 178 simple_lock_init(&vnode_free_list_slock); 179 vnode_zone = zinit("VNODE", sizeof (struct vnode), 0, 0, 5); 180 /* 181 * Initialize the filesystem syncer. 182 / 183* syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE, 184 &syncer_mask); 185 syncer_maxdelay = syncer_mask + 1; 186} 187 188/* 189 * Mark a mount point as busy. Used to synchronize access and to delay 190 * unmounting. Interlock is not released on failure. 191 / 192int 193vfs_busy(mp, flags, interlkp, p) 194* struct mount mp; 195* int flags; 196 struct simplelock interlkp; 197* struct proc p; 198{ 199* int lkflags; 200 201 if (mp->mnt_kern_flag & MNTK_UNMOUNT) { 202 if (flags & LK_NOWAIT) 203 return (ENOENT); 204 mp->mnt_kern_flag \|= MNTK_MWAIT; 205 if (interlkp) { 206 simple_unlock(interlkp); 207 } 208 /* 209 * Since all busy locks are shared except the exclusive 210 * lock granted when unmounting, the only place that a 211 * wakeup needs to be done is at the release of the 212 * exclusive lock at the end of dounmount. 213 / 214* tsleep((caddr_t)mp, PVFS, "vfs_busy", 0); 215 if (interlkp) { 216 simple_lock(interlkp); 217 } 218 return (ENOENT); 219 } 220 lkflags = LK_SHARED \| LK_NOPAUSE; 221 if (interlkp) 222 lkflags \|= LK_INTERLOCK; 223 if (lockmgr(&mp->mnt_lock, lkflags, interlkp, p)) 224 panic("vfs_busy: unexpected lock failure"); 225 return (0); 226} 227 228/* 229 * Free a busy filesystem. 230 / 231void 232vfs_unbusy(mp, p) 233* struct mount mp; 234* struct proc p; 235{ 236* 237 lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, p); 238} 239 240/* 241 * Lookup a filesystem type, and if found allocate and initialize 242 * a mount structure for it. 243 * 244 * Devname is usually updated by mount(8) after booting. 245 / 246int 247vfs_rootmountalloc(fstypename, devname, mpp) 248* char fstypename; 249* char devname; 250* struct mount *mpp; 251{ 252* struct proc p = curproc; / XXX / 253* struct vfsconf vfsp; 254* struct mount mp; 255* 256 if (fstypename == NULL) 257 return (ENODEV); 258 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) 259 if (!strcmp(vfsp->vfc_name, fstypename)) 260 break; 261 if (vfsp == NULL) 262 return (ENODEV); 263 mp = malloc((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK); 264 bzero((char )mp, (u_long)sizeof(struct mount)); 265* lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, LK_NOPAUSE); 266 (void)vfs_busy(mp, LK_NOWAIT, 0, p); 267 LIST_INIT(&mp->mnt_vnodelist); 268 mp->mnt_vfc = vfsp; 269 mp->mnt_op = vfsp->vfc_vfsops; 270 mp->mnt_flag = MNT_RDONLY; 271 mp->mnt_vnodecovered = NULLVP; 272 vfsp->vfc_refcount++; 273 mp->mnt_iosize_max = DFLTPHYS; 274 mp->mnt_stat.f_type = vfsp->vfc_typenum; 275 mp->mnt_flag \|= vfsp->vfc_flags & MNT_VISFLAGMASK; 276 strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN); 277 mp->mnt_stat.f_mntonname[0] = '/'; 278 mp->mnt_stat.f_mntonname[1] = 0; 279 (void) copystr(devname, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, 0); 280 mpp = mp; 281* return (0); 282} 283 284/* 285 * Find an appropriate filesystem to use for the root. If a filesystem 286 * has not been preselected, walk through the list of known filesystems 287 * trying those that have mountroot routines, and try them until one 288 * works or we have tried them all. 289 / 290#ifdef notdef / XXX JH / 291int 292lite2_vfs_mountroot() 293{ 294* struct vfsconf vfsp; 295* extern int (lite2_mountroot) __P((void)); 296* int error; 297 298 if (lite2_mountroot != NULL) 299 return ((lite2_mountroot)()); 300* for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) { 301 if (vfsp->vfc_mountroot == NULL) 302 continue; 303 if ((error = (vfsp->vfc_mountroot)()) == 0) 304* return (0); 305 printf("%s_mountroot failed: %d\n", vfsp->vfc_name, error); 306 } 307 return (ENODEV); 308} 309#endif 310 311/* 312 * Lookup a mount point by filesystem identifier. 313 / 314struct mount 315vfs_getvfs(fsid) 316 fsid_t fsid; 317{ 318* register struct mount mp; 319* 320 simple_lock(&mountlist_slock); 321 TAILQ_FOREACH(mp, &mountlist, mnt_list) { 322 if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] && 323 mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) { 324 simple_unlock(&mountlist_slock); 325 return (mp); 326 } 327 } 328 simple_unlock(&mountlist_slock); 329 return ((struct mount ) 0); 330} 331* 332/* 333 * Get a new unique fsid. Try to make its val[0] unique, since this value 334 * will be used to create fake device numbers for stat(). Also try (but 335 * not so hard) make its val[0] unique mod 2^16, since some emulators only 336 * support 16-bit device numbers. We end up with unique val[0]'s for the 337 * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls. 338 * 339 * Keep in mind that several mounts may be running in parallel. Starting 340 * the search one past where the previous search terminated is both a 341 * micro-optimization and a defense against returning the same fsid to 342 * different mounts. 343 / 344void 345vfs_getnewfsid(mp) 346* struct mount mp; 347{ 348* static u_int16_t mntid_base; 349 fsid_t tfsid; 350 int mtype; 351 352 simple_lock(&mntid_slock); 353 mtype = mp->mnt_vfc->vfc_typenum; 354 tfsid.val[1] = mtype; 355 mtype = (mtype & 0xFF) << 24; 356 for (;;) { 357 tfsid.val[0] = makeudev(255, 358 mtype \| ((mntid_base & 0xFF00) << 8) \| (mntid_base & 0xFF)); 359 mntid_base++; 360 if (vfs_getvfs(&tfsid) == NULL) 361 break; 362 } 363 mp->mnt_stat.f_fsid.val[0] = tfsid.val[0]; 364 mp->mnt_stat.f_fsid.val[1] = tfsid.val[1]; 365 simple_unlock(&mntid_slock); 366} 367 368/* 369 * Knob to control the precision of file timestamps: 370 * 371 * 0 = seconds only; nanoseconds zeroed. 372 * 1 = seconds and nanoseconds, accurate within 1/HZ. 373 * 2 = seconds and nanoseconds, truncated to microseconds. 374 * >=3 = seconds and nanoseconds, maximum precision. 375 / 376enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC }; 377* 378static int timestamp_precision = TSP_SEC; 379SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW, 380 &timestamp_precision, 0, ""); 381 382/* 383 * Get a current timestamp. 384 / 385void 386vfs_timestamp(tsp) 387* struct timespec tsp; 388{ 389* struct timeval tv; 390 391 switch (timestamp_precision) { 392 case TSP_SEC: 393 tsp->tv_sec = time_second; 394 tsp->tv_nsec = 0; 395 break; 396 case TSP_HZ: 397 getnanotime(tsp); 398 break; 399 case TSP_USEC: 400 microtime(&tv); 401 TIMEVAL_TO_TIMESPEC(&tv, tsp); 402 break; 403 case TSP_NSEC: 404 default: 405 nanotime(tsp); 406 break; 407 } 408} 409 410/* 411 * Set vnode attributes to VNOVAL 412 / 413void 414vattr_null(vap) 415* register struct vattr vap; 416{ 417* 418 vap->va_type = VNON; 419 vap->va_size = VNOVAL; 420 vap->va_bytes = VNOVAL; 421 vap->va_mode = VNOVAL; 422 vap->va_nlink = VNOVAL; 423 vap->va_uid = VNOVAL; 424 vap->va_gid = VNOVAL; 425 vap->va_fsid = VNOVAL; 426 vap->va_fileid = VNOVAL; 427 vap->va_blocksize = VNOVAL; 428 vap->va_rdev = VNOVAL; 429 vap->va_atime.tv_sec = VNOVAL; 430 vap->va_atime.tv_nsec = VNOVAL; 431 vap->va_mtime.tv_sec = VNOVAL; 432 vap->va_mtime.tv_nsec = VNOVAL; 433 vap->va_ctime.tv_sec = VNOVAL; 434 vap->va_ctime.tv_nsec = VNOVAL; 435 vap->va_flags = VNOVAL; 436 vap->va_gen = VNOVAL; 437 vap->va_vaflags = 0; 438} 439 440/* 441 * Routines having to do with the management of the vnode table. 442 / 443* 444/* 445 * Return the next vnode from the free list. 446 / 447int 448getnewvnode(tag, mp, vops, vpp) 449* enum vtagtype tag; 450 struct mount mp; 451* vop_t *vops; 452* struct vnode *vpp; 453{ 454* int s, count; 455 struct proc p = curproc; / XXX / 456* struct vnode vp = NULL; 457* struct mount vnmp; 458* vm_object_t object; 459 460 /* 461 * We take the least recently used vnode from the freelist 462 * if we can get it and it has no cached pages, and no 463 * namecache entries are relative to it. 464 * Otherwise we allocate a new vnode 465 / 466* 467 s = splbio(); 468 simple_lock(&vnode_free_list_slock); 469 470 if (wantfreevnodes && freevnodes < wantfreevnodes) { 471 vp = NULL; 472 } else if (!wantfreevnodes && freevnodes <= desiredvnodes) { 473 /* 474 * XXX: this is only here to be backwards compatible 475 / 476* vp = NULL; 477 } else for (count = 0; count < freevnodes; count++) { 478 vp = TAILQ_FIRST(&vnode_free_list); 479 if (vp == NULL \|\| vp->v_usecount) 480 panic("getnewvnode: free vnode isn't"); 481 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); 482 /* 483 * Don't recycle if active in the namecache or 484 * if it still has cached pages or we cannot get 485 * its interlock. 486 / 487* if (LIST_FIRST(&vp->v_cache_src) != NULL \|\| 488 (VOP_GETVOBJECT(vp, &object) == 0 && 489 (object->resident_page_count \|\| object->ref_count)) \|\| 490 !simple_lock_try(&vp->v_interlock)) { 491 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 492 vp = NULL; 493 continue; 494 } 495 /* 496 * Skip over it if its filesystem is being suspended. 497 / 498* if (vn_start_write(vp, &vnmp, V_NOWAIT) == 0) 499 break; 500 simple_unlock(&vp->v_interlock); 501 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 502 vp = NULL; 503 } 504 if (vp) { 505 vp->v_flag \|= VDOOMED; 506 freevnodes--; 507 simple_unlock(&vnode_free_list_slock); 508 cache_purge(vp); 509 vp->v_lease = NULL; 510 if (vp->v_type != VBAD) { 511 vgonel(vp, p); 512 } else { 513 simple_unlock(&vp->v_interlock); 514 } 515 vn_finished_write(vnmp); 516 517#ifdef INVARIANTS 518 { 519 int s; 520 521 if (vp->v_data) 522 panic("cleaned vnode isn't"); 523 s = splbio(); 524 if (vp->v_numoutput) 525 panic("Clean vnode has pending I/O's"); 526 splx(s); 527 if (vp->v_writecount != 0) 528 panic("Non-zero write count"); 529 } 530#endif 531 vp->v_flag = 0; 532 vp->v_lastw = 0; 533 vp->v_lasta = 0; 534 vp->v_cstart = 0; 535 vp->v_clen = 0; 536 vp->v_socket = 0; 537 } else { 538 simple_unlock(&vnode_free_list_slock); 539 vp = (struct vnode ) zalloc(vnode_zone); 540* bzero((char ) vp, sizeof vp); 541 simple_lock_init(&vp->v_interlock); 542 vp->v_dd = vp; 543 cache_purge(vp); 544 LIST_INIT(&vp->v_cache_src); 545 TAILQ_INIT(&vp->v_cache_dst); 546 numvnodes++; 547 } 548 549 TAILQ_INIT(&vp->v_cleanblkhd); 550 TAILQ_INIT(&vp->v_dirtyblkhd); 551 vp->v_type = VNON; 552 vp->v_tag = tag; 553 vp->v_op = vops; 554 insmntque(vp, mp); 555 vpp = vp; 556* vp->v_usecount = 1; 557 vp->v_data = 0; 558 splx(s); 559 560 vfs_object_create(vp, p, p->p_ucred); 561 return (0); 562} 563 564/* 565 * Move a vnode from one mount queue to another. 566 / 567static void 568insmntque(vp, mp) 569* register struct vnode vp; 570* register struct mount mp; 571{ 572* 573 simple_lock(&mntvnode_slock); 574 /* 575 * Delete from old mount point vnode list, if on one. 576 / 577* if (vp->v_mount != NULL) 578 LIST_REMOVE(vp, v_mntvnodes); 579 /* 580 * Insert into list of vnodes for the new mount point, if available. 581 / 582* if ((vp->v_mount = mp) == NULL) { 583 simple_unlock(&mntvnode_slock); 584 return; 585 } 586 LIST_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes); 587 simple_unlock(&mntvnode_slock); 588} 589 590/* 591 * Update outstanding I/O count and do wakeup if requested. 592 / 593void 594vwakeup(bp) 595* register struct buf bp; 596{ 597* register struct vnode vp; 598* 599 bp->b_flags &= ~B_WRITEINPROG; 600 if ((vp = bp->b_vp)) { 601 vp->v_numoutput--; 602 if (vp->v_numoutput < 0) 603 panic("vwakeup: neg numoutput"); 604 if ((vp->v_numoutput == 0) && (vp->v_flag & VBWAIT)) { 605 vp->v_flag &= ~VBWAIT; 606 wakeup((caddr_t) &vp->v_numoutput); 607 } 608 } 609} 610 611/* 612 * Flush out and invalidate all buffers associated with a vnode. 613 * Called with the underlying object locked. 614 / 615int 616vinvalbuf(vp, flags, cred, p, slpflag, slptimeo) 617* register struct vnode vp; 618* int flags; 619 struct ucred cred; 620* struct proc p; 621* int slpflag, slptimeo; 622{ 623 register struct buf bp; 624* struct buf nbp, blist; 625 int s, error; 626 vm_object_t object; 627 628 if (flags & V_SAVE) { 629 s = splbio(); 630 while (vp->v_numoutput) { 631 vp->v_flag \|= VBWAIT; 632 error = tsleep((caddr_t)&vp->v_numoutput, 633 slpflag \| (PRIBIO + 1), "vinvlbuf", slptimeo); 634 if (error) { 635 splx(s); 636 return (error); 637 } 638 } 639 if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) { 640 splx(s); 641 if ((error = VOP_FSYNC(vp, cred, MNT_WAIT, p)) != 0) 642 return (error); 643 s = splbio(); 644 if (vp->v_numoutput > 0 \|\| 645 !TAILQ_EMPTY(&vp->v_dirtyblkhd)) 646 panic("vinvalbuf: dirty bufs"); 647 } 648 splx(s); 649 } 650 s = splbio(); 651 for (;;) { 652 blist = TAILQ_FIRST(&vp->v_cleanblkhd); 653 if (!blist) 654 blist = TAILQ_FIRST(&vp->v_dirtyblkhd); 655 if (!blist) 656 break; 657 658 for (bp = blist; bp; bp = nbp) { 659 nbp = TAILQ_NEXT(bp, b_vnbufs); 660 if (BUF_LOCK(bp, LK_EXCLUSIVE \| LK_NOWAIT)) { 661 error = BUF_TIMELOCK(bp, 662 LK_EXCLUSIVE \| LK_SLEEPFAIL, 663 "vinvalbuf", slpflag, slptimeo); 664 if (error == ENOLCK) 665 break; 666 splx(s); 667 return (error); 668 } 669 /* 670 * XXX Since there are no node locks for NFS, I 671 * believe there is a slight chance that a delayed 672 * write will occur while sleeping just above, so 673 * check for it. Note that vfs_bio_awrite expects 674 * buffers to reside on a queue, while VOP_BWRITE and 675 * brelse do not. 676 / 677* if (((bp->b_flags & (B_DELWRI \| B_INVAL)) == B_DELWRI) && 678 (flags & V_SAVE)) { 679 680 if (bp->b_vp == vp) { 681 if (bp->b_flags & B_CLUSTEROK) { 682 BUF_UNLOCK(bp); 683 vfs_bio_awrite(bp); 684 } else { 685 bremfree(bp); 686 bp->b_flags \|= B_ASYNC; 687 BUF_WRITE(bp); 688 } 689 } else { 690 bremfree(bp); 691 (void) BUF_WRITE(bp); 692 } 693 break; 694 } 695 bremfree(bp); 696 bp->b_flags \|= (B_INVAL \| B_NOCACHE \| B_RELBUF); 697 bp->b_flags &= ~B_ASYNC; 698 brelse(bp); 699 } 700 } 701 702 while (vp->v_numoutput > 0) { 703 vp->v_flag \|= VBWAIT; 704 tsleep(&vp->v_numoutput, PVM, "vnvlbv", 0); 705 } 706 707 splx(s); 708 709 /* 710 * Destroy the copy in the VM cache, too. 711 / 712* simple_lock(&vp->v_interlock); 713 if (VOP_GETVOBJECT(vp, &object) == 0) { 714 vm_object_page_remove(object, 0, 0, 715 (flags & V_SAVE) ? TRUE : FALSE); 716 } 717 simple_unlock(&vp->v_interlock); 718 719 if (!TAILQ_EMPTY(&vp->v_dirtyblkhd) \|\| !TAILQ_EMPTY(&vp->v_cleanblkhd)) 720 panic("vinvalbuf: flush failed"); 721 return (0); 722} 723 724/* 725 * Truncate a file's buffer and pages to a specified length. This 726 * is in lieu of the old vinvalbuf mechanism, which performed unneeded 727 * sync activity. 728 / 729int 730vtruncbuf(vp, cred, p, length, blksize) 731* register struct vnode vp; 732* struct ucred cred; 733* struct proc p; 734* off_t length; 735 int blksize; 736{ 737 register struct buf bp; 738* struct buf nbp; 739* int s, anyfreed; 740 int trunclbn; 741 742 /* 743 * Round up to the next lbn. 744 / 745* trunclbn = (length + blksize - 1) / blksize; 746 747 s = splbio(); 748restart: 749 anyfreed = 1; 750 for (;anyfreed;) { 751 anyfreed = 0; 752 for (bp = TAILQ_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) { 753 nbp = TAILQ_NEXT(bp, b_vnbufs); 754 if (bp->b_lblkno >= trunclbn) { 755 if (BUF_LOCK(bp, LK_EXCLUSIVE \| LK_NOWAIT)) { 756 BUF_LOCK(bp, LK_EXCLUSIVE\|LK_SLEEPFAIL); 757 goto restart; 758 } else { 759 bremfree(bp); 760 bp->b_flags \|= (B_INVAL \| B_RELBUF); 761 bp->b_flags &= ~B_ASYNC; 762 brelse(bp); 763 anyfreed = 1; 764 } 765 if (nbp && 766 (((nbp->b_xflags & BX_VNCLEAN) == 0) \|\| 767 (nbp->b_vp != vp) \|\| 768 (nbp->b_flags & B_DELWRI))) { 769 goto restart; 770 } 771 } 772 } 773 774 for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { 775 nbp = TAILQ_NEXT(bp, b_vnbufs); 776 if (bp->b_lblkno >= trunclbn) { 777 if (BUF_LOCK(bp, LK_EXCLUSIVE \| LK_NOWAIT)) { 778 BUF_LOCK(bp, LK_EXCLUSIVE\|LK_SLEEPFAIL); 779 goto restart; 780 } else { 781 bremfree(bp); 782 bp->b_flags \|= (B_INVAL \| B_RELBUF); 783 bp->b_flags &= ~B_ASYNC; 784 brelse(bp); 785 anyfreed = 1; 786 } 787 if (nbp && 788 (((nbp->b_xflags & BX_VNDIRTY) == 0) \|\| 789 (nbp->b_vp != vp) \|\| 790 (nbp->b_flags & B_DELWRI) == 0)) { 791 goto restart; 792 } 793 } 794 } 795 } 796 797 if (length > 0) { 798restartsync: 799 for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { 800 nbp = TAILQ_NEXT(bp, b_vnbufs); 801 if ((bp->b_flags & B_DELWRI) && (bp->b_lblkno < 0)) { 802 if (BUF_LOCK(bp, LK_EXCLUSIVE \| LK_NOWAIT)) { 803 BUF_LOCK(bp, LK_EXCLUSIVE\|LK_SLEEPFAIL); 804 goto restart; 805 } else { 806 bremfree(bp); 807 if (bp->b_vp == vp) { 808 bp->b_flags \|= B_ASYNC; 809 } else { 810 bp->b_flags &= ~B_ASYNC; 811 } 812 BUF_WRITE(bp); 813 } 814 goto restartsync; 815 } 816 817 } 818 } 819 820 while (vp->v_numoutput > 0) { 821 vp->v_flag \|= VBWAIT; 822 tsleep(&vp->v_numoutput, PVM, "vbtrunc", 0); 823 } 824 825 splx(s); 826 827 vnode_pager_setsize(vp, length); 828 829 return (0); 830} 831 832/* 833 * Associate a buffer with a vnode. 834 / 835void 836bgetvp(vp, bp) 837* register struct vnode vp; 838* register struct buf bp; 839{ 840* int s; 841 842 KASSERT(bp->b_vp == NULL, ("bgetvp: not free")); 843 844 vhold(vp); 845 bp->b_vp = vp; 846 bp->b_dev = vn_todev(vp); 847 /* 848 * Insert onto list for new vnode. 849 / 850* s = splbio(); 851 bp->b_xflags \|= BX_VNCLEAN; 852 bp->b_xflags &= ~BX_VNDIRTY; 853 TAILQ_INSERT_TAIL(&vp->v_cleanblkhd, bp, b_vnbufs); 854 splx(s); 855} 856 857/* 858 * Disassociate a buffer from a vnode. 859 / 860void 861brelvp(bp) 862* register struct buf bp; 863{ 864* struct vnode vp; 865* struct buflists listheadp; 866* int s; 867 868 KASSERT(bp->b_vp != NULL, ("brelvp: NULL")); 869 870 /* 871 * Delete from old vnode list, if on one. 872 / 873* vp = bp->b_vp; 874 s = splbio(); 875 if (bp->b_xflags & (BX_VNDIRTY \| BX_VNCLEAN)) { 876 if (bp->b_xflags & BX_VNDIRTY) 877 listheadp = &vp->v_dirtyblkhd; 878 else 879 listheadp = &vp->v_cleanblkhd; 880 TAILQ_REMOVE(listheadp, bp, b_vnbufs); 881 bp->b_xflags &= ~(BX_VNDIRTY \| BX_VNCLEAN); 882 } 883 if ((vp->v_flag & VONWORKLST) && TAILQ_EMPTY(&vp->v_dirtyblkhd)) { 884 vp->v_flag &= ~VONWORKLST; 885 LIST_REMOVE(vp, v_synclist); 886 } 887 splx(s); 888 bp->b_vp = (struct vnode ) 0; 889* vdrop(vp); 890} 891 892/* 893 * The workitem queue. 894 * 895 * It is useful to delay writes of file data and filesystem metadata 896 * for tens of seconds so that quickly created and deleted files need 897 * not waste disk bandwidth being created and removed. To realize this, 898 * we append vnodes to a "workitem" queue. When running with a soft 899 * updates implementation, most pending metadata dependencies should 900 * not wait for more than a few seconds. Thus, mounted on block devices 901 * are delayed only about a half the time that file data is delayed. 902 * Similarly, directory updates are more critical, so are only delayed 903 * about a third the time that file data is delayed. Thus, there are 904 * SYNCER_MAXDELAY queues that are processed round-robin at a rate of 905 * one each second (driven off the filesystem syncer process). The 906 * syncer_delayno variable indicates the next queue that is to be processed. 907 * Items that need to be processed soon are placed in this queue: 908 * 909 * syncer_workitem_pending[syncer_delayno] 910 * 911 * A delay of fifteen seconds is done by placing the request fifteen 912 * entries later in the queue: 913 * 914 * syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask] 915 * 916 / 917* 918/* 919 * Add an item to the syncer work queue. 920 / 921static void 922vn_syncer_add_to_worklist(struct vnode vp, int delay) 923{ 924 int s, slot; 925 926 s = splbio(); 927 928 if (vp->v_flag & VONWORKLST) { 929 LIST_REMOVE(vp, v_synclist); 930 } 931 932 if (delay > syncer_maxdelay - 2) 933 delay = syncer_maxdelay - 2; 934 slot = (syncer_delayno + delay) & syncer_mask; 935 936 LIST_INSERT_HEAD(&syncer_workitem_pending[slot], vp, v_synclist); 937 vp->v_flag \|= VONWORKLST; 938 splx(s); 939} 940 941struct proc updateproc; 942static void sched_sync __P((void)); 943static struct kproc_desc up_kp = { 944* "syncer", 945 sched_sync, 946 &updateproc 947}; 948SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp) 949 950/* 951 * System filesystem synchronizer daemon. 952 / 953void 954sched_sync(void) 955{ 956* struct synclist slp; 957* struct vnode vp; 958* struct mount mp; 959* long starttime; 960 int s; 961 struct proc p = updateproc; 962* 963 mtx_enter(&Giant, MTX_DEF); 964 965 EVENTHANDLER_REGISTER(shutdown_pre_sync, shutdown_kproc, p, 966 SHUTDOWN_PRI_LAST); 967 968 for (;;) { 969 kproc_suspend_loop(p); 970 971 starttime = time_second; 972 973 /* 974 * Push files whose dirty time has expired. Be careful 975 * of interrupt race on slp queue. 976 / 977* s = splbio(); 978 slp = &syncer_workitem_pending[syncer_delayno]; 979 syncer_delayno += 1; 980 if (syncer_delayno == syncer_maxdelay) 981 syncer_delayno = 0; 982 splx(s); 983 984 while ((vp = LIST_FIRST(slp)) != NULL) { 985 if (VOP_ISLOCKED(vp, NULL) == 0 && 986 vn_start_write(vp, &mp, V_NOWAIT) == 0) { 987 vn_lock(vp, LK_EXCLUSIVE \| LK_RETRY, p); 988 (void) VOP_FSYNC(vp, p->p_ucred, MNT_LAZY, p); 989 VOP_UNLOCK(vp, 0, p); 990 vn_finished_write(mp); 991 } 992 s = splbio(); 993 if (LIST_FIRST(slp) == vp) { 994 /* 995 * Note: v_tag VT_VFS vps can remain on the 996 * worklist too with no dirty blocks, but 997 * since sync_fsync() moves it to a different 998 * slot we are safe. 999 / 1000* if (TAILQ_EMPTY(&vp->v_dirtyblkhd) && 1001 !vn_isdisk(vp, NULL)) 1002 panic("sched_sync: fsync failed vp %p tag %d", vp, vp->v_tag); 1003 /* 1004 * Put us back on the worklist. The worklist 1005 * routine will remove us from our current 1006 * position and then add us back in at a later 1007 * position. 1008 / 1009* vn_syncer_add_to_worklist(vp, syncdelay); 1010 } 1011 splx(s); 1012 } 1013 1014 /* 1015 * Do soft update processing. 1016 / 1017#ifdef SOFTUPDATES 1018* softdep_process_worklist(NULL); 1019#endif 1020 1021 /* 1022 * The variable rushjob allows the kernel to speed up the 1023 * processing of the filesystem syncer process. A rushjob 1024 * value of N tells the filesystem syncer to process the next 1025 * N seconds worth of work on its queue ASAP. Currently rushjob 1026 * is used by the soft update code to speed up the filesystem 1027 * syncer process when the incore state is getting so far 1028 * ahead of the disk that the kernel memory pool is being 1029 * threatened with exhaustion. 1030 / 1031* if (rushjob > 0) { 1032 rushjob -= 1; 1033 continue; 1034 } 1035 /* 1036 * If it has taken us less than a second to process the 1037 * current work, then wait. Otherwise start right over 1038 * again. We can still lose time if any single round 1039 * takes more than two seconds, but it does not really 1040 * matter as we are just trying to generally pace the 1041 * filesystem activity. 1042 / 1043* if (time_second == starttime) 1044 tsleep(&lbolt, PPAUSE, "syncer", 0); 1045 } 1046} 1047 1048/* 1049 * Request the syncer daemon to speed up its work. 1050 * We never push it to speed up more than half of its 1051 * normal turn time, otherwise it could take over the cpu. 1052 / 1053int 1054speedup_syncer() 1055{ 1056* int s; 1057 1058 s = splhigh(); 1059 if (updateproc->p_wchan == &lbolt) 1060 setrunnable(updateproc); 1061 splx(s); 1062 if (rushjob < syncdelay / 2) { 1063 rushjob += 1; 1064 stat_rush_requests += 1; 1065 return (1); 1066 } 1067 return(0); 1068} 1069 1070/* 1071 * Associate a p-buffer with a vnode. 1072 * 1073 * Also sets B_PAGING flag to indicate that vnode is not fully associated 1074 * with the buffer. i.e. the bp has not been linked into the vnode or 1075 * ref-counted. 1076 / 1077void 1078pbgetvp(vp, bp) 1079* register struct vnode vp; 1080* register struct buf bp; 1081{ 1082* 1083 KASSERT(bp->b_vp == NULL, ("pbgetvp: not free")); 1084 1085 bp->b_vp = vp; 1086 bp->b_flags \|= B_PAGING; 1087 bp->b_dev = vn_todev(vp); 1088} 1089 1090/* 1091 * Disassociate a p-buffer from a vnode. 1092 / 1093void 1094pbrelvp(bp) 1095* register struct buf bp; 1096{ 1097* 1098 KASSERT(bp->b_vp != NULL, ("pbrelvp: NULL")); 1099 1100 /* XXX REMOVE ME / 1101* if (bp->b_vnbufs.tqe_next != NULL) { 1102 panic( 1103 "relpbuf(): b_vp was probably reassignbuf()d %p %x", 1104 bp, 1105 (int)bp->b_flags 1106 ); 1107 } 1108 bp->b_vp = (struct vnode ) 0; 1109* bp->b_flags &= ~B_PAGING; 1110} 1111 1112void 1113pbreassignbuf(bp, newvp) 1114 struct buf bp; 1115* struct vnode newvp; 1116{ 1117* if ((bp->b_flags & B_PAGING) == 0) { 1118 panic( 1119 "pbreassignbuf() on non phys bp %p", 1120 bp 1121 ); 1122 } 1123 bp->b_vp = newvp; 1124} 1125 1126/* 1127 * Reassign a buffer from one vnode to another. 1128 * Used to assign file specific control information 1129 * (indirect blocks) to the vnode to which they belong. 1130 / 1131void 1132reassignbuf(bp, newvp) 1133* register struct buf bp; 1134* register struct vnode newvp; 1135{ 1136* struct buflists listheadp; 1137* int delay; 1138 int s; 1139 1140 if (newvp == NULL) { 1141 printf("reassignbuf: NULL"); 1142 return; 1143 } 1144 ++reassignbufcalls; 1145 1146 /* 1147 * B_PAGING flagged buffers cannot be reassigned because their vp 1148 * is not fully linked in. 1149 / 1150* if (bp->b_flags & B_PAGING) 1151 panic("cannot reassign paging buffer"); 1152 1153 s = splbio(); 1154 /* 1155 * Delete from old vnode list, if on one. 1156 / 1157* if (bp->b_xflags & (BX_VNDIRTY \| BX_VNCLEAN)) { 1158 if (bp->b_xflags & BX_VNDIRTY) 1159 listheadp = &bp->b_vp->v_dirtyblkhd; 1160 else 1161 listheadp = &bp->b_vp->v_cleanblkhd; 1162 TAILQ_REMOVE(listheadp, bp, b_vnbufs); 1163 bp->b_xflags &= ~(BX_VNDIRTY \| BX_VNCLEAN); 1164 if (bp->b_vp != newvp) { 1165 vdrop(bp->b_vp); 1166 bp->b_vp = NULL; /* for clarification / 1167* } 1168 } 1169 /* 1170 * If dirty, put on list of dirty buffers; otherwise insert onto list 1171 * of clean buffers. 1172 / 1173* if (bp->b_flags & B_DELWRI) { 1174 struct buf tbp; 1175* 1176 listheadp = &newvp->v_dirtyblkhd; 1177 if ((newvp->v_flag & VONWORKLST) == 0) { 1178 switch (newvp->v_type) { 1179 case VDIR: 1180 delay = dirdelay; 1181 break; 1182 case VCHR: 1183 case VBLK: 1184 if (newvp->v_specmountpoint != NULL) { 1185 delay = metadelay; 1186 break; 1187 } 1188 /* fall through / 1189* default: 1190 delay = filedelay; 1191 } 1192 vn_syncer_add_to_worklist(newvp, delay); 1193 } 1194 bp->b_xflags \|= BX_VNDIRTY; 1195 tbp = TAILQ_FIRST(listheadp); 1196 if (tbp == NULL \|\| 1197 bp->b_lblkno == 0 \|\| 1198 (bp->b_lblkno > 0 && tbp->b_lblkno < 0) \|\| 1199 (bp->b_lblkno > 0 && bp->b_lblkno < tbp->b_lblkno)) { 1200 TAILQ_INSERT_HEAD(listheadp, bp, b_vnbufs); 1201 ++reassignbufsortgood; 1202 } else if (bp->b_lblkno < 0) { 1203 TAILQ_INSERT_TAIL(listheadp, bp, b_vnbufs); 1204 ++reassignbufsortgood; 1205 } else if (reassignbufmethod == 1) { 1206 /* 1207 * New sorting algorithm, only handle sequential case, 1208 * otherwise append to end (but before metadata) 1209 / 1210* if ((tbp = gbincore(newvp, bp->b_lblkno - 1)) != NULL && 1211 (tbp->b_xflags & BX_VNDIRTY)) { 1212 /* 1213 * Found the best place to insert the buffer 1214 / 1215* TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs); 1216 ++reassignbufsortgood; 1217 } else { 1218 /* 1219 * Missed, append to end, but before meta-data. 1220 * We know that the head buffer in the list is 1221 * not meta-data due to prior conditionals. 1222 * 1223 * Indirect effects: NFS second stage write 1224 * tends to wind up here, giving maximum 1225 * distance between the unstable write and the 1226 * commit rpc. 1227 / 1228* tbp = TAILQ_LAST(listheadp, buflists); 1229 while (tbp && tbp->b_lblkno < 0) 1230 tbp = TAILQ_PREV(tbp, buflists, b_vnbufs); 1231 TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs); 1232 ++reassignbufsortbad; 1233 } 1234 } else { 1235 /* 1236 * Old sorting algorithm, scan queue and insert 1237 / 1238* struct buf ttbp; 1239* while ((ttbp = TAILQ_NEXT(tbp, b_vnbufs)) && 1240 (ttbp->b_lblkno < bp->b_lblkno)) { 1241 ++reassignbufloops; 1242 tbp = ttbp; 1243 } 1244 TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs); 1245 } 1246 } else { 1247 bp->b_xflags \|= BX_VNCLEAN; 1248 TAILQ_INSERT_TAIL(&newvp->v_cleanblkhd, bp, b_vnbufs); 1249 if ((newvp->v_flag & VONWORKLST) && 1250 TAILQ_EMPTY(&newvp->v_dirtyblkhd)) { 1251 newvp->v_flag &= ~VONWORKLST; 1252 LIST_REMOVE(newvp, v_synclist); 1253 } 1254 } 1255 if (bp->b_vp != newvp) { 1256 bp->b_vp = newvp; 1257 vhold(bp->b_vp); 1258 } 1259 splx(s); 1260} 1261 1262/* 1263 * Create a vnode for a block device. 1264 * Used for mounting the root file system. 1265 * XXX: This now changed to a VCHR due to the block/char merging. 1266 / 1267int 1268bdevvp(dev, vpp) 1269* dev_t dev; 1270 struct vnode *vpp; 1271{ 1272* register struct vnode vp; 1273* struct vnode nvp; 1274* int error; 1275 1276 if (dev == NODEV) { 1277 vpp = NULLVP; 1278* return (ENXIO); 1279 } 1280 error = getnewvnode(VT_NON, (struct mount )0, spec_vnodeop_p, &nvp); 1281* if (error) { 1282 vpp = NULLVP; 1283* return (error); 1284 } 1285 vp = nvp; 1286 vp->v_type = VCHR; 1287 addalias(vp, dev); 1288 vpp = vp; 1289* return (0); 1290} 1291 1292/* 1293 * Add vnode to the alias list hung off the dev_t. 1294 * 1295 * The reason for this gunk is that multiple vnodes can reference 1296 * the same physical device, so checking vp->v_usecount to see 1297 * how many users there are is inadequate; the v_usecount for 1298 * the vnodes need to be accumulated. vcount() does that. 1299 / 1300struct vnode 1301addaliasu(nvp, nvp_rdev) 1302 struct vnode nvp; 1303* udev_t nvp_rdev; 1304{ 1305 struct vnode ovp; 1306* vop_t *ops; 1307* dev_t dev; 1308 1309 if (nvp->v_type != VBLK && nvp->v_type != VCHR) 1310 panic("addaliasu on non-special vnode"); 1311 dev = udev2dev(nvp_rdev, nvp->v_type == VBLK ? 1 : 0); 1312 /* 1313 * Check to see if we have a bdevvp vnode with no associated 1314 * filesystem. If so, we want to associate the filesystem of 1315 * the new newly instigated vnode with the bdevvp vnode and 1316 * discard the newly created vnode rather than leaving the 1317 * bdevvp vnode lying around with no associated filesystem. 1318 / 1319* if (vfinddev(dev, nvp->v_type, &ovp) == 0 \|\| ovp->v_data != NULL) { 1320 addalias(nvp, dev); 1321 return (nvp); 1322 } 1323 /* 1324 * Discard unneeded vnode, but save its node specific data. 1325 * Note that if there is a lock, it is carried over in the 1326 * node specific data to the replacement vnode. 1327 / 1328* vref(ovp); 1329 ovp->v_data = nvp->v_data; 1330 ovp->v_tag = nvp->v_tag; 1331 nvp->v_data = NULL; 1332 ops = nvp->v_op; 1333 nvp->v_op = ovp->v_op; 1334 ovp->v_op = ops; 1335 insmntque(ovp, nvp->v_mount); 1336 vrele(nvp); 1337 vgone(nvp); 1338 return (ovp); 1339} 1340 1341void 1342addalias(nvp, dev) 1343 struct vnode nvp; 1344* dev_t dev; 1345{ 1346 1347 if (nvp->v_type != VBLK && nvp->v_type != VCHR) 1348 panic("addalias on non-special vnode"); 1349 1350 nvp->v_rdev = dev; 1351 simple_lock(&spechash_slock); 1352 SLIST_INSERT_HEAD(&dev->si_hlist, nvp, v_specnext); 1353 simple_unlock(&spechash_slock); 1354} 1355 1356/* 1357 * Grab a particular vnode from the free list, increment its 1358 * reference count and lock it. The vnode lock bit is set if the 1359 * vnode is being eliminated in vgone. The process is awakened 1360 * when the transition is completed, and an error returned to 1361 * indicate that the vnode is no longer usable (possibly having 1362 * been changed to a new file system type). 1363 / 1364int 1365vget(vp, flags, p) 1366* register struct vnode vp; 1367* int flags; 1368 struct proc p; 1369{ 1370* int error; 1371 1372 /* 1373 * If the vnode is in the process of being cleaned out for 1374 * another use, we wait for the cleaning to finish and then 1375 * return failure. Cleaning is determined by checking that 1376 * the VXLOCK flag is set. 1377 / 1378* if ((flags & LK_INTERLOCK) == 0) { 1379 simple_lock(&vp->v_interlock); 1380 } 1381 if (vp->v_flag & VXLOCK) { 1382 vp->v_flag \|= VXWANT; 1383 simple_unlock(&vp->v_interlock); 1384 tsleep((caddr_t)vp, PINOD, "vget", 0); 1385 return (ENOENT); 1386 } 1387 1388 vp->v_usecount++; 1389 1390 if (VSHOULDBUSY(vp)) 1391 vbusy(vp); 1392 if (flags & LK_TYPE_MASK) { 1393 if ((error = vn_lock(vp, flags \| LK_INTERLOCK, p)) != 0) { 1394 /* 1395 * must expand vrele here because we do not want 1396 * to call VOP_INACTIVE if the reference count 1397 * drops back to zero since it was never really 1398 * active. We must remove it from the free list 1399 * before sleeping so that multiple processes do 1400 * not try to recycle it. 1401 / 1402* simple_lock(&vp->v_interlock); 1403 vp->v_usecount--; 1404 if (VSHOULDFREE(vp)) 1405 vfree(vp); 1406 simple_unlock(&vp->v_interlock); 1407 } 1408 return (error); 1409 } 1410 simple_unlock(&vp->v_interlock); 1411 return (0); 1412} 1413 1414void 1415vref(struct vnode vp) 1416{ 1417* simple_lock(&vp->v_interlock); 1418 vp->v_usecount++; 1419 simple_unlock(&vp->v_interlock); 1420} 1421 1422/* 1423 * Vnode put/release. 1424 * If count drops to zero, call inactive routine and return to freelist. 1425 / 1426void 1427vrele(vp) 1428* struct vnode vp; 1429{ 1430* struct proc p = curproc; / XXX / 1431* 1432 KASSERT(vp != NULL, ("vrele: null vp")); 1433 KASSERT(vp->v_writecount < vp->v_usecount, ("vrele: missed vn_close")); 1434 1435 simple_lock(&vp->v_interlock); 1436 1437 if (vp->v_usecount > 1) { 1438 1439 vp->v_usecount--; 1440 simple_unlock(&vp->v_interlock); 1441 1442 return; 1443 } 1444 1445 if (vp->v_usecount == 1) { 1446 1447 vp->v_usecount--; 1448 if (VSHOULDFREE(vp)) 1449 vfree(vp); 1450 /* 1451 * If we are doing a vput, the node is already locked, and we must 1452 * call VOP_INACTIVE with the node locked. So, in the case of 1453 * vrele, we explicitly lock the vnode before calling VOP_INACTIVE. 1454 / 1455* if (vn_lock(vp, LK_EXCLUSIVE \| LK_INTERLOCK, p) == 0) { 1456 VOP_INACTIVE(vp, p); 1457 } 1458 1459 } else { 1460#ifdef DIAGNOSTIC 1461 vprint("vrele: negative ref count", vp); 1462 simple_unlock(&vp->v_interlock); 1463#endif 1464 panic("vrele: negative ref cnt"); 1465 } 1466} 1467 1468void 1469vput(vp) 1470 struct vnode vp; 1471{ 1472* struct proc p = curproc; / XXX / 1473* 1474 KASSERT(vp != NULL, ("vput: null vp")); 1475 KASSERT(vp->v_writecount < vp->v_usecount, ("vput: missed vn_close")); 1476 1477 simple_lock(&vp->v_interlock); 1478 1479 if (vp->v_usecount > 1) { 1480 1481 vp->v_usecount--; 1482 VOP_UNLOCK(vp, LK_INTERLOCK, p); 1483 return; 1484 1485 } 1486 1487 if (vp->v_usecount == 1) { 1488 1489 vp->v_usecount--; 1490 if (VSHOULDFREE(vp)) 1491 vfree(vp); 1492 /* 1493 * If we are doing a vput, the node is already locked, and we must 1494 * call VOP_INACTIVE with the node locked. So, in the case of 1495 * vrele, we explicitly lock the vnode before calling VOP_INACTIVE. 1496 / 1497* simple_unlock(&vp->v_interlock); 1498 VOP_INACTIVE(vp, p); 1499 1500 } else { 1501#ifdef DIAGNOSTIC 1502 vprint("vput: negative ref count", vp); 1503#endif 1504 panic("vput: negative ref cnt"); 1505 } 1506} 1507 1508/* 1509 * Somebody doesn't want the vnode recycled. 1510 / 1511void 1512vhold(vp) 1513* register struct vnode vp; 1514{ 1515* int s; 1516 1517 s = splbio(); 1518 vp->v_holdcnt++; 1519 if (VSHOULDBUSY(vp)) 1520 vbusy(vp); 1521 splx(s); 1522} 1523 1524/* 1525 * One less who cares about this vnode. 1526 / 1527void 1528vdrop(vp) 1529* register struct vnode vp; 1530{ 1531* int s; 1532 1533 s = splbio(); 1534 if (vp->v_holdcnt <= 0) 1535 panic("vdrop: holdcnt"); 1536 vp->v_holdcnt--; 1537 if (VSHOULDFREE(vp)) 1538 vfree(vp); 1539 splx(s); 1540} 1541 1542/* 1543 * Remove any vnodes in the vnode table belonging to mount point mp. 1544 * 1545 * If MNT_NOFORCE is specified, there should not be any active ones, 1546 * return error if any are found (nb: this is a user error, not a 1547 * system error). If MNT_FORCE is specified, detach any active vnodes 1548 * that are found. 1549 / 1550#ifdef DIAGNOSTIC 1551static int busyprt = 0; / print out busy vnodes / 1552SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, ""); 1553#endif 1554* 1555int 1556vflush(mp, skipvp, flags) 1557 struct mount mp; 1558* struct vnode skipvp; 1559* int flags; 1560{ 1561 struct proc p = curproc; / XXX / 1562* struct vnode vp, nvp; 1563 int busy = 0; 1564 1565 simple_lock(&mntvnode_slock); 1566loop: 1567 for (vp = LIST_FIRST(&mp->mnt_vnodelist); vp; vp = nvp) { 1568 /* 1569 * Make sure this vnode wasn't reclaimed in getnewvnode(). 1570 * Start over if it has (it won't be on the list anymore). 1571 / 1572* if (vp->v_mount != mp) 1573 goto loop; 1574 nvp = LIST_NEXT(vp, v_mntvnodes); 1575 /* 1576 * Skip over a selected vnode. 1577 / 1578* if (vp == skipvp) 1579 continue; 1580 1581 simple_lock(&vp->v_interlock); 1582 /* 1583 * Skip over a vnodes marked VSYSTEM. 1584 / 1585* if ((flags & SKIPSYSTEM) && (vp->v_flag & VSYSTEM)) { 1586 simple_unlock(&vp->v_interlock); 1587 continue; 1588 } 1589 /* 1590 * If WRITECLOSE is set, only flush out regular file vnodes 1591 * open for writing. 1592 / 1593* if ((flags & WRITECLOSE) && 1594 (vp->v_writecount == 0 \|\| vp->v_type != VREG)) { 1595 simple_unlock(&vp->v_interlock); 1596 continue; 1597 } 1598 1599 /* 1600 * With v_usecount == 0, all we need to do is clear out the 1601 * vnode data structures and we are done. 1602 / 1603* if (vp->v_usecount == 0) { 1604 simple_unlock(&mntvnode_slock); 1605 vgonel(vp, p); 1606 simple_lock(&mntvnode_slock); 1607 continue; 1608 } 1609 1610 /* 1611 * If FORCECLOSE is set, forcibly close the vnode. For block 1612 * or character devices, revert to an anonymous device. For 1613 * all other files, just kill them. 1614 / 1615* if (flags & FORCECLOSE) { 1616 simple_unlock(&mntvnode_slock); 1617 if (vp->v_type != VBLK && vp->v_type != VCHR) { 1618 vgonel(vp, p); 1619 } else { 1620 vclean(vp, 0, p); 1621 vp->v_op = spec_vnodeop_p; 1622 insmntque(vp, (struct mount ) 0); 1623* } 1624 simple_lock(&mntvnode_slock); 1625 continue; 1626 } 1627#ifdef DIAGNOSTIC 1628 if (busyprt) 1629 vprint("vflush: busy vnode", vp); 1630#endif 1631 simple_unlock(&vp->v_interlock); 1632 busy++; 1633 } 1634 simple_unlock(&mntvnode_slock); 1635 if (busy) 1636 return (EBUSY); 1637 return (0); 1638} 1639 1640/* 1641 * Disassociate the underlying file system from a vnode. 1642 / 1643static void 1644vclean(vp, flags, p) 1645* struct vnode vp; 1646* int flags; 1647 struct proc p; 1648{ 1649* int active; 1650 1651 /* 1652 * Check to see if the vnode is in use. If so we have to reference it 1653 * before we clean it out so that its count cannot fall to zero and 1654 * generate a race against ourselves to recycle it. 1655 / 1656* if ((active = vp->v_usecount)) 1657 vp->v_usecount++; 1658 1659 /* 1660 * Prevent the vnode from being recycled or brought into use while we 1661 * clean it out. 1662 / 1663* if (vp->v_flag & VXLOCK) 1664 panic("vclean: deadlock"); 1665 vp->v_flag \|= VXLOCK; 1666 /* 1667 * Even if the count is zero, the VOP_INACTIVE routine may still 1668 * have the object locked while it cleans it out. The VOP_LOCK 1669 * ensures that the VOP_INACTIVE routine is done with its work. 1670 * For active vnodes, it ensures that no other activity can 1671 * occur while the underlying object is being cleaned out. 1672 / 1673* VOP_LOCK(vp, LK_DRAIN \| LK_INTERLOCK, p); 1674 1675 /* 1676 * Clean out any buffers associated with the vnode. 1677 * If the flush fails, just toss the buffers. 1678 / 1679* if (flags & DOCLOSE) { 1680 if (TAILQ_FIRST(&vp->v_dirtyblkhd) != NULL) 1681 (void) vn_write_suspend_wait(vp, NULL, V_WAIT); 1682 if (vinvalbuf(vp, V_SAVE, NOCRED, p, 0, 0) != 0) 1683 vinvalbuf(vp, 0, NOCRED, p, 0, 0); 1684 } 1685 1686 VOP_DESTROYVOBJECT(vp); 1687 1688 /* 1689 * If purging an active vnode, it must be closed and 1690 * deactivated before being reclaimed. Note that the 1691 * VOP_INACTIVE will unlock the vnode. 1692 / 1693* if (active) { 1694 if (flags & DOCLOSE) 1695 VOP_CLOSE(vp, FNONBLOCK, NOCRED, p); 1696 VOP_INACTIVE(vp, p); 1697 } else { 1698 /* 1699 * Any other processes trying to obtain this lock must first 1700 * wait for VXLOCK to clear, then call the new lock operation. 1701 / 1702* VOP_UNLOCK(vp, 0, p); 1703 } 1704 /* 1705 * Reclaim the vnode. 1706 / 1707* if (VOP_RECLAIM(vp, p)) 1708 panic("vclean: cannot reclaim"); 1709 1710 if (active) { 1711 /* 1712 * Inline copy of vrele() since VOP_INACTIVE 1713 * has already been called. 1714 / 1715* simple_lock(&vp->v_interlock); 1716 if (--vp->v_usecount <= 0) { 1717#ifdef DIAGNOSTIC 1718 if (vp->v_usecount < 0 \|\| vp->v_writecount != 0) { 1719 vprint("vclean: bad ref count", vp); 1720 panic("vclean: ref cnt"); 1721 } 1722#endif 1723 vfree(vp); 1724 } 1725 simple_unlock(&vp->v_interlock); 1726 } 1727 1728 cache_purge(vp); 1729 if (vp->v_vnlock) { 1730 FREE(vp->v_vnlock, M_VNODE); 1731 vp->v_vnlock = NULL; 1732 } 1733 1734 if (VSHOULDFREE(vp)) 1735 vfree(vp); 1736 1737 /* 1738 * Done with purge, notify sleepers of the grim news. 1739 / 1740* vp->v_op = dead_vnodeop_p; 1741 vn_pollgone(vp); 1742 vp->v_tag = VT_NON; 1743 vp->v_flag &= ~VXLOCK; 1744 if (vp->v_flag & VXWANT) { 1745 vp->v_flag &= ~VXWANT; 1746 wakeup((caddr_t) vp); 1747 } 1748} 1749 1750/* 1751 * Eliminate all activity associated with the requested vnode 1752 * and with all vnodes aliased to the requested vnode. 1753 / 1754int 1755vop_revoke(ap) 1756* struct vop_revoke_args /* { 1757 struct vnode a_vp; 1758* int a_flags; 1759 } / ap; 1760{ 1761 struct vnode vp, vq; 1762 dev_t dev; 1763 1764 KASSERT((ap->a_flags & REVOKEALL) != 0, ("vop_revoke")); 1765 1766 vp = ap->a_vp; 1767 /* 1768 * If a vgone (or vclean) is already in progress, 1769 * wait until it is done and return. 1770 / 1771* if (vp->v_flag & VXLOCK) { 1772 vp->v_flag \|= VXWANT; 1773 simple_unlock(&vp->v_interlock); 1774 tsleep((caddr_t)vp, PINOD, "vop_revokeall", 0); 1775 return (0); 1776 } 1777 dev = vp->v_rdev; 1778 for (;;) { 1779 simple_lock(&spechash_slock); 1780 vq = SLIST_FIRST(&dev->si_hlist); 1781 simple_unlock(&spechash_slock); 1782 if (!vq) 1783 break; 1784 vgone(vq); 1785 } 1786 return (0); 1787} 1788 1789/* 1790 * Recycle an unused vnode to the front of the free list. 1791 * Release the passed interlock if the vnode will be recycled. 1792 / 1793int 1794vrecycle(vp, inter_lkp, p) 1795* struct vnode vp; 1796* struct simplelock inter_lkp; 1797* struct proc p; 1798{ 1799* 1800 simple_lock(&vp->v_interlock); 1801 if (vp->v_usecount == 0) { 1802 if (inter_lkp) { 1803 simple_unlock(inter_lkp); 1804 } 1805 vgonel(vp, p); 1806 return (1); 1807 } 1808 simple_unlock(&vp->v_interlock); 1809 return (0); 1810} 1811 1812/* 1813 * Eliminate all activity associated with a vnode 1814 * in preparation for reuse. 1815 / 1816void 1817vgone(vp) 1818* register struct vnode vp; 1819{ 1820* struct proc p = curproc; / XXX / 1821* 1822 simple_lock(&vp->v_interlock); 1823 vgonel(vp, p); 1824} 1825 1826/* 1827 * vgone, with the vp interlock held. 1828 / 1829void 1830vgonel(vp, p) 1831* struct vnode vp; 1832* struct proc p; 1833{ 1834* int s; 1835 1836 /* 1837 * If a vgone (or vclean) is already in progress, 1838 * wait until it is done and return. 1839 / 1840* if (vp->v_flag & VXLOCK) { 1841 vp->v_flag \|= VXWANT; 1842 simple_unlock(&vp->v_interlock); 1843 tsleep((caddr_t)vp, PINOD, "vgone", 0); 1844 return; 1845 } 1846 1847 /* 1848 * Clean out the filesystem specific data. 1849 / 1850* vclean(vp, DOCLOSE, p); 1851 simple_lock(&vp->v_interlock); 1852 1853 /* 1854 * Delete from old mount point vnode list, if on one. 1855 / 1856* if (vp->v_mount != NULL) 1857 insmntque(vp, (struct mount )0); 1858* /* 1859 * If special device, remove it from special device alias list 1860 * if it is on one. 1861 / 1862* if ((vp->v_type == VBLK \|\| vp->v_type == VCHR) && vp->v_rdev != NULL) { 1863 simple_lock(&spechash_slock);
1864 SLIST_REMOVE(&vp->v_hashchain, vp, vnode, v_specnext);	1864 SLIST_REMOVE(&vp->v_rdev->si_hlist, vp, vnode, v_specnext);
1865 freedev(vp->v_rdev); 1866 simple_unlock(&spechash_slock); 1867 vp->v_rdev = NULL; 1868 } 1869 1870 /* 1871 * If it is on the freelist and not already at the head, 1872 * move it to the head of the list. The test of the 1873 * VDOOMED flag and the reference count of zero is because 1874 * it will be removed from the free list by getnewvnode, 1875 * but will not have its reference count incremented until 1876 * after calling vgone. If the reference count were 1877 * incremented first, vgone would (incorrectly) try to 1878 * close the previous instance of the underlying object. 1879 / 1880* if (vp->v_usecount == 0 && !(vp->v_flag & VDOOMED)) { 1881 s = splbio(); 1882 simple_lock(&vnode_free_list_slock); 1883 if (vp->v_flag & VFREE) 1884 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); 1885 else 1886 freevnodes++; 1887 vp->v_flag \|= VFREE; 1888 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); 1889 simple_unlock(&vnode_free_list_slock); 1890 splx(s); 1891 } 1892 1893 vp->v_type = VBAD; 1894 simple_unlock(&vp->v_interlock); 1895} 1896 1897/* 1898 * Lookup a vnode by device number. 1899 / 1900int 1901vfinddev(dev, type, vpp) 1902* dev_t dev; 1903 enum vtype type; 1904 struct vnode *vpp; 1905{ 1906* struct vnode vp; 1907* 1908 simple_lock(&spechash_slock); 1909 SLIST_FOREACH(vp, &dev->si_hlist, v_specnext) { 1910 if (type == vp->v_type) { 1911 vpp = vp; 1912* simple_unlock(&spechash_slock); 1913 return (1); 1914 } 1915 } 1916 simple_unlock(&spechash_slock); 1917 return (0); 1918} 1919 1920/* 1921 * Calculate the total number of references to a special device. 1922 / 1923int 1924vcount(vp) 1925* struct vnode vp; 1926{ 1927* struct vnode vq; 1928* int count; 1929 1930 count = 0; 1931 simple_lock(&spechash_slock);	1865 freedev(vp->v_rdev); 1866 simple_unlock(&spechash_slock); 1867 vp->v_rdev = NULL; 1868 } 1869 1870 /* 1871 * If it is on the freelist and not already at the head, 1872 * move it to the head of the list. The test of the 1873 * VDOOMED flag and the reference count of zero is because 1874 * it will be removed from the free list by getnewvnode, 1875 * but will not have its reference count incremented until 1876 * after calling vgone. If the reference count were 1877 * incremented first, vgone would (incorrectly) try to 1878 * close the previous instance of the underlying object. 1879 / 1880* if (vp->v_usecount == 0 && !(vp->v_flag & VDOOMED)) { 1881 s = splbio(); 1882 simple_lock(&vnode_free_list_slock); 1883 if (vp->v_flag & VFREE) 1884 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); 1885 else 1886 freevnodes++; 1887 vp->v_flag \|= VFREE; 1888 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); 1889 simple_unlock(&vnode_free_list_slock); 1890 splx(s); 1891 } 1892 1893 vp->v_type = VBAD; 1894 simple_unlock(&vp->v_interlock); 1895} 1896 1897/* 1898 * Lookup a vnode by device number. 1899 / 1900int 1901vfinddev(dev, type, vpp) 1902* dev_t dev; 1903 enum vtype type; 1904 struct vnode *vpp; 1905{ 1906* struct vnode vp; 1907* 1908 simple_lock(&spechash_slock); 1909 SLIST_FOREACH(vp, &dev->si_hlist, v_specnext) { 1910 if (type == vp->v_type) { 1911 vpp = vp; 1912* simple_unlock(&spechash_slock); 1913 return (1); 1914 } 1915 } 1916 simple_unlock(&spechash_slock); 1917 return (0); 1918} 1919 1920/* 1921 * Calculate the total number of references to a special device. 1922 / 1923int 1924vcount(vp) 1925* struct vnode vp; 1926{ 1927* struct vnode vq; 1928* int count; 1929 1930 count = 0; 1931 simple_lock(&spechash_slock);
1932 SLIST_FOREACH(vq, &vp->v_hashchain, v_specnext)	1932 SLIST_FOREACH(vq, &vp->v_rdev->si_hlist, v_specnext)
1933 count += vq->v_usecount; 1934 simple_unlock(&spechash_slock); 1935 return (count); 1936} 1937 1938/* 1939 * Same as above, but using the dev_t as argument 1940 / 1941* 1942int 1943count_dev(dev) 1944 dev_t dev; 1945{ 1946 struct vnode vp; 1947* 1948 vp = SLIST_FIRST(&dev->si_hlist); 1949 if (vp == NULL) 1950 return (0); 1951 return(vcount(vp)); 1952} 1953 1954/* 1955 * Print out a description of a vnode. 1956 / 1957static char typename[] = 1958{"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"}; 1959 1960void 1961vprint(label, vp) 1962 char label; 1963* struct vnode vp; 1964{ 1965* char buf[96]; 1966 1967 if (label != NULL) 1968 printf("%s: %p: ", label, (void )vp); 1969* else 1970 printf("%p: ", (void )vp); 1971* printf("type %s, usecount %d, writecount %d, refcount %d,", 1972 typename[vp->v_type], vp->v_usecount, vp->v_writecount, 1973 vp->v_holdcnt); 1974 buf[0] = '\0'; 1975 if (vp->v_flag & VROOT) 1976 strcat(buf, "\|VROOT"); 1977 if (vp->v_flag & VTEXT) 1978 strcat(buf, "\|VTEXT"); 1979 if (vp->v_flag & VSYSTEM) 1980 strcat(buf, "\|VSYSTEM"); 1981 if (vp->v_flag & VXLOCK) 1982 strcat(buf, "\|VXLOCK"); 1983 if (vp->v_flag & VXWANT) 1984 strcat(buf, "\|VXWANT"); 1985 if (vp->v_flag & VBWAIT) 1986 strcat(buf, "\|VBWAIT"); 1987 if (vp->v_flag & VDOOMED) 1988 strcat(buf, "\|VDOOMED"); 1989 if (vp->v_flag & VFREE) 1990 strcat(buf, "\|VFREE"); 1991 if (vp->v_flag & VOBJBUF) 1992 strcat(buf, "\|VOBJBUF"); 1993 if (buf[0] != '\0') 1994 printf(" flags (%s)", &buf[1]); 1995 if (vp->v_data == NULL) { 1996 printf("\n"); 1997 } else { 1998 printf("\n\t"); 1999 VOP_PRINT(vp); 2000 } 2001} 2002 2003#ifdef DDB 2004#include <ddb/ddb.h> 2005/* 2006 * List all of the locked vnodes in the system. 2007 * Called when debugging the kernel. 2008 / 2009DB_SHOW_COMMAND(lockedvnodes, lockedvnodes) 2010{ 2011* struct proc p = curproc; / XXX / 2012* struct mount mp, nmp; 2013 struct vnode vp; 2014* 2015 printf("Locked vnodes\n"); 2016 simple_lock(&mountlist_slock); 2017 for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { 2018 if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) { 2019 nmp = TAILQ_NEXT(mp, mnt_list); 2020 continue; 2021 } 2022 LIST_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) { 2023 if (VOP_ISLOCKED(vp, NULL)) 2024 vprint((char )0, vp); 2025* } 2026 simple_lock(&mountlist_slock); 2027 nmp = TAILQ_NEXT(mp, mnt_list); 2028 vfs_unbusy(mp, p); 2029 } 2030 simple_unlock(&mountlist_slock); 2031} 2032#endif 2033 2034/* 2035 * Top level filesystem related information gathering. 2036 / 2037static int sysctl_ovfs_conf __P((SYSCTL_HANDLER_ARGS)); 2038* 2039static int 2040vfs_sysctl(SYSCTL_HANDLER_ARGS) 2041{ 2042 int name = (int )arg1 - 1; /* XXX / 2043* u_int namelen = arg2 + 1; /* XXX / 2044* struct vfsconf vfsp; 2045* 2046#if 1 \|\| defined(COMPAT_PRELITE2) 2047 /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. / 2048* if (namelen == 1) 2049 return (sysctl_ovfs_conf(oidp, arg1, arg2, req)); 2050#endif 2051 2052#ifdef notyet 2053 /* all sysctl names at this level are at least name and field / 2054* if (namelen < 2) 2055 return (ENOTDIR); /* overloaded / 2056* if (name[0] != VFS_GENERIC) { 2057 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) 2058 if (vfsp->vfc_typenum == name[0]) 2059 break; 2060 if (vfsp == NULL) 2061 return (EOPNOTSUPP); 2062 return ((vfsp->vfc_vfsops->vfs_sysctl)(&name[1], namelen - 1, 2063* oldp, oldlenp, newp, newlen, p)); 2064 } 2065#endif 2066 switch (name[1]) { 2067 case VFS_MAXTYPENUM: 2068 if (namelen != 2) 2069 return (ENOTDIR); 2070 return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int))); 2071 case VFS_CONF: 2072 if (namelen != 3) 2073 return (ENOTDIR); /* overloaded / 2074* for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) 2075 if (vfsp->vfc_typenum == name[2]) 2076 break; 2077 if (vfsp == NULL) 2078 return (EOPNOTSUPP); 2079 return (SYSCTL_OUT(req, vfsp, sizeof vfsp)); 2080* } 2081 return (EOPNOTSUPP); 2082} 2083 2084SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD, vfs_sysctl, 2085 "Generic filesystem"); 2086 2087#if 1 \|\| defined(COMPAT_PRELITE2) 2088 2089static int 2090sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS) 2091{ 2092 int error; 2093 struct vfsconf vfsp; 2094* struct ovfsconf ovfs; 2095 2096 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) { 2097 ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag / 2098* strcpy(ovfs.vfc_name, vfsp->vfc_name); 2099 ovfs.vfc_index = vfsp->vfc_typenum; 2100 ovfs.vfc_refcount = vfsp->vfc_refcount; 2101 ovfs.vfc_flags = vfsp->vfc_flags; 2102 error = SYSCTL_OUT(req, &ovfs, sizeof ovfs); 2103 if (error) 2104 return error; 2105 } 2106 return 0; 2107} 2108 2109#endif /* 1 \|\| COMPAT_PRELITE2 / 2110* 2111#if 0 2112#define KINFO_VNODESLOP 10 2113/* 2114 * Dump vnode list (via sysctl). 2115 * Copyout address of vnode followed by vnode. 2116 / 2117/ ARGSUSED / 2118static int 2119sysctl_vnode(SYSCTL_HANDLER_ARGS) 2120{ 2121* struct proc p = curproc; / XXX / 2122* struct mount mp, nmp; 2123 struct vnode nvp, vp; 2124 int error; 2125 2126#define VPTRSZ sizeof (struct vnode ) 2127#define VNODESZ sizeof (struct vnode) 2128* 2129 req->lock = 0; 2130 if (!req->oldptr) /* Make an estimate / 2131* return (SYSCTL_OUT(req, 0, 2132 (numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ))); 2133 2134 simple_lock(&mountlist_slock); 2135 for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { 2136 if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) { 2137 nmp = TAILQ_NEXT(mp, mnt_list); 2138 continue; 2139 } 2140again: 2141 simple_lock(&mntvnode_slock); 2142 for (vp = LIST_FIRST(&mp->mnt_vnodelist); 2143 vp != NULL; 2144 vp = nvp) { 2145 /* 2146 * Check that the vp is still associated with 2147 * this filesystem. RACE: could have been 2148 * recycled onto the same filesystem. 2149 / 2150* if (vp->v_mount != mp) { 2151 simple_unlock(&mntvnode_slock); 2152 goto again; 2153 } 2154 nvp = LIST_NEXT(vp, v_mntvnodes); 2155 simple_unlock(&mntvnode_slock); 2156 if ((error = SYSCTL_OUT(req, &vp, VPTRSZ)) \|\| 2157 (error = SYSCTL_OUT(req, vp, VNODESZ))) 2158 return (error); 2159 simple_lock(&mntvnode_slock); 2160 } 2161 simple_unlock(&mntvnode_slock); 2162 simple_lock(&mountlist_slock); 2163 nmp = TAILQ_NEXT(mp, mnt_list); 2164 vfs_unbusy(mp, p); 2165 } 2166 simple_unlock(&mountlist_slock); 2167 2168 return (0); 2169} 2170#endif 2171 2172/* 2173 * XXX 2174 * Exporting the vnode list on large systems causes them to crash. 2175 * Exporting the vnode list on medium systems causes sysctl to coredump. 2176 / 2177#if 0 2178SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE\|CTLFLAG_RD, 2179* 0, 0, sysctl_vnode, "S,vnode", ""); 2180#endif 2181 2182/* 2183 * Check to see if a filesystem is mounted on a block device. 2184 / 2185int 2186vfs_mountedon(vp) 2187* struct vnode vp; 2188{ 2189* 2190 if (vp->v_specmountpoint != NULL) 2191 return (EBUSY); 2192 return (0); 2193} 2194 2195/* 2196 * Unmount all filesystems. The list is traversed in reverse order 2197 * of mounting to avoid dependencies. 2198 / 2199void 2200vfs_unmountall() 2201{ 2202* struct mount mp; 2203* struct proc p; 2204* int error; 2205 2206 if (curproc != NULL) 2207 p = curproc; 2208 else 2209 p = initproc; /* XXX XXX should this be proc0? / 2210* /* 2211 * Since this only runs when rebooting, it is not interlocked. 2212 / 2213* while(!TAILQ_EMPTY(&mountlist)) { 2214 mp = TAILQ_LAST(&mountlist, mntlist); 2215 error = dounmount(mp, MNT_FORCE, p); 2216 if (error) { 2217 TAILQ_REMOVE(&mountlist, mp, mnt_list); 2218 printf("unmount of %s failed (", 2219 mp->mnt_stat.f_mntonname); 2220 if (error == EBUSY) 2221 printf("BUSY)\n"); 2222 else 2223 printf("%d)\n", error); 2224 } else { 2225 /* The unmount has removed mp from the mountlist / 2226* } 2227 } 2228} 2229 2230/* 2231 * Build hash lists of net addresses and hang them off the mount point. 2232 * Called by ufs_mount() to set up the lists of export addresses. 2233 / 2234static int 2235vfs_hang_addrlist(mp, nep, argp) 2236* struct mount mp; 2237* struct netexport nep; 2238* struct export_args argp; 2239{ 2240* register struct netcred np; 2241* register struct radix_node_head rnh; 2242* register int i; 2243 struct radix_node rn; 2244* struct sockaddr saddr, smask = 0; 2245 struct domain dom; 2246* int error; 2247 2248 if (argp->ex_addrlen == 0) { 2249 if (mp->mnt_flag & MNT_DEFEXPORTED) 2250 return (EPERM); 2251 np = &nep->ne_defexported; 2252 np->netc_exflags = argp->ex_flags; 2253 np->netc_anon = argp->ex_anon; 2254 np->netc_anon.cr_ref = 1; 2255 mp->mnt_flag \|= MNT_DEFEXPORTED; 2256 return (0); 2257 } 2258 i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen; 2259 np = (struct netcred ) malloc(i, M_NETADDR, M_WAITOK); 2260* bzero((caddr_t) np, i); 2261 saddr = (struct sockaddr ) (np + 1); 2262* if ((error = copyin(argp->ex_addr, (caddr_t) saddr, argp->ex_addrlen))) 2263 goto out; 2264 if (saddr->sa_len > argp->ex_addrlen) 2265 saddr->sa_len = argp->ex_addrlen; 2266 if (argp->ex_masklen) { 2267 smask = (struct sockaddr ) ((caddr_t) saddr + argp->ex_addrlen); 2268* error = copyin(argp->ex_mask, (caddr_t) smask, argp->ex_masklen); 2269 if (error) 2270 goto out; 2271 if (smask->sa_len > argp->ex_masklen) 2272 smask->sa_len = argp->ex_masklen; 2273 } 2274 i = saddr->sa_family; 2275 if ((rnh = nep->ne_rtable[i]) == 0) { 2276 /* 2277 * Seems silly to initialize every AF when most are not used, 2278 * do so on demand here 2279 / 2280* for (dom = domains; dom; dom = dom->dom_next) 2281 if (dom->dom_family == i && dom->dom_rtattach) { 2282 dom->dom_rtattach((void *) &nep->ne_rtable[i], 2283* dom->dom_rtoffset); 2284 break; 2285 } 2286 if ((rnh = nep->ne_rtable[i]) == 0) { 2287 error = ENOBUFS; 2288 goto out; 2289 } 2290 } 2291 rn = (rnh->rnh_addaddr) ((caddr_t) saddr, (caddr_t) smask, rnh, 2292* np->netc_rnodes); 2293 if (rn == 0 \|\| np != (struct netcred ) rn) { / already exists / 2294* error = EPERM; 2295 goto out; 2296 } 2297 np->netc_exflags = argp->ex_flags; 2298 np->netc_anon = argp->ex_anon; 2299 np->netc_anon.cr_ref = 1; 2300 return (0); 2301out: 2302 free(np, M_NETADDR); 2303 return (error); 2304} 2305 2306/* ARGSUSED / 2307static int 2308vfs_free_netcred(rn, w) 2309* struct radix_node rn; 2310* void w; 2311{ 2312* register struct radix_node_head rnh = (struct radix_node_head ) w; 2313 2314 (rnh->rnh_deladdr) (rn->rn_key, rn->rn_mask, rnh); 2315* free((caddr_t) rn, M_NETADDR); 2316 return (0); 2317} 2318 2319/* 2320 * Free the net address hash lists that are hanging off the mount points. 2321 / 2322static void 2323vfs_free_addrlist(nep) 2324* struct netexport nep; 2325{ 2326* register int i; 2327 register struct radix_node_head rnh; 2328* 2329 for (i = 0; i <= AF_MAX; i++) 2330 if ((rnh = nep->ne_rtable[i])) { 2331 (rnh->rnh_walktree) (rnh, vfs_free_netcred, 2332* (caddr_t) rnh); 2333 free((caddr_t) rnh, M_RTABLE); 2334 nep->ne_rtable[i] = 0; 2335 } 2336} 2337 2338int 2339vfs_export(mp, nep, argp) 2340 struct mount mp; 2341* struct netexport nep; 2342* struct export_args argp; 2343{ 2344* int error; 2345 2346 if (argp->ex_flags & MNT_DELEXPORT) { 2347 if (mp->mnt_flag & MNT_EXPUBLIC) { 2348 vfs_setpublicfs(NULL, NULL, NULL); 2349 mp->mnt_flag &= ~MNT_EXPUBLIC; 2350 } 2351 vfs_free_addrlist(nep); 2352 mp->mnt_flag &= ~(MNT_EXPORTED \| MNT_DEFEXPORTED); 2353 } 2354 if (argp->ex_flags & MNT_EXPORTED) { 2355 if (argp->ex_flags & MNT_EXPUBLIC) { 2356 if ((error = vfs_setpublicfs(mp, nep, argp)) != 0) 2357 return (error); 2358 mp->mnt_flag \|= MNT_EXPUBLIC; 2359 } 2360 if ((error = vfs_hang_addrlist(mp, nep, argp))) 2361 return (error); 2362 mp->mnt_flag \|= MNT_EXPORTED; 2363 } 2364 return (0); 2365} 2366 2367 2368/* 2369 * Set the publicly exported filesystem (WebNFS). Currently, only 2370 * one public filesystem is possible in the spec (RFC 2054 and 2055) 2371 / 2372int 2373vfs_setpublicfs(mp, nep, argp) 2374* struct mount mp; 2375* struct netexport nep; 2376* struct export_args argp; 2377{ 2378* int error; 2379 struct vnode rvp; 2380* char cp; 2381* 2382 /* 2383 * mp == NULL -> invalidate the current info, the FS is 2384 * no longer exported. May be called from either vfs_export 2385 * or unmount, so check if it hasn't already been done. 2386 / 2387* if (mp == NULL) { 2388 if (nfs_pub.np_valid) { 2389 nfs_pub.np_valid = 0; 2390 if (nfs_pub.np_index != NULL) { 2391 FREE(nfs_pub.np_index, M_TEMP); 2392 nfs_pub.np_index = NULL; 2393 } 2394 } 2395 return (0); 2396 } 2397 2398 /* 2399 * Only one allowed at a time. 2400 / 2401* if (nfs_pub.np_valid != 0 && mp != nfs_pub.np_mount) 2402 return (EBUSY); 2403 2404 /* 2405 * Get real filehandle for root of exported FS. 2406 / 2407* bzero((caddr_t)&nfs_pub.np_handle, sizeof(nfs_pub.np_handle)); 2408 nfs_pub.np_handle.fh_fsid = mp->mnt_stat.f_fsid; 2409 2410 if ((error = VFS_ROOT(mp, &rvp))) 2411 return (error); 2412 2413 if ((error = VFS_VPTOFH(rvp, &nfs_pub.np_handle.fh_fid))) 2414 return (error); 2415 2416 vput(rvp); 2417 2418 /* 2419 * If an indexfile was specified, pull it in. 2420 / 2421* if (argp->ex_indexfile != NULL) { 2422 MALLOC(nfs_pub.np_index, char , MAXNAMLEN + 1, M_TEMP, 2423* M_WAITOK); 2424 error = copyinstr(argp->ex_indexfile, nfs_pub.np_index, 2425 MAXNAMLEN, (size_t )0); 2426* if (!error) { 2427 /* 2428 * Check for illegal filenames. 2429 / 2430* for (cp = nfs_pub.np_index; cp; cp++) { 2431* if (cp == '/') { 2432* error = EINVAL; 2433 break; 2434 } 2435 } 2436 } 2437 if (error) { 2438 FREE(nfs_pub.np_index, M_TEMP); 2439 return (error); 2440 } 2441 } 2442 2443 nfs_pub.np_mount = mp; 2444 nfs_pub.np_valid = 1; 2445 return (0); 2446} 2447 2448struct netcred * 2449vfs_export_lookup(mp, nep, nam) 2450 register struct mount mp; 2451* struct netexport nep; 2452* struct sockaddr nam; 2453{ 2454* register struct netcred np; 2455* register struct radix_node_head rnh; 2456* struct sockaddr saddr; 2457* 2458 np = NULL; 2459 if (mp->mnt_flag & MNT_EXPORTED) { 2460 /* 2461 * Lookup in the export list first. 2462 / 2463* if (nam != NULL) { 2464 saddr = nam; 2465 rnh = nep->ne_rtable[saddr->sa_family]; 2466 if (rnh != NULL) { 2467 np = (struct netcred ) 2468* (rnh->rnh_matchaddr)((caddr_t)saddr, 2469* rnh); 2470 if (np && np->netc_rnodes->rn_flags & RNF_ROOT) 2471 np = NULL; 2472 } 2473 } 2474 /* 2475 * If no address match, use the default if it exists. 2476 / 2477* if (np == NULL && mp->mnt_flag & MNT_DEFEXPORTED) 2478 np = &nep->ne_defexported; 2479 } 2480 return (np); 2481} 2482 2483/* 2484 * perform msync on all vnodes under a mount point 2485 * the mount point must be locked. 2486 / 2487void 2488vfs_msync(struct mount mp, int flags) { 2489 struct vnode vp, nvp; 2490 struct vm_object obj; 2491* int anyio, tries; 2492 2493 tries = 5; 2494loop: 2495 anyio = 0; 2496 for (vp = LIST_FIRST(&mp->mnt_vnodelist); vp != NULL; vp = nvp) { 2497 2498 nvp = LIST_NEXT(vp, v_mntvnodes); 2499 2500 if (vp->v_mount != mp) { 2501 goto loop; 2502 } 2503 2504 if (vp->v_flag & VXLOCK) /* XXX: what if MNT_WAIT? / 2505* continue; 2506 2507 if (flags != MNT_WAIT) { 2508 if (VOP_GETVOBJECT(vp, &obj) != 0 \|\| 2509 (obj->flags & OBJ_MIGHTBEDIRTY) == 0) 2510 continue; 2511 if (VOP_ISLOCKED(vp, NULL)) 2512 continue; 2513 } 2514 2515 simple_lock(&vp->v_interlock); 2516 if (VOP_GETVOBJECT(vp, &obj) == 0 && 2517 (obj->flags & OBJ_MIGHTBEDIRTY)) { 2518 if (!vget(vp, 2519 LK_INTERLOCK \| LK_EXCLUSIVE \| LK_RETRY \| LK_NOOBJ, curproc)) { 2520 if (VOP_GETVOBJECT(vp, &obj) == 0) { 2521 vm_object_page_clean(obj, 0, 0, flags == MNT_WAIT ? OBJPC_SYNC : OBJPC_NOSYNC); 2522 anyio = 1; 2523 } 2524 vput(vp); 2525 } 2526 } else { 2527 simple_unlock(&vp->v_interlock); 2528 } 2529 } 2530 if (anyio && (--tries > 0)) 2531 goto loop; 2532} 2533 2534/* 2535 * Create the VM object needed for VMIO and mmap support. This 2536 * is done for all VREG files in the system. Some filesystems might 2537 * afford the additional metadata buffering capability of the 2538 * VMIO code by making the device node be VMIO mode also. 2539 * 2540 * vp must be locked when vfs_object_create is called. 2541 / 2542int 2543vfs_object_create(vp, p, cred) 2544* struct vnode vp; 2545* struct proc p; 2546* struct ucred cred; 2547{ 2548* return (VOP_CREATEVOBJECT(vp, cred, p)); 2549} 2550 2551void 2552vfree(vp) 2553 struct vnode vp; 2554{ 2555* int s; 2556 2557 s = splbio(); 2558 simple_lock(&vnode_free_list_slock); 2559 KASSERT((vp->v_flag & VFREE) == 0, ("vnode already free")); 2560 if (vp->v_flag & VAGE) { 2561 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); 2562 } else { 2563 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 2564 } 2565 freevnodes++; 2566 simple_unlock(&vnode_free_list_slock); 2567 vp->v_flag &= ~VAGE; 2568 vp->v_flag \|= VFREE; 2569 splx(s); 2570} 2571 2572void 2573vbusy(vp) 2574 struct vnode vp; 2575{ 2576* int s; 2577 2578 s = splbio(); 2579 simple_lock(&vnode_free_list_slock); 2580 KASSERT((vp->v_flag & VFREE) != 0, ("vnode not free")); 2581 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); 2582 freevnodes--; 2583 simple_unlock(&vnode_free_list_slock); 2584 vp->v_flag &= ~(VFREE\|VAGE); 2585 splx(s); 2586} 2587 2588/* 2589 * Record a process's interest in events which might happen to 2590 * a vnode. Because poll uses the historic select-style interface 2591 * internally, this routine serves as both the ``check for any 2592 * pending events'' and the ``record my interest in future events'' 2593 * functions. (These are done together, while the lock is held, 2594 * to avoid race conditions.) 2595 / 2596int 2597vn_pollrecord(vp, p, events) 2598* struct vnode vp; 2599* struct proc p; 2600* short events; 2601{ 2602 simple_lock(&vp->v_pollinfo.vpi_lock); 2603 if (vp->v_pollinfo.vpi_revents & events) { 2604 /* 2605 * This leaves events we are not interested 2606 * in available for the other process which 2607 * which presumably had requested them 2608 * (otherwise they would never have been 2609 * recorded). 2610 / 2611* events &= vp->v_pollinfo.vpi_revents; 2612 vp->v_pollinfo.vpi_revents &= ~events; 2613 2614 simple_unlock(&vp->v_pollinfo.vpi_lock); 2615 return events; 2616 } 2617 vp->v_pollinfo.vpi_events \|= events; 2618 selrecord(p, &vp->v_pollinfo.vpi_selinfo); 2619 simple_unlock(&vp->v_pollinfo.vpi_lock); 2620 return 0; 2621} 2622 2623/* 2624 * Note the occurrence of an event. If the VN_POLLEVENT macro is used, 2625 * it is possible for us to miss an event due to race conditions, but 2626 * that condition is expected to be rare, so for the moment it is the 2627 * preferred interface. 2628 / 2629void 2630vn_pollevent(vp, events) 2631* struct vnode vp; 2632* short events; 2633{ 2634 simple_lock(&vp->v_pollinfo.vpi_lock); 2635 if (vp->v_pollinfo.vpi_events & events) { 2636 /* 2637 * We clear vpi_events so that we don't 2638 * call selwakeup() twice if two events are 2639 * posted before the polling process(es) is 2640 * awakened. This also ensures that we take at 2641 * most one selwakeup() if the polling process 2642 * is no longer interested. However, it does 2643 * mean that only one event can be noticed at 2644 * a time. (Perhaps we should only clear those 2645 * event bits which we note?) XXX 2646 / 2647* vp->v_pollinfo.vpi_events = 0; /* &= ~events ??? / 2648* vp->v_pollinfo.vpi_revents \|= events; 2649 selwakeup(&vp->v_pollinfo.vpi_selinfo); 2650 } 2651 simple_unlock(&vp->v_pollinfo.vpi_lock); 2652} 2653 2654/* 2655 * Wake up anyone polling on vp because it is being revoked. 2656 * This depends on dead_poll() returning POLLHUP for correct 2657 * behavior. 2658 / 2659void 2660vn_pollgone(vp) 2661* struct vnode vp; 2662{ 2663* simple_lock(&vp->v_pollinfo.vpi_lock); 2664 if (vp->v_pollinfo.vpi_events) { 2665 vp->v_pollinfo.vpi_events = 0; 2666 selwakeup(&vp->v_pollinfo.vpi_selinfo); 2667 } 2668 simple_unlock(&vp->v_pollinfo.vpi_lock); 2669} 2670 2671 2672 2673/* 2674 * Routine to create and manage a filesystem syncer vnode. 2675 / 2676#define sync_close ((int () __P((struct vop_close_args )))nullop) 2677static int sync_fsync __P((struct vop_fsync_args )); 2678static int sync_inactive __P((struct vop_inactive_args )); 2679static int sync_reclaim __P((struct vop_reclaim_args )); 2680#define sync_lock ((int () __P((struct vop_lock_args )))vop_nolock) 2681#define sync_unlock ((int () __P((struct vop_unlock_args )))vop_nounlock) 2682static int sync_print __P((struct vop_print_args )); 2683#define sync_islocked ((int() __P((struct vop_islocked_args )))vop_noislocked) 2684* 2685static vop_t *sync_vnodeop_p; 2686static struct vnodeopv_entry_desc sync_vnodeop_entries[] = { 2687* { &vop_default_desc, (vop_t ) vop_eopnotsupp }, 2688* { &vop_close_desc, (vop_t ) sync_close }, / close / 2689* { &vop_fsync_desc, (vop_t ) sync_fsync }, / fsync / 2690* { &vop_inactive_desc, (vop_t ) sync_inactive }, / inactive / 2691* { &vop_reclaim_desc, (vop_t ) sync_reclaim }, / reclaim / 2692* { &vop_lock_desc, (vop_t ) sync_lock }, / lock / 2693* { &vop_unlock_desc, (vop_t ) sync_unlock }, / unlock / 2694* { &vop_print_desc, (vop_t ) sync_print }, / print / 2695* { &vop_islocked_desc, (vop_t ) sync_islocked }, / islocked / 2696* { NULL, NULL } 2697}; 2698static struct vnodeopv_desc sync_vnodeop_opv_desc = 2699 { &sync_vnodeop_p, sync_vnodeop_entries }; 2700 2701VNODEOP_SET(sync_vnodeop_opv_desc); 2702 2703/* 2704 * Create a new filesystem syncer vnode for the specified mount point. 2705 / 2706int 2707vfs_allocate_syncvnode(mp) 2708* struct mount mp; 2709{ 2710* struct vnode vp; 2711* static long start, incr, next; 2712 int error; 2713 2714 /* Allocate a new vnode / 2715* if ((error = getnewvnode(VT_VFS, mp, sync_vnodeop_p, &vp)) != 0) { 2716 mp->mnt_syncer = NULL; 2717 return (error); 2718 } 2719 vp->v_type = VNON; 2720 /* 2721 * Place the vnode onto the syncer worklist. We attempt to 2722 * scatter them about on the list so that they will go off 2723 * at evenly distributed times even if all the filesystems 2724 * are mounted at once. 2725 / 2726* next += incr; 2727 if (next == 0 \|\| next > syncer_maxdelay) { 2728 start /= 2; 2729 incr /= 2; 2730 if (start == 0) { 2731 start = syncer_maxdelay / 2; 2732 incr = syncer_maxdelay; 2733 } 2734 next = start; 2735 } 2736 vn_syncer_add_to_worklist(vp, syncdelay > 0 ? next % syncdelay : 0); 2737 mp->mnt_syncer = vp; 2738 return (0); 2739} 2740 2741/* 2742 * Do a lazy sync of the filesystem. 2743 / 2744static int 2745sync_fsync(ap) 2746* struct vop_fsync_args /* { 2747 struct vnode a_vp; 2748* struct ucred a_cred; 2749* int a_waitfor; 2750 struct proc a_p; 2751* } / ap; 2752{ 2753 struct vnode syncvp = ap->a_vp; 2754* struct mount mp = syncvp->v_mount; 2755* struct proc p = ap->a_p; 2756* int asyncflag; 2757 2758 /* 2759 * We only need to do something if this is a lazy evaluation. 2760 / 2761* if (ap->a_waitfor != MNT_LAZY) 2762 return (0); 2763 2764 /* 2765 * Move ourselves to the back of the sync list. 2766 / 2767* vn_syncer_add_to_worklist(syncvp, syncdelay); 2768 2769 /* 2770 * Walk the list of vnodes pushing all that are dirty and 2771 * not already on the sync list. 2772 / 2773* simple_lock(&mountlist_slock); 2774 if (vfs_busy(mp, LK_EXCLUSIVE \| LK_NOWAIT, &mountlist_slock, p) != 0) { 2775 simple_unlock(&mountlist_slock); 2776 return (0); 2777 } 2778 if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) { 2779 vfs_unbusy(mp, p); 2780 simple_unlock(&mountlist_slock); 2781 return (0); 2782 } 2783 asyncflag = mp->mnt_flag & MNT_ASYNC; 2784 mp->mnt_flag &= ~MNT_ASYNC; 2785 vfs_msync(mp, MNT_NOWAIT); 2786 VFS_SYNC(mp, MNT_LAZY, ap->a_cred, p); 2787 if (asyncflag) 2788 mp->mnt_flag \|= MNT_ASYNC; 2789 vn_finished_write(mp); 2790 vfs_unbusy(mp, p); 2791 return (0); 2792} 2793 2794/* 2795 * The syncer vnode is no referenced. 2796 / 2797static int 2798sync_inactive(ap) 2799* struct vop_inactive_args /* { 2800 struct vnode a_vp; 2801* struct proc a_p; 2802* } / ap; 2803{ 2804 2805 vgone(ap->a_vp); 2806 return (0); 2807} 2808 2809/* 2810 * The syncer vnode is no longer needed and is being decommissioned. 2811 * 2812 * Modifications to the worklist must be protected at splbio(). 2813 / 2814static int 2815sync_reclaim(ap) 2816* struct vop_reclaim_args /* { 2817 struct vnode a_vp; 2818* } / ap; 2819{ 2820 struct vnode vp = ap->a_vp; 2821* int s; 2822 2823 s = splbio(); 2824 vp->v_mount->mnt_syncer = NULL; 2825 if (vp->v_flag & VONWORKLST) { 2826 LIST_REMOVE(vp, v_synclist); 2827 vp->v_flag &= ~VONWORKLST; 2828 } 2829 splx(s); 2830 2831 return (0); 2832} 2833 2834/* 2835 * Print out a syncer vnode. 2836 / 2837static int 2838sync_print(ap) 2839* struct vop_print_args /* { 2840 struct vnode a_vp; 2841* } / ap; 2842{ 2843 struct vnode vp = ap->a_vp; 2844* 2845 printf("syncer vnode"); 2846 if (vp->v_vnlock != NULL) 2847 lockmgr_printinfo(vp->v_vnlock); 2848 printf("\n"); 2849 return (0); 2850} 2851 2852/* 2853 * extract the dev_t from a VBLK or VCHR 2854 / 2855dev_t 2856vn_todev(vp) 2857* struct vnode vp; 2858{ 2859* if (vp->v_type != VBLK && vp->v_type != VCHR) 2860 return (NODEV); 2861 return (vp->v_rdev); 2862} 2863 2864/* 2865 * Check if vnode represents a disk device 2866 / 2867int 2868vn_isdisk(vp, errp) 2869* struct vnode vp; 2870* int errp; 2871{ 2872* struct cdevsw cdevsw; 2873* 2874 if (vp->v_type != VBLK && vp->v_type != VCHR) { 2875 if (errp != NULL) 2876 errp = ENOTBLK; 2877* return (0); 2878 } 2879 if (vp->v_rdev == NULL) { 2880 if (errp != NULL) 2881 errp = ENXIO; 2882* return (0); 2883 } 2884 cdevsw = devsw(vp->v_rdev); 2885 if (cdevsw == NULL) { 2886 if (errp != NULL) 2887 errp = ENXIO; 2888* return (0); 2889 } 2890 if (!(cdevsw->d_flags & D_DISK)) { 2891 if (errp != NULL) 2892 errp = ENOTBLK; 2893* return (0); 2894 } 2895 if (errp != NULL) 2896 errp = 0; 2897* return (1); 2898} 2899 2900void 2901NDFREE(ndp, flags) 2902 struct nameidata ndp; 2903* const uint flags; 2904{ 2905 if (!(flags & NDF_NO_FREE_PNBUF) && 2906 (ndp->ni_cnd.cn_flags & HASBUF)) { 2907 zfree(namei_zone, ndp->ni_cnd.cn_pnbuf); 2908 ndp->ni_cnd.cn_flags &= ~HASBUF; 2909 } 2910 if (!(flags & NDF_NO_DVP_UNLOCK) && 2911 (ndp->ni_cnd.cn_flags & LOCKPARENT) && 2912 ndp->ni_dvp != ndp->ni_vp) 2913 VOP_UNLOCK(ndp->ni_dvp, 0, ndp->ni_cnd.cn_proc); 2914 if (!(flags & NDF_NO_DVP_RELE) && 2915 (ndp->ni_cnd.cn_flags & (LOCKPARENT\|WANTPARENT))) { 2916 vrele(ndp->ni_dvp); 2917 ndp->ni_dvp = NULL; 2918 } 2919 if (!(flags & NDF_NO_VP_UNLOCK) && 2920 (ndp->ni_cnd.cn_flags & LOCKLEAF) && ndp->ni_vp) 2921 VOP_UNLOCK(ndp->ni_vp, 0, ndp->ni_cnd.cn_proc); 2922 if (!(flags & NDF_NO_VP_RELE) && 2923 ndp->ni_vp) { 2924 vrele(ndp->ni_vp); 2925 ndp->ni_vp = NULL; 2926 } 2927 if (!(flags & NDF_NO_STARTDIR_RELE) && 2928 (ndp->ni_cnd.cn_flags & SAVESTART)) { 2929 vrele(ndp->ni_startdir); 2930 ndp->ni_startdir = NULL; 2931 } 2932} 2933 2934int 2935vaccess(type, file_mode, file_uid, file_gid, acc_mode, cred, privused) 2936 enum vtype type; 2937 mode_t file_mode; 2938 uid_t file_uid; 2939 gid_t file_gid; 2940 mode_t acc_mode; 2941 struct ucred cred; 2942* int privused; 2943{ 2944* mode_t dac_granted; 2945#ifdef CAPABILITIES 2946 mode_t cap_granted; 2947#endif 2948 2949 /* 2950 * Look for a normal, non-privileged way to access the file/directory 2951 * as requested. If it exists, go with that. 2952 / 2953* 2954 if (privused != NULL) 2955 privused = 0; 2956* 2957 dac_granted = 0; 2958 2959 /* Check the owner. / 2960* if (cred->cr_uid == file_uid) { 2961 if (file_mode & S_IXUSR) 2962 dac_granted \|= VEXEC; 2963 if (file_mode & S_IRUSR) 2964 dac_granted \|= VREAD; 2965 if (file_mode & S_IWUSR) 2966 dac_granted \|= VWRITE; 2967 2968 if ((acc_mode & dac_granted) == acc_mode) 2969 return (0); 2970 2971 goto privcheck; 2972 } 2973 2974 /* Otherwise, check the groups (first match) / 2975* if (groupmember(file_gid, cred)) { 2976 if (file_mode & S_IXGRP) 2977 dac_granted \|= VEXEC; 2978 if (file_mode & S_IRGRP) 2979 dac_granted \|= VREAD; 2980 if (file_mode & S_IWGRP) 2981 dac_granted \|= VWRITE; 2982 2983 if ((acc_mode & dac_granted) == acc_mode) 2984 return (0); 2985 2986 goto privcheck; 2987 } 2988 2989 /* Otherwise, check everyone else. / 2990* if (file_mode & S_IXOTH) 2991 dac_granted \|= VEXEC; 2992 if (file_mode & S_IROTH) 2993 dac_granted \|= VREAD; 2994 if (file_mode & S_IWOTH) 2995 dac_granted \|= VWRITE; 2996 if ((acc_mode & dac_granted) == acc_mode) 2997 return (0); 2998 2999privcheck: 3000 if (!suser_xxx(cred, NULL, PRISON_ROOT)) { 3001 /* XXX audit: privilege used / 3002* if (privused != NULL) 3003 privused = 1; 3004* return (0); 3005 } 3006 3007#ifdef CAPABILITIES 3008 /* 3009 * Build a capability mask to determine if the set of capabilities 3010 * satisfies the requirements when combined with the granted mask 3011 * from above. 3012 * For each capability, if the capability is required, bitwise 3013 * or the request type onto the cap_granted mask. 3014 / 3015* cap_granted = 0; 3016 if ((acc_mode & VEXEC) && ((dac_granted & VEXEC) == 0) && 3017 !cap_check_xxx(cred, NULL, CAP_DAC_EXECUTE, PRISON_ROOT)) 3018 cap_granted \|= VEXEC; 3019 3020 if ((acc_mode & VREAD) && ((dac_granted & VREAD) == 0) && 3021 !cap_check_xxx(cred, NULL, CAP_DAC_READ_SEARCH, PRISON_ROOT)) 3022 cap_granted \|= VREAD; 3023 3024 if ((acc_mode & VWRITE) && ((dac_granted & VWRITE) == 0) && 3025 !cap_check_xxx(cred, NULL, CAP_DAC_WRITE, PRISON_ROOT)) 3026 cap_granted \|= VWRITE; 3027 3028 if ((acc_mode & (cap_granted \| dac_granted)) == acc_mode) { 3029 /* XXX audit: privilege used / 3030* if (privused != NULL) 3031 privused = 1; 3032* return (0); 3033 } 3034#endif 3035 3036 return (EACCES); 3037}	1933 count += vq->v_usecount; 1934 simple_unlock(&spechash_slock); 1935 return (count); 1936} 1937 1938/* 1939 * Same as above, but using the dev_t as argument 1940 / 1941* 1942int 1943count_dev(dev) 1944 dev_t dev; 1945{ 1946 struct vnode vp; 1947* 1948 vp = SLIST_FIRST(&dev->si_hlist); 1949 if (vp == NULL) 1950 return (0); 1951 return(vcount(vp)); 1952} 1953 1954/* 1955 * Print out a description of a vnode. 1956 / 1957static char typename[] = 1958{"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"}; 1959 1960void 1961vprint(label, vp) 1962 char label; 1963* struct vnode vp; 1964{ 1965* char buf[96]; 1966 1967 if (label != NULL) 1968 printf("%s: %p: ", label, (void )vp); 1969* else 1970 printf("%p: ", (void )vp); 1971* printf("type %s, usecount %d, writecount %d, refcount %d,", 1972 typename[vp->v_type], vp->v_usecount, vp->v_writecount, 1973 vp->v_holdcnt); 1974 buf[0] = '\0'; 1975 if (vp->v_flag & VROOT) 1976 strcat(buf, "\|VROOT"); 1977 if (vp->v_flag & VTEXT) 1978 strcat(buf, "\|VTEXT"); 1979 if (vp->v_flag & VSYSTEM) 1980 strcat(buf, "\|VSYSTEM"); 1981 if (vp->v_flag & VXLOCK) 1982 strcat(buf, "\|VXLOCK"); 1983 if (vp->v_flag & VXWANT) 1984 strcat(buf, "\|VXWANT"); 1985 if (vp->v_flag & VBWAIT) 1986 strcat(buf, "\|VBWAIT"); 1987 if (vp->v_flag & VDOOMED) 1988 strcat(buf, "\|VDOOMED"); 1989 if (vp->v_flag & VFREE) 1990 strcat(buf, "\|VFREE"); 1991 if (vp->v_flag & VOBJBUF) 1992 strcat(buf, "\|VOBJBUF"); 1993 if (buf[0] != '\0') 1994 printf(" flags (%s)", &buf[1]); 1995 if (vp->v_data == NULL) { 1996 printf("\n"); 1997 } else { 1998 printf("\n\t"); 1999 VOP_PRINT(vp); 2000 } 2001} 2002 2003#ifdef DDB 2004#include <ddb/ddb.h> 2005/* 2006 * List all of the locked vnodes in the system. 2007 * Called when debugging the kernel. 2008 / 2009DB_SHOW_COMMAND(lockedvnodes, lockedvnodes) 2010{ 2011* struct proc p = curproc; / XXX / 2012* struct mount mp, nmp; 2013 struct vnode vp; 2014* 2015 printf("Locked vnodes\n"); 2016 simple_lock(&mountlist_slock); 2017 for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { 2018 if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) { 2019 nmp = TAILQ_NEXT(mp, mnt_list); 2020 continue; 2021 } 2022 LIST_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) { 2023 if (VOP_ISLOCKED(vp, NULL)) 2024 vprint((char )0, vp); 2025* } 2026 simple_lock(&mountlist_slock); 2027 nmp = TAILQ_NEXT(mp, mnt_list); 2028 vfs_unbusy(mp, p); 2029 } 2030 simple_unlock(&mountlist_slock); 2031} 2032#endif 2033 2034/* 2035 * Top level filesystem related information gathering. 2036 / 2037static int sysctl_ovfs_conf __P((SYSCTL_HANDLER_ARGS)); 2038* 2039static int 2040vfs_sysctl(SYSCTL_HANDLER_ARGS) 2041{ 2042 int name = (int )arg1 - 1; /* XXX / 2043* u_int namelen = arg2 + 1; /* XXX / 2044* struct vfsconf vfsp; 2045* 2046#if 1 \|\| defined(COMPAT_PRELITE2) 2047 /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. / 2048* if (namelen == 1) 2049 return (sysctl_ovfs_conf(oidp, arg1, arg2, req)); 2050#endif 2051 2052#ifdef notyet 2053 /* all sysctl names at this level are at least name and field / 2054* if (namelen < 2) 2055 return (ENOTDIR); /* overloaded / 2056* if (name[0] != VFS_GENERIC) { 2057 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) 2058 if (vfsp->vfc_typenum == name[0]) 2059 break; 2060 if (vfsp == NULL) 2061 return (EOPNOTSUPP); 2062 return ((vfsp->vfc_vfsops->vfs_sysctl)(&name[1], namelen - 1, 2063* oldp, oldlenp, newp, newlen, p)); 2064 } 2065#endif 2066 switch (name[1]) { 2067 case VFS_MAXTYPENUM: 2068 if (namelen != 2) 2069 return (ENOTDIR); 2070 return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int))); 2071 case VFS_CONF: 2072 if (namelen != 3) 2073 return (ENOTDIR); /* overloaded / 2074* for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) 2075 if (vfsp->vfc_typenum == name[2]) 2076 break; 2077 if (vfsp == NULL) 2078 return (EOPNOTSUPP); 2079 return (SYSCTL_OUT(req, vfsp, sizeof vfsp)); 2080* } 2081 return (EOPNOTSUPP); 2082} 2083 2084SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD, vfs_sysctl, 2085 "Generic filesystem"); 2086 2087#if 1 \|\| defined(COMPAT_PRELITE2) 2088 2089static int 2090sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS) 2091{ 2092 int error; 2093 struct vfsconf vfsp; 2094* struct ovfsconf ovfs; 2095 2096 for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) { 2097 ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag / 2098* strcpy(ovfs.vfc_name, vfsp->vfc_name); 2099 ovfs.vfc_index = vfsp->vfc_typenum; 2100 ovfs.vfc_refcount = vfsp->vfc_refcount; 2101 ovfs.vfc_flags = vfsp->vfc_flags; 2102 error = SYSCTL_OUT(req, &ovfs, sizeof ovfs); 2103 if (error) 2104 return error; 2105 } 2106 return 0; 2107} 2108 2109#endif /* 1 \|\| COMPAT_PRELITE2 / 2110* 2111#if 0 2112#define KINFO_VNODESLOP 10 2113/* 2114 * Dump vnode list (via sysctl). 2115 * Copyout address of vnode followed by vnode. 2116 / 2117/ ARGSUSED / 2118static int 2119sysctl_vnode(SYSCTL_HANDLER_ARGS) 2120{ 2121* struct proc p = curproc; / XXX / 2122* struct mount mp, nmp; 2123 struct vnode nvp, vp; 2124 int error; 2125 2126#define VPTRSZ sizeof (struct vnode ) 2127#define VNODESZ sizeof (struct vnode) 2128* 2129 req->lock = 0; 2130 if (!req->oldptr) /* Make an estimate / 2131* return (SYSCTL_OUT(req, 0, 2132 (numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ))); 2133 2134 simple_lock(&mountlist_slock); 2135 for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { 2136 if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) { 2137 nmp = TAILQ_NEXT(mp, mnt_list); 2138 continue; 2139 } 2140again: 2141 simple_lock(&mntvnode_slock); 2142 for (vp = LIST_FIRST(&mp->mnt_vnodelist); 2143 vp != NULL; 2144 vp = nvp) { 2145 /* 2146 * Check that the vp is still associated with 2147 * this filesystem. RACE: could have been 2148 * recycled onto the same filesystem. 2149 / 2150* if (vp->v_mount != mp) { 2151 simple_unlock(&mntvnode_slock); 2152 goto again; 2153 } 2154 nvp = LIST_NEXT(vp, v_mntvnodes); 2155 simple_unlock(&mntvnode_slock); 2156 if ((error = SYSCTL_OUT(req, &vp, VPTRSZ)) \|\| 2157 (error = SYSCTL_OUT(req, vp, VNODESZ))) 2158 return (error); 2159 simple_lock(&mntvnode_slock); 2160 } 2161 simple_unlock(&mntvnode_slock); 2162 simple_lock(&mountlist_slock); 2163 nmp = TAILQ_NEXT(mp, mnt_list); 2164 vfs_unbusy(mp, p); 2165 } 2166 simple_unlock(&mountlist_slock); 2167 2168 return (0); 2169} 2170#endif 2171 2172/* 2173 * XXX 2174 * Exporting the vnode list on large systems causes them to crash. 2175 * Exporting the vnode list on medium systems causes sysctl to coredump. 2176 / 2177#if 0 2178SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE\|CTLFLAG_RD, 2179* 0, 0, sysctl_vnode, "S,vnode", ""); 2180#endif 2181 2182/* 2183 * Check to see if a filesystem is mounted on a block device. 2184 / 2185int 2186vfs_mountedon(vp) 2187* struct vnode vp; 2188{ 2189* 2190 if (vp->v_specmountpoint != NULL) 2191 return (EBUSY); 2192 return (0); 2193} 2194 2195/* 2196 * Unmount all filesystems. The list is traversed in reverse order 2197 * of mounting to avoid dependencies. 2198 / 2199void 2200vfs_unmountall() 2201{ 2202* struct mount mp; 2203* struct proc p; 2204* int error; 2205 2206 if (curproc != NULL) 2207 p = curproc; 2208 else 2209 p = initproc; /* XXX XXX should this be proc0? / 2210* /* 2211 * Since this only runs when rebooting, it is not interlocked. 2212 / 2213* while(!TAILQ_EMPTY(&mountlist)) { 2214 mp = TAILQ_LAST(&mountlist, mntlist); 2215 error = dounmount(mp, MNT_FORCE, p); 2216 if (error) { 2217 TAILQ_REMOVE(&mountlist, mp, mnt_list); 2218 printf("unmount of %s failed (", 2219 mp->mnt_stat.f_mntonname); 2220 if (error == EBUSY) 2221 printf("BUSY)\n"); 2222 else 2223 printf("%d)\n", error); 2224 } else { 2225 /* The unmount has removed mp from the mountlist / 2226* } 2227 } 2228} 2229 2230/* 2231 * Build hash lists of net addresses and hang them off the mount point. 2232 * Called by ufs_mount() to set up the lists of export addresses. 2233 / 2234static int 2235vfs_hang_addrlist(mp, nep, argp) 2236* struct mount mp; 2237* struct netexport nep; 2238* struct export_args argp; 2239{ 2240* register struct netcred np; 2241* register struct radix_node_head rnh; 2242* register int i; 2243 struct radix_node rn; 2244* struct sockaddr saddr, smask = 0; 2245 struct domain dom; 2246* int error; 2247 2248 if (argp->ex_addrlen == 0) { 2249 if (mp->mnt_flag & MNT_DEFEXPORTED) 2250 return (EPERM); 2251 np = &nep->ne_defexported; 2252 np->netc_exflags = argp->ex_flags; 2253 np->netc_anon = argp->ex_anon; 2254 np->netc_anon.cr_ref = 1; 2255 mp->mnt_flag \|= MNT_DEFEXPORTED; 2256 return (0); 2257 } 2258 i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen; 2259 np = (struct netcred ) malloc(i, M_NETADDR, M_WAITOK); 2260* bzero((caddr_t) np, i); 2261 saddr = (struct sockaddr ) (np + 1); 2262* if ((error = copyin(argp->ex_addr, (caddr_t) saddr, argp->ex_addrlen))) 2263 goto out; 2264 if (saddr->sa_len > argp->ex_addrlen) 2265 saddr->sa_len = argp->ex_addrlen; 2266 if (argp->ex_masklen) { 2267 smask = (struct sockaddr ) ((caddr_t) saddr + argp->ex_addrlen); 2268* error = copyin(argp->ex_mask, (caddr_t) smask, argp->ex_masklen); 2269 if (error) 2270 goto out; 2271 if (smask->sa_len > argp->ex_masklen) 2272 smask->sa_len = argp->ex_masklen; 2273 } 2274 i = saddr->sa_family; 2275 if ((rnh = nep->ne_rtable[i]) == 0) { 2276 /* 2277 * Seems silly to initialize every AF when most are not used, 2278 * do so on demand here 2279 / 2280* for (dom = domains; dom; dom = dom->dom_next) 2281 if (dom->dom_family == i && dom->dom_rtattach) { 2282 dom->dom_rtattach((void *) &nep->ne_rtable[i], 2283* dom->dom_rtoffset); 2284 break; 2285 } 2286 if ((rnh = nep->ne_rtable[i]) == 0) { 2287 error = ENOBUFS; 2288 goto out; 2289 } 2290 } 2291 rn = (rnh->rnh_addaddr) ((caddr_t) saddr, (caddr_t) smask, rnh, 2292* np->netc_rnodes); 2293 if (rn == 0 \|\| np != (struct netcred ) rn) { / already exists / 2294* error = EPERM; 2295 goto out; 2296 } 2297 np->netc_exflags = argp->ex_flags; 2298 np->netc_anon = argp->ex_anon; 2299 np->netc_anon.cr_ref = 1; 2300 return (0); 2301out: 2302 free(np, M_NETADDR); 2303 return (error); 2304} 2305 2306/* ARGSUSED / 2307static int 2308vfs_free_netcred(rn, w) 2309* struct radix_node rn; 2310* void w; 2311{ 2312* register struct radix_node_head rnh = (struct radix_node_head ) w; 2313 2314 (rnh->rnh_deladdr) (rn->rn_key, rn->rn_mask, rnh); 2315* free((caddr_t) rn, M_NETADDR); 2316 return (0); 2317} 2318 2319/* 2320 * Free the net address hash lists that are hanging off the mount points. 2321 / 2322static void 2323vfs_free_addrlist(nep) 2324* struct netexport nep; 2325{ 2326* register int i; 2327 register struct radix_node_head rnh; 2328* 2329 for (i = 0; i <= AF_MAX; i++) 2330 if ((rnh = nep->ne_rtable[i])) { 2331 (rnh->rnh_walktree) (rnh, vfs_free_netcred, 2332* (caddr_t) rnh); 2333 free((caddr_t) rnh, M_RTABLE); 2334 nep->ne_rtable[i] = 0; 2335 } 2336} 2337 2338int 2339vfs_export(mp, nep, argp) 2340 struct mount mp; 2341* struct netexport nep; 2342* struct export_args argp; 2343{ 2344* int error; 2345 2346 if (argp->ex_flags & MNT_DELEXPORT) { 2347 if (mp->mnt_flag & MNT_EXPUBLIC) { 2348 vfs_setpublicfs(NULL, NULL, NULL); 2349 mp->mnt_flag &= ~MNT_EXPUBLIC; 2350 } 2351 vfs_free_addrlist(nep); 2352 mp->mnt_flag &= ~(MNT_EXPORTED \| MNT_DEFEXPORTED); 2353 } 2354 if (argp->ex_flags & MNT_EXPORTED) { 2355 if (argp->ex_flags & MNT_EXPUBLIC) { 2356 if ((error = vfs_setpublicfs(mp, nep, argp)) != 0) 2357 return (error); 2358 mp->mnt_flag \|= MNT_EXPUBLIC; 2359 } 2360 if ((error = vfs_hang_addrlist(mp, nep, argp))) 2361 return (error); 2362 mp->mnt_flag \|= MNT_EXPORTED; 2363 } 2364 return (0); 2365} 2366 2367 2368/* 2369 * Set the publicly exported filesystem (WebNFS). Currently, only 2370 * one public filesystem is possible in the spec (RFC 2054 and 2055) 2371 / 2372int 2373vfs_setpublicfs(mp, nep, argp) 2374* struct mount mp; 2375* struct netexport nep; 2376* struct export_args argp; 2377{ 2378* int error; 2379 struct vnode rvp; 2380* char cp; 2381* 2382 /* 2383 * mp == NULL -> invalidate the current info, the FS is 2384 * no longer exported. May be called from either vfs_export 2385 * or unmount, so check if it hasn't already been done. 2386 / 2387* if (mp == NULL) { 2388 if (nfs_pub.np_valid) { 2389 nfs_pub.np_valid = 0; 2390 if (nfs_pub.np_index != NULL) { 2391 FREE(nfs_pub.np_index, M_TEMP); 2392 nfs_pub.np_index = NULL; 2393 } 2394 } 2395 return (0); 2396 } 2397 2398 /* 2399 * Only one allowed at a time. 2400 / 2401* if (nfs_pub.np_valid != 0 && mp != nfs_pub.np_mount) 2402 return (EBUSY); 2403 2404 /* 2405 * Get real filehandle for root of exported FS. 2406 / 2407* bzero((caddr_t)&nfs_pub.np_handle, sizeof(nfs_pub.np_handle)); 2408 nfs_pub.np_handle.fh_fsid = mp->mnt_stat.f_fsid; 2409 2410 if ((error = VFS_ROOT(mp, &rvp))) 2411 return (error); 2412 2413 if ((error = VFS_VPTOFH(rvp, &nfs_pub.np_handle.fh_fid))) 2414 return (error); 2415 2416 vput(rvp); 2417 2418 /* 2419 * If an indexfile was specified, pull it in. 2420 / 2421* if (argp->ex_indexfile != NULL) { 2422 MALLOC(nfs_pub.np_index, char , MAXNAMLEN + 1, M_TEMP, 2423* M_WAITOK); 2424 error = copyinstr(argp->ex_indexfile, nfs_pub.np_index, 2425 MAXNAMLEN, (size_t )0); 2426* if (!error) { 2427 /* 2428 * Check for illegal filenames. 2429 / 2430* for (cp = nfs_pub.np_index; cp; cp++) { 2431* if (cp == '/') { 2432* error = EINVAL; 2433 break; 2434 } 2435 } 2436 } 2437 if (error) { 2438 FREE(nfs_pub.np_index, M_TEMP); 2439 return (error); 2440 } 2441 } 2442 2443 nfs_pub.np_mount = mp; 2444 nfs_pub.np_valid = 1; 2445 return (0); 2446} 2447 2448struct netcred * 2449vfs_export_lookup(mp, nep, nam) 2450 register struct mount mp; 2451* struct netexport nep; 2452* struct sockaddr nam; 2453{ 2454* register struct netcred np; 2455* register struct radix_node_head rnh; 2456* struct sockaddr saddr; 2457* 2458 np = NULL; 2459 if (mp->mnt_flag & MNT_EXPORTED) { 2460 /* 2461 * Lookup in the export list first. 2462 / 2463* if (nam != NULL) { 2464 saddr = nam; 2465 rnh = nep->ne_rtable[saddr->sa_family]; 2466 if (rnh != NULL) { 2467 np = (struct netcred ) 2468* (rnh->rnh_matchaddr)((caddr_t)saddr, 2469* rnh); 2470 if (np && np->netc_rnodes->rn_flags & RNF_ROOT) 2471 np = NULL; 2472 } 2473 } 2474 /* 2475 * If no address match, use the default if it exists. 2476 / 2477* if (np == NULL && mp->mnt_flag & MNT_DEFEXPORTED) 2478 np = &nep->ne_defexported; 2479 } 2480 return (np); 2481} 2482 2483/* 2484 * perform msync on all vnodes under a mount point 2485 * the mount point must be locked. 2486 / 2487void 2488vfs_msync(struct mount mp, int flags) { 2489 struct vnode vp, nvp; 2490 struct vm_object obj; 2491* int anyio, tries; 2492 2493 tries = 5; 2494loop: 2495 anyio = 0; 2496 for (vp = LIST_FIRST(&mp->mnt_vnodelist); vp != NULL; vp = nvp) { 2497 2498 nvp = LIST_NEXT(vp, v_mntvnodes); 2499 2500 if (vp->v_mount != mp) { 2501 goto loop; 2502 } 2503 2504 if (vp->v_flag & VXLOCK) /* XXX: what if MNT_WAIT? / 2505* continue; 2506 2507 if (flags != MNT_WAIT) { 2508 if (VOP_GETVOBJECT(vp, &obj) != 0 \|\| 2509 (obj->flags & OBJ_MIGHTBEDIRTY) == 0) 2510 continue; 2511 if (VOP_ISLOCKED(vp, NULL)) 2512 continue; 2513 } 2514 2515 simple_lock(&vp->v_interlock); 2516 if (VOP_GETVOBJECT(vp, &obj) == 0 && 2517 (obj->flags & OBJ_MIGHTBEDIRTY)) { 2518 if (!vget(vp, 2519 LK_INTERLOCK \| LK_EXCLUSIVE \| LK_RETRY \| LK_NOOBJ, curproc)) { 2520 if (VOP_GETVOBJECT(vp, &obj) == 0) { 2521 vm_object_page_clean(obj, 0, 0, flags == MNT_WAIT ? OBJPC_SYNC : OBJPC_NOSYNC); 2522 anyio = 1; 2523 } 2524 vput(vp); 2525 } 2526 } else { 2527 simple_unlock(&vp->v_interlock); 2528 } 2529 } 2530 if (anyio && (--tries > 0)) 2531 goto loop; 2532} 2533 2534/* 2535 * Create the VM object needed for VMIO and mmap support. This 2536 * is done for all VREG files in the system. Some filesystems might 2537 * afford the additional metadata buffering capability of the 2538 * VMIO code by making the device node be VMIO mode also. 2539 * 2540 * vp must be locked when vfs_object_create is called. 2541 / 2542int 2543vfs_object_create(vp, p, cred) 2544* struct vnode vp; 2545* struct proc p; 2546* struct ucred cred; 2547{ 2548* return (VOP_CREATEVOBJECT(vp, cred, p)); 2549} 2550 2551void 2552vfree(vp) 2553 struct vnode vp; 2554{ 2555* int s; 2556 2557 s = splbio(); 2558 simple_lock(&vnode_free_list_slock); 2559 KASSERT((vp->v_flag & VFREE) == 0, ("vnode already free")); 2560 if (vp->v_flag & VAGE) { 2561 TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); 2562 } else { 2563 TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); 2564 } 2565 freevnodes++; 2566 simple_unlock(&vnode_free_list_slock); 2567 vp->v_flag &= ~VAGE; 2568 vp->v_flag \|= VFREE; 2569 splx(s); 2570} 2571 2572void 2573vbusy(vp) 2574 struct vnode vp; 2575{ 2576* int s; 2577 2578 s = splbio(); 2579 simple_lock(&vnode_free_list_slock); 2580 KASSERT((vp->v_flag & VFREE) != 0, ("vnode not free")); 2581 TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); 2582 freevnodes--; 2583 simple_unlock(&vnode_free_list_slock); 2584 vp->v_flag &= ~(VFREE\|VAGE); 2585 splx(s); 2586} 2587 2588/* 2589 * Record a process's interest in events which might happen to 2590 * a vnode. Because poll uses the historic select-style interface 2591 * internally, this routine serves as both the ``check for any 2592 * pending events'' and the ``record my interest in future events'' 2593 * functions. (These are done together, while the lock is held, 2594 * to avoid race conditions.) 2595 / 2596int 2597vn_pollrecord(vp, p, events) 2598* struct vnode vp; 2599* struct proc p; 2600* short events; 2601{ 2602 simple_lock(&vp->v_pollinfo.vpi_lock); 2603 if (vp->v_pollinfo.vpi_revents & events) { 2604 /* 2605 * This leaves events we are not interested 2606 * in available for the other process which 2607 * which presumably had requested them 2608 * (otherwise they would never have been 2609 * recorded). 2610 / 2611* events &= vp->v_pollinfo.vpi_revents; 2612 vp->v_pollinfo.vpi_revents &= ~events; 2613 2614 simple_unlock(&vp->v_pollinfo.vpi_lock); 2615 return events; 2616 } 2617 vp->v_pollinfo.vpi_events \|= events; 2618 selrecord(p, &vp->v_pollinfo.vpi_selinfo); 2619 simple_unlock(&vp->v_pollinfo.vpi_lock); 2620 return 0; 2621} 2622 2623/* 2624 * Note the occurrence of an event. If the VN_POLLEVENT macro is used, 2625 * it is possible for us to miss an event due to race conditions, but 2626 * that condition is expected to be rare, so for the moment it is the 2627 * preferred interface. 2628 / 2629void 2630vn_pollevent(vp, events) 2631* struct vnode vp; 2632* short events; 2633{ 2634 simple_lock(&vp->v_pollinfo.vpi_lock); 2635 if (vp->v_pollinfo.vpi_events & events) { 2636 /* 2637 * We clear vpi_events so that we don't 2638 * call selwakeup() twice if two events are 2639 * posted before the polling process(es) is 2640 * awakened. This also ensures that we take at 2641 * most one selwakeup() if the polling process 2642 * is no longer interested. However, it does 2643 * mean that only one event can be noticed at 2644 * a time. (Perhaps we should only clear those 2645 * event bits which we note?) XXX 2646 / 2647* vp->v_pollinfo.vpi_events = 0; /* &= ~events ??? / 2648* vp->v_pollinfo.vpi_revents \|= events; 2649 selwakeup(&vp->v_pollinfo.vpi_selinfo); 2650 } 2651 simple_unlock(&vp->v_pollinfo.vpi_lock); 2652} 2653 2654/* 2655 * Wake up anyone polling on vp because it is being revoked. 2656 * This depends on dead_poll() returning POLLHUP for correct 2657 * behavior. 2658 / 2659void 2660vn_pollgone(vp) 2661* struct vnode vp; 2662{ 2663* simple_lock(&vp->v_pollinfo.vpi_lock); 2664 if (vp->v_pollinfo.vpi_events) { 2665 vp->v_pollinfo.vpi_events = 0; 2666 selwakeup(&vp->v_pollinfo.vpi_selinfo); 2667 } 2668 simple_unlock(&vp->v_pollinfo.vpi_lock); 2669} 2670 2671 2672 2673/* 2674 * Routine to create and manage a filesystem syncer vnode. 2675 / 2676#define sync_close ((int () __P((struct vop_close_args )))nullop) 2677static int sync_fsync __P((struct vop_fsync_args )); 2678static int sync_inactive __P((struct vop_inactive_args )); 2679static int sync_reclaim __P((struct vop_reclaim_args )); 2680#define sync_lock ((int () __P((struct vop_lock_args )))vop_nolock) 2681#define sync_unlock ((int () __P((struct vop_unlock_args )))vop_nounlock) 2682static int sync_print __P((struct vop_print_args )); 2683#define sync_islocked ((int() __P((struct vop_islocked_args )))vop_noislocked) 2684* 2685static vop_t *sync_vnodeop_p; 2686static struct vnodeopv_entry_desc sync_vnodeop_entries[] = { 2687* { &vop_default_desc, (vop_t ) vop_eopnotsupp }, 2688* { &vop_close_desc, (vop_t ) sync_close }, / close / 2689* { &vop_fsync_desc, (vop_t ) sync_fsync }, / fsync / 2690* { &vop_inactive_desc, (vop_t ) sync_inactive }, / inactive / 2691* { &vop_reclaim_desc, (vop_t ) sync_reclaim }, / reclaim / 2692* { &vop_lock_desc, (vop_t ) sync_lock }, / lock / 2693* { &vop_unlock_desc, (vop_t ) sync_unlock }, / unlock / 2694* { &vop_print_desc, (vop_t ) sync_print }, / print / 2695* { &vop_islocked_desc, (vop_t ) sync_islocked }, / islocked / 2696* { NULL, NULL } 2697}; 2698static struct vnodeopv_desc sync_vnodeop_opv_desc = 2699 { &sync_vnodeop_p, sync_vnodeop_entries }; 2700 2701VNODEOP_SET(sync_vnodeop_opv_desc); 2702 2703/* 2704 * Create a new filesystem syncer vnode for the specified mount point. 2705 / 2706int 2707vfs_allocate_syncvnode(mp) 2708* struct mount mp; 2709{ 2710* struct vnode vp; 2711* static long start, incr, next; 2712 int error; 2713 2714 /* Allocate a new vnode / 2715* if ((error = getnewvnode(VT_VFS, mp, sync_vnodeop_p, &vp)) != 0) { 2716 mp->mnt_syncer = NULL; 2717 return (error); 2718 } 2719 vp->v_type = VNON; 2720 /* 2721 * Place the vnode onto the syncer worklist. We attempt to 2722 * scatter them about on the list so that they will go off 2723 * at evenly distributed times even if all the filesystems 2724 * are mounted at once. 2725 / 2726* next += incr; 2727 if (next == 0 \|\| next > syncer_maxdelay) { 2728 start /= 2; 2729 incr /= 2; 2730 if (start == 0) { 2731 start = syncer_maxdelay / 2; 2732 incr = syncer_maxdelay; 2733 } 2734 next = start; 2735 } 2736 vn_syncer_add_to_worklist(vp, syncdelay > 0 ? next % syncdelay : 0); 2737 mp->mnt_syncer = vp; 2738 return (0); 2739} 2740 2741/* 2742 * Do a lazy sync of the filesystem. 2743 / 2744static int 2745sync_fsync(ap) 2746* struct vop_fsync_args /* { 2747 struct vnode a_vp; 2748* struct ucred a_cred; 2749* int a_waitfor; 2750 struct proc a_p; 2751* } / ap; 2752{ 2753 struct vnode syncvp = ap->a_vp; 2754* struct mount mp = syncvp->v_mount; 2755* struct proc p = ap->a_p; 2756* int asyncflag; 2757 2758 /* 2759 * We only need to do something if this is a lazy evaluation. 2760 / 2761* if (ap->a_waitfor != MNT_LAZY) 2762 return (0); 2763 2764 /* 2765 * Move ourselves to the back of the sync list. 2766 / 2767* vn_syncer_add_to_worklist(syncvp, syncdelay); 2768 2769 /* 2770 * Walk the list of vnodes pushing all that are dirty and 2771 * not already on the sync list. 2772 / 2773* simple_lock(&mountlist_slock); 2774 if (vfs_busy(mp, LK_EXCLUSIVE \| LK_NOWAIT, &mountlist_slock, p) != 0) { 2775 simple_unlock(&mountlist_slock); 2776 return (0); 2777 } 2778 if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) { 2779 vfs_unbusy(mp, p); 2780 simple_unlock(&mountlist_slock); 2781 return (0); 2782 } 2783 asyncflag = mp->mnt_flag & MNT_ASYNC; 2784 mp->mnt_flag &= ~MNT_ASYNC; 2785 vfs_msync(mp, MNT_NOWAIT); 2786 VFS_SYNC(mp, MNT_LAZY, ap->a_cred, p); 2787 if (asyncflag) 2788 mp->mnt_flag \|= MNT_ASYNC; 2789 vn_finished_write(mp); 2790 vfs_unbusy(mp, p); 2791 return (0); 2792} 2793 2794/* 2795 * The syncer vnode is no referenced. 2796 / 2797static int 2798sync_inactive(ap) 2799* struct vop_inactive_args /* { 2800 struct vnode a_vp; 2801* struct proc a_p; 2802* } / ap; 2803{ 2804 2805 vgone(ap->a_vp); 2806 return (0); 2807} 2808 2809/* 2810 * The syncer vnode is no longer needed and is being decommissioned. 2811 * 2812 * Modifications to the worklist must be protected at splbio(). 2813 / 2814static int 2815sync_reclaim(ap) 2816* struct vop_reclaim_args /* { 2817 struct vnode a_vp; 2818* } / ap; 2819{ 2820 struct vnode vp = ap->a_vp; 2821* int s; 2822 2823 s = splbio(); 2824 vp->v_mount->mnt_syncer = NULL; 2825 if (vp->v_flag & VONWORKLST) { 2826 LIST_REMOVE(vp, v_synclist); 2827 vp->v_flag &= ~VONWORKLST; 2828 } 2829 splx(s); 2830 2831 return (0); 2832} 2833 2834/* 2835 * Print out a syncer vnode. 2836 / 2837static int 2838sync_print(ap) 2839* struct vop_print_args /* { 2840 struct vnode a_vp; 2841* } / ap; 2842{ 2843 struct vnode vp = ap->a_vp; 2844* 2845 printf("syncer vnode"); 2846 if (vp->v_vnlock != NULL) 2847 lockmgr_printinfo(vp->v_vnlock); 2848 printf("\n"); 2849 return (0); 2850} 2851 2852/* 2853 * extract the dev_t from a VBLK or VCHR 2854 / 2855dev_t 2856vn_todev(vp) 2857* struct vnode vp; 2858{ 2859* if (vp->v_type != VBLK && vp->v_type != VCHR) 2860 return (NODEV); 2861 return (vp->v_rdev); 2862} 2863 2864/* 2865 * Check if vnode represents a disk device 2866 / 2867int 2868vn_isdisk(vp, errp) 2869* struct vnode vp; 2870* int errp; 2871{ 2872* struct cdevsw cdevsw; 2873* 2874 if (vp->v_type != VBLK && vp->v_type != VCHR) { 2875 if (errp != NULL) 2876 errp = ENOTBLK; 2877* return (0); 2878 } 2879 if (vp->v_rdev == NULL) { 2880 if (errp != NULL) 2881 errp = ENXIO; 2882* return (0); 2883 } 2884 cdevsw = devsw(vp->v_rdev); 2885 if (cdevsw == NULL) { 2886 if (errp != NULL) 2887 errp = ENXIO; 2888* return (0); 2889 } 2890 if (!(cdevsw->d_flags & D_DISK)) { 2891 if (errp != NULL) 2892 errp = ENOTBLK; 2893* return (0); 2894 } 2895 if (errp != NULL) 2896 errp = 0; 2897* return (1); 2898} 2899 2900void 2901NDFREE(ndp, flags) 2902 struct nameidata ndp; 2903* const uint flags; 2904{ 2905 if (!(flags & NDF_NO_FREE_PNBUF) && 2906 (ndp->ni_cnd.cn_flags & HASBUF)) { 2907 zfree(namei_zone, ndp->ni_cnd.cn_pnbuf); 2908 ndp->ni_cnd.cn_flags &= ~HASBUF; 2909 } 2910 if (!(flags & NDF_NO_DVP_UNLOCK) && 2911 (ndp->ni_cnd.cn_flags & LOCKPARENT) && 2912 ndp->ni_dvp != ndp->ni_vp) 2913 VOP_UNLOCK(ndp->ni_dvp, 0, ndp->ni_cnd.cn_proc); 2914 if (!(flags & NDF_NO_DVP_RELE) && 2915 (ndp->ni_cnd.cn_flags & (LOCKPARENT\|WANTPARENT))) { 2916 vrele(ndp->ni_dvp); 2917 ndp->ni_dvp = NULL; 2918 } 2919 if (!(flags & NDF_NO_VP_UNLOCK) && 2920 (ndp->ni_cnd.cn_flags & LOCKLEAF) && ndp->ni_vp) 2921 VOP_UNLOCK(ndp->ni_vp, 0, ndp->ni_cnd.cn_proc); 2922 if (!(flags & NDF_NO_VP_RELE) && 2923 ndp->ni_vp) { 2924 vrele(ndp->ni_vp); 2925 ndp->ni_vp = NULL; 2926 } 2927 if (!(flags & NDF_NO_STARTDIR_RELE) && 2928 (ndp->ni_cnd.cn_flags & SAVESTART)) { 2929 vrele(ndp->ni_startdir); 2930 ndp->ni_startdir = NULL; 2931 } 2932} 2933 2934int 2935vaccess(type, file_mode, file_uid, file_gid, acc_mode, cred, privused) 2936 enum vtype type; 2937 mode_t file_mode; 2938 uid_t file_uid; 2939 gid_t file_gid; 2940 mode_t acc_mode; 2941 struct ucred cred; 2942* int privused; 2943{ 2944* mode_t dac_granted; 2945#ifdef CAPABILITIES 2946 mode_t cap_granted; 2947#endif 2948 2949 /* 2950 * Look for a normal, non-privileged way to access the file/directory 2951 * as requested. If it exists, go with that. 2952 / 2953* 2954 if (privused != NULL) 2955 privused = 0; 2956* 2957 dac_granted = 0; 2958 2959 /* Check the owner. / 2960* if (cred->cr_uid == file_uid) { 2961 if (file_mode & S_IXUSR) 2962 dac_granted \|= VEXEC; 2963 if (file_mode & S_IRUSR) 2964 dac_granted \|= VREAD; 2965 if (file_mode & S_IWUSR) 2966 dac_granted \|= VWRITE; 2967 2968 if ((acc_mode & dac_granted) == acc_mode) 2969 return (0); 2970 2971 goto privcheck; 2972 } 2973 2974 /* Otherwise, check the groups (first match) / 2975* if (groupmember(file_gid, cred)) { 2976 if (file_mode & S_IXGRP) 2977 dac_granted \|= VEXEC; 2978 if (file_mode & S_IRGRP) 2979 dac_granted \|= VREAD; 2980 if (file_mode & S_IWGRP) 2981 dac_granted \|= VWRITE; 2982 2983 if ((acc_mode & dac_granted) == acc_mode) 2984 return (0); 2985 2986 goto privcheck; 2987 } 2988 2989 /* Otherwise, check everyone else. / 2990* if (file_mode & S_IXOTH) 2991 dac_granted \|= VEXEC; 2992 if (file_mode & S_IROTH) 2993 dac_granted \|= VREAD; 2994 if (file_mode & S_IWOTH) 2995 dac_granted \|= VWRITE; 2996 if ((acc_mode & dac_granted) == acc_mode) 2997 return (0); 2998 2999privcheck: 3000 if (!suser_xxx(cred, NULL, PRISON_ROOT)) { 3001 /* XXX audit: privilege used / 3002* if (privused != NULL) 3003 privused = 1; 3004* return (0); 3005 } 3006 3007#ifdef CAPABILITIES 3008 /* 3009 * Build a capability mask to determine if the set of capabilities 3010 * satisfies the requirements when combined with the granted mask 3011 * from above. 3012 * For each capability, if the capability is required, bitwise 3013 * or the request type onto the cap_granted mask. 3014 / 3015* cap_granted = 0; 3016 if ((acc_mode & VEXEC) && ((dac_granted & VEXEC) == 0) && 3017 !cap_check_xxx(cred, NULL, CAP_DAC_EXECUTE, PRISON_ROOT)) 3018 cap_granted \|= VEXEC; 3019 3020 if ((acc_mode & VREAD) && ((dac_granted & VREAD) == 0) && 3021 !cap_check_xxx(cred, NULL, CAP_DAC_READ_SEARCH, PRISON_ROOT)) 3022 cap_granted \|= VREAD; 3023 3024 if ((acc_mode & VWRITE) && ((dac_granted & VWRITE) == 0) && 3025 !cap_check_xxx(cred, NULL, CAP_DAC_WRITE, PRISON_ROOT)) 3026 cap_granted \|= VWRITE; 3027 3028 if ((acc_mode & (cap_granted \| dac_granted)) == acc_mode) { 3029 /* XXX audit: privilege used / 3030* if (privused != NULL) 3031 privused = 1; 3032* return (0); 3033 } 3034#endif 3035 3036 return (EACCES); 3037}