1/* $NetBSD: vfs_subr.c,v 1.500 2023/04/30 08:46:11 riastradh Exp $ */ 2 3/*- 4 * Copyright (c) 1997, 1998, 2004, 2005, 2007, 2008, 2019, 2020 5 * The NetBSD Foundation, Inc. 6 * All rights reserved. 7 * 8 * This code is derived from software contributed to The NetBSD Foundation 9 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, 10 * NASA Ames Research Center, by Charles M. Hannum, by Andrew Doran, 11 * by Marshall Kirk McKusick and Greg Ganger at the University of Michigan. 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 16 * 1. Redistributions of source code must retain the above copyright 17 * notice, this list of conditions and the following disclaimer. 18 * 2. Redistributions in binary form must reproduce the above copyright 19 * notice, this list of conditions and the following disclaimer in the 20 * documentation and/or other materials provided with the distribution. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 23 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 24 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 25 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 26 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 27 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 28 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 29 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 30 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 31 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 32 * POSSIBILITY OF SUCH DAMAGE. 33 */ 34 35/* 36 * Copyright (c) 1989, 1993 37 * The Regents of the University of California. All rights reserved. 38 * (c) UNIX System Laboratories, Inc. 39 * All or some portions of this file are derived from material licensed 40 * to the University of California by American Telephone and Telegraph 41 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 42 * the permission of UNIX System Laboratories, Inc. 43 * 44 * Redistribution and use in source and binary forms, with or without 45 * modification, are permitted provided that the following conditions 46 * are met: 47 * 1. Redistributions of source code must retain the above copyright 48 * notice, this list of conditions and the following disclaimer. 49 * 2. Redistributions in binary form must reproduce the above copyright 50 * notice, this list of conditions and the following disclaimer in the 51 * documentation and/or other materials provided with the distribution. 52 * 3. Neither the name of the University nor the names of its contributors 53 * may be used to endorse or promote products derived from this software 54 * without specific prior written permission. 55 * 56 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 57 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 58 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 59 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 60 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 61 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 62 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 63 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 64 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 65 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 66 * SUCH DAMAGE. 67 * 68 * @(#)vfs_subr.c 8.13 (Berkeley) 4/18/94 69 */ 70 71#include <sys/cdefs.h> 72__KERNEL_RCSID(0, "$NetBSD: vfs_subr.c,v 1.500 2023/04/30 08:46:11 riastradh Exp $"); 73 74#ifdef _KERNEL_OPT 75#include "opt_compat_43.h" 76#include "opt_compat_netbsd.h" 77#include "opt_ddb.h" 78#endif 79 80#include <sys/param.h> 81#include <sys/types.h> 82 83#include <sys/buf.h> 84#include <sys/conf.h> 85#include <sys/dirent.h> 86#include <sys/errno.h> 87#include <sys/filedesc.h> 88#include <sys/fstrans.h> 89#include <sys/kauth.h> 90#include <sys/kernel.h> 91#include <sys/kmem.h> 92#include <sys/module.h> 93#include <sys/mount.h> 94#include <sys/namei.h> 95#include <sys/stat.h> 96#include <sys/syscallargs.h> 97#include <sys/sysctl.h> 98#include <sys/systm.h> 99#include <sys/vnode_impl.h> 100 101#include <miscfs/deadfs/deadfs.h> 102#include <miscfs/genfs/genfs.h> 103#include <miscfs/specfs/specdev.h> 104 105#include <uvm/uvm_ddb.h> 106 107SDT_PROBE_DEFINE3(vfs, syncer, worklist, vnode__add, 108 "struct vnode *"/*vp*/, 109 "int"/*delayx*/, 110 "int"/*slot*/); 111SDT_PROBE_DEFINE4(vfs, syncer, worklist, vnode__update, 112 "struct vnode *"/*vp*/, 113 "int"/*delayx*/, 114 "int"/*oslot*/, 115 "int"/*nslot*/); 116SDT_PROBE_DEFINE1(vfs, syncer, worklist, vnode__remove, 117 "struct vnode *"/*vp*/); 118 119SDT_PROBE_DEFINE3(vfs, syncer, worklist, mount__add, 120 "struct mount *"/*mp*/, 121 "int"/*vdelay*/, 122 "int"/*slot*/); 123SDT_PROBE_DEFINE4(vfs, syncer, worklist, mount__update, 124 "struct mount *"/*vp*/, 125 "int"/*vdelay*/, 126 "int"/*oslot*/, 127 "int"/*nslot*/); 128SDT_PROBE_DEFINE1(vfs, syncer, worklist, mount__remove, 129 "struct mount *"/*mp*/); 130 131SDT_PROBE_DEFINE1(vfs, syncer, sync, start, 132 "int"/*starttime*/); 133SDT_PROBE_DEFINE1(vfs, syncer, sync, mount__start, 134 "struct mount *"/*mp*/); 135SDT_PROBE_DEFINE2(vfs, syncer, sync, mount__done, 136 "struct mount *"/*mp*/, 137 "int"/*error*/); 138SDT_PROBE_DEFINE1(vfs, syncer, sync, mount__skip, 139 "struct mount *"/*mp*/); 140SDT_PROBE_DEFINE1(vfs, syncer, sync, vnode__start, 141 "struct vnode *"/*vp*/); 142SDT_PROBE_DEFINE2(vfs, syncer, sync, vnode__done, 143 "struct vnode *"/*vp*/, 144 "int"/*error*/); 145SDT_PROBE_DEFINE2(vfs, syncer, sync, vnode__fail__lock, 146 "struct vnode *"/*vp*/, 147 "int"/*error*/); 148SDT_PROBE_DEFINE2(vfs, syncer, sync, vnode__fail__vget, 149 "struct vnode *"/*vp*/, 150 "int"/*error*/); 151SDT_PROBE_DEFINE2(vfs, syncer, sync, done, 152 "int"/*starttime*/, 153 "int"/*endtime*/); 154 155const enum vtype iftovt_tab[16] = { 156 VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, 157 VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD, 158}; 159const int vttoif_tab[9] = { 160 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, 161 S_IFSOCK, S_IFIFO, S_IFMT, 162}; 163 164/* 165 * Insq/Remq for the vnode usage lists. 166 */ 167#define bufinsvn(bp, dp) LIST_INSERT_HEAD(dp, bp, b_vnbufs) 168#define bufremvn(bp) { \ 169 LIST_REMOVE(bp, b_vnbufs); \ 170 (bp)->b_vnbufs.le_next = NOLIST; \ 171} 172 173int doforce = 1; /* 1 => permit forcible unmounting */ 174 175/* 176 * Local declarations. 177 */ 178 179static void vn_initialize_syncerd(void); 180 181/* 182 * Initialize the vnode management data structures. 183 */ 184void 185vntblinit(void) 186{ 187 188 vn_initialize_syncerd(); 189 vfs_mount_sysinit(); 190 vfs_vnode_sysinit(); 191} 192 193/* 194 * Flush out and invalidate all buffers associated with a vnode. 195 * Called with the underlying vnode locked, which should prevent new dirty 196 * buffers from being queued. 197 */ 198int 199vinvalbuf(struct vnode *vp, int flags, kauth_cred_t cred, struct lwp *l, 200 bool catch_p, int slptimeo) 201{ 202 struct buf *bp, *nbp; 203 int error; 204 int flushflags = PGO_ALLPAGES | PGO_FREE | PGO_SYNCIO | 205 (flags & V_SAVE ? PGO_CLEANIT | PGO_RECLAIM : 0); 206 207 /* XXXUBC this doesn't look at flags or slp* */ 208 rw_enter(vp->v_uobj.vmobjlock, RW_WRITER); 209 error = VOP_PUTPAGES(vp, 0, 0, flushflags); 210 if (error) { 211 return error; 212 } 213 214 if (flags & V_SAVE) { 215 error = VOP_FSYNC(vp, cred, FSYNC_WAIT|FSYNC_RECLAIM, 0, 0); 216 if (error) 217 return (error); 218 KASSERT(LIST_EMPTY(&vp->v_dirtyblkhd)); 219 } 220 221 mutex_enter(&bufcache_lock); 222restart: 223 for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { 224 KASSERT(bp->b_vp == vp); 225 nbp = LIST_NEXT(bp, b_vnbufs); 226 error = bbusy(bp, catch_p, slptimeo, NULL); 227 if (error != 0) { 228 if (error == EPASSTHROUGH) 229 goto restart; 230 mutex_exit(&bufcache_lock); 231 return (error); 232 } 233 brelsel(bp, BC_INVAL | BC_VFLUSH); 234 } 235 236 for (bp = LIST_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) { 237 KASSERT(bp->b_vp == vp); 238 nbp = LIST_NEXT(bp, b_vnbufs); 239 error = bbusy(bp, catch_p, slptimeo, NULL); 240 if (error != 0) { 241 if (error == EPASSTHROUGH) 242 goto restart; 243 mutex_exit(&bufcache_lock); 244 return (error); 245 } 246 /* 247 * XXX Since there are no node locks for NFS, I believe 248 * there is a slight chance that a delayed write will 249 * occur while sleeping just above, so check for it. 250 */ 251 if ((bp->b_oflags & BO_DELWRI) && (flags & V_SAVE)) { 252#ifdef DEBUG 253 printf("buffer still DELWRI\n"); 254#endif 255 bp->b_cflags |= BC_BUSY | BC_VFLUSH; 256 mutex_exit(&bufcache_lock); 257 VOP_BWRITE(bp->b_vp, bp); 258 mutex_enter(&bufcache_lock); 259 goto restart; 260 } 261 brelsel(bp, BC_INVAL | BC_VFLUSH); 262 } 263 264#ifdef DIAGNOSTIC 265 if (!LIST_EMPTY(&vp->v_cleanblkhd) || !LIST_EMPTY(&vp->v_dirtyblkhd)) 266 panic("vinvalbuf: flush failed, vp %p", vp); 267#endif 268 269 mutex_exit(&bufcache_lock); 270 271 return (0); 272} 273 274/* 275 * Destroy any in core blocks past the truncation length. 276 * Called with the underlying vnode locked, which should prevent new dirty 277 * buffers from being queued. 278 */ 279int 280vtruncbuf(struct vnode *vp, daddr_t lbn, bool catch_p, int slptimeo) 281{ 282 struct buf *bp, *nbp; 283 int error; 284 voff_t off; 285 286 off = round_page((voff_t)lbn << vp->v_mount->mnt_fs_bshift); 287 rw_enter(vp->v_uobj.vmobjlock, RW_WRITER); 288 error = VOP_PUTPAGES(vp, off, 0, PGO_FREE | PGO_SYNCIO); 289 if (error) { 290 return error; 291 } 292 293 mutex_enter(&bufcache_lock); 294restart: 295 for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { 296 KASSERT(bp->b_vp == vp); 297 nbp = LIST_NEXT(bp, b_vnbufs); 298 if (bp->b_lblkno < lbn) 299 continue; 300 error = bbusy(bp, catch_p, slptimeo, NULL); 301 if (error != 0) { 302 if (error == EPASSTHROUGH) 303 goto restart; 304 mutex_exit(&bufcache_lock); 305 return (error); 306 } 307 brelsel(bp, BC_INVAL | BC_VFLUSH); 308 } 309 310 for (bp = LIST_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) { 311 KASSERT(bp->b_vp == vp); 312 nbp = LIST_NEXT(bp, b_vnbufs); 313 if (bp->b_lblkno < lbn) 314 continue; 315 error = bbusy(bp, catch_p, slptimeo, NULL); 316 if (error != 0) { 317 if (error == EPASSTHROUGH) 318 goto restart; 319 mutex_exit(&bufcache_lock); 320 return (error); 321 } 322 brelsel(bp, BC_INVAL | BC_VFLUSH); 323 } 324 mutex_exit(&bufcache_lock); 325 326 return (0); 327} 328 329/* 330 * Flush all dirty buffers from a vnode. 331 * Called with the underlying vnode locked, which should prevent new dirty 332 * buffers from being queued. 333 */ 334int 335vflushbuf(struct vnode *vp, int flags) 336{ 337 struct buf *bp, *nbp; 338 int error, pflags; 339 bool dirty, sync; 340 341 sync = (flags & FSYNC_WAIT) != 0; 342 pflags = PGO_CLEANIT | PGO_ALLPAGES | 343 (sync ? PGO_SYNCIO : 0) | 344 ((flags & FSYNC_LAZY) ? PGO_LAZY : 0); 345 rw_enter(vp->v_uobj.vmobjlock, RW_WRITER); 346 (void) VOP_PUTPAGES(vp, 0, 0, pflags); 347 348loop: 349 mutex_enter(&bufcache_lock); 350 for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { 351 KASSERT(bp->b_vp == vp); 352 nbp = LIST_NEXT(bp, b_vnbufs); 353 if ((bp->b_cflags & BC_BUSY)) 354 continue; 355 if ((bp->b_oflags & BO_DELWRI) == 0) 356 panic("vflushbuf: not dirty, bp %p", bp); 357 bp->b_cflags |= BC_BUSY | BC_VFLUSH; 358 mutex_exit(&bufcache_lock); 359 /* 360 * Wait for I/O associated with indirect blocks to complete, 361 * since there is no way to quickly wait for them below. 362 */ 363 if (bp->b_vp == vp || !sync) 364 (void) bawrite(bp); 365 else { 366 error = bwrite(bp); 367 if (error) 368 return error; 369 } 370 goto loop; 371 } 372 mutex_exit(&bufcache_lock); 373 374 if (!sync) 375 return 0; 376 377 mutex_enter(vp->v_interlock); 378 while (vp->v_numoutput != 0) 379 cv_wait(&vp->v_cv, vp->v_interlock); 380 dirty = !LIST_EMPTY(&vp->v_dirtyblkhd); 381 mutex_exit(vp->v_interlock); 382 383 if (dirty) { 384 vprint("vflushbuf: dirty", vp); 385 goto loop; 386 } 387 388 return 0; 389} 390 391/* 392 * Create a vnode for a block device. 393 * Used for root filesystem and swap areas. 394 * Also used for memory file system special devices. 395 */ 396int 397bdevvp(dev_t dev, vnode_t **vpp) 398{ 399 struct vattr va; 400 401 vattr_null(&va); 402 va.va_type = VBLK; 403 va.va_rdev = dev; 404 405 return vcache_new(dead_rootmount, NULL, &va, NOCRED, NULL, vpp); 406} 407 408/* 409 * Create a vnode for a character device. 410 * Used for kernfs and some console handling. 411 */ 412int 413cdevvp(dev_t dev, vnode_t **vpp) 414{ 415 struct vattr va; 416 417 vattr_null(&va); 418 va.va_type = VCHR; 419 va.va_rdev = dev; 420 421 return vcache_new(dead_rootmount, NULL, &va, NOCRED, NULL, vpp); 422} 423 424/* 425 * Associate a buffer with a vnode. There must already be a hold on 426 * the vnode. 427 */ 428void 429bgetvp(struct vnode *vp, struct buf *bp) 430{ 431 432 KASSERT(bp->b_vp == NULL); 433 KASSERT(bp->b_objlock == &buffer_lock); 434 KASSERT(mutex_owned(vp->v_interlock)); 435 KASSERT(mutex_owned(&bufcache_lock)); 436 KASSERT((bp->b_cflags & BC_BUSY) != 0); 437 KASSERT(!cv_has_waiters(&bp->b_done)); 438 439 vholdl(vp); 440 bp->b_vp = vp; 441 if (vp->v_type == VBLK || vp->v_type == VCHR) 442 bp->b_dev = vp->v_rdev; 443 else 444 bp->b_dev = NODEV; 445 446 /* 447 * Insert onto list for new vnode. 448 */ 449 bufinsvn(bp, &vp->v_cleanblkhd); 450 bp->b_objlock = vp->v_interlock; 451} 452 453/* 454 * Disassociate a buffer from a vnode. 455 */ 456void 457brelvp(struct buf *bp) 458{ 459 struct vnode *vp = bp->b_vp; 460 461 KASSERT(vp != NULL); 462 KASSERT(bp->b_objlock == vp->v_interlock); 463 KASSERT(mutex_owned(vp->v_interlock)); 464 KASSERT(mutex_owned(&bufcache_lock)); 465 KASSERT((bp->b_cflags & BC_BUSY) != 0); 466 KASSERT(!cv_has_waiters(&bp->b_done)); 467 468 /* 469 * Delete from old vnode list, if on one. 470 */ 471 if (LIST_NEXT(bp, b_vnbufs) != NOLIST) 472 bufremvn(bp); 473 474 if ((vp->v_iflag & (VI_ONWORKLST | VI_PAGES)) == VI_ONWORKLST && 475 LIST_FIRST(&vp->v_dirtyblkhd) == NULL) 476 vn_syncer_remove_from_worklist(vp); 477 478 bp->b_objlock = &buffer_lock; 479 bp->b_vp = NULL; 480 holdrelel(vp); 481} 482 483/* 484 * Reassign a buffer from one vnode list to another. 485 * The list reassignment must be within the same vnode. 486 * Used to assign file specific control information 487 * (indirect blocks) to the list to which they belong. 488 */ 489void 490reassignbuf(struct buf *bp, struct vnode *vp) 491{ 492 struct buflists *listheadp; 493 int delayx; 494 495 KASSERT(mutex_owned(&bufcache_lock)); 496 KASSERT(bp->b_objlock == vp->v_interlock); 497 KASSERT(mutex_owned(vp->v_interlock)); 498 KASSERT((bp->b_cflags & BC_BUSY) != 0); 499 500 /* 501 * Delete from old vnode list, if on one. 502 */ 503 if (LIST_NEXT(bp, b_vnbufs) != NOLIST) 504 bufremvn(bp); 505 506 /* 507 * If dirty, put on list of dirty buffers; 508 * otherwise insert onto list of clean buffers. 509 */ 510 if ((bp->b_oflags & BO_DELWRI) == 0) { 511 listheadp = &vp->v_cleanblkhd; 512 if ((vp->v_iflag & (VI_ONWORKLST | VI_PAGES)) == 513 VI_ONWORKLST && 514 LIST_FIRST(&vp->v_dirtyblkhd) == NULL) 515 vn_syncer_remove_from_worklist(vp); 516 } else { 517 listheadp = &vp->v_dirtyblkhd; 518 if ((vp->v_iflag & VI_ONWORKLST) == 0) { 519 switch (vp->v_type) { 520 case VDIR: 521 delayx = dirdelay; 522 break; 523 case VBLK: 524 if (spec_node_getmountedfs(vp) != NULL) { 525 delayx = metadelay; 526 break; 527 } 528 /* fall through */ 529 default: 530 delayx = filedelay; 531 break; 532 } 533 if (!vp->v_mount || 534 (vp->v_mount->mnt_flag & MNT_ASYNC) == 0) 535 vn_syncer_add_to_worklist(vp, delayx); 536 } 537 } 538 bufinsvn(bp, listheadp); 539} 540 541/* 542 * Lookup a vnode by device number and return it referenced. 543 */ 544int 545vfinddev(dev_t dev, enum vtype type, vnode_t **vpp) 546{ 547 548 return (spec_node_lookup_by_dev(type, dev, VDEAD_NOWAIT, vpp) == 0); 549} 550 551/* 552 * Revoke all the vnodes corresponding to the specified minor number 553 * range (endpoints inclusive) of the specified major. 554 */ 555void 556vdevgone(int maj, int minl, int minh, enum vtype type) 557{ 558 vnode_t *vp; 559 dev_t dev; 560 int mn; 561 562 for (mn = minl; mn <= minh; mn++) { 563 dev = makedev(maj, mn); 564 /* 565 * Notify anyone trying to get at this device that it 566 * has been detached, and then revoke it. 567 */ 568 switch (type) { 569 case VBLK: 570 bdev_detached(dev); 571 break; 572 case VCHR: 573 cdev_detached(dev); 574 break; 575 default: 576 panic("invalid specnode type: %d", type); 577 } 578 /* 579 * Passing 0 as flags, instead of VDEAD_NOWAIT, means 580 * spec_node_lookup_by_dev will wait for vnodes it 581 * finds concurrently being revoked before returning. 582 */ 583 while (spec_node_lookup_by_dev(type, dev, 0, &vp) == 0) { 584 VOP_REVOKE(vp, REVOKEALL); 585 vrele(vp); 586 } 587 } 588} 589 590/* 591 * The filesystem synchronizer mechanism - syncer. 592 * 593 * It is useful to delay writes of file data and filesystem metadata for 594 * a certain amount of time so that quickly created and deleted files need 595 * not waste disk bandwidth being created and removed. To implement this, 596 * vnodes are appended to a "workitem" queue. 597 * 598 * Most pending metadata should not wait for more than ten seconds. Thus, 599 * mounted on block devices are delayed only about a half the time that file 600 * data is delayed. Similarly, directory updates are more critical, so are 601 * only delayed about a third the time that file data is delayed. 602 * 603 * There are SYNCER_MAXDELAY queues that are processed in a round-robin 604 * manner at a rate of one each second (driven off the filesystem syner 605 * thread). The syncer_delayno variable indicates the next queue that is 606 * to be processed. Items that need to be processed soon are placed in 607 * this queue: 608 * 609 * syncer_workitem_pending[syncer_delayno] 610 * 611 * A delay of e.g. fifteen seconds is done by placing the request fifteen 612 * entries later in the queue: 613 * 614 * syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask] 615 * 616 * Flag VI_ONWORKLST indicates that vnode is added into the queue. 617 */ 618 619#define SYNCER_MAXDELAY 32 620 621typedef TAILQ_HEAD(synclist, vnode_impl) synclist_t; 622 623static void vn_syncer_add1(struct vnode *, int); 624static void sysctl_vfs_syncfs_setup(struct sysctllog **); 625 626/* 627 * Defines and variables for the syncer process. 628 */ 629int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */ 630time_t syncdelay = 30; /* max time to delay syncing data */ 631time_t filedelay = 30; /* time to delay syncing files */ 632time_t dirdelay = 15; /* time to delay syncing directories */ 633time_t metadelay = 10; /* time to delay syncing metadata */ 634time_t lockdelay = 1; /* time to delay if locking fails */ 635 636static kmutex_t syncer_data_lock; /* short term lock on data structs */ 637 638static int syncer_delayno = 0; 639static long syncer_last; 640static synclist_t * syncer_workitem_pending; 641 642static void 643vn_initialize_syncerd(void) 644{ 645 int i; 646 647 syncer_last = SYNCER_MAXDELAY + 2; 648 649 sysctl_vfs_syncfs_setup(NULL); 650 651 syncer_workitem_pending = 652 kmem_alloc(syncer_last * sizeof (struct synclist), KM_SLEEP); 653 654 for (i = 0; i < syncer_last; i++) 655 TAILQ_INIT(&syncer_workitem_pending[i]); 656 657 mutex_init(&syncer_data_lock, MUTEX_DEFAULT, IPL_NONE); 658} 659 660/* 661 * Return delay factor appropriate for the given file system. For 662 * WAPBL we use the sync vnode to burst out metadata updates: sync 663 * those file systems more frequently. 664 */ 665static inline int 666sync_delay(struct mount *mp) 667{ 668 669 return mp->mnt_wapbl != NULL ? metadelay : syncdelay; 670} 671 672/* 673 * Compute the next slot index from delay. 674 */ 675static inline int 676sync_delay_slot(int delayx) 677{ 678 679 if (delayx > syncer_maxdelay - 2) 680 delayx = syncer_maxdelay - 2; 681 return (syncer_delayno + delayx) % syncer_last; 682} 683 684/* 685 * Add an item to the syncer work queue. 686 */ 687static void 688vn_syncer_add1(struct vnode *vp, int delayx) 689{ 690 synclist_t *slp; 691 vnode_impl_t *vip = VNODE_TO_VIMPL(vp); 692 693 KASSERT(mutex_owned(&syncer_data_lock)); 694 695 if (vp->v_iflag & VI_ONWORKLST) { 696 /* 697 * Remove in order to adjust the position of the vnode. 698 * Note: called from sched_sync(), which will not hold 699 * interlock, therefore we cannot modify v_iflag here. 700 */ 701 slp = &syncer_workitem_pending[vip->vi_synclist_slot]; 702 TAILQ_REMOVE(slp, vip, vi_synclist); 703 } else { 704 KASSERT(mutex_owned(vp->v_interlock)); 705 vp->v_iflag |= VI_ONWORKLST; 706 } 707 708 vip->vi_synclist_slot = sync_delay_slot(delayx); 709 710 slp = &syncer_workitem_pending[vip->vi_synclist_slot]; 711 TAILQ_INSERT_TAIL(slp, vip, vi_synclist); 712} 713 714void 715vn_syncer_add_to_worklist(struct vnode *vp, int delayx) 716{ 717 vnode_impl_t *vip = VNODE_TO_VIMPL(vp); 718 719 KASSERT(mutex_owned(vp->v_interlock)); 720 721 mutex_enter(&syncer_data_lock); 722 vn_syncer_add1(vp, delayx); 723 SDT_PROBE3(vfs, syncer, worklist, vnode__add, 724 vp, delayx, vip->vi_synclist_slot); 725 mutex_exit(&syncer_data_lock); 726} 727 728/* 729 * Remove an item from the syncer work queue. 730 */ 731void 732vn_syncer_remove_from_worklist(struct vnode *vp) 733{ 734 synclist_t *slp; 735 vnode_impl_t *vip = VNODE_TO_VIMPL(vp); 736 737 KASSERT(mutex_owned(vp->v_interlock)); 738 739 if (vp->v_iflag & VI_ONWORKLST) { 740 mutex_enter(&syncer_data_lock); 741 SDT_PROBE1(vfs, syncer, worklist, vnode__remove, vp); 742 vp->v_iflag &= ~VI_ONWORKLST; 743 slp = &syncer_workitem_pending[vip->vi_synclist_slot]; 744 TAILQ_REMOVE(slp, vip, vi_synclist); 745 mutex_exit(&syncer_data_lock); 746 } 747} 748 749/* 750 * Add this mount point to the syncer. 751 */ 752void 753vfs_syncer_add_to_worklist(struct mount *mp) 754{ 755 static int start, incr, next; 756 int vdelay; 757 758 KASSERT(mutex_owned(mp->mnt_updating)); 759 KASSERT((mp->mnt_iflag & IMNT_ONWORKLIST) == 0); 760 761 /* 762 * We attempt to scatter the mount points on the list 763 * so that they will go off at evenly distributed times 764 * even if all the filesystems are mounted at once. 765 */ 766 767 next += incr; 768 if (next == 0 || next > syncer_maxdelay) { 769 start /= 2; 770 incr /= 2; 771 if (start == 0) { 772 start = syncer_maxdelay / 2; 773 incr = syncer_maxdelay; 774 } 775 next = start; 776 } 777 mp->mnt_iflag |= IMNT_ONWORKLIST; 778 vdelay = sync_delay(mp); 779 mp->mnt_synclist_slot = vdelay > 0 ? next % vdelay : 0; 780 SDT_PROBE3(vfs, syncer, worklist, mount__add, 781 mp, vdelay, mp->mnt_synclist_slot); 782} 783 784/* 785 * Remove the mount point from the syncer. 786 */ 787void 788vfs_syncer_remove_from_worklist(struct mount *mp) 789{ 790 791 KASSERT(mutex_owned(mp->mnt_updating)); 792 KASSERT((mp->mnt_iflag & IMNT_ONWORKLIST) != 0); 793 794 SDT_PROBE1(vfs, syncer, worklist, mount__remove, mp); 795 mp->mnt_iflag &= ~IMNT_ONWORKLIST; 796} 797 798/* 799 * Try lazy sync, return true on success. 800 */ 801static bool 802lazy_sync_vnode(struct vnode *vp) 803{ 804 bool synced; 805 int error; 806 807 KASSERT(mutex_owned(&syncer_data_lock)); 808 809 synced = false; 810 if ((error = vcache_tryvget(vp)) == 0) { 811 mutex_exit(&syncer_data_lock); 812 if ((error = vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT)) == 0) { 813 synced = true; 814 SDT_PROBE1(vfs, syncer, sync, vnode__start, vp); 815 error = VOP_FSYNC(vp, curlwp->l_cred, 816 FSYNC_LAZY, 0, 0); 817 SDT_PROBE2(vfs, syncer, sync, vnode__done, vp, error); 818 vput(vp); 819 } else { 820 SDT_PROBE2(vfs, syncer, sync, vnode__fail__lock, 821 vp, error); 822 vrele(vp); 823 } 824 mutex_enter(&syncer_data_lock); 825 } else { 826 SDT_PROBE2(vfs, syncer, sync, vnode__fail__vget, vp, error); 827 } 828 return synced; 829} 830 831/* 832 * System filesystem synchronizer daemon. 833 */ 834void 835sched_sync(void *arg) 836{ 837 mount_iterator_t *iter; 838 synclist_t *slp; 839 struct vnode_impl *vi; 840 struct vnode *vp; 841 struct mount *mp; 842 time_t starttime, endtime; 843 int vdelay, oslot, nslot, delayx; 844 bool synced; 845 int error; 846 847 for (;;) { 848 starttime = time_second; 849 SDT_PROBE1(vfs, syncer, sync, start, starttime); 850 851 /* 852 * Sync mounts whose dirty time has expired. 853 */ 854 mountlist_iterator_init(&iter); 855 while ((mp = mountlist_iterator_trynext(iter)) != NULL) { 856 if ((mp->mnt_iflag & IMNT_ONWORKLIST) == 0 || 857 mp->mnt_synclist_slot != syncer_delayno) { 858 SDT_PROBE1(vfs, syncer, sync, mount__skip, 859 mp); 860 continue; 861 } 862 863 vdelay = sync_delay(mp); 864 oslot = mp->mnt_synclist_slot; 865 nslot = sync_delay_slot(vdelay); 866 mp->mnt_synclist_slot = nslot; 867 SDT_PROBE4(vfs, syncer, worklist, mount__update, 868 mp, vdelay, oslot, nslot); 869 870 SDT_PROBE1(vfs, syncer, sync, mount__start, mp); 871 error = VFS_SYNC(mp, MNT_LAZY, curlwp->l_cred); 872 SDT_PROBE2(vfs, syncer, sync, mount__done, 873 mp, error); 874 } 875 mountlist_iterator_destroy(iter); 876 877 mutex_enter(&syncer_data_lock); 878 879 /* 880 * Push files whose dirty time has expired. 881 */ 882 slp = &syncer_workitem_pending[syncer_delayno]; 883 syncer_delayno += 1; 884 if (syncer_delayno >= syncer_last) 885 syncer_delayno = 0; 886 887 while ((vi = TAILQ_FIRST(slp)) != NULL) { 888 vp = VIMPL_TO_VNODE(vi); 889 synced = lazy_sync_vnode(vp); 890 891 /* 892 * XXX The vnode may have been recycled, in which 893 * case it may have a new identity. 894 */ 895 vi = TAILQ_FIRST(slp); 896 if (vi != NULL && VIMPL_TO_VNODE(vi) == vp) { 897 /* 898 * Put us back on the worklist. The worklist 899 * routine will remove us from our current 900 * position and then add us back in at a later 901 * position. 902 * 903 * Try again sooner rather than later if 904 * we were unable to lock the vnode. Lock 905 * failure should not prevent us from doing 906 * the sync "soon". 907 * 908 * If we locked it yet arrive here, it's 909 * likely that lazy sync is in progress and 910 * so the vnode still has dirty metadata. 911 * syncdelay is mainly to get this vnode out 912 * of the way so we do not consider it again 913 * "soon" in this loop, so the delay time is 914 * not critical as long as it is not "soon". 915 * While write-back strategy is the file 916 * system's domain, we expect write-back to 917 * occur no later than syncdelay seconds 918 * into the future. 919 */ 920 delayx = synced ? syncdelay : lockdelay; 921 oslot = vi->vi_synclist_slot; 922 vn_syncer_add1(vp, delayx); 923 nslot = vi->vi_synclist_slot; 924 SDT_PROBE4(vfs, syncer, worklist, 925 vnode__update, 926 vp, delayx, oslot, nslot); 927 } 928 } 929 930 endtime = time_second; 931 932 SDT_PROBE2(vfs, syncer, sync, done, starttime, endtime); 933 934 /* 935 * If it has taken us less than a second to process the 936 * current work, then wait. Otherwise start right over 937 * again. We can still lose time if any single round 938 * takes more than two seconds, but it does not really 939 * matter as we are just trying to generally pace the 940 * filesystem activity. 941 */ 942 if (endtime == starttime) { 943 kpause("syncer", false, hz, &syncer_data_lock); 944 } 945 mutex_exit(&syncer_data_lock); 946 } 947} 948 949static void 950sysctl_vfs_syncfs_setup(struct sysctllog **clog) 951{ 952 const struct sysctlnode *rnode, *cnode; 953 954 sysctl_createv(clog, 0, NULL, &rnode, 955 CTLFLAG_PERMANENT, 956 CTLTYPE_NODE, "sync", 957 SYSCTL_DESCR("syncer options"), 958 NULL, 0, NULL, 0, 959 CTL_VFS, CTL_CREATE, CTL_EOL); 960 961 sysctl_createv(clog, 0, &rnode, &cnode, 962 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 963 CTLTYPE_QUAD, "delay", 964 SYSCTL_DESCR("max time to delay syncing data"), 965 NULL, 0, &syncdelay, 0, 966 CTL_CREATE, CTL_EOL); 967 968 sysctl_createv(clog, 0, &rnode, &cnode, 969 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 970 CTLTYPE_QUAD, "filedelay", 971 SYSCTL_DESCR("time to delay syncing files"), 972 NULL, 0, &filedelay, 0, 973 CTL_CREATE, CTL_EOL); 974 975 sysctl_createv(clog, 0, &rnode, &cnode, 976 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 977 CTLTYPE_QUAD, "dirdelay", 978 SYSCTL_DESCR("time to delay syncing directories"), 979 NULL, 0, &dirdelay, 0, 980 CTL_CREATE, CTL_EOL); 981 982 sysctl_createv(clog, 0, &rnode, &cnode, 983 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 984 CTLTYPE_QUAD, "metadelay", 985 SYSCTL_DESCR("time to delay syncing metadata"), 986 NULL, 0, &metadelay, 0, 987 CTL_CREATE, CTL_EOL); 988} 989 990/* 991 * sysctl helper routine to return list of supported fstypes 992 */ 993int 994sysctl_vfs_generic_fstypes(SYSCTLFN_ARGS) 995{ 996 char bf[sizeof(((struct statvfs *)NULL)->f_fstypename)]; 997 char *where = oldp; 998 struct vfsops *v; 999 size_t needed, left, slen; 1000 int error, first; 1001 1002 if (newp != NULL) 1003 return (EPERM); 1004 if (namelen != 0) 1005 return (EINVAL); 1006 1007 first = 1; 1008 error = 0; 1009 needed = 0; 1010 left = *oldlenp; 1011 1012 sysctl_unlock(); 1013 mutex_enter(&vfs_list_lock); 1014 LIST_FOREACH(v, &vfs_list, vfs_list) { 1015 if (where == NULL) 1016 needed += strlen(v->vfs_name) + 1; 1017 else { 1018 memset(bf, 0, sizeof(bf)); 1019 if (first) { 1020 strncpy(bf, v->vfs_name, sizeof(bf)); 1021 first = 0; 1022 } else { 1023 bf[0] = ' '; 1024 strncpy(bf + 1, v->vfs_name, sizeof(bf) - 1); 1025 } 1026 bf[sizeof(bf)-1] = '\0'; 1027 slen = strlen(bf); 1028 if (left < slen + 1) 1029 break; 1030 v->vfs_refcount++; 1031 mutex_exit(&vfs_list_lock); 1032 /* +1 to copy out the trailing NUL byte */ 1033 error = copyout(bf, where, slen + 1); 1034 mutex_enter(&vfs_list_lock); 1035 v->vfs_refcount--; 1036 if (error) 1037 break; 1038 where += slen; 1039 needed += slen; 1040 left -= slen; 1041 } 1042 } 1043 mutex_exit(&vfs_list_lock); 1044 sysctl_relock(); 1045 *oldlenp = needed; 1046 return (error); 1047} 1048 1049int kinfo_vdebug = 1; 1050int kinfo_vgetfailed; 1051 1052#define KINFO_VNODESLOP 10 1053 1054/* 1055 * Dump vnode list (via sysctl). 1056 * Copyout address of vnode followed by vnode. 1057 */ 1058int 1059sysctl_kern_vnode(SYSCTLFN_ARGS) 1060{ 1061 char *where = oldp; 1062 size_t *sizep = oldlenp; 1063 struct mount *mp; 1064 vnode_t *vp, vbuf; 1065 mount_iterator_t *iter; 1066 struct vnode_iterator *marker; 1067 char *bp = where; 1068 char *ewhere; 1069 int error; 1070 1071 if (namelen != 0) 1072 return (EOPNOTSUPP); 1073 if (newp != NULL) 1074 return (EPERM); 1075 1076#define VPTRSZ sizeof(vnode_t *) 1077#define VNODESZ sizeof(vnode_t) 1078 if (where == NULL) { 1079 *sizep = (numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ); 1080 return (0); 1081 } 1082 ewhere = where + *sizep; 1083 1084 sysctl_unlock(); 1085 mountlist_iterator_init(&iter); 1086 while ((mp = mountlist_iterator_next(iter)) != NULL) { 1087 vfs_vnode_iterator_init(mp, &marker); 1088 while ((vp = vfs_vnode_iterator_next(marker, NULL, NULL))) { 1089 if (bp + VPTRSZ + VNODESZ > ewhere) { 1090 vrele(vp); 1091 vfs_vnode_iterator_destroy(marker); 1092 mountlist_iterator_destroy(iter); 1093 sysctl_relock(); 1094 *sizep = bp - where; 1095 return (ENOMEM); 1096 } 1097 memcpy(&vbuf, vp, VNODESZ); 1098 if ((error = copyout(&vp, bp, VPTRSZ)) || 1099 (error = copyout(&vbuf, bp + VPTRSZ, VNODESZ))) { 1100 vrele(vp); 1101 vfs_vnode_iterator_destroy(marker); 1102 mountlist_iterator_destroy(iter); 1103 sysctl_relock(); 1104 return (error); 1105 } 1106 vrele(vp); 1107 bp += VPTRSZ + VNODESZ; 1108 } 1109 vfs_vnode_iterator_destroy(marker); 1110 } 1111 mountlist_iterator_destroy(iter); 1112 sysctl_relock(); 1113 1114 *sizep = bp - where; 1115 return (0); 1116} 1117 1118/* 1119 * Set vnode attributes to VNOVAL 1120 */ 1121void 1122vattr_null(struct vattr *vap) 1123{ 1124 1125 memset(vap, 0, sizeof(*vap)); 1126 1127 vap->va_type = VNON; 1128 1129 /* 1130 * Assign individually so that it is safe even if size and 1131 * sign of each member are varied. 1132 */ 1133 vap->va_mode = VNOVAL; 1134 vap->va_nlink = VNOVAL; 1135 vap->va_uid = VNOVAL; 1136 vap->va_gid = VNOVAL; 1137 vap->va_fsid = VNOVAL; 1138 vap->va_fileid = VNOVAL; 1139 vap->va_size = VNOVAL; 1140 vap->va_blocksize = VNOVAL; 1141 vap->va_atime.tv_sec = 1142 vap->va_mtime.tv_sec = 1143 vap->va_ctime.tv_sec = 1144 vap->va_birthtime.tv_sec = VNOVAL; 1145 vap->va_atime.tv_nsec = 1146 vap->va_mtime.tv_nsec = 1147 vap->va_ctime.tv_nsec = 1148 vap->va_birthtime.tv_nsec = VNOVAL; 1149 vap->va_gen = VNOVAL; 1150 vap->va_flags = VNOVAL; 1151 vap->va_rdev = VNOVAL; 1152 vap->va_bytes = VNOVAL; 1153} 1154 1155/* 1156 * Vnode state to string. 1157 */ 1158const char * 1159vstate_name(enum vnode_state state) 1160{ 1161 1162 switch (state) { 1163 case VS_ACTIVE: 1164 return "ACTIVE"; 1165 case VS_MARKER: 1166 return "MARKER"; 1167 case VS_LOADING: 1168 return "LOADING"; 1169 case VS_LOADED: 1170 return "LOADED"; 1171 case VS_BLOCKED: 1172 return "BLOCKED"; 1173 case VS_RECLAIMING: 1174 return "RECLAIMING"; 1175 case VS_RECLAIMED: 1176 return "RECLAIMED"; 1177 default: 1178 return "ILLEGAL"; 1179 } 1180} 1181 1182/* 1183 * Print a description of a vnode (common part). 1184 */ 1185static void 1186vprint_common(struct vnode *vp, const char *prefix, 1187 void (*pr)(const char *, ...) __printflike(1, 2)) 1188{ 1189 int n; 1190 char bf[96]; 1191 const uint8_t *cp; 1192 vnode_impl_t *vip; 1193 const char * const vnode_tags[] = { VNODE_TAGS }; 1194 const char * const vnode_types[] = { VNODE_TYPES }; 1195 const char vnode_flagbits[] = VNODE_FLAGBITS; 1196 1197#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof(arr[0])) 1198#define ARRAY_PRINT(idx, arr) \ 1199 ((unsigned int)(idx) < ARRAY_SIZE(arr) ? (arr)[(idx)] : "UNKNOWN") 1200 1201 vip = VNODE_TO_VIMPL(vp); 1202 1203 snprintb(bf, sizeof(bf), 1204 vnode_flagbits, vp->v_iflag | vp->v_vflag | vp->v_uflag); 1205 1206 (*pr)("vnode %p flags %s\n", vp, bf); 1207 (*pr)("%stag %s(%d) type %s(%d) mount %p typedata %p\n", prefix, 1208 ARRAY_PRINT(vp->v_tag, vnode_tags), vp->v_tag, 1209 ARRAY_PRINT(vp->v_type, vnode_types), vp->v_type, 1210 vp->v_mount, vp->v_mountedhere); 1211 (*pr)("%susecount %d writecount %d holdcount %d\n", prefix, 1212 vrefcnt(vp), vp->v_writecount, vp->v_holdcnt); 1213 (*pr)("%ssize %" PRIx64 " writesize %" PRIx64 " numoutput %d\n", 1214 prefix, vp->v_size, vp->v_writesize, vp->v_numoutput); 1215 (*pr)("%sdata %p lock %p\n", prefix, vp->v_data, &vip->vi_lock); 1216 1217 (*pr)("%sstate %s key(%p %zd)", prefix, vstate_name(vip->vi_state), 1218 vip->vi_key.vk_mount, vip->vi_key.vk_key_len); 1219 n = vip->vi_key.vk_key_len; 1220 cp = vip->vi_key.vk_key; 1221 while (n-- > 0) 1222 (*pr)(" %02x", *cp++); 1223 (*pr)("\n"); 1224 (*pr)("%slrulisthd %p\n", prefix, vip->vi_lrulisthd); 1225 1226#undef ARRAY_PRINT 1227#undef ARRAY_SIZE 1228} 1229 1230/* 1231 * Print out a description of a vnode. 1232 */ 1233void 1234vprint(const char *label, struct vnode *vp) 1235{ 1236 1237 if (label != NULL) 1238 printf("%s: ", label); 1239 vprint_common(vp, "\t", printf); 1240 if (vp->v_data != NULL) { 1241 printf("\t"); 1242 VOP_PRINT(vp); 1243 } 1244} 1245 1246/* 1247 * Given a file system name, look up the vfsops for that 1248 * file system, or return NULL if file system isn't present 1249 * in the kernel. 1250 */ 1251struct vfsops * 1252vfs_getopsbyname(const char *name) 1253{ 1254 struct vfsops *v; 1255 1256 mutex_enter(&vfs_list_lock); 1257 LIST_FOREACH(v, &vfs_list, vfs_list) { 1258 if (strcmp(v->vfs_name, name) == 0) 1259 break; 1260 } 1261 if (v != NULL) 1262 v->vfs_refcount++; 1263 mutex_exit(&vfs_list_lock); 1264 1265 return (v); 1266} 1267 1268void 1269copy_statvfs_info(struct statvfs *sbp, const struct mount *mp) 1270{ 1271 const struct statvfs *mbp; 1272 1273 if (sbp == (mbp = &mp->mnt_stat)) 1274 return; 1275 1276 (void)memcpy(&sbp->f_fsidx, &mbp->f_fsidx, sizeof(sbp->f_fsidx)); 1277 sbp->f_fsid = mbp->f_fsid; 1278 sbp->f_owner = mbp->f_owner; 1279 sbp->f_flag = mbp->f_flag; 1280 sbp->f_syncwrites = mbp->f_syncwrites; 1281 sbp->f_asyncwrites = mbp->f_asyncwrites; 1282 sbp->f_syncreads = mbp->f_syncreads; 1283 sbp->f_asyncreads = mbp->f_asyncreads; 1284 (void)memcpy(sbp->f_spare, mbp->f_spare, sizeof(mbp->f_spare)); 1285 (void)memcpy(sbp->f_fstypename, mbp->f_fstypename, 1286 sizeof(sbp->f_fstypename)); 1287 (void)memcpy(sbp->f_mntonname, mbp->f_mntonname, 1288 sizeof(sbp->f_mntonname)); 1289 (void)memcpy(sbp->f_mntfromname, mp->mnt_stat.f_mntfromname, 1290 sizeof(sbp->f_mntfromname)); 1291 (void)memcpy(sbp->f_mntfromlabel, mp->mnt_stat.f_mntfromlabel, 1292 sizeof(sbp->f_mntfromlabel)); 1293 sbp->f_namemax = mbp->f_namemax; 1294} 1295 1296int 1297set_statvfs_info(const char *onp, int ukon, const char *fromp, int ukfrom, 1298 const char *vfsname, struct mount *mp, struct lwp *l) 1299{ 1300 int error; 1301 size_t size; 1302 struct statvfs *sfs = &mp->mnt_stat; 1303 int (*fun)(const void *, void *, size_t, size_t *); 1304 1305 (void)strlcpy(mp->mnt_stat.f_fstypename, vfsname, 1306 sizeof(mp->mnt_stat.f_fstypename)); 1307 1308 if (onp) { 1309 struct cwdinfo *cwdi = l->l_proc->p_cwdi; 1310 fun = (ukon == UIO_SYSSPACE) ? copystr : copyinstr; 1311 if (cwdi->cwdi_rdir != NULL) { 1312 size_t len; 1313 char *bp; 1314 char *path = PNBUF_GET(); 1315 1316 bp = path + MAXPATHLEN; 1317 *--bp = '\0'; 1318 rw_enter(&cwdi->cwdi_lock, RW_READER); 1319 error = getcwd_common(cwdi->cwdi_rdir, rootvnode, &bp, 1320 path, MAXPATHLEN / 2, 0, l); 1321 rw_exit(&cwdi->cwdi_lock); 1322 if (error) { 1323 PNBUF_PUT(path); 1324 return error; 1325 } 1326 1327 len = strlen(bp); 1328 if (len > sizeof(sfs->f_mntonname) - 1) 1329 len = sizeof(sfs->f_mntonname) - 1; 1330 (void)strncpy(sfs->f_mntonname, bp, len); 1331 PNBUF_PUT(path); 1332 1333 if (len < sizeof(sfs->f_mntonname) - 1) { 1334 error = (*fun)(onp, &sfs->f_mntonname[len], 1335 sizeof(sfs->f_mntonname) - len - 1, &size); 1336 if (error) 1337 return error; 1338 size += len; 1339 } else { 1340 size = len; 1341 } 1342 } else { 1343 error = (*fun)(onp, &sfs->f_mntonname, 1344 sizeof(sfs->f_mntonname) - 1, &size); 1345 if (error) 1346 return error; 1347 } 1348 (void)memset(sfs->f_mntonname + size, 0, 1349 sizeof(sfs->f_mntonname) - size); 1350 } 1351 1352 if (fromp) { 1353 fun = (ukfrom == UIO_SYSSPACE) ? copystr : copyinstr; 1354 error = (*fun)(fromp, sfs->f_mntfromname, 1355 sizeof(sfs->f_mntfromname) - 1, &size); 1356 if (error) 1357 return error; 1358 (void)memset(sfs->f_mntfromname + size, 0, 1359 sizeof(sfs->f_mntfromname) - size); 1360 } 1361 return 0; 1362} 1363 1364/* 1365 * Knob to control the precision of file timestamps: 1366 * 1367 * 0 = seconds only; nanoseconds zeroed. 1368 * 1 = seconds and nanoseconds, accurate within 1/HZ. 1369 * 2 = seconds and nanoseconds, truncated to microseconds. 1370 * >=3 = seconds and nanoseconds, maximum precision. 1371 */ 1372enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC }; 1373 1374int vfs_timestamp_precision __read_mostly = TSP_NSEC; 1375 1376void 1377vfs_timestamp(struct timespec *tsp) 1378{ 1379 struct timeval tv; 1380 1381 switch (vfs_timestamp_precision) { 1382 case TSP_SEC: 1383 tsp->tv_sec = time_second; 1384 tsp->tv_nsec = 0; 1385 break; 1386 case TSP_HZ: 1387 getnanotime(tsp); 1388 break; 1389 case TSP_USEC: 1390 microtime(&tv); 1391 TIMEVAL_TO_TIMESPEC(&tv, tsp); 1392 break; 1393 case TSP_NSEC: 1394 default: 1395 nanotime(tsp); 1396 break; 1397 } 1398} 1399 1400/* 1401 * The purpose of this routine is to remove granularity from accmode_t, 1402 * reducing it into standard unix access bits - VEXEC, VREAD, VWRITE, 1403 * VADMIN and VAPPEND. 1404 * 1405 * If it returns 0, the caller is supposed to continue with the usual 1406 * access checks using 'accmode' as modified by this routine. If it 1407 * returns nonzero value, the caller is supposed to return that value 1408 * as errno. 1409 * 1410 * Note that after this routine runs, accmode may be zero. 1411 */ 1412int 1413vfs_unixify_accmode(accmode_t *accmode) 1414{ 1415 /* 1416 * There is no way to specify explicit "deny" rule using 1417 * file mode or POSIX.1e ACLs. 1418 */ 1419 if (*accmode & VEXPLICIT_DENY) { 1420 *accmode = 0; 1421 return (0); 1422 } 1423 1424 /* 1425 * None of these can be translated into usual access bits. 1426 * Also, the common case for NFSv4 ACLs is to not contain 1427 * either of these bits. Caller should check for VWRITE 1428 * on the containing directory instead. 1429 */ 1430 if (*accmode & (VDELETE_CHILD | VDELETE)) 1431 return (EPERM); 1432 1433 if (*accmode & VADMIN_PERMS) { 1434 *accmode &= ~VADMIN_PERMS; 1435 *accmode |= VADMIN; 1436 } 1437 1438 /* 1439 * There is no way to deny VREAD_ATTRIBUTES, VREAD_ACL 1440 * or VSYNCHRONIZE using file mode or POSIX.1e ACL. 1441 */ 1442 *accmode &= ~(VSTAT_PERMS | VSYNCHRONIZE); 1443 1444 return (0); 1445} 1446 1447time_t rootfstime; /* recorded root fs time, if known */ 1448void 1449setrootfstime(time_t t) 1450{ 1451 rootfstime = t; 1452} 1453 1454static const uint8_t vttodt_tab[ ] = { 1455 [VNON] = DT_UNKNOWN, 1456 [VREG] = DT_REG, 1457 [VDIR] = DT_DIR, 1458 [VBLK] = DT_BLK, 1459 [VCHR] = DT_CHR, 1460 [VLNK] = DT_LNK, 1461 [VSOCK] = DT_SOCK, 1462 [VFIFO] = DT_FIFO, 1463 [VBAD] = DT_UNKNOWN 1464}; 1465 1466uint8_t 1467vtype2dt(enum vtype vt) 1468{ 1469 1470 CTASSERT(VBAD == __arraycount(vttodt_tab) - 1); 1471 return vttodt_tab[vt]; 1472} 1473 1474int 1475VFS_MOUNT(struct mount *mp, const char *a, void *b, size_t *c) 1476{ 1477 int mpsafe = mp->mnt_iflag & IMNT_MPSAFE; 1478 int error; 1479 1480 /* 1481 * Note: The first time through, the vfs_mount function may set 1482 * IMNT_MPSAFE, so we have to cache it on entry in order to 1483 * avoid leaking a kernel lock. 1484 * 1485 * XXX Maybe the MPSAFE bit should be set in struct vfsops and 1486 * not in struct mount. 1487 */ 1488 if (mpsafe) { 1489 KERNEL_LOCK(1, NULL); 1490 } 1491 error = (*(mp->mnt_op->vfs_mount))(mp, a, b, c); 1492 if (mpsafe) { 1493 KERNEL_UNLOCK_ONE(NULL); 1494 } 1495 1496 return error; 1497} 1498 1499int 1500VFS_START(struct mount *mp, int a) 1501{ 1502 int error; 1503 1504 if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { 1505 KERNEL_LOCK(1, NULL); 1506 } 1507 error = (*(mp->mnt_op->vfs_start))(mp, a); 1508 if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { 1509 KERNEL_UNLOCK_ONE(NULL); 1510 } 1511 1512 return error; 1513} 1514 1515int 1516VFS_UNMOUNT(struct mount *mp, int a) 1517{ 1518 int error; 1519 1520 if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { 1521 KERNEL_LOCK(1, NULL); 1522 } 1523 error = (*(mp->mnt_op->vfs_unmount))(mp, a); 1524 if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { 1525 KERNEL_UNLOCK_ONE(NULL); 1526 } 1527 1528 return error; 1529} 1530 1531int 1532VFS_ROOT(struct mount *mp, int lktype, struct vnode **a) 1533{ 1534 int error; 1535 1536 if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { 1537 KERNEL_LOCK(1, NULL); 1538 } 1539 error = (*(mp->mnt_op->vfs_root))(mp, lktype, a); 1540 if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { 1541 KERNEL_UNLOCK_ONE(NULL); 1542 } 1543 1544 return error; 1545} 1546 1547int 1548VFS_QUOTACTL(struct mount *mp, struct quotactl_args *args) 1549{ 1550 int error; 1551 1552 if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { 1553 KERNEL_LOCK(1, NULL); 1554 } 1555 error = (*(mp->mnt_op->vfs_quotactl))(mp, args); 1556 if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { 1557 KERNEL_UNLOCK_ONE(NULL); 1558 } 1559 1560 return error; 1561} 1562 1563int 1564VFS_STATVFS(struct mount *mp, struct statvfs *a) 1565{ 1566 int error; 1567 1568 if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { 1569 KERNEL_LOCK(1, NULL); 1570 } 1571 error = (*(mp->mnt_op->vfs_statvfs))(mp, a); 1572 if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { 1573 KERNEL_UNLOCK_ONE(NULL); 1574 } 1575 1576 return error; 1577} 1578 1579int 1580VFS_SYNC(struct mount *mp, int a, struct kauth_cred *b) 1581{ 1582 int error; 1583 1584 if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { 1585 KERNEL_LOCK(1, NULL); 1586 } 1587 error = (*(mp->mnt_op->vfs_sync))(mp, a, b); 1588 if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { 1589 KERNEL_UNLOCK_ONE(NULL); 1590 } 1591 1592 return error; 1593} 1594 1595int 1596VFS_FHTOVP(struct mount *mp, struct fid *a, int b, struct vnode **c) 1597{ 1598 int error; 1599 1600 if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { 1601 KERNEL_LOCK(1, NULL); 1602 } 1603 error = (*(mp->mnt_op->vfs_fhtovp))(mp, a, b, c); 1604 if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { 1605 KERNEL_UNLOCK_ONE(NULL); 1606 } 1607 1608 return error; 1609} 1610 1611int 1612VFS_VPTOFH(struct vnode *vp, struct fid *a, size_t *b) 1613{ 1614 int error; 1615 1616 if ((vp->v_vflag & VV_MPSAFE) == 0) { 1617 KERNEL_LOCK(1, NULL); 1618 } 1619 error = (*(vp->v_mount->mnt_op->vfs_vptofh))(vp, a, b); 1620 if ((vp->v_vflag & VV_MPSAFE) == 0) { 1621 KERNEL_UNLOCK_ONE(NULL); 1622 } 1623 1624 return error; 1625} 1626 1627int 1628VFS_SNAPSHOT(struct mount *mp, struct vnode *a, struct timespec *b) 1629{ 1630 int error; 1631 1632 if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { 1633 KERNEL_LOCK(1, NULL); 1634 } 1635 error = (*(mp->mnt_op->vfs_snapshot))(mp, a, b); 1636 if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { 1637 KERNEL_UNLOCK_ONE(NULL); 1638 } 1639 1640 return error; 1641} 1642 1643int 1644VFS_EXTATTRCTL(struct mount *mp, int a, struct vnode *b, int c, const char *d) 1645{ 1646 int error; 1647 1648 KERNEL_LOCK(1, NULL); /* XXXSMP check ffs */ 1649 error = (*(mp->mnt_op->vfs_extattrctl))(mp, a, b, c, d); 1650 KERNEL_UNLOCK_ONE(NULL); /* XXX */ 1651 1652 return error; 1653} 1654 1655int 1656VFS_SUSPENDCTL(struct mount *mp, int a) 1657{ 1658 int error; 1659 1660 if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { 1661 KERNEL_LOCK(1, NULL); 1662 } 1663 error = (*(mp->mnt_op->vfs_suspendctl))(mp, a); 1664 if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { 1665 KERNEL_UNLOCK_ONE(NULL); 1666 } 1667 1668 return error; 1669} 1670 1671#if defined(DDB) || defined(DEBUGPRINT) 1672static const char buf_flagbits[] = BUF_FLAGBITS; 1673 1674void 1675vfs_buf_print(struct buf *bp, int full, void (*pr)(const char *, ...)) 1676{ 1677 char bf[1024]; 1678 1679 (*pr)(" vp %p lblkno 0x%"PRIx64" blkno 0x%"PRIx64" rawblkno 0x%" 1680 PRIx64 " dev 0x%x\n", 1681 bp->b_vp, bp->b_lblkno, bp->b_blkno, bp->b_rawblkno, bp->b_dev); 1682 1683 snprintb(bf, sizeof(bf), 1684 buf_flagbits, bp->b_flags | bp->b_oflags | bp->b_cflags); 1685 (*pr)(" error %d flags %s\n", bp->b_error, bf); 1686 1687 (*pr)(" bufsize 0x%lx bcount 0x%lx resid 0x%lx\n", 1688 bp->b_bufsize, bp->b_bcount, bp->b_resid); 1689 (*pr)(" data %p saveaddr %p\n", 1690 bp->b_data, bp->b_saveaddr); 1691 (*pr)(" iodone %p objlock %p\n", bp->b_iodone, bp->b_objlock); 1692} 1693 1694void 1695vfs_vnode_print(struct vnode *vp, int full, void (*pr)(const char *, ...)) 1696{ 1697 1698 uvm_object_printit(&vp->v_uobj, full, pr); 1699 (*pr)("\n"); 1700 vprint_common(vp, "", pr); 1701 if (full) { 1702 struct buf *bp; 1703 1704 (*pr)("clean bufs:\n"); 1705 LIST_FOREACH(bp, &vp->v_cleanblkhd, b_vnbufs) { 1706 (*pr)(" bp %p\n", bp); 1707 vfs_buf_print(bp, full, pr); 1708 } 1709 1710 (*pr)("dirty bufs:\n"); 1711 LIST_FOREACH(bp, &vp->v_dirtyblkhd, b_vnbufs) { 1712 (*pr)(" bp %p\n", bp); 1713 vfs_buf_print(bp, full, pr); 1714 } 1715 } 1716} 1717 1718void 1719vfs_vnode_lock_print(void *vlock, int full, void (*pr)(const char *, ...)) 1720{ 1721 struct mount *mp; 1722 vnode_impl_t *vip; 1723 1724 for (mp = _mountlist_next(NULL); mp; mp = _mountlist_next(mp)) { 1725 TAILQ_FOREACH(vip, &mp->mnt_vnodelist, vi_mntvnodes) { 1726 if (&vip->vi_lock == vlock || 1727 VIMPL_TO_VNODE(vip)->v_interlock == vlock) 1728 vfs_vnode_print(VIMPL_TO_VNODE(vip), full, pr); 1729 } 1730 } 1731} 1732 1733void 1734vfs_mount_print_all(int full, void (*pr)(const char *, ...)) 1735{ 1736 struct mount *mp; 1737 for (mp = _mountlist_next(NULL); mp; mp = _mountlist_next(mp)) 1738 vfs_mount_print(mp, full, pr); 1739} 1740 1741void 1742vfs_mount_print(struct mount *mp, int full, void (*pr)(const char *, ...)) 1743{ 1744 char sbuf[256]; 1745 1746 (*pr)("vnodecovered = %p data = %p\n", 1747 mp->mnt_vnodecovered, mp->mnt_data); 1748 1749 (*pr)("fs_bshift %d dev_bshift = %d\n", 1750 mp->mnt_fs_bshift, mp->mnt_dev_bshift); 1751 1752 snprintb(sbuf, sizeof(sbuf), __MNT_FLAG_BITS, mp->mnt_flag); 1753 (*pr)("flag = %s\n", sbuf); 1754 1755 snprintb(sbuf, sizeof(sbuf), __IMNT_FLAG_BITS, mp->mnt_iflag); 1756 (*pr)("iflag = %s\n", sbuf); 1757 1758 (*pr)("refcnt = %d updating @ %p\n", mp->mnt_refcnt, mp->mnt_updating); 1759 1760 (*pr)("statvfs cache:\n"); 1761 (*pr)("\tbsize = %lu\n", mp->mnt_stat.f_bsize); 1762 (*pr)("\tfrsize = %lu\n", mp->mnt_stat.f_frsize); 1763 (*pr)("\tiosize = %lu\n", mp->mnt_stat.f_iosize); 1764 1765 (*pr)("\tblocks = %"PRIu64"\n", mp->mnt_stat.f_blocks); 1766 (*pr)("\tbfree = %"PRIu64"\n", mp->mnt_stat.f_bfree); 1767 (*pr)("\tbavail = %"PRIu64"\n", mp->mnt_stat.f_bavail); 1768 (*pr)("\tbresvd = %"PRIu64"\n", mp->mnt_stat.f_bresvd); 1769 1770 (*pr)("\tfiles = %"PRIu64"\n", mp->mnt_stat.f_files); 1771 (*pr)("\tffree = %"PRIu64"\n", mp->mnt_stat.f_ffree); 1772 (*pr)("\tfavail = %"PRIu64"\n", mp->mnt_stat.f_favail); 1773 (*pr)("\tfresvd = %"PRIu64"\n", mp->mnt_stat.f_fresvd); 1774 1775 (*pr)("\tf_fsidx = { 0x%"PRIx32", 0x%"PRIx32" }\n", 1776 mp->mnt_stat.f_fsidx.__fsid_val[0], 1777 mp->mnt_stat.f_fsidx.__fsid_val[1]); 1778 1779 (*pr)("\towner = %"PRIu32"\n", mp->mnt_stat.f_owner); 1780 (*pr)("\tnamemax = %lu\n", mp->mnt_stat.f_namemax); 1781 1782 snprintb(sbuf, sizeof(sbuf), __MNT_FLAG_BITS, mp->mnt_stat.f_flag); 1783 1784 (*pr)("\tflag = %s\n", sbuf); 1785 (*pr)("\tsyncwrites = %" PRIu64 "\n", mp->mnt_stat.f_syncwrites); 1786 (*pr)("\tasyncwrites = %" PRIu64 "\n", mp->mnt_stat.f_asyncwrites); 1787 (*pr)("\tsyncreads = %" PRIu64 "\n", mp->mnt_stat.f_syncreads); 1788 (*pr)("\tasyncreads = %" PRIu64 "\n", mp->mnt_stat.f_asyncreads); 1789 (*pr)("\tfstypename = %s\n", mp->mnt_stat.f_fstypename); 1790 (*pr)("\tmntonname = %s\n", mp->mnt_stat.f_mntonname); 1791 (*pr)("\tmntfromname = %s\n", mp->mnt_stat.f_mntfromname); 1792 1793 { 1794 int cnt = 0; 1795 vnode_t *vp; 1796 vnode_impl_t *vip; 1797 (*pr)("locked vnodes ="); 1798 TAILQ_FOREACH(vip, &mp->mnt_vnodelist, vi_mntvnodes) { 1799 vp = VIMPL_TO_VNODE(vip); 1800 if (VOP_ISLOCKED(vp)) { 1801 if ((++cnt % 6) == 0) { 1802 (*pr)(" %p,\n\t", vp); 1803 } else { 1804 (*pr)(" %p,", vp); 1805 } 1806 } 1807 } 1808 (*pr)("\n"); 1809 } 1810 1811 if (full) { 1812 int cnt = 0; 1813 vnode_t *vp; 1814 vnode_impl_t *vip; 1815 (*pr)("all vnodes ="); 1816 TAILQ_FOREACH(vip, &mp->mnt_vnodelist, vi_mntvnodes) { 1817 vp = VIMPL_TO_VNODE(vip); 1818 if (!TAILQ_NEXT(vip, vi_mntvnodes)) { 1819 (*pr)(" %p", vp); 1820 } else if ((++cnt % 6) == 0) { 1821 (*pr)(" %p,\n\t", vp); 1822 } else { 1823 (*pr)(" %p,", vp); 1824 } 1825 } 1826 (*pr)("\n"); 1827 } 1828} 1829 1830/* 1831 * List all of the locked vnodes in the system. 1832 */ 1833void printlockedvnodes(void); 1834 1835void 1836printlockedvnodes(void) 1837{ 1838 struct mount *mp; 1839 vnode_t *vp; 1840 vnode_impl_t *vip; 1841 1842 printf("Locked vnodes\n"); 1843 for (mp = _mountlist_next(NULL); mp; mp = _mountlist_next(mp)) { 1844 TAILQ_FOREACH(vip, &mp->mnt_vnodelist, vi_mntvnodes) { 1845 vp = VIMPL_TO_VNODE(vip); 1846 if (VOP_ISLOCKED(vp)) 1847 vprint(NULL, vp); 1848 } 1849 } 1850} 1851 1852#endif /* DDB || DEBUGPRINT */ 1853