1/*	$NetBSD: vfs_subr.c,v 1.500 2023/04/30 08:46:11 riastradh Exp $	*/
2
3/*-
4 * Copyright (c) 1997, 1998, 2004, 2005, 2007, 2008, 2019, 2020
5 *     The NetBSD Foundation, Inc.
6 * All rights reserved.
7 *
8 * This code is derived from software contributed to The NetBSD Foundation
9 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
10 * NASA Ames Research Center, by Charles M. Hannum, by Andrew Doran,
11 * by Marshall Kirk McKusick and Greg Ganger at the University of Michigan.
12 *
13 * Redistribution and use in source and binary forms, with or without
14 * modification, are permitted provided that the following conditions
15 * are met:
16 * 1. Redistributions of source code must retain the above copyright
17 *    notice, this list of conditions and the following disclaimer.
18 * 2. Redistributions in binary form must reproduce the above copyright
19 *    notice, this list of conditions and the following disclaimer in the
20 *    documentation and/or other materials provided with the distribution.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
23 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
24 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
25 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
26 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
27 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
28 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
29 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
30 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
31 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
32 * POSSIBILITY OF SUCH DAMAGE.
33 */
34
35/*
36 * Copyright (c) 1989, 1993
37 *	The Regents of the University of California.  All rights reserved.
38 * (c) UNIX System Laboratories, Inc.
39 * All or some portions of this file are derived from material licensed
40 * to the University of California by American Telephone and Telegraph
41 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
42 * the permission of UNIX System Laboratories, Inc.
43 *
44 * Redistribution and use in source and binary forms, with or without
45 * modification, are permitted provided that the following conditions
46 * are met:
47 * 1. Redistributions of source code must retain the above copyright
48 *    notice, this list of conditions and the following disclaimer.
49 * 2. Redistributions in binary form must reproduce the above copyright
50 *    notice, this list of conditions and the following disclaimer in the
51 *    documentation and/or other materials provided with the distribution.
52 * 3. Neither the name of the University nor the names of its contributors
53 *    may be used to endorse or promote products derived from this software
54 *    without specific prior written permission.
55 *
56 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
57 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
58 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
59 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
60 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
61 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
62 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
63 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
64 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
65 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
66 * SUCH DAMAGE.
67 *
68 *	@(#)vfs_subr.c	8.13 (Berkeley) 4/18/94
69 */
70
71#include <sys/cdefs.h>
72__KERNEL_RCSID(0, "$NetBSD: vfs_subr.c,v 1.500 2023/04/30 08:46:11 riastradh Exp $");
73
74#ifdef _KERNEL_OPT
75#include "opt_compat_43.h"
76#include "opt_compat_netbsd.h"
77#include "opt_ddb.h"
78#endif
79
80#include <sys/param.h>
81#include <sys/types.h>
82
83#include <sys/buf.h>
84#include <sys/conf.h>
85#include <sys/dirent.h>
86#include <sys/errno.h>
87#include <sys/filedesc.h>
88#include <sys/fstrans.h>
89#include <sys/kauth.h>
90#include <sys/kernel.h>
91#include <sys/kmem.h>
92#include <sys/module.h>
93#include <sys/mount.h>
94#include <sys/namei.h>
95#include <sys/stat.h>
96#include <sys/syscallargs.h>
97#include <sys/sysctl.h>
98#include <sys/systm.h>
99#include <sys/vnode_impl.h>
100
101#include <miscfs/deadfs/deadfs.h>
102#include <miscfs/genfs/genfs.h>
103#include <miscfs/specfs/specdev.h>
104
105#include <uvm/uvm_ddb.h>
106
107SDT_PROBE_DEFINE3(vfs, syncer, worklist, vnode__add,
108    "struct vnode *"/*vp*/,
109    "int"/*delayx*/,
110    "int"/*slot*/);
111SDT_PROBE_DEFINE4(vfs, syncer, worklist, vnode__update,
112    "struct vnode *"/*vp*/,
113    "int"/*delayx*/,
114    "int"/*oslot*/,
115    "int"/*nslot*/);
116SDT_PROBE_DEFINE1(vfs, syncer, worklist, vnode__remove,
117    "struct vnode *"/*vp*/);
118
119SDT_PROBE_DEFINE3(vfs, syncer, worklist, mount__add,
120    "struct mount *"/*mp*/,
121    "int"/*vdelay*/,
122    "int"/*slot*/);
123SDT_PROBE_DEFINE4(vfs, syncer, worklist, mount__update,
124    "struct mount *"/*vp*/,
125    "int"/*vdelay*/,
126    "int"/*oslot*/,
127    "int"/*nslot*/);
128SDT_PROBE_DEFINE1(vfs, syncer, worklist, mount__remove,
129    "struct mount *"/*mp*/);
130
131SDT_PROBE_DEFINE1(vfs, syncer, sync, start,
132    "int"/*starttime*/);
133SDT_PROBE_DEFINE1(vfs, syncer, sync, mount__start,
134    "struct mount *"/*mp*/);
135SDT_PROBE_DEFINE2(vfs, syncer, sync, mount__done,
136    "struct mount *"/*mp*/,
137    "int"/*error*/);
138SDT_PROBE_DEFINE1(vfs, syncer, sync, mount__skip,
139    "struct mount *"/*mp*/);
140SDT_PROBE_DEFINE1(vfs, syncer, sync, vnode__start,
141    "struct vnode *"/*vp*/);
142SDT_PROBE_DEFINE2(vfs, syncer, sync, vnode__done,
143    "struct vnode *"/*vp*/,
144    "int"/*error*/);
145SDT_PROBE_DEFINE2(vfs, syncer, sync, vnode__fail__lock,
146    "struct vnode *"/*vp*/,
147    "int"/*error*/);
148SDT_PROBE_DEFINE2(vfs, syncer, sync, vnode__fail__vget,
149    "struct vnode *"/*vp*/,
150    "int"/*error*/);
151SDT_PROBE_DEFINE2(vfs, syncer, sync, done,
152    "int"/*starttime*/,
153    "int"/*endtime*/);
154
155const enum vtype iftovt_tab[16] = {
156	VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
157	VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
158};
159const int	vttoif_tab[9] = {
160	0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
161	S_IFSOCK, S_IFIFO, S_IFMT,
162};
163
164/*
165 * Insq/Remq for the vnode usage lists.
166 */
167#define	bufinsvn(bp, dp)	LIST_INSERT_HEAD(dp, bp, b_vnbufs)
168#define	bufremvn(bp) {							\
169	LIST_REMOVE(bp, b_vnbufs);					\
170	(bp)->b_vnbufs.le_next = NOLIST;				\
171}
172
173int doforce = 1;		/* 1 => permit forcible unmounting */
174
175/*
176 * Local declarations.
177 */
178
179static void vn_initialize_syncerd(void);
180
181/*
182 * Initialize the vnode management data structures.
183 */
184void
185vntblinit(void)
186{
187
188	vn_initialize_syncerd();
189	vfs_mount_sysinit();
190	vfs_vnode_sysinit();
191}
192
193/*
194 * Flush out and invalidate all buffers associated with a vnode.
195 * Called with the underlying vnode locked, which should prevent new dirty
196 * buffers from being queued.
197 */
198int
199vinvalbuf(struct vnode *vp, int flags, kauth_cred_t cred, struct lwp *l,
200	  bool catch_p, int slptimeo)
201{
202	struct buf *bp, *nbp;
203	int error;
204	int flushflags = PGO_ALLPAGES | PGO_FREE | PGO_SYNCIO |
205	    (flags & V_SAVE ? PGO_CLEANIT | PGO_RECLAIM : 0);
206
207	/* XXXUBC this doesn't look at flags or slp* */
208	rw_enter(vp->v_uobj.vmobjlock, RW_WRITER);
209	error = VOP_PUTPAGES(vp, 0, 0, flushflags);
210	if (error) {
211		return error;
212	}
213
214	if (flags & V_SAVE) {
215		error = VOP_FSYNC(vp, cred, FSYNC_WAIT|FSYNC_RECLAIM, 0, 0);
216		if (error)
217		        return (error);
218		KASSERT(LIST_EMPTY(&vp->v_dirtyblkhd));
219	}
220
221	mutex_enter(&bufcache_lock);
222restart:
223	for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
224		KASSERT(bp->b_vp == vp);
225		nbp = LIST_NEXT(bp, b_vnbufs);
226		error = bbusy(bp, catch_p, slptimeo, NULL);
227		if (error != 0) {
228			if (error == EPASSTHROUGH)
229				goto restart;
230			mutex_exit(&bufcache_lock);
231			return (error);
232		}
233		brelsel(bp, BC_INVAL | BC_VFLUSH);
234	}
235
236	for (bp = LIST_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) {
237		KASSERT(bp->b_vp == vp);
238		nbp = LIST_NEXT(bp, b_vnbufs);
239		error = bbusy(bp, catch_p, slptimeo, NULL);
240		if (error != 0) {
241			if (error == EPASSTHROUGH)
242				goto restart;
243			mutex_exit(&bufcache_lock);
244			return (error);
245		}
246		/*
247		 * XXX Since there are no node locks for NFS, I believe
248		 * there is a slight chance that a delayed write will
249		 * occur while sleeping just above, so check for it.
250		 */
251		if ((bp->b_oflags & BO_DELWRI) && (flags & V_SAVE)) {
252#ifdef DEBUG
253			printf("buffer still DELWRI\n");
254#endif
255			bp->b_cflags |= BC_BUSY | BC_VFLUSH;
256			mutex_exit(&bufcache_lock);
257			VOP_BWRITE(bp->b_vp, bp);
258			mutex_enter(&bufcache_lock);
259			goto restart;
260		}
261		brelsel(bp, BC_INVAL | BC_VFLUSH);
262	}
263
264#ifdef DIAGNOSTIC
265	if (!LIST_EMPTY(&vp->v_cleanblkhd) || !LIST_EMPTY(&vp->v_dirtyblkhd))
266		panic("vinvalbuf: flush failed, vp %p", vp);
267#endif
268
269	mutex_exit(&bufcache_lock);
270
271	return (0);
272}
273
274/*
275 * Destroy any in core blocks past the truncation length.
276 * Called with the underlying vnode locked, which should prevent new dirty
277 * buffers from being queued.
278 */
279int
280vtruncbuf(struct vnode *vp, daddr_t lbn, bool catch_p, int slptimeo)
281{
282	struct buf *bp, *nbp;
283	int error;
284	voff_t off;
285
286	off = round_page((voff_t)lbn << vp->v_mount->mnt_fs_bshift);
287	rw_enter(vp->v_uobj.vmobjlock, RW_WRITER);
288	error = VOP_PUTPAGES(vp, off, 0, PGO_FREE | PGO_SYNCIO);
289	if (error) {
290		return error;
291	}
292
293	mutex_enter(&bufcache_lock);
294restart:
295	for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
296		KASSERT(bp->b_vp == vp);
297		nbp = LIST_NEXT(bp, b_vnbufs);
298		if (bp->b_lblkno < lbn)
299			continue;
300		error = bbusy(bp, catch_p, slptimeo, NULL);
301		if (error != 0) {
302			if (error == EPASSTHROUGH)
303				goto restart;
304			mutex_exit(&bufcache_lock);
305			return (error);
306		}
307		brelsel(bp, BC_INVAL | BC_VFLUSH);
308	}
309
310	for (bp = LIST_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) {
311		KASSERT(bp->b_vp == vp);
312		nbp = LIST_NEXT(bp, b_vnbufs);
313		if (bp->b_lblkno < lbn)
314			continue;
315		error = bbusy(bp, catch_p, slptimeo, NULL);
316		if (error != 0) {
317			if (error == EPASSTHROUGH)
318				goto restart;
319			mutex_exit(&bufcache_lock);
320			return (error);
321		}
322		brelsel(bp, BC_INVAL | BC_VFLUSH);
323	}
324	mutex_exit(&bufcache_lock);
325
326	return (0);
327}
328
329/*
330 * Flush all dirty buffers from a vnode.
331 * Called with the underlying vnode locked, which should prevent new dirty
332 * buffers from being queued.
333 */
334int
335vflushbuf(struct vnode *vp, int flags)
336{
337	struct buf *bp, *nbp;
338	int error, pflags;
339	bool dirty, sync;
340
341	sync = (flags & FSYNC_WAIT) != 0;
342	pflags = PGO_CLEANIT | PGO_ALLPAGES |
343		(sync ? PGO_SYNCIO : 0) |
344		((flags & FSYNC_LAZY) ? PGO_LAZY : 0);
345	rw_enter(vp->v_uobj.vmobjlock, RW_WRITER);
346	(void) VOP_PUTPAGES(vp, 0, 0, pflags);
347
348loop:
349	mutex_enter(&bufcache_lock);
350	for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
351		KASSERT(bp->b_vp == vp);
352		nbp = LIST_NEXT(bp, b_vnbufs);
353		if ((bp->b_cflags & BC_BUSY))
354			continue;
355		if ((bp->b_oflags & BO_DELWRI) == 0)
356			panic("vflushbuf: not dirty, bp %p", bp);
357		bp->b_cflags |= BC_BUSY | BC_VFLUSH;
358		mutex_exit(&bufcache_lock);
359		/*
360		 * Wait for I/O associated with indirect blocks to complete,
361		 * since there is no way to quickly wait for them below.
362		 */
363		if (bp->b_vp == vp || !sync)
364			(void) bawrite(bp);
365		else {
366			error = bwrite(bp);
367			if (error)
368				return error;
369		}
370		goto loop;
371	}
372	mutex_exit(&bufcache_lock);
373
374	if (!sync)
375		return 0;
376
377	mutex_enter(vp->v_interlock);
378	while (vp->v_numoutput != 0)
379		cv_wait(&vp->v_cv, vp->v_interlock);
380	dirty = !LIST_EMPTY(&vp->v_dirtyblkhd);
381	mutex_exit(vp->v_interlock);
382
383	if (dirty) {
384		vprint("vflushbuf: dirty", vp);
385		goto loop;
386	}
387
388	return 0;
389}
390
391/*
392 * Create a vnode for a block device.
393 * Used for root filesystem and swap areas.
394 * Also used for memory file system special devices.
395 */
396int
397bdevvp(dev_t dev, vnode_t **vpp)
398{
399	struct vattr va;
400
401	vattr_null(&va);
402	va.va_type = VBLK;
403	va.va_rdev = dev;
404
405	return vcache_new(dead_rootmount, NULL, &va, NOCRED, NULL, vpp);
406}
407
408/*
409 * Create a vnode for a character device.
410 * Used for kernfs and some console handling.
411 */
412int
413cdevvp(dev_t dev, vnode_t **vpp)
414{
415	struct vattr va;
416
417	vattr_null(&va);
418	va.va_type = VCHR;
419	va.va_rdev = dev;
420
421	return vcache_new(dead_rootmount, NULL, &va, NOCRED, NULL, vpp);
422}
423
424/*
425 * Associate a buffer with a vnode.  There must already be a hold on
426 * the vnode.
427 */
428void
429bgetvp(struct vnode *vp, struct buf *bp)
430{
431
432	KASSERT(bp->b_vp == NULL);
433	KASSERT(bp->b_objlock == &buffer_lock);
434	KASSERT(mutex_owned(vp->v_interlock));
435	KASSERT(mutex_owned(&bufcache_lock));
436	KASSERT((bp->b_cflags & BC_BUSY) != 0);
437	KASSERT(!cv_has_waiters(&bp->b_done));
438
439	vholdl(vp);
440	bp->b_vp = vp;
441	if (vp->v_type == VBLK || vp->v_type == VCHR)
442		bp->b_dev = vp->v_rdev;
443	else
444		bp->b_dev = NODEV;
445
446	/*
447	 * Insert onto list for new vnode.
448	 */
449	bufinsvn(bp, &vp->v_cleanblkhd);
450	bp->b_objlock = vp->v_interlock;
451}
452
453/*
454 * Disassociate a buffer from a vnode.
455 */
456void
457brelvp(struct buf *bp)
458{
459	struct vnode *vp = bp->b_vp;
460
461	KASSERT(vp != NULL);
462	KASSERT(bp->b_objlock == vp->v_interlock);
463	KASSERT(mutex_owned(vp->v_interlock));
464	KASSERT(mutex_owned(&bufcache_lock));
465	KASSERT((bp->b_cflags & BC_BUSY) != 0);
466	KASSERT(!cv_has_waiters(&bp->b_done));
467
468	/*
469	 * Delete from old vnode list, if on one.
470	 */
471	if (LIST_NEXT(bp, b_vnbufs) != NOLIST)
472		bufremvn(bp);
473
474	if ((vp->v_iflag & (VI_ONWORKLST | VI_PAGES)) == VI_ONWORKLST &&
475	    LIST_FIRST(&vp->v_dirtyblkhd) == NULL)
476		vn_syncer_remove_from_worklist(vp);
477
478	bp->b_objlock = &buffer_lock;
479	bp->b_vp = NULL;
480	holdrelel(vp);
481}
482
483/*
484 * Reassign a buffer from one vnode list to another.
485 * The list reassignment must be within the same vnode.
486 * Used to assign file specific control information
487 * (indirect blocks) to the list to which they belong.
488 */
489void
490reassignbuf(struct buf *bp, struct vnode *vp)
491{
492	struct buflists *listheadp;
493	int delayx;
494
495	KASSERT(mutex_owned(&bufcache_lock));
496	KASSERT(bp->b_objlock == vp->v_interlock);
497	KASSERT(mutex_owned(vp->v_interlock));
498	KASSERT((bp->b_cflags & BC_BUSY) != 0);
499
500	/*
501	 * Delete from old vnode list, if on one.
502	 */
503	if (LIST_NEXT(bp, b_vnbufs) != NOLIST)
504		bufremvn(bp);
505
506	/*
507	 * If dirty, put on list of dirty buffers;
508	 * otherwise insert onto list of clean buffers.
509	 */
510	if ((bp->b_oflags & BO_DELWRI) == 0) {
511		listheadp = &vp->v_cleanblkhd;
512		if ((vp->v_iflag & (VI_ONWORKLST | VI_PAGES)) ==
513		    VI_ONWORKLST &&
514		    LIST_FIRST(&vp->v_dirtyblkhd) == NULL)
515			vn_syncer_remove_from_worklist(vp);
516	} else {
517		listheadp = &vp->v_dirtyblkhd;
518		if ((vp->v_iflag & VI_ONWORKLST) == 0) {
519			switch (vp->v_type) {
520			case VDIR:
521				delayx = dirdelay;
522				break;
523			case VBLK:
524				if (spec_node_getmountedfs(vp) != NULL) {
525					delayx = metadelay;
526					break;
527				}
528				/* fall through */
529			default:
530				delayx = filedelay;
531				break;
532			}
533			if (!vp->v_mount ||
534			    (vp->v_mount->mnt_flag & MNT_ASYNC) == 0)
535				vn_syncer_add_to_worklist(vp, delayx);
536		}
537	}
538	bufinsvn(bp, listheadp);
539}
540
541/*
542 * Lookup a vnode by device number and return it referenced.
543 */
544int
545vfinddev(dev_t dev, enum vtype type, vnode_t **vpp)
546{
547
548	return (spec_node_lookup_by_dev(type, dev, VDEAD_NOWAIT, vpp) == 0);
549}
550
551/*
552 * Revoke all the vnodes corresponding to the specified minor number
553 * range (endpoints inclusive) of the specified major.
554 */
555void
556vdevgone(int maj, int minl, int minh, enum vtype type)
557{
558	vnode_t *vp;
559	dev_t dev;
560	int mn;
561
562	for (mn = minl; mn <= minh; mn++) {
563		dev = makedev(maj, mn);
564		/*
565		 * Notify anyone trying to get at this device that it
566		 * has been detached, and then revoke it.
567		 */
568		switch (type) {
569		case VBLK:
570			bdev_detached(dev);
571			break;
572		case VCHR:
573			cdev_detached(dev);
574			break;
575		default:
576			panic("invalid specnode type: %d", type);
577		}
578		/*
579		 * Passing 0 as flags, instead of VDEAD_NOWAIT, means
580		 * spec_node_lookup_by_dev will wait for vnodes it
581		 * finds concurrently being revoked before returning.
582		 */
583		while (spec_node_lookup_by_dev(type, dev, 0, &vp) == 0) {
584			VOP_REVOKE(vp, REVOKEALL);
585			vrele(vp);
586		}
587	}
588}
589
590/*
591 * The filesystem synchronizer mechanism - syncer.
592 *
593 * It is useful to delay writes of file data and filesystem metadata for
594 * a certain amount of time so that quickly created and deleted files need
595 * not waste disk bandwidth being created and removed.  To implement this,
596 * vnodes are appended to a "workitem" queue.
597 *
598 * Most pending metadata should not wait for more than ten seconds.  Thus,
599 * mounted on block devices are delayed only about a half the time that file
600 * data is delayed.  Similarly, directory updates are more critical, so are
601 * only delayed about a third the time that file data is delayed.
602 *
603 * There are SYNCER_MAXDELAY queues that are processed in a round-robin
604 * manner at a rate of one each second (driven off the filesystem syner
605 * thread). The syncer_delayno variable indicates the next queue that is
606 * to be processed.  Items that need to be processed soon are placed in
607 * this queue:
608 *
609 *	syncer_workitem_pending[syncer_delayno]
610 *
611 * A delay of e.g. fifteen seconds is done by placing the request fifteen
612 * entries later in the queue:
613 *
614 *	syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
615 *
616 * Flag VI_ONWORKLST indicates that vnode is added into the queue.
617 */
618
619#define SYNCER_MAXDELAY		32
620
621typedef TAILQ_HEAD(synclist, vnode_impl) synclist_t;
622
623static void	vn_syncer_add1(struct vnode *, int);
624static void	sysctl_vfs_syncfs_setup(struct sysctllog **);
625
626/*
627 * Defines and variables for the syncer process.
628 */
629int syncer_maxdelay = SYNCER_MAXDELAY;	/* maximum delay time */
630time_t syncdelay = 30;			/* max time to delay syncing data */
631time_t filedelay = 30;			/* time to delay syncing files */
632time_t dirdelay  = 15;			/* time to delay syncing directories */
633time_t metadelay = 10;			/* time to delay syncing metadata */
634time_t lockdelay = 1;			/* time to delay if locking fails */
635
636static kmutex_t		syncer_data_lock; /* short term lock on data structs */
637
638static int		syncer_delayno = 0;
639static long		syncer_last;
640static synclist_t *	syncer_workitem_pending;
641
642static void
643vn_initialize_syncerd(void)
644{
645	int i;
646
647	syncer_last = SYNCER_MAXDELAY + 2;
648
649	sysctl_vfs_syncfs_setup(NULL);
650
651	syncer_workitem_pending =
652	    kmem_alloc(syncer_last * sizeof (struct synclist), KM_SLEEP);
653
654	for (i = 0; i < syncer_last; i++)
655		TAILQ_INIT(&syncer_workitem_pending[i]);
656
657	mutex_init(&syncer_data_lock, MUTEX_DEFAULT, IPL_NONE);
658}
659
660/*
661 * Return delay factor appropriate for the given file system.   For
662 * WAPBL we use the sync vnode to burst out metadata updates: sync
663 * those file systems more frequently.
664 */
665static inline int
666sync_delay(struct mount *mp)
667{
668
669	return mp->mnt_wapbl != NULL ? metadelay : syncdelay;
670}
671
672/*
673 * Compute the next slot index from delay.
674 */
675static inline int
676sync_delay_slot(int delayx)
677{
678
679	if (delayx > syncer_maxdelay - 2)
680		delayx = syncer_maxdelay - 2;
681	return (syncer_delayno + delayx) % syncer_last;
682}
683
684/*
685 * Add an item to the syncer work queue.
686 */
687static void
688vn_syncer_add1(struct vnode *vp, int delayx)
689{
690	synclist_t *slp;
691	vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
692
693	KASSERT(mutex_owned(&syncer_data_lock));
694
695	if (vp->v_iflag & VI_ONWORKLST) {
696		/*
697		 * Remove in order to adjust the position of the vnode.
698		 * Note: called from sched_sync(), which will not hold
699		 * interlock, therefore we cannot modify v_iflag here.
700		 */
701		slp = &syncer_workitem_pending[vip->vi_synclist_slot];
702		TAILQ_REMOVE(slp, vip, vi_synclist);
703	} else {
704		KASSERT(mutex_owned(vp->v_interlock));
705		vp->v_iflag |= VI_ONWORKLST;
706	}
707
708	vip->vi_synclist_slot = sync_delay_slot(delayx);
709
710	slp = &syncer_workitem_pending[vip->vi_synclist_slot];
711	TAILQ_INSERT_TAIL(slp, vip, vi_synclist);
712}
713
714void
715vn_syncer_add_to_worklist(struct vnode *vp, int delayx)
716{
717	vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
718
719	KASSERT(mutex_owned(vp->v_interlock));
720
721	mutex_enter(&syncer_data_lock);
722	vn_syncer_add1(vp, delayx);
723	SDT_PROBE3(vfs, syncer, worklist, vnode__add,
724	    vp, delayx, vip->vi_synclist_slot);
725	mutex_exit(&syncer_data_lock);
726}
727
728/*
729 * Remove an item from the syncer work queue.
730 */
731void
732vn_syncer_remove_from_worklist(struct vnode *vp)
733{
734	synclist_t *slp;
735	vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
736
737	KASSERT(mutex_owned(vp->v_interlock));
738
739	if (vp->v_iflag & VI_ONWORKLST) {
740		mutex_enter(&syncer_data_lock);
741		SDT_PROBE1(vfs, syncer, worklist, vnode__remove,  vp);
742		vp->v_iflag &= ~VI_ONWORKLST;
743		slp = &syncer_workitem_pending[vip->vi_synclist_slot];
744		TAILQ_REMOVE(slp, vip, vi_synclist);
745		mutex_exit(&syncer_data_lock);
746	}
747}
748
749/*
750 * Add this mount point to the syncer.
751 */
752void
753vfs_syncer_add_to_worklist(struct mount *mp)
754{
755	static int start, incr, next;
756	int vdelay;
757
758	KASSERT(mutex_owned(mp->mnt_updating));
759	KASSERT((mp->mnt_iflag & IMNT_ONWORKLIST) == 0);
760
761	/*
762	 * We attempt to scatter the mount points on the list
763	 * so that they will go off at evenly distributed times
764	 * even if all the filesystems are mounted at once.
765	 */
766
767	next += incr;
768	if (next == 0 || next > syncer_maxdelay) {
769		start /= 2;
770		incr /= 2;
771		if (start == 0) {
772			start = syncer_maxdelay / 2;
773			incr = syncer_maxdelay;
774		}
775		next = start;
776	}
777	mp->mnt_iflag |= IMNT_ONWORKLIST;
778	vdelay = sync_delay(mp);
779	mp->mnt_synclist_slot = vdelay > 0 ? next % vdelay : 0;
780	SDT_PROBE3(vfs, syncer, worklist, mount__add,
781	    mp, vdelay, mp->mnt_synclist_slot);
782}
783
784/*
785 * Remove the mount point from the syncer.
786 */
787void
788vfs_syncer_remove_from_worklist(struct mount *mp)
789{
790
791	KASSERT(mutex_owned(mp->mnt_updating));
792	KASSERT((mp->mnt_iflag & IMNT_ONWORKLIST) != 0);
793
794	SDT_PROBE1(vfs, syncer, worklist, mount__remove,  mp);
795	mp->mnt_iflag &= ~IMNT_ONWORKLIST;
796}
797
798/*
799 * Try lazy sync, return true on success.
800 */
801static bool
802lazy_sync_vnode(struct vnode *vp)
803{
804	bool synced;
805	int error;
806
807	KASSERT(mutex_owned(&syncer_data_lock));
808
809	synced = false;
810	if ((error = vcache_tryvget(vp)) == 0) {
811		mutex_exit(&syncer_data_lock);
812		if ((error = vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT)) == 0) {
813			synced = true;
814			SDT_PROBE1(vfs, syncer, sync, vnode__start,  vp);
815			error = VOP_FSYNC(vp, curlwp->l_cred,
816			    FSYNC_LAZY, 0, 0);
817			SDT_PROBE2(vfs, syncer, sync, vnode__done,  vp, error);
818			vput(vp);
819		} else {
820			SDT_PROBE2(vfs, syncer, sync, vnode__fail__lock,
821			    vp, error);
822			vrele(vp);
823		}
824		mutex_enter(&syncer_data_lock);
825	} else {
826		SDT_PROBE2(vfs, syncer, sync, vnode__fail__vget,  vp, error);
827	}
828	return synced;
829}
830
831/*
832 * System filesystem synchronizer daemon.
833 */
834void
835sched_sync(void *arg)
836{
837	mount_iterator_t *iter;
838	synclist_t *slp;
839	struct vnode_impl *vi;
840	struct vnode *vp;
841	struct mount *mp;
842	time_t starttime, endtime;
843	int vdelay, oslot, nslot, delayx;
844	bool synced;
845	int error;
846
847	for (;;) {
848		starttime = time_second;
849		SDT_PROBE1(vfs, syncer, sync, start,  starttime);
850
851		/*
852		 * Sync mounts whose dirty time has expired.
853		 */
854		mountlist_iterator_init(&iter);
855		while ((mp = mountlist_iterator_trynext(iter)) != NULL) {
856			if ((mp->mnt_iflag & IMNT_ONWORKLIST) == 0 ||
857			    mp->mnt_synclist_slot != syncer_delayno) {
858				SDT_PROBE1(vfs, syncer, sync, mount__skip,
859				    mp);
860				continue;
861			}
862
863			vdelay = sync_delay(mp);
864			oslot = mp->mnt_synclist_slot;
865			nslot = sync_delay_slot(vdelay);
866			mp->mnt_synclist_slot = nslot;
867			SDT_PROBE4(vfs, syncer, worklist, mount__update,
868			    mp, vdelay, oslot, nslot);
869
870			SDT_PROBE1(vfs, syncer, sync, mount__start,  mp);
871			error = VFS_SYNC(mp, MNT_LAZY, curlwp->l_cred);
872			SDT_PROBE2(vfs, syncer, sync, mount__done,
873			    mp, error);
874		}
875		mountlist_iterator_destroy(iter);
876
877		mutex_enter(&syncer_data_lock);
878
879		/*
880		 * Push files whose dirty time has expired.
881		 */
882		slp = &syncer_workitem_pending[syncer_delayno];
883		syncer_delayno += 1;
884		if (syncer_delayno >= syncer_last)
885			syncer_delayno = 0;
886
887		while ((vi = TAILQ_FIRST(slp)) != NULL) {
888			vp = VIMPL_TO_VNODE(vi);
889			synced = lazy_sync_vnode(vp);
890
891			/*
892			 * XXX The vnode may have been recycled, in which
893			 * case it may have a new identity.
894			 */
895			vi = TAILQ_FIRST(slp);
896			if (vi != NULL && VIMPL_TO_VNODE(vi) == vp) {
897				/*
898				 * Put us back on the worklist.  The worklist
899				 * routine will remove us from our current
900				 * position and then add us back in at a later
901				 * position.
902				 *
903				 * Try again sooner rather than later if
904				 * we were unable to lock the vnode.  Lock
905				 * failure should not prevent us from doing
906				 * the sync "soon".
907				 *
908				 * If we locked it yet arrive here, it's
909				 * likely that lazy sync is in progress and
910				 * so the vnode still has dirty metadata.
911				 * syncdelay is mainly to get this vnode out
912				 * of the way so we do not consider it again
913				 * "soon" in this loop, so the delay time is
914				 * not critical as long as it is not "soon".
915				 * While write-back strategy is the file
916				 * system's domain, we expect write-back to
917				 * occur no later than syncdelay seconds
918				 * into the future.
919				 */
920				delayx = synced ? syncdelay : lockdelay;
921				oslot = vi->vi_synclist_slot;
922				vn_syncer_add1(vp, delayx);
923				nslot = vi->vi_synclist_slot;
924				SDT_PROBE4(vfs, syncer, worklist,
925				    vnode__update,
926				    vp, delayx, oslot, nslot);
927			}
928		}
929
930		endtime = time_second;
931
932		SDT_PROBE2(vfs, syncer, sync, done,  starttime, endtime);
933
934		/*
935		 * If it has taken us less than a second to process the
936		 * current work, then wait.  Otherwise start right over
937		 * again.  We can still lose time if any single round
938		 * takes more than two seconds, but it does not really
939		 * matter as we are just trying to generally pace the
940		 * filesystem activity.
941		 */
942		if (endtime == starttime) {
943			kpause("syncer", false, hz, &syncer_data_lock);
944		}
945		mutex_exit(&syncer_data_lock);
946	}
947}
948
949static void
950sysctl_vfs_syncfs_setup(struct sysctllog **clog)
951{
952	const struct sysctlnode *rnode, *cnode;
953
954	sysctl_createv(clog, 0, NULL, &rnode,
955			CTLFLAG_PERMANENT,
956			CTLTYPE_NODE, "sync",
957			SYSCTL_DESCR("syncer options"),
958			NULL, 0, NULL, 0,
959			CTL_VFS, CTL_CREATE, CTL_EOL);
960
961	sysctl_createv(clog, 0, &rnode, &cnode,
962			CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
963			CTLTYPE_QUAD, "delay",
964			SYSCTL_DESCR("max time to delay syncing data"),
965			NULL, 0, &syncdelay, 0,
966			CTL_CREATE, CTL_EOL);
967
968	sysctl_createv(clog, 0, &rnode, &cnode,
969			CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
970			CTLTYPE_QUAD, "filedelay",
971			SYSCTL_DESCR("time to delay syncing files"),
972			NULL, 0, &filedelay, 0,
973			CTL_CREATE, CTL_EOL);
974
975	sysctl_createv(clog, 0, &rnode, &cnode,
976			CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
977			CTLTYPE_QUAD, "dirdelay",
978			SYSCTL_DESCR("time to delay syncing directories"),
979			NULL, 0, &dirdelay, 0,
980			CTL_CREATE, CTL_EOL);
981
982	sysctl_createv(clog, 0, &rnode, &cnode,
983			CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
984			CTLTYPE_QUAD, "metadelay",
985			SYSCTL_DESCR("time to delay syncing metadata"),
986			NULL, 0, &metadelay, 0,
987			CTL_CREATE, CTL_EOL);
988}
989
990/*
991 * sysctl helper routine to return list of supported fstypes
992 */
993int
994sysctl_vfs_generic_fstypes(SYSCTLFN_ARGS)
995{
996	char bf[sizeof(((struct statvfs *)NULL)->f_fstypename)];
997	char *where = oldp;
998	struct vfsops *v;
999	size_t needed, left, slen;
1000	int error, first;
1001
1002	if (newp != NULL)
1003		return (EPERM);
1004	if (namelen != 0)
1005		return (EINVAL);
1006
1007	first = 1;
1008	error = 0;
1009	needed = 0;
1010	left = *oldlenp;
1011
1012	sysctl_unlock();
1013	mutex_enter(&vfs_list_lock);
1014	LIST_FOREACH(v, &vfs_list, vfs_list) {
1015		if (where == NULL)
1016			needed += strlen(v->vfs_name) + 1;
1017		else {
1018			memset(bf, 0, sizeof(bf));
1019			if (first) {
1020				strncpy(bf, v->vfs_name, sizeof(bf));
1021				first = 0;
1022			} else {
1023				bf[0] = ' ';
1024				strncpy(bf + 1, v->vfs_name, sizeof(bf) - 1);
1025			}
1026			bf[sizeof(bf)-1] = '\0';
1027			slen = strlen(bf);
1028			if (left < slen + 1)
1029				break;
1030			v->vfs_refcount++;
1031			mutex_exit(&vfs_list_lock);
1032			/* +1 to copy out the trailing NUL byte */
1033			error = copyout(bf, where, slen + 1);
1034			mutex_enter(&vfs_list_lock);
1035			v->vfs_refcount--;
1036			if (error)
1037				break;
1038			where += slen;
1039			needed += slen;
1040			left -= slen;
1041		}
1042	}
1043	mutex_exit(&vfs_list_lock);
1044	sysctl_relock();
1045	*oldlenp = needed;
1046	return (error);
1047}
1048
1049int kinfo_vdebug = 1;
1050int kinfo_vgetfailed;
1051
1052#define KINFO_VNODESLOP	10
1053
1054/*
1055 * Dump vnode list (via sysctl).
1056 * Copyout address of vnode followed by vnode.
1057 */
1058int
1059sysctl_kern_vnode(SYSCTLFN_ARGS)
1060{
1061	char *where = oldp;
1062	size_t *sizep = oldlenp;
1063	struct mount *mp;
1064	vnode_t *vp, vbuf;
1065	mount_iterator_t *iter;
1066	struct vnode_iterator *marker;
1067	char *bp = where;
1068	char *ewhere;
1069	int error;
1070
1071	if (namelen != 0)
1072		return (EOPNOTSUPP);
1073	if (newp != NULL)
1074		return (EPERM);
1075
1076#define VPTRSZ	sizeof(vnode_t *)
1077#define VNODESZ	sizeof(vnode_t)
1078	if (where == NULL) {
1079		*sizep = (numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ);
1080		return (0);
1081	}
1082	ewhere = where + *sizep;
1083
1084	sysctl_unlock();
1085	mountlist_iterator_init(&iter);
1086	while ((mp = mountlist_iterator_next(iter)) != NULL) {
1087		vfs_vnode_iterator_init(mp, &marker);
1088		while ((vp = vfs_vnode_iterator_next(marker, NULL, NULL))) {
1089			if (bp + VPTRSZ + VNODESZ > ewhere) {
1090				vrele(vp);
1091				vfs_vnode_iterator_destroy(marker);
1092				mountlist_iterator_destroy(iter);
1093				sysctl_relock();
1094				*sizep = bp - where;
1095				return (ENOMEM);
1096			}
1097			memcpy(&vbuf, vp, VNODESZ);
1098			if ((error = copyout(&vp, bp, VPTRSZ)) ||
1099			    (error = copyout(&vbuf, bp + VPTRSZ, VNODESZ))) {
1100				vrele(vp);
1101				vfs_vnode_iterator_destroy(marker);
1102				mountlist_iterator_destroy(iter);
1103				sysctl_relock();
1104				return (error);
1105			}
1106			vrele(vp);
1107			bp += VPTRSZ + VNODESZ;
1108		}
1109		vfs_vnode_iterator_destroy(marker);
1110	}
1111	mountlist_iterator_destroy(iter);
1112	sysctl_relock();
1113
1114	*sizep = bp - where;
1115	return (0);
1116}
1117
1118/*
1119 * Set vnode attributes to VNOVAL
1120 */
1121void
1122vattr_null(struct vattr *vap)
1123{
1124
1125	memset(vap, 0, sizeof(*vap));
1126
1127	vap->va_type = VNON;
1128
1129	/*
1130	 * Assign individually so that it is safe even if size and
1131	 * sign of each member are varied.
1132	 */
1133	vap->va_mode = VNOVAL;
1134	vap->va_nlink = VNOVAL;
1135	vap->va_uid = VNOVAL;
1136	vap->va_gid = VNOVAL;
1137	vap->va_fsid = VNOVAL;
1138	vap->va_fileid = VNOVAL;
1139	vap->va_size = VNOVAL;
1140	vap->va_blocksize = VNOVAL;
1141	vap->va_atime.tv_sec =
1142	    vap->va_mtime.tv_sec =
1143	    vap->va_ctime.tv_sec =
1144	    vap->va_birthtime.tv_sec = VNOVAL;
1145	vap->va_atime.tv_nsec =
1146	    vap->va_mtime.tv_nsec =
1147	    vap->va_ctime.tv_nsec =
1148	    vap->va_birthtime.tv_nsec = VNOVAL;
1149	vap->va_gen = VNOVAL;
1150	vap->va_flags = VNOVAL;
1151	vap->va_rdev = VNOVAL;
1152	vap->va_bytes = VNOVAL;
1153}
1154
1155/*
1156 * Vnode state to string.
1157 */
1158const char *
1159vstate_name(enum vnode_state state)
1160{
1161
1162	switch (state) {
1163	case VS_ACTIVE:
1164		return "ACTIVE";
1165	case VS_MARKER:
1166		return "MARKER";
1167	case VS_LOADING:
1168		return "LOADING";
1169	case VS_LOADED:
1170		return "LOADED";
1171	case VS_BLOCKED:
1172		return "BLOCKED";
1173	case VS_RECLAIMING:
1174		return "RECLAIMING";
1175	case VS_RECLAIMED:
1176		return "RECLAIMED";
1177	default:
1178		return "ILLEGAL";
1179	}
1180}
1181
1182/*
1183 * Print a description of a vnode (common part).
1184 */
1185static void
1186vprint_common(struct vnode *vp, const char *prefix,
1187    void (*pr)(const char *, ...) __printflike(1, 2))
1188{
1189	int n;
1190	char bf[96];
1191	const uint8_t *cp;
1192	vnode_impl_t *vip;
1193	const char * const vnode_tags[] = { VNODE_TAGS };
1194	const char * const vnode_types[] = { VNODE_TYPES };
1195	const char vnode_flagbits[] = VNODE_FLAGBITS;
1196
1197#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof(arr[0]))
1198#define ARRAY_PRINT(idx, arr) \
1199    ((unsigned int)(idx) < ARRAY_SIZE(arr) ? (arr)[(idx)] : "UNKNOWN")
1200
1201	vip = VNODE_TO_VIMPL(vp);
1202
1203	snprintb(bf, sizeof(bf),
1204	    vnode_flagbits, vp->v_iflag | vp->v_vflag | vp->v_uflag);
1205
1206	(*pr)("vnode %p flags %s\n", vp, bf);
1207	(*pr)("%stag %s(%d) type %s(%d) mount %p typedata %p\n", prefix,
1208	    ARRAY_PRINT(vp->v_tag, vnode_tags), vp->v_tag,
1209	    ARRAY_PRINT(vp->v_type, vnode_types), vp->v_type,
1210	    vp->v_mount, vp->v_mountedhere);
1211	(*pr)("%susecount %d writecount %d holdcount %d\n", prefix,
1212	    vrefcnt(vp), vp->v_writecount, vp->v_holdcnt);
1213	(*pr)("%ssize %" PRIx64 " writesize %" PRIx64 " numoutput %d\n",
1214	    prefix, vp->v_size, vp->v_writesize, vp->v_numoutput);
1215	(*pr)("%sdata %p lock %p\n", prefix, vp->v_data, &vip->vi_lock);
1216
1217	(*pr)("%sstate %s key(%p %zd)", prefix, vstate_name(vip->vi_state),
1218	    vip->vi_key.vk_mount, vip->vi_key.vk_key_len);
1219	n = vip->vi_key.vk_key_len;
1220	cp = vip->vi_key.vk_key;
1221	while (n-- > 0)
1222		(*pr)(" %02x", *cp++);
1223	(*pr)("\n");
1224	(*pr)("%slrulisthd %p\n", prefix, vip->vi_lrulisthd);
1225
1226#undef ARRAY_PRINT
1227#undef ARRAY_SIZE
1228}
1229
1230/*
1231 * Print out a description of a vnode.
1232 */
1233void
1234vprint(const char *label, struct vnode *vp)
1235{
1236
1237	if (label != NULL)
1238		printf("%s: ", label);
1239	vprint_common(vp, "\t", printf);
1240	if (vp->v_data != NULL) {
1241		printf("\t");
1242		VOP_PRINT(vp);
1243	}
1244}
1245
1246/*
1247 * Given a file system name, look up the vfsops for that
1248 * file system, or return NULL if file system isn't present
1249 * in the kernel.
1250 */
1251struct vfsops *
1252vfs_getopsbyname(const char *name)
1253{
1254	struct vfsops *v;
1255
1256	mutex_enter(&vfs_list_lock);
1257	LIST_FOREACH(v, &vfs_list, vfs_list) {
1258		if (strcmp(v->vfs_name, name) == 0)
1259			break;
1260	}
1261	if (v != NULL)
1262		v->vfs_refcount++;
1263	mutex_exit(&vfs_list_lock);
1264
1265	return (v);
1266}
1267
1268void
1269copy_statvfs_info(struct statvfs *sbp, const struct mount *mp)
1270{
1271	const struct statvfs *mbp;
1272
1273	if (sbp == (mbp = &mp->mnt_stat))
1274		return;
1275
1276	(void)memcpy(&sbp->f_fsidx, &mbp->f_fsidx, sizeof(sbp->f_fsidx));
1277	sbp->f_fsid = mbp->f_fsid;
1278	sbp->f_owner = mbp->f_owner;
1279	sbp->f_flag = mbp->f_flag;
1280	sbp->f_syncwrites = mbp->f_syncwrites;
1281	sbp->f_asyncwrites = mbp->f_asyncwrites;
1282	sbp->f_syncreads = mbp->f_syncreads;
1283	sbp->f_asyncreads = mbp->f_asyncreads;
1284	(void)memcpy(sbp->f_spare, mbp->f_spare, sizeof(mbp->f_spare));
1285	(void)memcpy(sbp->f_fstypename, mbp->f_fstypename,
1286	    sizeof(sbp->f_fstypename));
1287	(void)memcpy(sbp->f_mntonname, mbp->f_mntonname,
1288	    sizeof(sbp->f_mntonname));
1289	(void)memcpy(sbp->f_mntfromname, mp->mnt_stat.f_mntfromname,
1290	    sizeof(sbp->f_mntfromname));
1291	(void)memcpy(sbp->f_mntfromlabel, mp->mnt_stat.f_mntfromlabel,
1292	    sizeof(sbp->f_mntfromlabel));
1293	sbp->f_namemax = mbp->f_namemax;
1294}
1295
1296int
1297set_statvfs_info(const char *onp, int ukon, const char *fromp, int ukfrom,
1298    const char *vfsname, struct mount *mp, struct lwp *l)
1299{
1300	int error;
1301	size_t size;
1302	struct statvfs *sfs = &mp->mnt_stat;
1303	int (*fun)(const void *, void *, size_t, size_t *);
1304
1305	(void)strlcpy(mp->mnt_stat.f_fstypename, vfsname,
1306	    sizeof(mp->mnt_stat.f_fstypename));
1307
1308	if (onp) {
1309		struct cwdinfo *cwdi = l->l_proc->p_cwdi;
1310		fun = (ukon == UIO_SYSSPACE) ? copystr : copyinstr;
1311		if (cwdi->cwdi_rdir != NULL) {
1312			size_t len;
1313			char *bp;
1314			char *path = PNBUF_GET();
1315
1316			bp = path + MAXPATHLEN;
1317			*--bp = '\0';
1318			rw_enter(&cwdi->cwdi_lock, RW_READER);
1319			error = getcwd_common(cwdi->cwdi_rdir, rootvnode, &bp,
1320			    path, MAXPATHLEN / 2, 0, l);
1321			rw_exit(&cwdi->cwdi_lock);
1322			if (error) {
1323				PNBUF_PUT(path);
1324				return error;
1325			}
1326
1327			len = strlen(bp);
1328			if (len > sizeof(sfs->f_mntonname) - 1)
1329				len = sizeof(sfs->f_mntonname) - 1;
1330			(void)strncpy(sfs->f_mntonname, bp, len);
1331			PNBUF_PUT(path);
1332
1333			if (len < sizeof(sfs->f_mntonname) - 1) {
1334				error = (*fun)(onp, &sfs->f_mntonname[len],
1335				    sizeof(sfs->f_mntonname) - len - 1, &size);
1336				if (error)
1337					return error;
1338				size += len;
1339			} else {
1340				size = len;
1341			}
1342		} else {
1343			error = (*fun)(onp, &sfs->f_mntonname,
1344			    sizeof(sfs->f_mntonname) - 1, &size);
1345			if (error)
1346				return error;
1347		}
1348		(void)memset(sfs->f_mntonname + size, 0,
1349		    sizeof(sfs->f_mntonname) - size);
1350	}
1351
1352	if (fromp) {
1353		fun = (ukfrom == UIO_SYSSPACE) ? copystr : copyinstr;
1354		error = (*fun)(fromp, sfs->f_mntfromname,
1355		    sizeof(sfs->f_mntfromname) - 1, &size);
1356		if (error)
1357			return error;
1358		(void)memset(sfs->f_mntfromname + size, 0,
1359		    sizeof(sfs->f_mntfromname) - size);
1360	}
1361	return 0;
1362}
1363
1364/*
1365 * Knob to control the precision of file timestamps:
1366 *
1367 *   0 = seconds only; nanoseconds zeroed.
1368 *   1 = seconds and nanoseconds, accurate within 1/HZ.
1369 *   2 = seconds and nanoseconds, truncated to microseconds.
1370 * >=3 = seconds and nanoseconds, maximum precision.
1371 */
1372enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC };
1373
1374int vfs_timestamp_precision __read_mostly = TSP_NSEC;
1375
1376void
1377vfs_timestamp(struct timespec *tsp)
1378{
1379	struct timeval tv;
1380
1381	switch (vfs_timestamp_precision) {
1382	case TSP_SEC:
1383		tsp->tv_sec = time_second;
1384		tsp->tv_nsec = 0;
1385		break;
1386	case TSP_HZ:
1387		getnanotime(tsp);
1388		break;
1389	case TSP_USEC:
1390		microtime(&tv);
1391		TIMEVAL_TO_TIMESPEC(&tv, tsp);
1392		break;
1393	case TSP_NSEC:
1394	default:
1395		nanotime(tsp);
1396		break;
1397	}
1398}
1399
1400/*
1401 * The purpose of this routine is to remove granularity from accmode_t,
1402 * reducing it into standard unix access bits - VEXEC, VREAD, VWRITE,
1403 * VADMIN and VAPPEND.
1404 *
1405 * If it returns 0, the caller is supposed to continue with the usual
1406 * access checks using 'accmode' as modified by this routine.  If it
1407 * returns nonzero value, the caller is supposed to return that value
1408 * as errno.
1409 *
1410 * Note that after this routine runs, accmode may be zero.
1411 */
1412int
1413vfs_unixify_accmode(accmode_t *accmode)
1414{
1415	/*
1416	 * There is no way to specify explicit "deny" rule using
1417	 * file mode or POSIX.1e ACLs.
1418	 */
1419	if (*accmode & VEXPLICIT_DENY) {
1420		*accmode = 0;
1421		return (0);
1422	}
1423
1424	/*
1425	 * None of these can be translated into usual access bits.
1426	 * Also, the common case for NFSv4 ACLs is to not contain
1427	 * either of these bits. Caller should check for VWRITE
1428	 * on the containing directory instead.
1429	 */
1430	if (*accmode & (VDELETE_CHILD | VDELETE))
1431		return (EPERM);
1432
1433	if (*accmode & VADMIN_PERMS) {
1434		*accmode &= ~VADMIN_PERMS;
1435		*accmode |= VADMIN;
1436	}
1437
1438	/*
1439	 * There is no way to deny VREAD_ATTRIBUTES, VREAD_ACL
1440	 * or VSYNCHRONIZE using file mode or POSIX.1e ACL.
1441	 */
1442	*accmode &= ~(VSTAT_PERMS | VSYNCHRONIZE);
1443
1444	return (0);
1445}
1446
1447time_t	rootfstime;			/* recorded root fs time, if known */
1448void
1449setrootfstime(time_t t)
1450{
1451	rootfstime = t;
1452}
1453
1454static const uint8_t vttodt_tab[ ] = {
1455	[VNON]	=	DT_UNKNOWN,
1456	[VREG]	=	DT_REG,
1457	[VDIR]	=	DT_DIR,
1458	[VBLK]	=	DT_BLK,
1459	[VCHR]	=	DT_CHR,
1460	[VLNK]	=	DT_LNK,
1461	[VSOCK]	=	DT_SOCK,
1462	[VFIFO]	=	DT_FIFO,
1463	[VBAD]	=	DT_UNKNOWN
1464};
1465
1466uint8_t
1467vtype2dt(enum vtype vt)
1468{
1469
1470	CTASSERT(VBAD == __arraycount(vttodt_tab) - 1);
1471	return vttodt_tab[vt];
1472}
1473
1474int
1475VFS_MOUNT(struct mount *mp, const char *a, void *b, size_t *c)
1476{
1477	int mpsafe = mp->mnt_iflag & IMNT_MPSAFE;
1478	int error;
1479
1480	/*
1481	 * Note: The first time through, the vfs_mount function may set
1482	 * IMNT_MPSAFE, so we have to cache it on entry in order to
1483	 * avoid leaking a kernel lock.
1484	 *
1485	 * XXX Maybe the MPSAFE bit should be set in struct vfsops and
1486	 * not in struct mount.
1487	 */
1488	if (mpsafe) {
1489		KERNEL_LOCK(1, NULL);
1490	}
1491	error = (*(mp->mnt_op->vfs_mount))(mp, a, b, c);
1492	if (mpsafe) {
1493		KERNEL_UNLOCK_ONE(NULL);
1494	}
1495
1496	return error;
1497}
1498
1499int
1500VFS_START(struct mount *mp, int a)
1501{
1502	int error;
1503
1504	if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
1505		KERNEL_LOCK(1, NULL);
1506	}
1507	error = (*(mp->mnt_op->vfs_start))(mp, a);
1508	if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
1509		KERNEL_UNLOCK_ONE(NULL);
1510	}
1511
1512	return error;
1513}
1514
1515int
1516VFS_UNMOUNT(struct mount *mp, int a)
1517{
1518	int error;
1519
1520	if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
1521		KERNEL_LOCK(1, NULL);
1522	}
1523	error = (*(mp->mnt_op->vfs_unmount))(mp, a);
1524	if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
1525		KERNEL_UNLOCK_ONE(NULL);
1526	}
1527
1528	return error;
1529}
1530
1531int
1532VFS_ROOT(struct mount *mp, int lktype, struct vnode **a)
1533{
1534	int error;
1535
1536	if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
1537		KERNEL_LOCK(1, NULL);
1538	}
1539	error = (*(mp->mnt_op->vfs_root))(mp, lktype, a);
1540	if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
1541		KERNEL_UNLOCK_ONE(NULL);
1542	}
1543
1544	return error;
1545}
1546
1547int
1548VFS_QUOTACTL(struct mount *mp, struct quotactl_args *args)
1549{
1550	int error;
1551
1552	if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
1553		KERNEL_LOCK(1, NULL);
1554	}
1555	error = (*(mp->mnt_op->vfs_quotactl))(mp, args);
1556	if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
1557		KERNEL_UNLOCK_ONE(NULL);
1558	}
1559
1560	return error;
1561}
1562
1563int
1564VFS_STATVFS(struct mount *mp, struct statvfs *a)
1565{
1566	int error;
1567
1568	if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
1569		KERNEL_LOCK(1, NULL);
1570	}
1571	error = (*(mp->mnt_op->vfs_statvfs))(mp, a);
1572	if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
1573		KERNEL_UNLOCK_ONE(NULL);
1574	}
1575
1576	return error;
1577}
1578
1579int
1580VFS_SYNC(struct mount *mp, int a, struct kauth_cred *b)
1581{
1582	int error;
1583
1584	if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
1585		KERNEL_LOCK(1, NULL);
1586	}
1587	error = (*(mp->mnt_op->vfs_sync))(mp, a, b);
1588	if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
1589		KERNEL_UNLOCK_ONE(NULL);
1590	}
1591
1592	return error;
1593}
1594
1595int
1596VFS_FHTOVP(struct mount *mp, struct fid *a, int b, struct vnode **c)
1597{
1598	int error;
1599
1600	if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
1601		KERNEL_LOCK(1, NULL);
1602	}
1603	error = (*(mp->mnt_op->vfs_fhtovp))(mp, a, b, c);
1604	if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
1605		KERNEL_UNLOCK_ONE(NULL);
1606	}
1607
1608	return error;
1609}
1610
1611int
1612VFS_VPTOFH(struct vnode *vp, struct fid *a, size_t *b)
1613{
1614	int error;
1615
1616	if ((vp->v_vflag & VV_MPSAFE) == 0) {
1617		KERNEL_LOCK(1, NULL);
1618	}
1619	error = (*(vp->v_mount->mnt_op->vfs_vptofh))(vp, a, b);
1620	if ((vp->v_vflag & VV_MPSAFE) == 0) {
1621		KERNEL_UNLOCK_ONE(NULL);
1622	}
1623
1624	return error;
1625}
1626
1627int
1628VFS_SNAPSHOT(struct mount *mp, struct vnode *a, struct timespec *b)
1629{
1630	int error;
1631
1632	if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
1633		KERNEL_LOCK(1, NULL);
1634	}
1635	error = (*(mp->mnt_op->vfs_snapshot))(mp, a, b);
1636	if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
1637		KERNEL_UNLOCK_ONE(NULL);
1638	}
1639
1640	return error;
1641}
1642
1643int
1644VFS_EXTATTRCTL(struct mount *mp, int a, struct vnode *b, int c, const char *d)
1645{
1646	int error;
1647
1648	KERNEL_LOCK(1, NULL);		/* XXXSMP check ffs */
1649	error = (*(mp->mnt_op->vfs_extattrctl))(mp, a, b, c, d);
1650	KERNEL_UNLOCK_ONE(NULL);	/* XXX */
1651
1652	return error;
1653}
1654
1655int
1656VFS_SUSPENDCTL(struct mount *mp, int a)
1657{
1658	int error;
1659
1660	if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
1661		KERNEL_LOCK(1, NULL);
1662	}
1663	error = (*(mp->mnt_op->vfs_suspendctl))(mp, a);
1664	if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
1665		KERNEL_UNLOCK_ONE(NULL);
1666	}
1667
1668	return error;
1669}
1670
1671#if defined(DDB) || defined(DEBUGPRINT)
1672static const char buf_flagbits[] = BUF_FLAGBITS;
1673
1674void
1675vfs_buf_print(struct buf *bp, int full, void (*pr)(const char *, ...))
1676{
1677	char bf[1024];
1678
1679	(*pr)("  vp %p lblkno 0x%"PRIx64" blkno 0x%"PRIx64" rawblkno 0x%"
1680	    PRIx64 " dev 0x%x\n",
1681	    bp->b_vp, bp->b_lblkno, bp->b_blkno, bp->b_rawblkno, bp->b_dev);
1682
1683	snprintb(bf, sizeof(bf),
1684	    buf_flagbits, bp->b_flags | bp->b_oflags | bp->b_cflags);
1685	(*pr)("  error %d flags %s\n", bp->b_error, bf);
1686
1687	(*pr)("  bufsize 0x%lx bcount 0x%lx resid 0x%lx\n",
1688		  bp->b_bufsize, bp->b_bcount, bp->b_resid);
1689	(*pr)("  data %p saveaddr %p\n",
1690		  bp->b_data, bp->b_saveaddr);
1691	(*pr)("  iodone %p objlock %p\n", bp->b_iodone, bp->b_objlock);
1692}
1693
1694void
1695vfs_vnode_print(struct vnode *vp, int full, void (*pr)(const char *, ...))
1696{
1697
1698	uvm_object_printit(&vp->v_uobj, full, pr);
1699	(*pr)("\n");
1700	vprint_common(vp, "", pr);
1701	if (full) {
1702		struct buf *bp;
1703
1704		(*pr)("clean bufs:\n");
1705		LIST_FOREACH(bp, &vp->v_cleanblkhd, b_vnbufs) {
1706			(*pr)(" bp %p\n", bp);
1707			vfs_buf_print(bp, full, pr);
1708		}
1709
1710		(*pr)("dirty bufs:\n");
1711		LIST_FOREACH(bp, &vp->v_dirtyblkhd, b_vnbufs) {
1712			(*pr)(" bp %p\n", bp);
1713			vfs_buf_print(bp, full, pr);
1714		}
1715	}
1716}
1717
1718void
1719vfs_vnode_lock_print(void *vlock, int full, void (*pr)(const char *, ...))
1720{
1721	struct mount *mp;
1722	vnode_impl_t *vip;
1723
1724	for (mp = _mountlist_next(NULL); mp; mp = _mountlist_next(mp)) {
1725		TAILQ_FOREACH(vip, &mp->mnt_vnodelist, vi_mntvnodes) {
1726			if (&vip->vi_lock == vlock ||
1727			    VIMPL_TO_VNODE(vip)->v_interlock == vlock)
1728				vfs_vnode_print(VIMPL_TO_VNODE(vip), full, pr);
1729		}
1730	}
1731}
1732
1733void
1734vfs_mount_print_all(int full, void (*pr)(const char *, ...))
1735{
1736	struct mount *mp;
1737	for (mp = _mountlist_next(NULL); mp; mp = _mountlist_next(mp))
1738		vfs_mount_print(mp, full, pr);
1739}
1740
1741void
1742vfs_mount_print(struct mount *mp, int full, void (*pr)(const char *, ...))
1743{
1744	char sbuf[256];
1745
1746	(*pr)("vnodecovered = %p data = %p\n",
1747			mp->mnt_vnodecovered, mp->mnt_data);
1748
1749	(*pr)("fs_bshift %d dev_bshift = %d\n",
1750			mp->mnt_fs_bshift, mp->mnt_dev_bshift);
1751
1752	snprintb(sbuf, sizeof(sbuf), __MNT_FLAG_BITS, mp->mnt_flag);
1753	(*pr)("flag = %s\n", sbuf);
1754
1755	snprintb(sbuf, sizeof(sbuf), __IMNT_FLAG_BITS, mp->mnt_iflag);
1756	(*pr)("iflag = %s\n", sbuf);
1757
1758	(*pr)("refcnt = %d updating @ %p\n", mp->mnt_refcnt, mp->mnt_updating);
1759
1760	(*pr)("statvfs cache:\n");
1761	(*pr)("\tbsize = %lu\n", mp->mnt_stat.f_bsize);
1762	(*pr)("\tfrsize = %lu\n", mp->mnt_stat.f_frsize);
1763	(*pr)("\tiosize = %lu\n", mp->mnt_stat.f_iosize);
1764
1765	(*pr)("\tblocks = %"PRIu64"\n", mp->mnt_stat.f_blocks);
1766	(*pr)("\tbfree = %"PRIu64"\n", mp->mnt_stat.f_bfree);
1767	(*pr)("\tbavail = %"PRIu64"\n", mp->mnt_stat.f_bavail);
1768	(*pr)("\tbresvd = %"PRIu64"\n", mp->mnt_stat.f_bresvd);
1769
1770	(*pr)("\tfiles = %"PRIu64"\n", mp->mnt_stat.f_files);
1771	(*pr)("\tffree = %"PRIu64"\n", mp->mnt_stat.f_ffree);
1772	(*pr)("\tfavail = %"PRIu64"\n", mp->mnt_stat.f_favail);
1773	(*pr)("\tfresvd = %"PRIu64"\n", mp->mnt_stat.f_fresvd);
1774
1775	(*pr)("\tf_fsidx = { 0x%"PRIx32", 0x%"PRIx32" }\n",
1776			mp->mnt_stat.f_fsidx.__fsid_val[0],
1777			mp->mnt_stat.f_fsidx.__fsid_val[1]);
1778
1779	(*pr)("\towner = %"PRIu32"\n", mp->mnt_stat.f_owner);
1780	(*pr)("\tnamemax = %lu\n", mp->mnt_stat.f_namemax);
1781
1782	snprintb(sbuf, sizeof(sbuf), __MNT_FLAG_BITS, mp->mnt_stat.f_flag);
1783
1784	(*pr)("\tflag = %s\n", sbuf);
1785	(*pr)("\tsyncwrites = %" PRIu64 "\n", mp->mnt_stat.f_syncwrites);
1786	(*pr)("\tasyncwrites = %" PRIu64 "\n", mp->mnt_stat.f_asyncwrites);
1787	(*pr)("\tsyncreads = %" PRIu64 "\n", mp->mnt_stat.f_syncreads);
1788	(*pr)("\tasyncreads = %" PRIu64 "\n", mp->mnt_stat.f_asyncreads);
1789	(*pr)("\tfstypename = %s\n", mp->mnt_stat.f_fstypename);
1790	(*pr)("\tmntonname = %s\n", mp->mnt_stat.f_mntonname);
1791	(*pr)("\tmntfromname = %s\n", mp->mnt_stat.f_mntfromname);
1792
1793	{
1794		int cnt = 0;
1795		vnode_t *vp;
1796		vnode_impl_t *vip;
1797		(*pr)("locked vnodes =");
1798		TAILQ_FOREACH(vip, &mp->mnt_vnodelist, vi_mntvnodes) {
1799			vp = VIMPL_TO_VNODE(vip);
1800			if (VOP_ISLOCKED(vp)) {
1801				if ((++cnt % 6) == 0) {
1802					(*pr)(" %p,\n\t", vp);
1803				} else {
1804					(*pr)(" %p,", vp);
1805				}
1806			}
1807		}
1808		(*pr)("\n");
1809	}
1810
1811	if (full) {
1812		int cnt = 0;
1813		vnode_t *vp;
1814		vnode_impl_t *vip;
1815		(*pr)("all vnodes =");
1816		TAILQ_FOREACH(vip, &mp->mnt_vnodelist, vi_mntvnodes) {
1817			vp = VIMPL_TO_VNODE(vip);
1818			if (!TAILQ_NEXT(vip, vi_mntvnodes)) {
1819				(*pr)(" %p", vp);
1820			} else if ((++cnt % 6) == 0) {
1821				(*pr)(" %p,\n\t", vp);
1822			} else {
1823				(*pr)(" %p,", vp);
1824			}
1825		}
1826		(*pr)("\n");
1827	}
1828}
1829
1830/*
1831 * List all of the locked vnodes in the system.
1832 */
1833void printlockedvnodes(void);
1834
1835void
1836printlockedvnodes(void)
1837{
1838	struct mount *mp;
1839	vnode_t *vp;
1840	vnode_impl_t *vip;
1841
1842	printf("Locked vnodes\n");
1843	for (mp = _mountlist_next(NULL); mp; mp = _mountlist_next(mp)) {
1844		TAILQ_FOREACH(vip, &mp->mnt_vnodelist, vi_mntvnodes) {
1845			vp = VIMPL_TO_VNODE(vip);
1846			if (VOP_ISLOCKED(vp))
1847				vprint(NULL, vp);
1848		}
1849	}
1850}
1851
1852#endif /* DDB || DEBUGPRINT */
1853