1/*	$NetBSD: vfs_mount.c,v 1.105 2024/04/19 00:45:41 riastradh Exp $	*/
2
3/*-
4 * Copyright (c) 1997-2020 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
9 * NASA Ames Research Center, by Charles M. Hannum, and by Andrew Doran.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 *    notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 *    notice, this list of conditions and the following disclaimer in the
18 *    documentation and/or other materials provided with the distribution.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
23 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 * POSSIBILITY OF SUCH DAMAGE.
31 */
32
33/*
34 * Copyright (c) 1989, 1993
35 *	The Regents of the University of California.  All rights reserved.
36 * (c) UNIX System Laboratories, Inc.
37 * All or some portions of this file are derived from material licensed
38 * to the University of California by American Telephone and Telegraph
39 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
40 * the permission of UNIX System Laboratories, Inc.
41 *
42 * Redistribution and use in source and binary forms, with or without
43 * modification, are permitted provided that the following conditions
44 * are met:
45 * 1. Redistributions of source code must retain the above copyright
46 *    notice, this list of conditions and the following disclaimer.
47 * 2. Redistributions in binary form must reproduce the above copyright
48 *    notice, this list of conditions and the following disclaimer in the
49 *    documentation and/or other materials provided with the distribution.
50 * 3. Neither the name of the University nor the names of its contributors
51 *    may be used to endorse or promote products derived from this software
52 *    without specific prior written permission.
53 *
54 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
55 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
56 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
57 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
58 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
59 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
60 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
62 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
64 * SUCH DAMAGE.
65 *
66 *	@(#)vfs_subr.c	8.13 (Berkeley) 4/18/94
67 */
68
69#include <sys/cdefs.h>
70__KERNEL_RCSID(0, "$NetBSD: vfs_mount.c,v 1.105 2024/04/19 00:45:41 riastradh Exp $");
71
72#include "veriexec.h"
73
74#include <sys/param.h>
75#include <sys/kernel.h>
76
77#include <sys/atomic.h>
78#include <sys/buf.h>
79#include <sys/conf.h>
80#include <sys/fcntl.h>
81#include <sys/filedesc.h>
82#include <sys/device.h>
83#include <sys/kauth.h>
84#include <sys/kmem.h>
85#include <sys/module.h>
86#include <sys/mount.h>
87#include <sys/fstrans.h>
88#include <sys/namei.h>
89#include <sys/extattr.h>
90#include <sys/verified_exec.h>
91#include <sys/syscallargs.h>
92#include <sys/sysctl.h>
93#include <sys/systm.h>
94#include <sys/vfs_syscalls.h>
95#include <sys/vnode_impl.h>
96
97#include <miscfs/deadfs/deadfs.h>
98#include <miscfs/genfs/genfs.h>
99#include <miscfs/specfs/specdev.h>
100
101#include <uvm/uvm_swap.h>
102
103enum mountlist_type {
104	ME_MOUNT,
105	ME_MARKER
106};
107struct mountlist_entry {
108	TAILQ_ENTRY(mountlist_entry) me_list;	/* Mount list. */
109	struct mount *me_mount;			/* Actual mount if ME_MOUNT,
110						   current mount else. */
111	enum mountlist_type me_type;		/* Mount or marker. */
112};
113struct mount_iterator {
114	struct mountlist_entry mi_entry;
115};
116
117static struct vnode *vfs_vnode_iterator_next1(struct vnode_iterator *,
118    bool (*)(void *, struct vnode *), void *, bool);
119
120/* Root filesystem. */
121vnode_t *			rootvnode;
122
123/* Mounted filesystem list. */
124static TAILQ_HEAD(mountlist, mountlist_entry) mountlist;
125static kmutex_t			mountlist_lock __cacheline_aligned;
126int vnode_offset_next_by_lru	/* XXX: ugly hack for pstat.c */
127    = offsetof(vnode_impl_t, vi_lrulist.tqe_next);
128
129kmutex_t			vfs_list_lock __cacheline_aligned;
130
131static specificdata_domain_t	mount_specificdata_domain;
132static kmutex_t			mntid_lock;
133
134static kmutex_t			mountgen_lock __cacheline_aligned;
135static uint64_t			mountgen;
136
137void
138vfs_mount_sysinit(void)
139{
140
141	TAILQ_INIT(&mountlist);
142	mutex_init(&mountlist_lock, MUTEX_DEFAULT, IPL_NONE);
143	mutex_init(&vfs_list_lock, MUTEX_DEFAULT, IPL_NONE);
144
145	mount_specificdata_domain = specificdata_domain_create();
146	mutex_init(&mntid_lock, MUTEX_DEFAULT, IPL_NONE);
147	mutex_init(&mountgen_lock, MUTEX_DEFAULT, IPL_NONE);
148	mountgen = 0;
149}
150
151struct mount *
152vfs_mountalloc(struct vfsops *vfsops, vnode_t *vp)
153{
154	struct mount *mp;
155	int error __diagused;
156
157	mp = kmem_zalloc(sizeof(*mp), KM_SLEEP);
158	mp->mnt_op = vfsops;
159	mp->mnt_refcnt = 1;
160	TAILQ_INIT(&mp->mnt_vnodelist);
161	mp->mnt_renamelock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
162	mp->mnt_vnodelock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
163	mp->mnt_updating = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
164	mp->mnt_vnodecovered = vp;
165	mount_initspecific(mp);
166
167	error = fstrans_mount(mp);
168	KASSERT(error == 0);
169
170	mutex_enter(&mountgen_lock);
171	mp->mnt_gen = mountgen++;
172	mutex_exit(&mountgen_lock);
173
174	return mp;
175}
176
177/*
178 * vfs_rootmountalloc: lookup a filesystem type, and if found allocate and
179 * initialize a mount structure for it.
180 *
181 * Devname is usually updated by mount(8) after booting.
182 */
183int
184vfs_rootmountalloc(const char *fstypename, const char *devname,
185    struct mount **mpp)
186{
187	struct vfsops *vfsp = NULL;
188	struct mount *mp;
189	int error __diagused;
190
191	mutex_enter(&vfs_list_lock);
192	LIST_FOREACH(vfsp, &vfs_list, vfs_list)
193		if (!strncmp(vfsp->vfs_name, fstypename,
194		    sizeof(mp->mnt_stat.f_fstypename)))
195			break;
196	if (vfsp == NULL) {
197		mutex_exit(&vfs_list_lock);
198		return (ENODEV);
199	}
200	vfsp->vfs_refcount++;
201	mutex_exit(&vfs_list_lock);
202
203	if ((mp = vfs_mountalloc(vfsp, NULL)) == NULL)
204		return ENOMEM;
205	error = vfs_busy(mp);
206	KASSERT(error == 0);
207	mp->mnt_flag = MNT_RDONLY;
208	(void)strlcpy(mp->mnt_stat.f_fstypename, vfsp->vfs_name,
209	    sizeof(mp->mnt_stat.f_fstypename));
210	mp->mnt_stat.f_mntonname[0] = '/';
211	mp->mnt_stat.f_mntonname[1] = '\0';
212	mp->mnt_stat.f_mntfromname[sizeof(mp->mnt_stat.f_mntfromname) - 1] =
213	    '\0';
214	(void)copystr(devname, mp->mnt_stat.f_mntfromname,
215	    sizeof(mp->mnt_stat.f_mntfromname) - 1, 0);
216	*mpp = mp;
217	return 0;
218}
219
220/*
221 * vfs_getnewfsid: get a new unique fsid.
222 */
223void
224vfs_getnewfsid(struct mount *mp)
225{
226	static u_short xxxfs_mntid;
227	struct mountlist_entry *me;
228	fsid_t tfsid;
229	int mtype;
230
231	mutex_enter(&mntid_lock);
232	if (xxxfs_mntid == 0)
233		++xxxfs_mntid;
234	mtype = makefstype(mp->mnt_op->vfs_name);
235	tfsid.__fsid_val[0] = makedev(mtype & 0xff, xxxfs_mntid);
236	tfsid.__fsid_val[1] = mtype;
237	/* Always increment to not return the same fsid to parallel mounts. */
238	xxxfs_mntid++;
239
240	/*
241	 * Directly walk mountlist to prevent deadlock through
242	 * mountlist_iterator_next() -> vfs_busy().
243	 */
244	mutex_enter(&mountlist_lock);
245	for (me = TAILQ_FIRST(&mountlist); me != TAILQ_END(&mountlist); ) {
246		if (me->me_type == ME_MOUNT &&
247		    me->me_mount->mnt_stat.f_fsidx.__fsid_val[0] ==
248		    tfsid.__fsid_val[0] &&
249		    me->me_mount->mnt_stat.f_fsidx.__fsid_val[1] ==
250		    tfsid.__fsid_val[1]) {
251			tfsid.__fsid_val[0]++;
252			xxxfs_mntid++;
253			me = TAILQ_FIRST(&mountlist);
254		} else {
255			me = TAILQ_NEXT(me, me_list);
256		}
257	}
258	mutex_exit(&mountlist_lock);
259
260	mp->mnt_stat.f_fsidx.__fsid_val[0] = tfsid.__fsid_val[0];
261	mp->mnt_stat.f_fsidx.__fsid_val[1] = tfsid.__fsid_val[1];
262	mp->mnt_stat.f_fsid = mp->mnt_stat.f_fsidx.__fsid_val[0];
263	mutex_exit(&mntid_lock);
264}
265
266/*
267 * Lookup a mount point by filesystem identifier.
268 *
269 * XXX Needs to add a reference to the mount point.
270 */
271struct mount *
272vfs_getvfs(fsid_t *fsid)
273{
274	mount_iterator_t *iter;
275	struct mount *mp;
276
277	mountlist_iterator_init(&iter);
278	while ((mp = mountlist_iterator_next(iter)) != NULL) {
279		if (mp->mnt_stat.f_fsidx.__fsid_val[0] == fsid->__fsid_val[0] &&
280		    mp->mnt_stat.f_fsidx.__fsid_val[1] == fsid->__fsid_val[1]) {
281			mountlist_iterator_destroy(iter);
282			return mp;
283		}
284	}
285	mountlist_iterator_destroy(iter);
286	return NULL;
287}
288
289/*
290 * Take a reference to a mount structure.
291 */
292void
293vfs_ref(struct mount *mp)
294{
295
296	KASSERT(mp->mnt_refcnt > 0 || mutex_owned(&mountlist_lock));
297
298	atomic_inc_uint(&mp->mnt_refcnt);
299}
300
301/*
302 * Drop a reference to a mount structure, freeing if the last reference.
303 */
304void
305vfs_rele(struct mount *mp)
306{
307
308	membar_release();
309	if (__predict_true((int)atomic_dec_uint_nv(&mp->mnt_refcnt) > 0)) {
310		return;
311	}
312	membar_acquire();
313
314	/*
315	 * Nothing else has visibility of the mount: we can now
316	 * free the data structures.
317	 */
318	KASSERT(mp->mnt_refcnt == 0);
319	specificdata_fini(mount_specificdata_domain, &mp->mnt_specdataref);
320	mutex_obj_free(mp->mnt_updating);
321	mutex_obj_free(mp->mnt_renamelock);
322	mutex_obj_free(mp->mnt_vnodelock);
323	if (mp->mnt_op != NULL) {
324		vfs_delref(mp->mnt_op);
325	}
326	fstrans_unmount(mp);
327	/*
328	 * Final free of mp gets done from fstrans_mount_dtor().
329	 *
330	 * Prevents this memory to be reused as a mount before
331	 * fstrans releases all references to it.
332	 */
333}
334
335/*
336 * Mark a mount point as busy, and gain a new reference to it.  Used to
337 * prevent the file system from being unmounted during critical sections.
338 *
339 * vfs_busy can be called multiple times and by multiple threads
340 * and must be accompanied by the same number of vfs_unbusy calls.
341 *
342 * => The caller must hold a pre-existing reference to the mount.
343 * => Will fail if the file system is being unmounted, or is unmounted.
344 */
345static inline int
346_vfs_busy(struct mount *mp, bool wait)
347{
348
349	KASSERT(mp->mnt_refcnt > 0);
350
351	if (wait) {
352		fstrans_start(mp);
353	} else {
354		if (fstrans_start_nowait(mp))
355			return EBUSY;
356	}
357	if (__predict_false((mp->mnt_iflag & IMNT_GONE) != 0)) {
358		fstrans_done(mp);
359		return ENOENT;
360	}
361	vfs_ref(mp);
362	return 0;
363}
364
365int
366vfs_busy(struct mount *mp)
367{
368
369	return _vfs_busy(mp, true);
370}
371
372int
373vfs_trybusy(struct mount *mp)
374{
375
376	return _vfs_busy(mp, false);
377}
378
379/*
380 * Unbusy a busy filesystem.
381 *
382 * Every successful vfs_busy() call must be undone by a vfs_unbusy() call.
383 */
384void
385vfs_unbusy(struct mount *mp)
386{
387
388	KASSERT(mp->mnt_refcnt > 0);
389
390	fstrans_done(mp);
391	vfs_rele(mp);
392}
393
394/*
395 * Change a file systems lower mount.
396 * Both the current and the new lower mount may be NULL.  The caller
397 * guarantees exclusive access to the mount and holds a pre-existing
398 * reference to the new lower mount.
399 */
400int
401vfs_set_lowermount(struct mount *mp, struct mount *lowermp)
402{
403	struct mount *oldlowermp;
404	int error;
405
406#ifdef DEBUG
407	/*
408	 * Limit the depth of file system stack so kernel sanitizers
409	 * may stress mount/unmount without exhausting the kernel stack.
410	 */
411	int depth;
412	struct mount *mp2;
413
414	for (depth = 0, mp2 = lowermp; mp2; depth++, mp2 = mp2->mnt_lower) {
415		if (depth == 23)
416			return EINVAL;
417	}
418#endif
419
420	if (lowermp) {
421		if (lowermp == dead_rootmount)
422			return ENOENT;
423		error = vfs_busy(lowermp);
424		if (error)
425			return error;
426		vfs_ref(lowermp);
427	}
428
429	oldlowermp = mp->mnt_lower;
430	mp->mnt_lower = lowermp;
431
432	if (lowermp)
433		vfs_unbusy(lowermp);
434
435	if (oldlowermp)
436		vfs_rele(oldlowermp);
437
438	return 0;
439}
440
441struct vnode_iterator {
442	vnode_impl_t vi_vnode;
443};
444
445void
446vfs_vnode_iterator_init(struct mount *mp, struct vnode_iterator **vnip)
447{
448	vnode_t *vp;
449	vnode_impl_t *vip;
450
451	vp = vnalloc_marker(mp);
452	vip = VNODE_TO_VIMPL(vp);
453
454	mutex_enter(mp->mnt_vnodelock);
455	TAILQ_INSERT_HEAD(&mp->mnt_vnodelist, vip, vi_mntvnodes);
456	vp->v_usecount = 1;
457	mutex_exit(mp->mnt_vnodelock);
458
459	*vnip = (struct vnode_iterator *)vip;
460}
461
462void
463vfs_vnode_iterator_destroy(struct vnode_iterator *vni)
464{
465	vnode_impl_t *mvip = &vni->vi_vnode;
466	vnode_t *mvp = VIMPL_TO_VNODE(mvip);
467	kmutex_t *lock;
468
469	KASSERT(vnis_marker(mvp));
470	if (vrefcnt(mvp) != 0) {
471		lock = mvp->v_mount->mnt_vnodelock;
472		mutex_enter(lock);
473		TAILQ_REMOVE(&mvp->v_mount->mnt_vnodelist, mvip, vi_mntvnodes);
474		mvp->v_usecount = 0;
475		mutex_exit(lock);
476	}
477	vnfree_marker(mvp);
478}
479
480static struct vnode *
481vfs_vnode_iterator_next1(struct vnode_iterator *vni,
482    bool (*f)(void *, struct vnode *), void *cl, bool do_wait)
483{
484	vnode_impl_t *mvip = &vni->vi_vnode;
485	struct mount *mp = VIMPL_TO_VNODE(mvip)->v_mount;
486	vnode_t *vp;
487	vnode_impl_t *vip;
488	kmutex_t *lock;
489	int error;
490
491	KASSERT(vnis_marker(VIMPL_TO_VNODE(mvip)));
492
493	lock = mp->mnt_vnodelock;
494	do {
495		mutex_enter(lock);
496		vip = TAILQ_NEXT(mvip, vi_mntvnodes);
497		TAILQ_REMOVE(&mp->mnt_vnodelist, mvip, vi_mntvnodes);
498		VIMPL_TO_VNODE(mvip)->v_usecount = 0;
499again:
500		if (vip == NULL) {
501			mutex_exit(lock);
502	       		return NULL;
503		}
504		vp = VIMPL_TO_VNODE(vip);
505		KASSERT(vp != NULL);
506		mutex_enter(vp->v_interlock);
507		if (vnis_marker(vp) ||
508		    vdead_check(vp, (do_wait ? 0 : VDEAD_NOWAIT)) ||
509		    (f && !(*f)(cl, vp))) {
510			mutex_exit(vp->v_interlock);
511			vip = TAILQ_NEXT(vip, vi_mntvnodes);
512			goto again;
513		}
514
515		TAILQ_INSERT_AFTER(&mp->mnt_vnodelist, vip, mvip, vi_mntvnodes);
516		VIMPL_TO_VNODE(mvip)->v_usecount = 1;
517		mutex_exit(lock);
518		error = vcache_vget(vp);
519		KASSERT(error == 0 || error == ENOENT);
520	} while (error != 0);
521
522	return vp;
523}
524
525struct vnode *
526vfs_vnode_iterator_next(struct vnode_iterator *vni,
527    bool (*f)(void *, struct vnode *), void *cl)
528{
529
530	return vfs_vnode_iterator_next1(vni, f, cl, false);
531}
532
533/*
534 * Move a vnode from one mount queue to another.
535 */
536void
537vfs_insmntque(vnode_t *vp, struct mount *mp)
538{
539	vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
540	struct mount *omp;
541	kmutex_t *lock;
542
543	KASSERT(mp == NULL || (mp->mnt_iflag & IMNT_UNMOUNT) == 0 ||
544	    vp->v_tag == VT_VFS);
545
546	/*
547	 * Delete from old mount point vnode list, if on one.
548	 */
549	if ((omp = vp->v_mount) != NULL) {
550		lock = omp->mnt_vnodelock;
551		mutex_enter(lock);
552		TAILQ_REMOVE(&vp->v_mount->mnt_vnodelist, vip, vi_mntvnodes);
553		mutex_exit(lock);
554	}
555
556	/*
557	 * Insert into list of vnodes for the new mount point, if
558	 * available.  The caller must take a reference on the mount
559	 * structure and donate to the vnode.
560	 */
561	if ((vp->v_mount = mp) != NULL) {
562		lock = mp->mnt_vnodelock;
563		mutex_enter(lock);
564		TAILQ_INSERT_TAIL(&mp->mnt_vnodelist, vip, vi_mntvnodes);
565		mutex_exit(lock);
566	}
567
568	if (omp != NULL) {
569		/* Release reference to old mount. */
570		vfs_rele(omp);
571	}
572}
573
574/*
575 * Remove any vnodes in the vnode table belonging to mount point mp.
576 *
577 * If FORCECLOSE is not specified, there should not be any active ones,
578 * return error if any are found (nb: this is a user error, not a
579 * system error). If FORCECLOSE is specified, detach any active vnodes
580 * that are found.
581 *
582 * If WRITECLOSE is set, only flush out regular file vnodes open for
583 * writing.
584 *
585 * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped.
586 */
587#ifdef DEBUG
588int busyprt = 0;	/* print out busy vnodes */
589struct ctldebug debug1 = { "busyprt", &busyprt };
590#endif
591
592static vnode_t *
593vflushnext(struct vnode_iterator *marker, int *when)
594{
595	if (getticks() > *when) {
596		yield();
597		*when = getticks() + hz / 10;
598	}
599	preempt_point();
600	return vfs_vnode_iterator_next1(marker, NULL, NULL, true);
601}
602
603/*
604 * Flush one vnode.  Referenced on entry, unreferenced on return.
605 */
606static int
607vflush_one(vnode_t *vp, vnode_t *skipvp, int flags)
608{
609	int error;
610	struct vattr vattr;
611
612	if (vp == skipvp ||
613	    ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM))) {
614		vrele(vp);
615		return 0;
616	}
617	/*
618	 * If WRITECLOSE is set, only flush out regular file
619	 * vnodes open for writing or open and unlinked.
620	 */
621	if ((flags & WRITECLOSE)) {
622		if (vp->v_type != VREG) {
623			vrele(vp);
624			return 0;
625		}
626		error = vn_lock(vp, LK_EXCLUSIVE);
627		if (error) {
628			KASSERT(error == ENOENT);
629			vrele(vp);
630			return 0;
631		}
632		error = VOP_FSYNC(vp, curlwp->l_cred, FSYNC_WAIT, 0, 0);
633		if (error == 0)
634			error = VOP_GETATTR(vp, &vattr, curlwp->l_cred);
635		VOP_UNLOCK(vp);
636		if (error) {
637			vrele(vp);
638			return error;
639		}
640		if (vp->v_writecount == 0 && vattr.va_nlink > 0) {
641			vrele(vp);
642			return 0;
643		}
644	}
645	/*
646	 * First try to recycle the vnode.
647	 */
648	if (vrecycle(vp))
649		return 0;
650	/*
651	 * If FORCECLOSE is set, forcibly close the vnode.
652	 * For block or character devices, revert to an
653	 * anonymous device.  For all other files, just
654	 * kill them.
655	 */
656	if (flags & FORCECLOSE) {
657		if (vrefcnt(vp) > 1 &&
658		    (vp->v_type == VBLK || vp->v_type == VCHR))
659			vcache_make_anon(vp);
660		else
661			vgone(vp);
662		return 0;
663	}
664	vrele(vp);
665	return EBUSY;
666}
667
668int
669vflush(struct mount *mp, vnode_t *skipvp, int flags)
670{
671	vnode_t *vp;
672	struct vnode_iterator *marker;
673	int busy, error, when, retries = 2;
674
675	do {
676		busy = error = when = 0;
677
678		/*
679		 * First, flush out any vnode references from the
680		 * deferred vrele list.
681		 */
682		vrele_flush(mp);
683
684		vfs_vnode_iterator_init(mp, &marker);
685
686		while ((vp = vflushnext(marker, &when)) != NULL) {
687			error = vflush_one(vp, skipvp, flags);
688			if (error == EBUSY) {
689				error = 0;
690				busy++;
691#ifdef DEBUG
692				if (busyprt && retries == 0)
693					vprint("vflush: busy vnode", vp);
694#endif
695			} else if (error != 0) {
696				break;
697			}
698		}
699
700		vfs_vnode_iterator_destroy(marker);
701	} while (error == 0 && busy > 0 && retries-- > 0);
702
703	if (error)
704		return error;
705	if (busy)
706		return EBUSY;
707	return 0;
708}
709
710/*
711 * Mount a file system.
712 */
713
714/*
715 * Scan all active processes to see if any of them have a current or root
716 * directory onto which the new filesystem has just been  mounted. If so,
717 * replace them with the new mount point.
718 */
719static void
720mount_checkdirs(vnode_t *olddp)
721{
722	vnode_t *newdp, *rele1, *rele2;
723	struct cwdinfo *cwdi;
724	struct proc *p;
725	bool retry;
726
727	if (vrefcnt(olddp) == 1) {
728		return;
729	}
730	if (VFS_ROOT(olddp->v_mountedhere, LK_EXCLUSIVE, &newdp))
731		panic("mount: lost mount");
732
733	do {
734		retry = false;
735		mutex_enter(&proc_lock);
736		PROCLIST_FOREACH(p, &allproc) {
737			if ((cwdi = p->p_cwdi) == NULL)
738				continue;
739			/*
740			 * Cannot change to the old directory any more,
741			 * so even if we see a stale value it is not a
742			 * problem.
743			 */
744			if (cwdi->cwdi_cdir != olddp &&
745			    cwdi->cwdi_rdir != olddp)
746				continue;
747			retry = true;
748			rele1 = NULL;
749			rele2 = NULL;
750			atomic_inc_uint(&cwdi->cwdi_refcnt);
751			mutex_exit(&proc_lock);
752			rw_enter(&cwdi->cwdi_lock, RW_WRITER);
753			if (cwdi->cwdi_cdir == olddp) {
754				rele1 = cwdi->cwdi_cdir;
755				vref(newdp);
756				cwdi->cwdi_cdir = newdp;
757			}
758			if (cwdi->cwdi_rdir == olddp) {
759				rele2 = cwdi->cwdi_rdir;
760				vref(newdp);
761				cwdi->cwdi_rdir = newdp;
762			}
763			rw_exit(&cwdi->cwdi_lock);
764			cwdfree(cwdi);
765			if (rele1 != NULL)
766				vrele(rele1);
767			if (rele2 != NULL)
768				vrele(rele2);
769			mutex_enter(&proc_lock);
770			break;
771		}
772		mutex_exit(&proc_lock);
773	} while (retry);
774
775	if (rootvnode == olddp) {
776		vrele(rootvnode);
777		vref(newdp);
778		rootvnode = newdp;
779	}
780	vput(newdp);
781}
782
783/*
784 * Start extended attributes
785 */
786static int
787start_extattr(struct mount *mp)
788{
789	int error;
790
791	error = VFS_EXTATTRCTL(mp, EXTATTR_CMD_START, NULL, 0, NULL);
792	if (error)
793		printf("%s: failed to start extattr: error = %d\n",
794		       mp->mnt_stat.f_mntonname, error);
795
796	return error;
797}
798
799int
800mount_domount(struct lwp *l, vnode_t **vpp, struct vfsops *vfsops,
801    const char *path, int flags, void *data, size_t *data_len)
802{
803	vnode_t *vp = *vpp;
804	struct mount *mp;
805	struct pathbuf *pb;
806	struct nameidata nd;
807	int error, error2;
808
809	error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT,
810	    KAUTH_REQ_SYSTEM_MOUNT_NEW, vp, KAUTH_ARG(flags), data);
811	if (error) {
812		vfs_delref(vfsops);
813		return error;
814	}
815
816	/* Cannot make a non-dir a mount-point (from here anyway). */
817	if (vp->v_type != VDIR) {
818		vfs_delref(vfsops);
819		return ENOTDIR;
820	}
821
822	if (flags & MNT_EXPORTED) {
823		vfs_delref(vfsops);
824		return EINVAL;
825	}
826
827	if ((mp = vfs_mountalloc(vfsops, vp)) == NULL) {
828		vfs_delref(vfsops);
829		return ENOMEM;
830	}
831
832	mp->mnt_stat.f_owner = kauth_cred_geteuid(l->l_cred);
833
834	/*
835	 * The underlying file system may refuse the mount for
836	 * various reasons.  Allow the user to force it to happen.
837	 *
838	 * Set the mount level flags.
839	 */
840	mp->mnt_flag = flags & (MNT_BASIC_FLAGS | MNT_FORCE | MNT_IGNORE);
841
842	error = VFS_MOUNT(mp, path, data, data_len);
843	mp->mnt_flag &= ~MNT_OP_FLAGS;
844
845	if (error != 0) {
846		vfs_rele(mp);
847		return error;
848	}
849
850	/* Suspend new file system before taking mnt_updating. */
851	do {
852		error2 = vfs_suspend(mp, 0);
853	} while (error2 == EINTR || error2 == ERESTART);
854	KASSERT(error2 == 0 || error2 == EOPNOTSUPP);
855	mutex_enter(mp->mnt_updating);
856
857	/*
858	 * Validate and prepare the mount point.
859	 */
860	error = pathbuf_copyin(path, &pb);
861	if (error != 0) {
862		goto err_mounted;
863	}
864	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, pb);
865	error = namei(&nd);
866	pathbuf_destroy(pb);
867	if (error != 0) {
868		goto err_mounted;
869	}
870	if (nd.ni_vp != vp) {
871		vput(nd.ni_vp);
872		error = EINVAL;
873		goto err_mounted;
874	}
875	if (vp->v_mountedhere != NULL) {
876		vput(nd.ni_vp);
877		error = EBUSY;
878		goto err_mounted;
879	}
880	error = vinvalbuf(vp, V_SAVE, l->l_cred, l, 0, 0);
881	if (error != 0) {
882		vput(nd.ni_vp);
883		goto err_mounted;
884	}
885
886	/*
887	 * Put the new filesystem on the mount list after root.
888	 */
889	cache_purge(vp);
890	mp->mnt_iflag &= ~IMNT_WANTRDWR;
891
892	mountlist_append(mp);
893	if ((mp->mnt_flag & (MNT_RDONLY | MNT_ASYNC)) == 0)
894		vfs_syncer_add_to_worklist(mp);
895	vp->v_mountedhere = mp;
896	vput(nd.ni_vp);
897
898	mount_checkdirs(vp);
899	mutex_exit(mp->mnt_updating);
900	if (error2 == 0)
901		vfs_resume(mp);
902
903	/* Hold an additional reference to the mount across VFS_START(). */
904	vfs_ref(mp);
905	(void) VFS_STATVFS(mp, &mp->mnt_stat);
906	error = VFS_START(mp, 0);
907	if (error) {
908		vrele(vp);
909	} else if (flags & MNT_EXTATTR) {
910		if (start_extattr(mp) != 0)
911			mp->mnt_flag &= ~MNT_EXTATTR;
912	}
913	/* Drop reference held for VFS_START(). */
914	vfs_rele(mp);
915	*vpp = NULL;
916	return error;
917
918err_mounted:
919	if (VFS_UNMOUNT(mp, MNT_FORCE) != 0)
920		panic("Unmounting fresh file system failed");
921	mutex_exit(mp->mnt_updating);
922	if (error2 == 0)
923		vfs_resume(mp);
924	vfs_set_lowermount(mp, NULL);
925	vfs_rele(mp);
926
927	return error;
928}
929
930/*
931 * Do the actual file system unmount.  File system is assumed to have
932 * been locked by the caller.
933 *
934 * => Caller hold reference to the mount, explicitly for dounmount().
935 */
936int
937dounmount(struct mount *mp, int flags, struct lwp *l)
938{
939	struct vnode *coveredvp, *vp;
940	struct vnode_impl *vip;
941	int error, async, used_syncer, used_extattr;
942	const bool was_suspended = fstrans_is_owner(mp);
943
944#if NVERIEXEC > 0
945	error = veriexec_unmountchk(mp);
946	if (error)
947		return (error);
948#endif /* NVERIEXEC > 0 */
949
950	if (!was_suspended) {
951		error = vfs_suspend(mp, 0);
952		if (error) {
953			return error;
954		}
955	}
956
957	KASSERT((mp->mnt_iflag & IMNT_GONE) == 0);
958
959	used_syncer = (mp->mnt_iflag & IMNT_ONWORKLIST) != 0;
960	used_extattr = mp->mnt_flag & MNT_EXTATTR;
961
962	mp->mnt_iflag |= IMNT_UNMOUNT;
963	mutex_enter(mp->mnt_updating);
964	async = mp->mnt_flag & MNT_ASYNC;
965	mp->mnt_flag &= ~MNT_ASYNC;
966	cache_purgevfs(mp);	/* remove cache entries for this file sys */
967	if (used_syncer)
968		vfs_syncer_remove_from_worklist(mp);
969	error = 0;
970	if (((mp->mnt_flag & MNT_RDONLY) == 0) && ((flags & MNT_FORCE) == 0)) {
971		error = VFS_SYNC(mp, MNT_WAIT, l->l_cred);
972	}
973	if (error == 0 || (flags & MNT_FORCE)) {
974		error = VFS_UNMOUNT(mp, flags);
975	}
976	if (error) {
977		mp->mnt_iflag &= ~IMNT_UNMOUNT;
978		if ((mp->mnt_flag & (MNT_RDONLY | MNT_ASYNC)) == 0)
979			vfs_syncer_add_to_worklist(mp);
980		mp->mnt_flag |= async;
981		mutex_exit(mp->mnt_updating);
982		if (!was_suspended)
983			vfs_resume(mp);
984		if (used_extattr) {
985			if (start_extattr(mp) != 0)
986				mp->mnt_flag &= ~MNT_EXTATTR;
987			else
988				mp->mnt_flag |= MNT_EXTATTR;
989		}
990		return (error);
991	}
992	mutex_exit(mp->mnt_updating);
993
994	/*
995	 * mark filesystem as gone to prevent further umounts
996	 * after mnt_umounting lock is gone, this also prevents
997	 * vfs_busy() from succeeding.
998	 */
999	mp->mnt_iflag |= IMNT_GONE;
1000	if ((coveredvp = mp->mnt_vnodecovered) != NULLVP) {
1001		coveredvp->v_mountedhere = NULL;
1002	}
1003	if (!was_suspended)
1004		vfs_resume(mp);
1005
1006	mountlist_remove(mp);
1007
1008	if ((vip = TAILQ_FIRST(&mp->mnt_vnodelist)) != NULL) {
1009		vp = VIMPL_TO_VNODE(vip);
1010		vprint("dangling", vp);
1011		panic("unmount: dangling vnode");
1012	}
1013	vfs_hooks_unmount(mp);
1014
1015	vfs_set_lowermount(mp, NULL);
1016	vfs_rele(mp);	/* reference from mount() */
1017	if (coveredvp != NULLVP) {
1018		vrele(coveredvp);
1019	}
1020	return (0);
1021}
1022
1023/*
1024 * Unmount all file systems.
1025 * We traverse the list in reverse order under the assumption that doing so
1026 * will avoid needing to worry about dependencies.
1027 */
1028bool
1029vfs_unmountall(struct lwp *l)
1030{
1031
1032	printf("unmounting file systems...\n");
1033	return vfs_unmountall1(l, true, true);
1034}
1035
1036static void
1037vfs_unmount_print(struct mount *mp, const char *pfx)
1038{
1039
1040	aprint_verbose("%sunmounted %s on %s type %s\n", pfx,
1041	    mp->mnt_stat.f_mntfromname, mp->mnt_stat.f_mntonname,
1042	    mp->mnt_stat.f_fstypename);
1043}
1044
1045/*
1046 * Return the mount with the highest generation less than "gen".
1047 */
1048static struct mount *
1049vfs_unmount_next(uint64_t gen)
1050{
1051	mount_iterator_t *iter;
1052	struct mount *mp, *nmp;
1053
1054	nmp = NULL;
1055
1056	mountlist_iterator_init(&iter);
1057	while ((mp = mountlist_iterator_next(iter)) != NULL) {
1058		if ((nmp == NULL || mp->mnt_gen > nmp->mnt_gen) &&
1059		    mp->mnt_gen < gen) {
1060			if (nmp != NULL)
1061				vfs_rele(nmp);
1062			nmp = mp;
1063			vfs_ref(nmp);
1064		}
1065	}
1066	mountlist_iterator_destroy(iter);
1067
1068	return nmp;
1069}
1070
1071bool
1072vfs_unmount_forceone(struct lwp *l)
1073{
1074	struct mount *mp;
1075	int error;
1076
1077	mp = vfs_unmount_next(mountgen);
1078	if (mp == NULL) {
1079		return false;
1080	}
1081
1082#ifdef DEBUG
1083	printf("forcefully unmounting %s (%s)...\n",
1084	    mp->mnt_stat.f_mntonname, mp->mnt_stat.f_mntfromname);
1085#endif
1086	if ((error = dounmount(mp, MNT_FORCE, l)) == 0) {
1087		vfs_unmount_print(mp, "forcefully ");
1088		return true;
1089	} else {
1090		vfs_rele(mp);
1091	}
1092
1093#ifdef DEBUG
1094	printf("forceful unmount of %s failed with error %d\n",
1095	    mp->mnt_stat.f_mntonname, error);
1096#endif
1097
1098	return false;
1099}
1100
1101bool
1102vfs_unmountall1(struct lwp *l, bool force, bool verbose)
1103{
1104	struct mount *mp;
1105	mount_iterator_t *iter;
1106	bool any_error = false, progress = false;
1107	uint64_t gen;
1108	int error;
1109
1110	gen = mountgen;
1111	for (;;) {
1112		mp = vfs_unmount_next(gen);
1113		if (mp == NULL)
1114			break;
1115		gen = mp->mnt_gen;
1116
1117#ifdef DEBUG
1118		printf("unmounting %p %s (%s)...\n",
1119		    (void *)mp, mp->mnt_stat.f_mntonname,
1120		    mp->mnt_stat.f_mntfromname);
1121#endif
1122		if ((error = dounmount(mp, force ? MNT_FORCE : 0, l)) == 0) {
1123			vfs_unmount_print(mp, "");
1124			progress = true;
1125		} else {
1126			vfs_rele(mp);
1127			if (verbose) {
1128				printf("unmount of %s failed with error %d\n",
1129				    mp->mnt_stat.f_mntonname, error);
1130			}
1131			any_error = true;
1132		}
1133	}
1134	if (verbose) {
1135		printf("unmounting done\n");
1136	}
1137	if (any_error && verbose) {
1138		printf("WARNING: some file systems would not unmount\n");
1139	}
1140	/* If the mountlist is empty it is time to remove swap. */
1141	mountlist_iterator_init(&iter);
1142	if (mountlist_iterator_next(iter) == NULL) {
1143		uvm_swap_shutdown(l);
1144	}
1145	mountlist_iterator_destroy(iter);
1146
1147	return progress;
1148}
1149
1150void
1151vfs_sync_all(struct lwp *l)
1152{
1153	printf("syncing disks... ");
1154
1155	/* remove user processes from run queue */
1156	suspendsched();
1157	(void)spl0();
1158
1159	/* avoid coming back this way again if we panic. */
1160	doing_shutdown = 1;
1161
1162	do_sys_sync(l);
1163
1164	/* Wait for sync to finish. */
1165	if (vfs_syncwait() != 0) {
1166#if defined(DDB) && defined(DEBUG_HALT_BUSY)
1167		Debugger();
1168#endif
1169		printf("giving up\n");
1170		return;
1171	} else
1172		printf("done\n");
1173}
1174
1175/*
1176 * Sync and unmount file systems before shutting down.
1177 */
1178void
1179vfs_shutdown(void)
1180{
1181	lwp_t *l = curlwp;
1182
1183	vfs_sync_all(l);
1184
1185	/*
1186	 * If we have panicked - do not make the situation potentially
1187	 * worse by unmounting the file systems.
1188	 */
1189	if (panicstr != NULL) {
1190		return;
1191	}
1192
1193	/* Unmount file systems. */
1194	vfs_unmountall(l);
1195}
1196
1197/*
1198 * Print a list of supported file system types (used by vfs_mountroot)
1199 */
1200static void
1201vfs_print_fstypes(void)
1202{
1203	struct vfsops *v;
1204	int cnt = 0;
1205
1206	mutex_enter(&vfs_list_lock);
1207	LIST_FOREACH(v, &vfs_list, vfs_list)
1208		++cnt;
1209	mutex_exit(&vfs_list_lock);
1210
1211	if (cnt == 0) {
1212		printf("WARNING: No file system modules have been loaded.\n");
1213		return;
1214	}
1215
1216	printf("Supported file systems:");
1217	mutex_enter(&vfs_list_lock);
1218	LIST_FOREACH(v, &vfs_list, vfs_list) {
1219		printf(" %s", v->vfs_name);
1220	}
1221	mutex_exit(&vfs_list_lock);
1222	printf("\n");
1223}
1224
1225/*
1226 * Mount the root file system.  If the operator didn't specify a
1227 * file system to use, try all possible file systems until one
1228 * succeeds.
1229 */
1230int
1231vfs_mountroot(void)
1232{
1233	struct vfsops *v;
1234	int error = ENODEV;
1235
1236	if (root_device == NULL)
1237		panic("vfs_mountroot: root device unknown");
1238
1239	switch (device_class(root_device)) {
1240	case DV_IFNET:
1241		if (rootdev != NODEV)
1242			panic("vfs_mountroot: rootdev set for DV_IFNET "
1243			    "(0x%llx -> %llu,%llu)",
1244			    (unsigned long long)rootdev,
1245			    (unsigned long long)major(rootdev),
1246			    (unsigned long long)minor(rootdev));
1247		break;
1248
1249	case DV_DISK:
1250		if (rootdev == NODEV)
1251			panic("vfs_mountroot: rootdev not set for DV_DISK");
1252	        if (bdevvp(rootdev, &rootvp))
1253	                panic("vfs_mountroot: can't get vnode for rootdev");
1254		vn_lock(rootvp, LK_EXCLUSIVE | LK_RETRY);
1255		error = VOP_OPEN(rootvp, FREAD, FSCRED);
1256		VOP_UNLOCK(rootvp);
1257		if (error) {
1258			printf("vfs_mountroot: can't open root device\n");
1259			return (error);
1260		}
1261		break;
1262
1263	case DV_VIRTUAL:
1264		break;
1265
1266	default:
1267		printf("%s: inappropriate for root file system\n",
1268		    device_xname(root_device));
1269		return (ENODEV);
1270	}
1271
1272	/*
1273	 * If user specified a root fs type, use it.  Make sure the
1274	 * specified type exists and has a mount_root()
1275	 */
1276	if (strcmp(rootfstype, ROOT_FSTYPE_ANY) != 0) {
1277		v = vfs_getopsbyname(rootfstype);
1278		error = EFTYPE;
1279		if (v != NULL) {
1280			if (v->vfs_mountroot != NULL) {
1281				error = (v->vfs_mountroot)();
1282			}
1283			v->vfs_refcount--;
1284		}
1285		goto done;
1286	}
1287
1288	/*
1289	 * Try each file system currently configured into the kernel.
1290	 */
1291	mutex_enter(&vfs_list_lock);
1292	LIST_FOREACH(v, &vfs_list, vfs_list) {
1293		if (v->vfs_mountroot == NULL)
1294			continue;
1295#ifdef DEBUG
1296		aprint_normal("mountroot: trying %s...\n", v->vfs_name);
1297#endif
1298		v->vfs_refcount++;
1299		mutex_exit(&vfs_list_lock);
1300		error = (*v->vfs_mountroot)();
1301		mutex_enter(&vfs_list_lock);
1302		v->vfs_refcount--;
1303		if (!error) {
1304			aprint_normal("root file system type: %s\n",
1305			    v->vfs_name);
1306			break;
1307		}
1308	}
1309	mutex_exit(&vfs_list_lock);
1310
1311	if (v == NULL) {
1312		vfs_print_fstypes();
1313		printf("no file system for %s", device_xname(root_device));
1314		if (device_class(root_device) == DV_DISK)
1315			printf(" (dev 0x%llx)", (unsigned long long)rootdev);
1316		printf("\n");
1317		error = EFTYPE;
1318	}
1319
1320done:
1321	if (error && device_class(root_device) == DV_DISK) {
1322		vn_lock(rootvp, LK_EXCLUSIVE | LK_RETRY);
1323		VOP_CLOSE(rootvp, FREAD, FSCRED);
1324		VOP_UNLOCK(rootvp);
1325		vrele(rootvp);
1326	}
1327	if (error == 0) {
1328		mount_iterator_t *iter;
1329		struct mount *mp;
1330
1331		mountlist_iterator_init(&iter);
1332		mp = mountlist_iterator_next(iter);
1333		KASSERT(mp != NULL);
1334		mountlist_iterator_destroy(iter);
1335
1336		mp->mnt_flag |= MNT_ROOTFS;
1337		mp->mnt_op->vfs_refcount++;
1338
1339		/*
1340		 * Get the vnode for '/'.  Set cwdi0.cwdi_cdir to
1341		 * reference it, and donate it the reference grabbed
1342		 * with VFS_ROOT().
1343		 */
1344		error = VFS_ROOT(mp, LK_NONE, &rootvnode);
1345		if (error)
1346			panic("cannot find root vnode, error=%d", error);
1347		cwdi0.cwdi_cdir = rootvnode;
1348		cwdi0.cwdi_rdir = NULL;
1349
1350		/*
1351		 * Now that root is mounted, we can fixup initproc's CWD
1352		 * info.  All other processes are kthreads, which merely
1353		 * share proc0's CWD info.
1354		 */
1355		initproc->p_cwdi->cwdi_cdir = rootvnode;
1356		vref(initproc->p_cwdi->cwdi_cdir);
1357		initproc->p_cwdi->cwdi_rdir = NULL;
1358		/*
1359		 * Enable loading of modules from the filesystem
1360		 */
1361		module_load_vfs_init();
1362
1363	}
1364	return (error);
1365}
1366
1367/*
1368 * mount_specific_key_create --
1369 *	Create a key for subsystem mount-specific data.
1370 */
1371int
1372mount_specific_key_create(specificdata_key_t *keyp, specificdata_dtor_t dtor)
1373{
1374
1375	return specificdata_key_create(mount_specificdata_domain, keyp, dtor);
1376}
1377
1378/*
1379 * mount_specific_key_delete --
1380 *	Delete a key for subsystem mount-specific data.
1381 */
1382void
1383mount_specific_key_delete(specificdata_key_t key)
1384{
1385
1386	specificdata_key_delete(mount_specificdata_domain, key);
1387}
1388
1389/*
1390 * mount_initspecific --
1391 *	Initialize a mount's specificdata container.
1392 */
1393void
1394mount_initspecific(struct mount *mp)
1395{
1396	int error __diagused;
1397
1398	error = specificdata_init(mount_specificdata_domain,
1399				  &mp->mnt_specdataref);
1400	KASSERT(error == 0);
1401}
1402
1403/*
1404 * mount_finispecific --
1405 *	Finalize a mount's specificdata container.
1406 */
1407void
1408mount_finispecific(struct mount *mp)
1409{
1410
1411	specificdata_fini(mount_specificdata_domain, &mp->mnt_specdataref);
1412}
1413
1414/*
1415 * mount_getspecific --
1416 *	Return mount-specific data corresponding to the specified key.
1417 */
1418void *
1419mount_getspecific(struct mount *mp, specificdata_key_t key)
1420{
1421
1422	return specificdata_getspecific(mount_specificdata_domain,
1423					 &mp->mnt_specdataref, key);
1424}
1425
1426/*
1427 * mount_setspecific --
1428 *	Set mount-specific data corresponding to the specified key.
1429 */
1430void
1431mount_setspecific(struct mount *mp, specificdata_key_t key, void *data)
1432{
1433
1434	specificdata_setspecific(mount_specificdata_domain,
1435				 &mp->mnt_specdataref, key, data);
1436}
1437
1438/*
1439 * Check to see if a filesystem is mounted on a block device.
1440 */
1441int
1442vfs_mountedon(vnode_t *vp)
1443{
1444	vnode_t *vq;
1445	int error = 0;
1446
1447	if (vp->v_type != VBLK)
1448		return ENOTBLK;
1449	if (spec_node_getmountedfs(vp) != NULL)
1450		return EBUSY;
1451	if (spec_node_lookup_by_dev(vp->v_type, vp->v_rdev, VDEAD_NOWAIT, &vq)
1452	    == 0) {
1453		if (spec_node_getmountedfs(vq) != NULL)
1454			error = EBUSY;
1455		vrele(vq);
1456	}
1457
1458	return error;
1459}
1460
1461/*
1462 * Check if a device pointed to by vp is mounted.
1463 *
1464 * Returns:
1465 *   EINVAL	if it's not a disk
1466 *   EBUSY	if it's a disk and mounted
1467 *   0		if it's a disk and not mounted
1468 */
1469int
1470rawdev_mounted(vnode_t *vp, vnode_t **bvpp)
1471{
1472	vnode_t *bvp;
1473	dev_t dev;
1474	int d_type;
1475
1476	bvp = NULL;
1477	d_type = D_OTHER;
1478
1479	if (iskmemvp(vp))
1480		return EINVAL;
1481
1482	switch (vp->v_type) {
1483	case VCHR: {
1484		const struct cdevsw *cdev;
1485
1486		dev = vp->v_rdev;
1487		cdev = cdevsw_lookup(dev);
1488		if (cdev != NULL) {
1489			dev_t blkdev;
1490
1491			blkdev = devsw_chr2blk(dev);
1492			if (blkdev != NODEV) {
1493				if (vfinddev(blkdev, VBLK, &bvp) != 0) {
1494					d_type = (cdev->d_flag & D_TYPEMASK);
1495					/* XXX: what if bvp disappears? */
1496					vrele(bvp);
1497				}
1498			}
1499		}
1500
1501		break;
1502		}
1503
1504	case VBLK: {
1505		const struct bdevsw *bdev;
1506
1507		dev = vp->v_rdev;
1508		bdev = bdevsw_lookup(dev);
1509		if (bdev != NULL)
1510			d_type = (bdev->d_flag & D_TYPEMASK);
1511
1512		bvp = vp;
1513
1514		break;
1515		}
1516
1517	default:
1518		break;
1519	}
1520
1521	if (d_type != D_DISK)
1522		return EINVAL;
1523
1524	if (bvpp != NULL)
1525		*bvpp = bvp;
1526
1527	/*
1528	 * XXX: This is bogus. We should be failing the request
1529	 * XXX: not only if this specific slice is mounted, but
1530	 * XXX: if it's on a disk with any other mounted slice.
1531	 */
1532	if (vfs_mountedon(bvp))
1533		return EBUSY;
1534
1535	return 0;
1536}
1537
1538/*
1539 * Make a 'unique' number from a mount type name.
1540 */
1541long
1542makefstype(const char *type)
1543{
1544	long rv;
1545
1546	for (rv = 0; *type; type++) {
1547		rv <<= 2;
1548		rv ^= *type;
1549	}
1550	return rv;
1551}
1552
1553static struct mountlist_entry *
1554mountlist_alloc(enum mountlist_type type, struct mount *mp)
1555{
1556	struct mountlist_entry *me;
1557
1558	me = kmem_zalloc(sizeof(*me), KM_SLEEP);
1559	me->me_mount = mp;
1560	me->me_type = type;
1561
1562	return me;
1563}
1564
1565static void
1566mountlist_free(struct mountlist_entry *me)
1567{
1568
1569	kmem_free(me, sizeof(*me));
1570}
1571
1572void
1573mountlist_iterator_init(mount_iterator_t **mip)
1574{
1575	struct mountlist_entry *me;
1576
1577	me = mountlist_alloc(ME_MARKER, NULL);
1578	mutex_enter(&mountlist_lock);
1579	TAILQ_INSERT_HEAD(&mountlist, me, me_list);
1580	mutex_exit(&mountlist_lock);
1581	*mip = (mount_iterator_t *)me;
1582}
1583
1584void
1585mountlist_iterator_destroy(mount_iterator_t *mi)
1586{
1587	struct mountlist_entry *marker = &mi->mi_entry;
1588
1589	if (marker->me_mount != NULL)
1590		vfs_unbusy(marker->me_mount);
1591
1592	mutex_enter(&mountlist_lock);
1593	TAILQ_REMOVE(&mountlist, marker, me_list);
1594	mutex_exit(&mountlist_lock);
1595
1596	mountlist_free(marker);
1597
1598}
1599
1600/*
1601 * Return the next mount or NULL for this iterator.
1602 * Mark it busy on success.
1603 */
1604static inline struct mount *
1605_mountlist_iterator_next(mount_iterator_t *mi, bool wait)
1606{
1607	struct mountlist_entry *me, *marker = &mi->mi_entry;
1608	struct mount *mp;
1609	int error;
1610
1611	if (marker->me_mount != NULL) {
1612		vfs_unbusy(marker->me_mount);
1613		marker->me_mount = NULL;
1614	}
1615
1616	mutex_enter(&mountlist_lock);
1617	for (;;) {
1618		KASSERT(marker->me_type == ME_MARKER);
1619
1620		me = TAILQ_NEXT(marker, me_list);
1621		if (me == NULL) {
1622			/* End of list: keep marker and return. */
1623			mutex_exit(&mountlist_lock);
1624			return NULL;
1625		}
1626		TAILQ_REMOVE(&mountlist, marker, me_list);
1627		TAILQ_INSERT_AFTER(&mountlist, me, marker, me_list);
1628
1629		/* Skip other markers. */
1630		if (me->me_type != ME_MOUNT)
1631			continue;
1632
1633		/* Take an initial reference for vfs_busy() below. */
1634		mp = me->me_mount;
1635		KASSERT(mp != NULL);
1636		vfs_ref(mp);
1637		mutex_exit(&mountlist_lock);
1638
1639		/* Try to mark this mount busy and return on success. */
1640		if (wait)
1641			error = vfs_busy(mp);
1642		else
1643			error = vfs_trybusy(mp);
1644		if (error == 0) {
1645			vfs_rele(mp);
1646			marker->me_mount = mp;
1647			return mp;
1648		}
1649		vfs_rele(mp);
1650		mutex_enter(&mountlist_lock);
1651	}
1652}
1653
1654struct mount *
1655mountlist_iterator_next(mount_iterator_t *mi)
1656{
1657
1658	return _mountlist_iterator_next(mi, true);
1659}
1660
1661struct mount *
1662mountlist_iterator_trynext(mount_iterator_t *mi)
1663{
1664
1665	return _mountlist_iterator_next(mi, false);
1666}
1667
1668/*
1669 * Attach new mount to the end of the mount list.
1670 */
1671void
1672mountlist_append(struct mount *mp)
1673{
1674	struct mountlist_entry *me;
1675
1676	me = mountlist_alloc(ME_MOUNT, mp);
1677	mutex_enter(&mountlist_lock);
1678	TAILQ_INSERT_TAIL(&mountlist, me, me_list);
1679	mutex_exit(&mountlist_lock);
1680}
1681
1682/*
1683 * Remove mount from mount list.
1684 */void
1685mountlist_remove(struct mount *mp)
1686{
1687	struct mountlist_entry *me;
1688
1689	mutex_enter(&mountlist_lock);
1690	TAILQ_FOREACH(me, &mountlist, me_list)
1691		if (me->me_type == ME_MOUNT && me->me_mount == mp)
1692			break;
1693	KASSERT(me != NULL);
1694	TAILQ_REMOVE(&mountlist, me, me_list);
1695	mutex_exit(&mountlist_lock);
1696	mountlist_free(me);
1697}
1698
1699/*
1700 * Unlocked variant to traverse the mountlist.
1701 * To be used from DDB only.
1702 */
1703struct mount *
1704_mountlist_next(struct mount *mp)
1705{
1706	struct mountlist_entry *me;
1707
1708	if (mp == NULL) {
1709		me = TAILQ_FIRST(&mountlist);
1710	} else {
1711		TAILQ_FOREACH(me, &mountlist, me_list)
1712			if (me->me_type == ME_MOUNT && me->me_mount == mp)
1713				break;
1714		if (me != NULL)
1715			me = TAILQ_NEXT(me, me_list);
1716	}
1717
1718	while (me != NULL && me->me_type != ME_MOUNT)
1719		me = TAILQ_NEXT(me, me_list);
1720
1721	return (me ? me->me_mount : NULL);
1722}
1723