1/*	$NetBSD$	*/
2
3/*-
4 * Copyright (c) 1997-2011 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
9 * NASA Ames Research Center, by Charles M. Hannum, and by Andrew Doran.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 *    notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 *    notice, this list of conditions and the following disclaimer in the
18 *    documentation and/or other materials provided with the distribution.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
23 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 * POSSIBILITY OF SUCH DAMAGE.
31 */
32
33/*
34 * Copyright (c) 1989, 1993
35 *	The Regents of the University of California.  All rights reserved.
36 * (c) UNIX System Laboratories, Inc.
37 * All or some portions of this file are derived from material licensed
38 * to the University of California by American Telephone and Telegraph
39 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
40 * the permission of UNIX System Laboratories, Inc.
41 *
42 * Redistribution and use in source and binary forms, with or without
43 * modification, are permitted provided that the following conditions
44 * are met:
45 * 1. Redistributions of source code must retain the above copyright
46 *    notice, this list of conditions and the following disclaimer.
47 * 2. Redistributions in binary form must reproduce the above copyright
48 *    notice, this list of conditions and the following disclaimer in the
49 *    documentation and/or other materials provided with the distribution.
50 * 3. Neither the name of the University nor the names of its contributors
51 *    may be used to endorse or promote products derived from this software
52 *    without specific prior written permission.
53 *
54 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
55 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
56 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
57 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
58 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
59 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
60 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
62 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
64 * SUCH DAMAGE.
65 *
66 *	@(#)vfs_subr.c	8.13 (Berkeley) 4/18/94
67 */
68
69#include <sys/cdefs.h>
70__KERNEL_RCSID(0, "$NetBSD$");
71
72#include <sys/param.h>
73#include <sys/kernel.h>
74
75#include <sys/atomic.h>
76#include <sys/buf.h>
77#include <sys/conf.h>
78#include <sys/fcntl.h>
79#include <sys/filedesc.h>
80#include <sys/device.h>
81#include <sys/kauth.h>
82#include <sys/kmem.h>
83#include <sys/module.h>
84#include <sys/mount.h>
85#include <sys/namei.h>
86#include <sys/extattr.h>
87#include <sys/syscallargs.h>
88#include <sys/sysctl.h>
89#include <sys/systm.h>
90#include <sys/vfs_syscalls.h>
91#include <sys/vnode.h>
92
93#include <miscfs/genfs/genfs.h>
94#include <miscfs/syncfs/syncfs.h>
95#include <miscfs/specfs/specdev.h>
96
97/* Root filesystem and device. */
98vnode_t *			rootvnode;
99struct device *			root_device;
100
101/* Mounted filesystem list. */
102struct mntlist			mountlist;
103kmutex_t			mountlist_lock;
104
105kmutex_t			mntvnode_lock;
106kmutex_t			vfs_list_lock;
107
108static specificdata_domain_t	mount_specificdata_domain;
109static kmutex_t			mntid_lock;
110
111static kmutex_t			mountgen_lock;
112static uint64_t			mountgen;
113
114void
115vfs_mount_sysinit(void)
116{
117
118	CIRCLEQ_INIT(&mountlist);
119	mutex_init(&mountlist_lock, MUTEX_DEFAULT, IPL_NONE);
120	mutex_init(&mntvnode_lock, MUTEX_DEFAULT, IPL_NONE);
121	mutex_init(&vfs_list_lock, MUTEX_DEFAULT, IPL_NONE);
122
123	mount_specificdata_domain = specificdata_domain_create();
124	mutex_init(&mntid_lock, MUTEX_DEFAULT, IPL_NONE);
125	mutex_init(&mountgen_lock, MUTEX_DEFAULT, IPL_NONE);
126	mountgen = 0;
127}
128
129struct mount *
130vfs_mountalloc(struct vfsops *vfsops, vnode_t *vp)
131{
132	struct mount *mp;
133	int error;
134
135	mp = kmem_zalloc(sizeof(*mp), KM_SLEEP);
136	if (mp == NULL)
137		return NULL;
138
139	mp->mnt_op = vfsops;
140	mp->mnt_refcnt = 1;
141	TAILQ_INIT(&mp->mnt_vnodelist);
142	rw_init(&mp->mnt_unmounting);
143	mutex_init(&mp->mnt_renamelock, MUTEX_DEFAULT, IPL_NONE);
144	mutex_init(&mp->mnt_updating, MUTEX_DEFAULT, IPL_NONE);
145	error = vfs_busy(mp, NULL);
146	KASSERT(error == 0);
147	mp->mnt_vnodecovered = vp;
148	mount_initspecific(mp);
149
150	mutex_enter(&mountgen_lock);
151	mp->mnt_gen = mountgen++;
152	mutex_exit(&mountgen_lock);
153
154	return mp;
155}
156
157/*
158 * vfs_rootmountalloc: lookup a filesystem type, and if found allocate and
159 * initialize a mount structure for it.
160 *
161 * Devname is usually updated by mount(8) after booting.
162 */
163int
164vfs_rootmountalloc(const char *fstypename, const char *devname,
165    struct mount **mpp)
166{
167	struct vfsops *vfsp = NULL;
168	struct mount *mp;
169
170	mutex_enter(&vfs_list_lock);
171	LIST_FOREACH(vfsp, &vfs_list, vfs_list)
172		if (!strncmp(vfsp->vfs_name, fstypename,
173		    sizeof(mp->mnt_stat.f_fstypename)))
174			break;
175	if (vfsp == NULL) {
176		mutex_exit(&vfs_list_lock);
177		return (ENODEV);
178	}
179	vfsp->vfs_refcount++;
180	mutex_exit(&vfs_list_lock);
181
182	if ((mp = vfs_mountalloc(vfsp, NULL)) == NULL)
183		return ENOMEM;
184	mp->mnt_flag = MNT_RDONLY;
185	(void)strlcpy(mp->mnt_stat.f_fstypename, vfsp->vfs_name,
186	    sizeof(mp->mnt_stat.f_fstypename));
187	mp->mnt_stat.f_mntonname[0] = '/';
188	mp->mnt_stat.f_mntonname[1] = '\0';
189	mp->mnt_stat.f_mntfromname[sizeof(mp->mnt_stat.f_mntfromname) - 1] =
190	    '\0';
191	(void)copystr(devname, mp->mnt_stat.f_mntfromname,
192	    sizeof(mp->mnt_stat.f_mntfromname) - 1, 0);
193	*mpp = mp;
194	return 0;
195}
196
197/*
198 * vfs_getnewfsid: get a new unique fsid.
199 */
200void
201vfs_getnewfsid(struct mount *mp)
202{
203	static u_short xxxfs_mntid;
204	fsid_t tfsid;
205	int mtype;
206
207	mutex_enter(&mntid_lock);
208	mtype = makefstype(mp->mnt_op->vfs_name);
209	mp->mnt_stat.f_fsidx.__fsid_val[0] = makedev(mtype, 0);
210	mp->mnt_stat.f_fsidx.__fsid_val[1] = mtype;
211	mp->mnt_stat.f_fsid = mp->mnt_stat.f_fsidx.__fsid_val[0];
212	if (xxxfs_mntid == 0)
213		++xxxfs_mntid;
214	tfsid.__fsid_val[0] = makedev(mtype & 0xff, xxxfs_mntid);
215	tfsid.__fsid_val[1] = mtype;
216	if (!CIRCLEQ_EMPTY(&mountlist)) {
217		while (vfs_getvfs(&tfsid)) {
218			tfsid.__fsid_val[0]++;
219			xxxfs_mntid++;
220		}
221	}
222	mp->mnt_stat.f_fsidx.__fsid_val[0] = tfsid.__fsid_val[0];
223	mp->mnt_stat.f_fsid = mp->mnt_stat.f_fsidx.__fsid_val[0];
224	mutex_exit(&mntid_lock);
225}
226
227/*
228 * Lookup a mount point by filesystem identifier.
229 *
230 * XXX Needs to add a reference to the mount point.
231 */
232struct mount *
233vfs_getvfs(fsid_t *fsid)
234{
235	struct mount *mp;
236
237	mutex_enter(&mountlist_lock);
238	CIRCLEQ_FOREACH(mp, &mountlist, mnt_list) {
239		if (mp->mnt_stat.f_fsidx.__fsid_val[0] == fsid->__fsid_val[0] &&
240		    mp->mnt_stat.f_fsidx.__fsid_val[1] == fsid->__fsid_val[1]) {
241			mutex_exit(&mountlist_lock);
242			return (mp);
243		}
244	}
245	mutex_exit(&mountlist_lock);
246	return NULL;
247}
248
249/*
250 * Drop a reference to a mount structure, freeing if the last reference.
251 */
252void
253vfs_destroy(struct mount *mp)
254{
255
256	if (__predict_true((int)atomic_dec_uint_nv(&mp->mnt_refcnt) > 0)) {
257		return;
258	}
259
260	/*
261	 * Nothing else has visibility of the mount: we can now
262	 * free the data structures.
263	 */
264	KASSERT(mp->mnt_refcnt == 0);
265	specificdata_fini(mount_specificdata_domain, &mp->mnt_specdataref);
266	rw_destroy(&mp->mnt_unmounting);
267	mutex_destroy(&mp->mnt_updating);
268	mutex_destroy(&mp->mnt_renamelock);
269	if (mp->mnt_op != NULL) {
270		vfs_delref(mp->mnt_op);
271	}
272	kmem_free(mp, sizeof(*mp));
273}
274
275/*
276 * Mark a mount point as busy, and gain a new reference to it.  Used to
277 * prevent the file system from being unmounted during critical sections.
278 *
279 * => The caller must hold a pre-existing reference to the mount.
280 * => Will fail if the file system is being unmounted, or is unmounted.
281 */
282int
283vfs_busy(struct mount *mp, struct mount **nextp)
284{
285
286	KASSERT(mp->mnt_refcnt > 0);
287
288	if (__predict_false(!rw_tryenter(&mp->mnt_unmounting, RW_READER))) {
289		if (nextp != NULL) {
290			KASSERT(mutex_owned(&mountlist_lock));
291			*nextp = CIRCLEQ_NEXT(mp, mnt_list);
292		}
293		return EBUSY;
294	}
295	if (__predict_false((mp->mnt_iflag & IMNT_GONE) != 0)) {
296		rw_exit(&mp->mnt_unmounting);
297		if (nextp != NULL) {
298			KASSERT(mutex_owned(&mountlist_lock));
299			*nextp = CIRCLEQ_NEXT(mp, mnt_list);
300		}
301		return ENOENT;
302	}
303	if (nextp != NULL) {
304		mutex_exit(&mountlist_lock);
305	}
306	atomic_inc_uint(&mp->mnt_refcnt);
307	return 0;
308}
309
310/*
311 * Unbusy a busy filesystem.
312 *
313 * => If keepref is true, preserve reference added by vfs_busy().
314 * => If nextp != NULL, acquire mountlist_lock.
315 */
316void
317vfs_unbusy(struct mount *mp, bool keepref, struct mount **nextp)
318{
319
320	KASSERT(mp->mnt_refcnt > 0);
321
322	if (nextp != NULL) {
323		mutex_enter(&mountlist_lock);
324	}
325	rw_exit(&mp->mnt_unmounting);
326	if (!keepref) {
327		vfs_destroy(mp);
328	}
329	if (nextp != NULL) {
330		KASSERT(mutex_owned(&mountlist_lock));
331		*nextp = CIRCLEQ_NEXT(mp, mnt_list);
332	}
333}
334
335/*
336 * Insert a marker vnode into a mount's vnode list, after the
337 * specified vnode.  mntvnode_lock must be held.
338 */
339void
340vmark(vnode_t *mvp, vnode_t *vp)
341{
342	struct mount *mp = mvp->v_mount;
343
344	KASSERT(mutex_owned(&mntvnode_lock));
345	KASSERT((mvp->v_iflag & VI_MARKER) != 0);
346	KASSERT(vp->v_mount == mp);
347
348	TAILQ_INSERT_AFTER(&mp->mnt_vnodelist, vp, mvp, v_mntvnodes);
349}
350
351/*
352 * Remove a marker vnode from a mount's vnode list, and return
353 * a pointer to the next vnode in the list.  mntvnode_lock must
354 * be held.
355 */
356vnode_t *
357vunmark(vnode_t *mvp)
358{
359	struct mount *mp = mvp->v_mount;
360	vnode_t *vp;
361
362	KASSERT(mutex_owned(&mntvnode_lock));
363	KASSERT((mvp->v_iflag & VI_MARKER) != 0);
364
365	vp = TAILQ_NEXT(mvp, v_mntvnodes);
366	TAILQ_REMOVE(&mp->mnt_vnodelist, mvp, v_mntvnodes);
367
368	KASSERT(vp == NULL || vp->v_mount == mp);
369
370	return vp;
371}
372
373/*
374 * Move a vnode from one mount queue to another.
375 */
376void
377vfs_insmntque(vnode_t *vp, struct mount *mp)
378{
379	struct mount *omp;
380
381	KASSERT(mp == NULL || (mp->mnt_iflag & IMNT_UNMOUNT) == 0 ||
382	    vp->v_tag == VT_VFS);
383
384	mutex_enter(&mntvnode_lock);
385	/*
386	 * Delete from old mount point vnode list, if on one.
387	 */
388	if ((omp = vp->v_mount) != NULL)
389		TAILQ_REMOVE(&vp->v_mount->mnt_vnodelist, vp, v_mntvnodes);
390	/*
391	 * Insert into list of vnodes for the new mount point, if
392	 * available.  The caller must take a reference on the mount
393	 * structure and donate to the vnode.
394	 */
395	if ((vp->v_mount = mp) != NULL)
396		TAILQ_INSERT_TAIL(&mp->mnt_vnodelist, vp, v_mntvnodes);
397	mutex_exit(&mntvnode_lock);
398
399	if (omp != NULL) {
400		/* Release reference to old mount. */
401		vfs_destroy(omp);
402	}
403}
404
405/*
406 * Remove any vnodes in the vnode table belonging to mount point mp.
407 *
408 * If FORCECLOSE is not specified, there should not be any active ones,
409 * return error if any are found (nb: this is a user error, not a
410 * system error). If FORCECLOSE is specified, detach any active vnodes
411 * that are found.
412 *
413 * If WRITECLOSE is set, only flush out regular file vnodes open for
414 * writing.
415 *
416 * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped.
417 */
418#ifdef DEBUG
419int busyprt = 0;	/* print out busy vnodes */
420struct ctldebug debug1 = { "busyprt", &busyprt };
421#endif
422
423static vnode_t *
424vflushnext(vnode_t *mvp, int *when)
425{
426
427	if (hardclock_ticks > *when) {
428		mutex_exit(&mntvnode_lock);
429		yield();
430		mutex_enter(&mntvnode_lock);
431		*when = hardclock_ticks + hz / 10;
432	}
433	return vunmark(mvp);
434}
435
436int
437vflush(struct mount *mp, vnode_t *skipvp, int flags)
438{
439	vnode_t *vp, *mvp;
440	int busy = 0, when = 0;
441
442	/* First, flush out any vnode references from vrele_list. */
443	vrele_flush();
444
445	/* Allocate a marker vnode. */
446	mvp = vnalloc(mp);
447
448	/*
449	 * NOTE: not using the TAILQ_FOREACH here since in this loop vgone()
450	 * and vclean() are called.
451	 */
452	mutex_enter(&mntvnode_lock);
453	for (vp = TAILQ_FIRST(&mp->mnt_vnodelist); vp != NULL;
454	    vp = vflushnext(mvp, &when)) {
455		vmark(mvp, vp);
456		if (vp->v_mount != mp || vismarker(vp))
457			continue;
458		/*
459		 * Skip over a selected vnode.
460		 */
461		if (vp == skipvp)
462			continue;
463		mutex_enter(vp->v_interlock);
464		/*
465		 * Ignore clean but still referenced vnodes.
466		 */
467		if ((vp->v_iflag & VI_CLEAN) != 0) {
468			mutex_exit(vp->v_interlock);
469			continue;
470		}
471		/*
472		 * Skip over a vnodes marked VSYSTEM.
473		 */
474		if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) {
475			mutex_exit(vp->v_interlock);
476			continue;
477		}
478		/*
479		 * If WRITECLOSE is set, only flush out regular file
480		 * vnodes open for writing.
481		 */
482		if ((flags & WRITECLOSE) &&
483		    (vp->v_writecount == 0 || vp->v_type != VREG)) {
484			mutex_exit(vp->v_interlock);
485			continue;
486		}
487		/*
488		 * With v_usecount == 0, all we need to do is clear
489		 * out the vnode data structures and we are done.
490		 */
491		if (vp->v_usecount == 0) {
492			mutex_exit(&mntvnode_lock);
493			vremfree(vp);
494			vp->v_usecount = 1;
495			vclean(vp, DOCLOSE);
496			vrelel(vp, 0);
497			mutex_enter(&mntvnode_lock);
498			continue;
499		}
500		/*
501		 * If FORCECLOSE is set, forcibly close the vnode.
502		 * For block or character devices, revert to an
503		 * anonymous device.  For all other files, just
504		 * kill them.
505		 */
506		if (flags & FORCECLOSE) {
507			mutex_exit(&mntvnode_lock);
508			atomic_inc_uint(&vp->v_usecount);
509			if (vp->v_type != VBLK && vp->v_type != VCHR) {
510				vclean(vp, DOCLOSE);
511				vrelel(vp, 0);
512			} else {
513				vclean(vp, 0);
514				vp->v_op = spec_vnodeop_p; /* XXXSMP */
515				mutex_exit(vp->v_interlock);
516				/*
517				 * The vnode isn't clean, but still resides
518				 * on the mount list.  Remove it. XXX This
519				 * is a bit dodgy.
520				 */
521				vfs_insmntque(vp, NULL);
522				vrele(vp);
523			}
524			mutex_enter(&mntvnode_lock);
525			continue;
526		}
527#ifdef DEBUG
528		if (busyprt)
529			vprint("vflush: busy vnode", vp);
530#endif
531		mutex_exit(vp->v_interlock);
532		busy++;
533	}
534	mutex_exit(&mntvnode_lock);
535	vnfree(mvp);
536	if (busy)
537		return (EBUSY);
538	return (0);
539}
540
541/*
542 * Remove clean vnodes from a mountpoint's vnode list.
543 */
544void
545vfs_scrubvnlist(struct mount *mp)
546{
547	vnode_t *vp, *nvp;
548
549retry:
550	mutex_enter(&mntvnode_lock);
551	for (vp = TAILQ_FIRST(&mp->mnt_vnodelist); vp; vp = nvp) {
552		nvp = TAILQ_NEXT(vp, v_mntvnodes);
553		mutex_enter(vp->v_interlock);
554		if ((vp->v_iflag & VI_CLEAN) != 0) {
555			TAILQ_REMOVE(&mp->mnt_vnodelist, vp, v_mntvnodes);
556			vp->v_mount = NULL;
557			mutex_exit(&mntvnode_lock);
558			mutex_exit(vp->v_interlock);
559			vfs_destroy(mp);
560			goto retry;
561		}
562		mutex_exit(vp->v_interlock);
563	}
564	mutex_exit(&mntvnode_lock);
565}
566
567/*
568 * Mount a file system.
569 */
570
571/*
572 * Scan all active processes to see if any of them have a current or root
573 * directory onto which the new filesystem has just been  mounted. If so,
574 * replace them with the new mount point.
575 */
576static void
577mount_checkdirs(vnode_t *olddp)
578{
579	vnode_t *newdp, *rele1, *rele2;
580	struct cwdinfo *cwdi;
581	struct proc *p;
582	bool retry;
583
584	if (olddp->v_usecount == 1) {
585		return;
586	}
587	if (VFS_ROOT(olddp->v_mountedhere, &newdp))
588		panic("mount: lost mount");
589
590	do {
591		retry = false;
592		mutex_enter(proc_lock);
593		PROCLIST_FOREACH(p, &allproc) {
594			if ((cwdi = p->p_cwdi) == NULL)
595				continue;
596			/*
597			 * Cannot change to the old directory any more,
598			 * so even if we see a stale value it is not a
599			 * problem.
600			 */
601			if (cwdi->cwdi_cdir != olddp &&
602			    cwdi->cwdi_rdir != olddp)
603				continue;
604			retry = true;
605			rele1 = NULL;
606			rele2 = NULL;
607			atomic_inc_uint(&cwdi->cwdi_refcnt);
608			mutex_exit(proc_lock);
609			rw_enter(&cwdi->cwdi_lock, RW_WRITER);
610			if (cwdi->cwdi_cdir == olddp) {
611				rele1 = cwdi->cwdi_cdir;
612				vref(newdp);
613				cwdi->cwdi_cdir = newdp;
614			}
615			if (cwdi->cwdi_rdir == olddp) {
616				rele2 = cwdi->cwdi_rdir;
617				vref(newdp);
618				cwdi->cwdi_rdir = newdp;
619			}
620			rw_exit(&cwdi->cwdi_lock);
621			cwdfree(cwdi);
622			if (rele1 != NULL)
623				vrele(rele1);
624			if (rele2 != NULL)
625				vrele(rele2);
626			mutex_enter(proc_lock);
627			break;
628		}
629		mutex_exit(proc_lock);
630	} while (retry);
631
632	if (rootvnode == olddp) {
633		vrele(rootvnode);
634		vref(newdp);
635		rootvnode = newdp;
636	}
637	vput(newdp);
638}
639
640int
641mount_domount(struct lwp *l, vnode_t **vpp, struct vfsops *vfsops,
642    const char *path, int flags, void *data, size_t *data_len)
643{
644	vnode_t *vp = *vpp;
645	struct mount *mp;
646	struct vattr va;
647	struct pathbuf *pb;
648	struct nameidata nd;
649	int error;
650
651	error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT,
652	    KAUTH_REQ_SYSTEM_MOUNT_NEW, vp, KAUTH_ARG(flags), data);
653	if (error) {
654		vfs_delref(vfsops);
655		return error;
656	}
657
658	/* Cannot make a non-dir a mount-point (from here anyway). */
659	if (vp->v_type != VDIR) {
660		vfs_delref(vfsops);
661		return ENOTDIR;
662	}
663
664	/*
665	 * If the user is not root, ensure that they own the directory
666	 * onto which we are attempting to mount.
667	 */
668	vn_lock(vp, LK_SHARED | LK_RETRY);
669	error = VOP_GETATTR(vp, &va, l->l_cred);
670	VOP_UNLOCK(vp);
671	if (error != 0) {
672		vfs_delref(vfsops);
673		return error;
674	}
675	if ((va.va_uid != kauth_cred_geteuid(l->l_cred) &&
676	    (error = kauth_authorize_generic(l->l_cred,
677	    KAUTH_GENERIC_ISSUSER, NULL)) != 0)) {
678		vfs_delref(vfsops);
679		return error;
680	}
681
682	if (flags & MNT_EXPORTED) {
683		vfs_delref(vfsops);
684		return EINVAL;
685	}
686
687	if ((mp = vfs_mountalloc(vfsops, vp)) == NULL) {
688		vfs_delref(vfsops);
689		return ENOMEM;
690	}
691
692	mp->mnt_stat.f_owner = kauth_cred_geteuid(l->l_cred);
693
694	/*
695	 * The underlying file system may refuse the mount for
696	 * various reasons.  Allow the user to force it to happen.
697	 *
698	 * Set the mount level flags.
699	 */
700	mp->mnt_flag = flags & (MNT_BASIC_FLAGS | MNT_FORCE | MNT_IGNORE);
701
702	mutex_enter(&mp->mnt_updating);
703	error = VFS_MOUNT(mp, path, data, data_len);
704	mp->mnt_flag &= ~MNT_OP_FLAGS;
705
706	if (error != 0)
707		goto err_unmounted;
708
709	/*
710	 * Validate and prepare the mount point.
711	 */
712	error = pathbuf_copyin(path, &pb);
713	if (error != 0) {
714		goto err_mounted;
715	}
716	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, pb);
717	error = namei(&nd);
718	pathbuf_destroy(pb);
719	if (error != 0) {
720		goto err_mounted;
721	}
722	if (nd.ni_vp != vp) {
723		vput(nd.ni_vp);
724		error = EINVAL;
725		goto err_mounted;
726	}
727	if (vp->v_mountedhere != NULL) {
728		vput(nd.ni_vp);
729		error = EBUSY;
730		goto err_mounted;
731	}
732	error = vinvalbuf(vp, V_SAVE, l->l_cred, l, 0, 0);
733	if (error != 0) {
734		vput(nd.ni_vp);
735		goto err_mounted;
736	}
737
738	/*
739	 * Put the new filesystem on the mount list after root.
740	 */
741	cache_purge(vp);
742	mp->mnt_iflag &= ~IMNT_WANTRDWR;
743
744	mutex_enter(&mountlist_lock);
745	CIRCLEQ_INSERT_TAIL(&mountlist, mp, mnt_list);
746	mutex_exit(&mountlist_lock);
747	if ((mp->mnt_flag & (MNT_RDONLY | MNT_ASYNC)) == 0)
748		error = vfs_allocate_syncvnode(mp);
749	if (error == 0)
750		vp->v_mountedhere = mp;
751	vput(nd.ni_vp);
752	if (error != 0)
753		goto err_onmountlist;
754
755	mount_checkdirs(vp);
756	mutex_exit(&mp->mnt_updating);
757
758	/* Hold an additional reference to the mount across VFS_START(). */
759	vfs_unbusy(mp, true, NULL);
760	(void) VFS_STATVFS(mp, &mp->mnt_stat);
761	error = VFS_START(mp, 0);
762       if (error) {
763		vrele(vp);
764       } else if (flags & MNT_EXTATTR) {
765	       error = VFS_EXTATTRCTL(vp->v_mountedhere,
766		   EXTATTR_CMD_START, NULL, 0, NULL);
767	       if (error)
768		       printf("%s: failed to start extattr: error = %d\n",
769			   vp->v_mountedhere->mnt_stat.f_mntonname, error);
770       }
771	/* Drop reference held for VFS_START(). */
772	vfs_destroy(mp);
773	*vpp = NULL;
774	return error;
775
776err_onmountlist:
777	mutex_enter(&mountlist_lock);
778	CIRCLEQ_REMOVE(&mountlist, mp, mnt_list);
779	mp->mnt_iflag |= IMNT_GONE;
780	mutex_exit(&mountlist_lock);
781
782err_mounted:
783	if (VFS_UNMOUNT(mp, MNT_FORCE) != 0)
784		panic("Unmounting fresh file system failed");
785
786err_unmounted:
787	vp->v_mountedhere = NULL;
788	mutex_exit(&mp->mnt_updating);
789	vfs_unbusy(mp, false, NULL);
790	vfs_destroy(mp);
791
792	return error;
793}
794
795/*
796 * Do the actual file system unmount.  File system is assumed to have
797 * been locked by the caller.
798 *
799 * => Caller hold reference to the mount, explicitly for dounmount().
800 */
801int
802dounmount(struct mount *mp, int flags, struct lwp *l)
803{
804	vnode_t *coveredvp;
805	int error, async, used_syncer;
806
807#if NVERIEXEC > 0
808	error = veriexec_unmountchk(mp);
809	if (error)
810		return (error);
811#endif /* NVERIEXEC > 0 */
812
813	/*
814	 * XXX Freeze syncer.  Must do this before locking the
815	 * mount point.  See dounmount() for details.
816	 */
817	mutex_enter(&syncer_mutex);
818	rw_enter(&mp->mnt_unmounting, RW_WRITER);
819	if ((mp->mnt_iflag & IMNT_GONE) != 0) {
820		rw_exit(&mp->mnt_unmounting);
821		mutex_exit(&syncer_mutex);
822		return ENOENT;
823	}
824
825	used_syncer = (mp->mnt_syncer != NULL);
826
827	/*
828	 * XXX Syncer must be frozen when we get here.  This should really
829	 * be done on a per-mountpoint basis, but the syncer doesn't work
830	 * like that.
831	 *
832	 * The caller of dounmount() must acquire syncer_mutex because
833	 * the syncer itself acquires locks in syncer_mutex -> vfs_busy
834	 * order, and we must preserve that order to avoid deadlock.
835	 *
836	 * So, if the file system did not use the syncer, now is
837	 * the time to release the syncer_mutex.
838	 */
839	if (used_syncer == 0) {
840		mutex_exit(&syncer_mutex);
841	}
842	mp->mnt_iflag |= IMNT_UNMOUNT;
843	async = mp->mnt_flag & MNT_ASYNC;
844	mp->mnt_flag &= ~MNT_ASYNC;
845	cache_purgevfs(mp);	/* remove cache entries for this file sys */
846	if (mp->mnt_syncer != NULL)
847		vfs_deallocate_syncvnode(mp);
848	error = 0;
849	if ((mp->mnt_flag & MNT_RDONLY) == 0) {
850		error = VFS_SYNC(mp, MNT_WAIT, l->l_cred);
851	}
852	vfs_scrubvnlist(mp);
853	if (error == 0 || (flags & MNT_FORCE)) {
854		error = VFS_UNMOUNT(mp, flags);
855	}
856	if (error) {
857		if ((mp->mnt_flag & (MNT_RDONLY | MNT_ASYNC)) == 0)
858			(void) vfs_allocate_syncvnode(mp);
859		mp->mnt_iflag &= ~IMNT_UNMOUNT;
860		mp->mnt_flag |= async;
861		rw_exit(&mp->mnt_unmounting);
862		if (used_syncer)
863			mutex_exit(&syncer_mutex);
864		return (error);
865	}
866	vfs_scrubvnlist(mp);
867	mutex_enter(&mountlist_lock);
868	if ((coveredvp = mp->mnt_vnodecovered) != NULLVP)
869		coveredvp->v_mountedhere = NULL;
870	CIRCLEQ_REMOVE(&mountlist, mp, mnt_list);
871	mp->mnt_iflag |= IMNT_GONE;
872	mutex_exit(&mountlist_lock);
873	if (TAILQ_FIRST(&mp->mnt_vnodelist) != NULL)
874		panic("unmount: dangling vnode");
875	if (used_syncer)
876		mutex_exit(&syncer_mutex);
877	vfs_hooks_unmount(mp);
878	rw_exit(&mp->mnt_unmounting);
879	vfs_destroy(mp);	/* reference from mount() */
880	if (coveredvp != NULLVP) {
881		vrele(coveredvp);
882	}
883	return (0);
884}
885
886/*
887 * Unmount all file systems.
888 * We traverse the list in reverse order under the assumption that doing so
889 * will avoid needing to worry about dependencies.
890 */
891bool
892vfs_unmountall(struct lwp *l)
893{
894
895	printf("unmounting file systems...");
896	return vfs_unmountall1(l, true, true);
897}
898
899static void
900vfs_unmount_print(struct mount *mp, const char *pfx)
901{
902
903	aprint_verbose("%sunmounted %s on %s type %s\n", pfx,
904	    mp->mnt_stat.f_mntfromname, mp->mnt_stat.f_mntonname,
905	    mp->mnt_stat.f_fstypename);
906}
907
908bool
909vfs_unmount_forceone(struct lwp *l)
910{
911	struct mount *mp, *nmp;
912	int error;
913
914	nmp = NULL;
915
916	CIRCLEQ_FOREACH_REVERSE(mp, &mountlist, mnt_list) {
917		if (nmp == NULL || mp->mnt_gen > nmp->mnt_gen) {
918			nmp = mp;
919		}
920	}
921	if (nmp == NULL) {
922		return false;
923	}
924
925#ifdef DEBUG
926	printf("\nforcefully unmounting %s (%s)...",
927	    nmp->mnt_stat.f_mntonname, nmp->mnt_stat.f_mntfromname);
928#endif
929	atomic_inc_uint(&nmp->mnt_refcnt);
930	if ((error = dounmount(nmp, MNT_FORCE, l)) == 0) {
931		vfs_unmount_print(nmp, "forcefully ");
932		return true;
933	} else {
934		vfs_destroy(nmp);
935	}
936
937#ifdef DEBUG
938	printf("forceful unmount of %s failed with error %d\n",
939	    nmp->mnt_stat.f_mntonname, error);
940#endif
941
942	return false;
943}
944
945bool
946vfs_unmountall1(struct lwp *l, bool force, bool verbose)
947{
948	struct mount *mp, *nmp;
949	bool any_error = false, progress = false;
950	int error;
951
952	for (mp = CIRCLEQ_LAST(&mountlist);
953	     mp != (void *)&mountlist;
954	     mp = nmp) {
955		nmp = CIRCLEQ_PREV(mp, mnt_list);
956#ifdef DEBUG
957		printf("\nunmounting %p %s (%s)...",
958		    (void *)mp, mp->mnt_stat.f_mntonname,
959		    mp->mnt_stat.f_mntfromname);
960#endif
961		atomic_inc_uint(&mp->mnt_refcnt);
962		if ((error = dounmount(mp, force ? MNT_FORCE : 0, l)) == 0) {
963			vfs_unmount_print(mp, "");
964			progress = true;
965		} else {
966			vfs_destroy(mp);
967			if (verbose) {
968				printf("unmount of %s failed with error %d\n",
969				    mp->mnt_stat.f_mntonname, error);
970			}
971			any_error = true;
972		}
973	}
974	if (verbose) {
975		printf(" done\n");
976	}
977	if (any_error && verbose) {
978		printf("WARNING: some file systems would not unmount\n");
979	}
980	return progress;
981}
982
983void
984vfs_sync_all(struct lwp *l)
985{
986	printf("syncing disks... ");
987
988	/* remove user processes from run queue */
989	suspendsched();
990	(void)spl0();
991
992	/* avoid coming back this way again if we panic. */
993	doing_shutdown = 1;
994
995	do_sys_sync(l);
996
997	/* Wait for sync to finish. */
998	if (buf_syncwait() != 0) {
999#if defined(DDB) && defined(DEBUG_HALT_BUSY)
1000		Debugger();
1001#endif
1002		printf("giving up\n");
1003		return;
1004	} else
1005		printf("done\n");
1006}
1007
1008/*
1009 * Sync and unmount file systems before shutting down.
1010 */
1011void
1012vfs_shutdown(void)
1013{
1014	lwp_t *l = curlwp;
1015
1016	vfs_sync_all(l);
1017
1018	/*
1019	 * If we have paniced - do not make the situation potentially
1020	 * worse by unmounting the file systems.
1021	 */
1022	if (panicstr != NULL) {
1023		return;
1024	}
1025
1026	/* Unmount file systems. */
1027	vfs_unmountall(l);
1028}
1029
1030/*
1031 * Print a list of supported file system types (used by vfs_mountroot)
1032 */
1033static void
1034vfs_print_fstypes(void)
1035{
1036	struct vfsops *v;
1037	int cnt = 0;
1038
1039	mutex_enter(&vfs_list_lock);
1040	LIST_FOREACH(v, &vfs_list, vfs_list)
1041		++cnt;
1042	mutex_exit(&vfs_list_lock);
1043
1044	if (cnt == 0) {
1045		printf("WARNING: No file system modules have been loaded.\n");
1046		return;
1047	}
1048
1049	printf("Supported file systems:");
1050	mutex_enter(&vfs_list_lock);
1051	LIST_FOREACH(v, &vfs_list, vfs_list) {
1052		printf(" %s", v->vfs_name);
1053	}
1054	mutex_exit(&vfs_list_lock);
1055	printf("\n");
1056}
1057
1058/*
1059 * Mount the root file system.  If the operator didn't specify a
1060 * file system to use, try all possible file systems until one
1061 * succeeds.
1062 */
1063int
1064vfs_mountroot(void)
1065{
1066	struct vfsops *v;
1067	int error = ENODEV;
1068
1069	if (root_device == NULL)
1070		panic("vfs_mountroot: root device unknown");
1071
1072	switch (device_class(root_device)) {
1073	case DV_IFNET:
1074		if (rootdev != NODEV)
1075			panic("vfs_mountroot: rootdev set for DV_IFNET "
1076			    "(0x%llx -> %llu,%llu)",
1077			    (unsigned long long)rootdev,
1078			    (unsigned long long)major(rootdev),
1079			    (unsigned long long)minor(rootdev));
1080		break;
1081
1082	case DV_DISK:
1083		if (rootdev == NODEV)
1084			panic("vfs_mountroot: rootdev not set for DV_DISK");
1085	        if (bdevvp(rootdev, &rootvp))
1086	                panic("vfs_mountroot: can't get vnode for rootdev");
1087		error = VOP_OPEN(rootvp, FREAD, FSCRED);
1088		if (error) {
1089			printf("vfs_mountroot: can't open root device\n");
1090			return (error);
1091		}
1092		break;
1093
1094	case DV_VIRTUAL:
1095		break;
1096
1097	default:
1098		printf("%s: inappropriate for root file system\n",
1099		    device_xname(root_device));
1100		return (ENODEV);
1101	}
1102
1103	/*
1104	 * If user specified a root fs type, use it.  Make sure the
1105	 * specified type exists and has a mount_root()
1106	 */
1107	if (strcmp(rootfstype, ROOT_FSTYPE_ANY) != 0) {
1108		v = vfs_getopsbyname(rootfstype);
1109		error = EFTYPE;
1110		if (v != NULL) {
1111			if (v->vfs_mountroot != NULL) {
1112				error = (v->vfs_mountroot)();
1113			}
1114			v->vfs_refcount--;
1115		}
1116		goto done;
1117	}
1118
1119	/*
1120	 * Try each file system currently configured into the kernel.
1121	 */
1122	mutex_enter(&vfs_list_lock);
1123	LIST_FOREACH(v, &vfs_list, vfs_list) {
1124		if (v->vfs_mountroot == NULL)
1125			continue;
1126#ifdef DEBUG
1127		aprint_normal("mountroot: trying %s...\n", v->vfs_name);
1128#endif
1129		v->vfs_refcount++;
1130		mutex_exit(&vfs_list_lock);
1131		error = (*v->vfs_mountroot)();
1132		mutex_enter(&vfs_list_lock);
1133		v->vfs_refcount--;
1134		if (!error) {
1135			aprint_normal("root file system type: %s\n",
1136			    v->vfs_name);
1137			break;
1138		}
1139	}
1140	mutex_exit(&vfs_list_lock);
1141
1142	if (v == NULL) {
1143		vfs_print_fstypes();
1144		printf("no file system for %s", device_xname(root_device));
1145		if (device_class(root_device) == DV_DISK)
1146			printf(" (dev 0x%llx)", (unsigned long long)rootdev);
1147		printf("\n");
1148		error = EFTYPE;
1149	}
1150
1151done:
1152	if (error && device_class(root_device) == DV_DISK) {
1153		VOP_CLOSE(rootvp, FREAD, FSCRED);
1154		vrele(rootvp);
1155	}
1156	if (error == 0) {
1157		extern struct cwdinfo cwdi0;
1158
1159		CIRCLEQ_FIRST(&mountlist)->mnt_flag |= MNT_ROOTFS;
1160		CIRCLEQ_FIRST(&mountlist)->mnt_op->vfs_refcount++;
1161
1162		/*
1163		 * Get the vnode for '/'.  Set cwdi0.cwdi_cdir to
1164		 * reference it.
1165		 */
1166		error = VFS_ROOT(CIRCLEQ_FIRST(&mountlist), &rootvnode);
1167		if (error)
1168			panic("cannot find root vnode, error=%d", error);
1169		cwdi0.cwdi_cdir = rootvnode;
1170		vref(cwdi0.cwdi_cdir);
1171		VOP_UNLOCK(rootvnode);
1172		cwdi0.cwdi_rdir = NULL;
1173
1174		/*
1175		 * Now that root is mounted, we can fixup initproc's CWD
1176		 * info.  All other processes are kthreads, which merely
1177		 * share proc0's CWD info.
1178		 */
1179		initproc->p_cwdi->cwdi_cdir = rootvnode;
1180		vref(initproc->p_cwdi->cwdi_cdir);
1181		initproc->p_cwdi->cwdi_rdir = NULL;
1182		/*
1183		 * Enable loading of modules from the filesystem
1184		 */
1185		module_load_vfs_init();
1186
1187	}
1188	return (error);
1189}
1190
1191/*
1192 * mount_specific_key_create --
1193 *	Create a key for subsystem mount-specific data.
1194 */
1195int
1196mount_specific_key_create(specificdata_key_t *keyp, specificdata_dtor_t dtor)
1197{
1198
1199	return specificdata_key_create(mount_specificdata_domain, keyp, dtor);
1200}
1201
1202/*
1203 * mount_specific_key_delete --
1204 *	Delete a key for subsystem mount-specific data.
1205 */
1206void
1207mount_specific_key_delete(specificdata_key_t key)
1208{
1209
1210	specificdata_key_delete(mount_specificdata_domain, key);
1211}
1212
1213/*
1214 * mount_initspecific --
1215 *	Initialize a mount's specificdata container.
1216 */
1217void
1218mount_initspecific(struct mount *mp)
1219{
1220	int error;
1221
1222	error = specificdata_init(mount_specificdata_domain,
1223				  &mp->mnt_specdataref);
1224	KASSERT(error == 0);
1225}
1226
1227/*
1228 * mount_finispecific --
1229 *	Finalize a mount's specificdata container.
1230 */
1231void
1232mount_finispecific(struct mount *mp)
1233{
1234
1235	specificdata_fini(mount_specificdata_domain, &mp->mnt_specdataref);
1236}
1237
1238/*
1239 * mount_getspecific --
1240 *	Return mount-specific data corresponding to the specified key.
1241 */
1242void *
1243mount_getspecific(struct mount *mp, specificdata_key_t key)
1244{
1245
1246	return specificdata_getspecific(mount_specificdata_domain,
1247					 &mp->mnt_specdataref, key);
1248}
1249
1250/*
1251 * mount_setspecific --
1252 *	Set mount-specific data corresponding to the specified key.
1253 */
1254void
1255mount_setspecific(struct mount *mp, specificdata_key_t key, void *data)
1256{
1257
1258	specificdata_setspecific(mount_specificdata_domain,
1259				 &mp->mnt_specdataref, key, data);
1260}
1261
1262/*
1263 * Check to see if a filesystem is mounted on a block device.
1264 */
1265int
1266vfs_mountedon(vnode_t *vp)
1267{
1268	vnode_t *vq;
1269	int error = 0;
1270
1271	if (vp->v_type != VBLK)
1272		return ENOTBLK;
1273	if (vp->v_specmountpoint != NULL)
1274		return (EBUSY);
1275	mutex_enter(&device_lock);
1276	for (vq = specfs_hash[SPECHASH(vp->v_rdev)]; vq != NULL;
1277	    vq = vq->v_specnext) {
1278		if (vq->v_type != vp->v_type || vq->v_rdev != vp->v_rdev)
1279			continue;
1280		if (vq->v_specmountpoint != NULL) {
1281			error = EBUSY;
1282			break;
1283		}
1284	}
1285	mutex_exit(&device_lock);
1286	return (error);
1287}
1288
1289/*
1290 * Check if a device pointed to by vp is mounted.
1291 *
1292 * Returns:
1293 *   EINVAL	if it's not a disk
1294 *   EBUSY	if it's a disk and mounted
1295 *   0		if it's a disk and not mounted
1296 */
1297int
1298rawdev_mounted(vnode_t *vp, vnode_t **bvpp)
1299{
1300	vnode_t *bvp;
1301	dev_t dev;
1302	int d_type;
1303
1304	bvp = NULL;
1305	d_type = D_OTHER;
1306
1307	if (iskmemvp(vp))
1308		return EINVAL;
1309
1310	switch (vp->v_type) {
1311	case VCHR: {
1312		const struct cdevsw *cdev;
1313
1314		dev = vp->v_rdev;
1315		cdev = cdevsw_lookup(dev);
1316		if (cdev != NULL) {
1317			dev_t blkdev;
1318
1319			blkdev = devsw_chr2blk(dev);
1320			if (blkdev != NODEV) {
1321				if (vfinddev(blkdev, VBLK, &bvp) != 0) {
1322					d_type = (cdev->d_flag & D_TYPEMASK);
1323					/* XXX: what if bvp disappears? */
1324					vrele(bvp);
1325				}
1326			}
1327		}
1328
1329		break;
1330		}
1331
1332	case VBLK: {
1333		const struct bdevsw *bdev;
1334
1335		dev = vp->v_rdev;
1336		bdev = bdevsw_lookup(dev);
1337		if (bdev != NULL)
1338			d_type = (bdev->d_flag & D_TYPEMASK);
1339
1340		bvp = vp;
1341
1342		break;
1343		}
1344
1345	default:
1346		break;
1347	}
1348
1349	if (d_type != D_DISK)
1350		return EINVAL;
1351
1352	if (bvpp != NULL)
1353		*bvpp = bvp;
1354
1355	/*
1356	 * XXX: This is bogus. We should be failing the request
1357	 * XXX: not only if this specific slice is mounted, but
1358	 * XXX: if it's on a disk with any other mounted slice.
1359	 */
1360	if (vfs_mountedon(bvp))
1361		return EBUSY;
1362
1363	return 0;
1364}
1365
1366/*
1367 * Make a 'unique' number from a mount type name.
1368 */
1369long
1370makefstype(const char *type)
1371{
1372	long rv;
1373
1374	for (rv = 0; *type; type++) {
1375		rv <<= 2;
1376		rv ^= *type;
1377	}
1378	return rv;
1379}
1380