vfs_mount.c revision 179670
1/*-
2 * Copyright (c) 1999-2004 Poul-Henning Kamp
3 * Copyright (c) 1999 Michael Smith
4 * Copyright (c) 1989, 1993
5 *	The Regents of the University of California.  All rights reserved.
6 * (c) UNIX System Laboratories, Inc.
7 * All or some portions of this file are derived from material licensed
8 * to the University of California by American Telephone and Telegraph
9 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
10 * the permission of UNIX System Laboratories, Inc.
11 *
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
14 * are met:
15 * 1. Redistributions of source code must retain the above copyright
16 *    notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 *    notice, this list of conditions and the following disclaimer in the
19 *    documentation and/or other materials provided with the distribution.
20 * 4. Neither the name of the University nor the names of its contributors
21 *    may be used to endorse or promote products derived from this software
22 *    without specific prior written permission.
23 *
24 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * SUCH DAMAGE.
35 */
36
37#include <sys/cdefs.h>
38__FBSDID("$FreeBSD: head/sys/kern/vfs_mount.c 179670 2008-06-09 10:31:38Z kib $");
39
40#include <sys/param.h>
41#include <sys/conf.h>
42#include <sys/fcntl.h>
43#include <sys/jail.h>
44#include <sys/kernel.h>
45#include <sys/libkern.h>
46#include <sys/malloc.h>
47#include <sys/mount.h>
48#include <sys/mutex.h>
49#include <sys/namei.h>
50#include <sys/priv.h>
51#include <sys/proc.h>
52#include <sys/filedesc.h>
53#include <sys/reboot.h>
54#include <sys/syscallsubr.h>
55#include <sys/sysproto.h>
56#include <sys/sx.h>
57#include <sys/sysctl.h>
58#include <sys/sysent.h>
59#include <sys/systm.h>
60#include <sys/vnode.h>
61#include <vm/uma.h>
62
63#include <geom/geom.h>
64
65#include <machine/stdarg.h>
66
67#include <security/audit/audit.h>
68#include <security/mac/mac_framework.h>
69
70#include "opt_rootdevname.h"
71#include "opt_mac.h"
72
73#define	ROOTNAME		"root_device"
74#define	VFS_MOUNTARG_SIZE_MAX	(1024 * 64)
75
76static int	vfs_domount(struct thread *td, const char *fstype,
77		    char *fspath, int fsflags, void *fsdata);
78static int	vfs_mountroot_ask(void);
79static int	vfs_mountroot_try(const char *mountfrom);
80static int	vfs_donmount(struct thread *td, int fsflags,
81		    struct uio *fsoptions);
82static void	free_mntarg(struct mntarg *ma);
83static int	vfs_getopt_pos(struct vfsoptlist *opts, const char *name);
84
85static int	usermount = 0;
86SYSCTL_INT(_vfs, OID_AUTO, usermount, CTLFLAG_RW, &usermount, 0,
87    "Unprivileged users may mount and unmount file systems");
88
89MALLOC_DEFINE(M_MOUNT, "mount", "vfs mount structure");
90MALLOC_DEFINE(M_VNODE_MARKER, "vnodemarker", "vnode marker");
91static uma_zone_t mount_zone;
92
93/* List of mounted filesystems. */
94struct mntlist mountlist = TAILQ_HEAD_INITIALIZER(mountlist);
95
96/* For any iteration/modification of mountlist */
97struct mtx mountlist_mtx;
98MTX_SYSINIT(mountlist, &mountlist_mtx, "mountlist", MTX_DEF);
99
100TAILQ_HEAD(vfsoptlist, vfsopt);
101struct vfsopt {
102	TAILQ_ENTRY(vfsopt) link;
103	char	*name;
104	void	*value;
105	int	len;
106};
107
108/*
109 * The vnode of the system's root (/ in the filesystem, without chroot
110 * active.)
111 */
112struct vnode	*rootvnode;
113
114/*
115 * The root filesystem is detailed in the kernel environment variable
116 * vfs.root.mountfrom, which is expected to be in the general format
117 *
118 * <vfsname>:[<path>]
119 * vfsname   := the name of a VFS known to the kernel and capable
120 *              of being mounted as root
121 * path      := disk device name or other data used by the filesystem
122 *              to locate its physical store
123 */
124
125/*
126 * Global opts, taken by all filesystems
127 */
128static const char *global_opts[] = {
129	"errmsg",
130	"fstype",
131	"fspath",
132	"ro",
133	"rw",
134	"nosuid",
135	"noexec",
136	"update",
137	NULL
138};
139
140/*
141 * The root specifiers we will try if RB_CDROM is specified.
142 */
143static char *cdrom_rootdevnames[] = {
144	"cd9660:cd0",
145	"cd9660:acd0",
146	NULL
147};
148
149/* legacy find-root code */
150char		*rootdevnames[2] = {NULL, NULL};
151#ifndef ROOTDEVNAME
152#  define ROOTDEVNAME NULL
153#endif
154static const char	*ctrootdevname = ROOTDEVNAME;
155
156/*
157 * ---------------------------------------------------------------------
158 * Functions for building and sanitizing the mount options
159 */
160
161/* Remove one mount option. */
162static void
163vfs_freeopt(struct vfsoptlist *opts, struct vfsopt *opt)
164{
165
166	TAILQ_REMOVE(opts, opt, link);
167	free(opt->name, M_MOUNT);
168	if (opt->value != NULL)
169		free(opt->value, M_MOUNT);
170#ifdef INVARIANTS
171	else if (opt->len != 0)
172		panic("%s: mount option with NULL value but length != 0",
173		    __func__);
174#endif
175	free(opt, M_MOUNT);
176}
177
178/* Release all resources related to the mount options. */
179void
180vfs_freeopts(struct vfsoptlist *opts)
181{
182	struct vfsopt *opt;
183
184	while (!TAILQ_EMPTY(opts)) {
185		opt = TAILQ_FIRST(opts);
186		vfs_freeopt(opts, opt);
187	}
188	free(opts, M_MOUNT);
189}
190
191void
192vfs_deleteopt(struct vfsoptlist *opts, const char *name)
193{
194	struct vfsopt *opt, *temp;
195
196	TAILQ_FOREACH_SAFE(opt, opts, link, temp)  {
197		if (strcmp(opt->name, name) == 0)
198			vfs_freeopt(opts, opt);
199	}
200}
201
202/*
203 * Check if options are equal (with or without the "no" prefix).
204 */
205static int
206vfs_equalopts(const char *opt1, const char *opt2)
207{
208
209	/* "opt" vs. "opt" or "noopt" vs. "noopt" */
210	if (strcmp(opt1, opt2) == 0)
211		return (1);
212	/* "noopt" vs. "opt" */
213	if (strncmp(opt1, "no", 2) == 0 && strcmp(opt1 + 2, opt2) == 0)
214		return (1);
215	/* "opt" vs. "noopt" */
216	if (strncmp(opt2, "no", 2) == 0 && strcmp(opt1, opt2 + 2) == 0)
217		return (1);
218	return (0);
219}
220
221/*
222 * If a mount option is specified several times,
223 * (with or without the "no" prefix) only keep
224 * the last occurence of it.
225 */
226static void
227vfs_sanitizeopts(struct vfsoptlist *opts)
228{
229	struct vfsopt *opt, *opt2, *tmp;
230
231	TAILQ_FOREACH_REVERSE(opt, opts, vfsoptlist, link) {
232		opt2 = TAILQ_PREV(opt, vfsoptlist, link);
233		while (opt2 != NULL) {
234			if (vfs_equalopts(opt->name, opt2->name)) {
235				tmp = TAILQ_PREV(opt2, vfsoptlist, link);
236				vfs_freeopt(opts, opt2);
237				opt2 = tmp;
238			} else {
239				opt2 = TAILQ_PREV(opt2, vfsoptlist, link);
240			}
241		}
242	}
243}
244
245/*
246 * Build a linked list of mount options from a struct uio.
247 */
248static int
249vfs_buildopts(struct uio *auio, struct vfsoptlist **options)
250{
251	struct vfsoptlist *opts;
252	struct vfsopt *opt;
253	size_t memused;
254	unsigned int i, iovcnt;
255	int error, namelen, optlen;
256
257	opts = malloc(sizeof(struct vfsoptlist), M_MOUNT, M_WAITOK);
258	TAILQ_INIT(opts);
259	memused = 0;
260	iovcnt = auio->uio_iovcnt;
261	for (i = 0; i < iovcnt; i += 2) {
262		opt = malloc(sizeof(struct vfsopt), M_MOUNT, M_WAITOK);
263		namelen = auio->uio_iov[i].iov_len;
264		optlen = auio->uio_iov[i + 1].iov_len;
265		opt->name = malloc(namelen, M_MOUNT, M_WAITOK);
266		opt->value = NULL;
267		opt->len = 0;
268
269		/*
270		 * Do this early, so jumps to "bad" will free the current
271		 * option.
272		 */
273		TAILQ_INSERT_TAIL(opts, opt, link);
274		memused += sizeof(struct vfsopt) + optlen + namelen;
275
276		/*
277		 * Avoid consuming too much memory, and attempts to overflow
278		 * memused.
279		 */
280		if (memused > VFS_MOUNTARG_SIZE_MAX ||
281		    optlen > VFS_MOUNTARG_SIZE_MAX ||
282		    namelen > VFS_MOUNTARG_SIZE_MAX) {
283			error = EINVAL;
284			goto bad;
285		}
286
287		if (auio->uio_segflg == UIO_SYSSPACE) {
288			bcopy(auio->uio_iov[i].iov_base, opt->name, namelen);
289		} else {
290			error = copyin(auio->uio_iov[i].iov_base, opt->name,
291			    namelen);
292			if (error)
293				goto bad;
294		}
295		/* Ensure names are null-terminated strings. */
296		if (opt->name[namelen - 1] != '\0') {
297			error = EINVAL;
298			goto bad;
299		}
300		if (optlen != 0) {
301			opt->len = optlen;
302			opt->value = malloc(optlen, M_MOUNT, M_WAITOK);
303			if (auio->uio_segflg == UIO_SYSSPACE) {
304				bcopy(auio->uio_iov[i + 1].iov_base, opt->value,
305				    optlen);
306			} else {
307				error = copyin(auio->uio_iov[i + 1].iov_base,
308				    opt->value, optlen);
309				if (error)
310					goto bad;
311			}
312		}
313	}
314	vfs_sanitizeopts(opts);
315	*options = opts;
316	return (0);
317bad:
318	vfs_freeopts(opts);
319	return (error);
320}
321
322/*
323 * Merge the old mount options with the new ones passed
324 * in the MNT_UPDATE case.
325 *
326 * XXX This function will keep a "nofoo" option in the
327 *     new options if there is no matching "foo" option
328 *     to be cancelled in the old options.  This is a bug
329 *     if the option's canonical name is "foo".  E.g., "noro"
330 *     shouldn't end up in the mount point's active options,
331 *     but it can.
332 */
333static void
334vfs_mergeopts(struct vfsoptlist *toopts, struct vfsoptlist *opts)
335{
336	struct vfsopt *opt, *opt2, *new;
337
338	TAILQ_FOREACH(opt, opts, link) {
339		/*
340		 * Check that this option hasn't been redefined
341		 * nor cancelled with a "no" mount option.
342		 */
343		opt2 = TAILQ_FIRST(toopts);
344		while (opt2 != NULL) {
345			if (strcmp(opt2->name, opt->name) == 0)
346				goto next;
347			if (strncmp(opt2->name, "no", 2) == 0 &&
348			    strcmp(opt2->name + 2, opt->name) == 0) {
349				vfs_freeopt(toopts, opt2);
350				goto next;
351			}
352			opt2 = TAILQ_NEXT(opt2, link);
353		}
354		/* We want this option, duplicate it. */
355		new = malloc(sizeof(struct vfsopt), M_MOUNT, M_WAITOK);
356		new->name = malloc(strlen(opt->name) + 1, M_MOUNT, M_WAITOK);
357		strcpy(new->name, opt->name);
358		if (opt->len != 0) {
359			new->value = malloc(opt->len, M_MOUNT, M_WAITOK);
360			bcopy(opt->value, new->value, opt->len);
361		} else {
362			new->value = NULL;
363		}
364		new->len = opt->len;
365		TAILQ_INSERT_TAIL(toopts, new, link);
366next:
367		continue;
368	}
369}
370
371/*
372 * Mount a filesystem.
373 */
374int
375nmount(td, uap)
376	struct thread *td;
377	struct nmount_args /* {
378		struct iovec *iovp;
379		unsigned int iovcnt;
380		int flags;
381	} */ *uap;
382{
383	struct uio *auio;
384	struct iovec *iov;
385	unsigned int i;
386	int error;
387	u_int iovcnt;
388
389	AUDIT_ARG(fflags, uap->flags);
390
391	/*
392	 * Filter out MNT_ROOTFS.  We do not want clients of nmount() in
393	 * userspace to set this flag, but we must filter it out if we want
394	 * MNT_UPDATE on the root file system to work.
395	 * MNT_ROOTFS should only be set in the kernel in vfs_mountroot_try().
396	 */
397	uap->flags &= ~MNT_ROOTFS;
398
399	iovcnt = uap->iovcnt;
400	/*
401	 * Check that we have an even number of iovec's
402	 * and that we have at least two options.
403	 */
404	if ((iovcnt & 1) || (iovcnt < 4))
405		return (EINVAL);
406
407	error = copyinuio(uap->iovp, iovcnt, &auio);
408	if (error)
409		return (error);
410	iov = auio->uio_iov;
411	for (i = 0; i < iovcnt; i++) {
412		if (iov->iov_len > MMAXOPTIONLEN) {
413			free(auio, M_IOV);
414			return (EINVAL);
415		}
416		iov++;
417	}
418	error = vfs_donmount(td, uap->flags, auio);
419
420	free(auio, M_IOV);
421	return (error);
422}
423
424/*
425 * ---------------------------------------------------------------------
426 * Various utility functions
427 */
428
429void
430vfs_ref(struct mount *mp)
431{
432
433	MNT_ILOCK(mp);
434	MNT_REF(mp);
435	MNT_IUNLOCK(mp);
436}
437
438void
439vfs_rel(struct mount *mp)
440{
441
442	MNT_ILOCK(mp);
443	MNT_REL(mp);
444	MNT_IUNLOCK(mp);
445}
446
447static int
448mount_init(void *mem, int size, int flags)
449{
450	struct mount *mp;
451
452	mp = (struct mount *)mem;
453	mtx_init(&mp->mnt_mtx, "struct mount mtx", NULL, MTX_DEF);
454	lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, 0);
455	lockinit(&mp->mnt_explock, PVFS, "explock", 0, 0);
456	return (0);
457}
458
459static void
460mount_fini(void *mem, int size)
461{
462	struct mount *mp;
463
464	mp = (struct mount *)mem;
465	lockdestroy(&mp->mnt_explock);
466	lockdestroy(&mp->mnt_lock);
467	mtx_destroy(&mp->mnt_mtx);
468}
469
470/*
471 * Allocate and initialize the mount point struct.
472 */
473struct mount *
474vfs_mount_alloc(struct vnode *vp, struct vfsconf *vfsp,
475    const char *fspath, struct thread *td)
476{
477	struct mount *mp;
478
479	mp = uma_zalloc(mount_zone, M_WAITOK);
480	bzero(&mp->mnt_startzero,
481	    __rangeof(struct mount, mnt_startzero, mnt_endzero));
482	TAILQ_INIT(&mp->mnt_nvnodelist);
483	mp->mnt_nvnodelistsize = 0;
484	mp->mnt_ref = 0;
485	(void) vfs_busy(mp, LK_NOWAIT, 0, td);
486	mp->mnt_op = vfsp->vfc_vfsops;
487	mp->mnt_vfc = vfsp;
488	vfsp->vfc_refcount++;	/* XXX Unlocked */
489	mp->mnt_stat.f_type = vfsp->vfc_typenum;
490	mp->mnt_gen++;
491	strlcpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN);
492	mp->mnt_vnodecovered = vp;
493	mp->mnt_cred = crdup(td->td_ucred);
494	mp->mnt_stat.f_owner = td->td_ucred->cr_uid;
495	strlcpy(mp->mnt_stat.f_mntonname, fspath, MNAMELEN);
496	mp->mnt_iosize_max = DFLTPHYS;
497#ifdef MAC
498	mac_mount_init(mp);
499	mac_mount_create(td->td_ucred, mp);
500#endif
501	arc4rand(&mp->mnt_hashseed, sizeof mp->mnt_hashseed, 0);
502	return (mp);
503}
504
505/*
506 * Destroy the mount struct previously allocated by vfs_mount_alloc().
507 */
508void
509vfs_mount_destroy(struct mount *mp)
510{
511	int i;
512
513	MNT_ILOCK(mp);
514	for (i = 0; mp->mnt_ref && i < 3; i++)
515		msleep(mp, MNT_MTX(mp), PVFS, "mntref", hz);
516	/*
517	 * This will always cause a 3 second delay in rebooting due to
518	 * refs on the root mountpoint that never go away.  Most of these
519	 * are held by init which never exits.
520	 */
521	if (i == 3 && (!rebooting || bootverbose))
522		printf("Mount point %s had %d dangling refs\n",
523		    mp->mnt_stat.f_mntonname, mp->mnt_ref);
524	if (mp->mnt_holdcnt != 0) {
525		printf("Waiting for mount point to be unheld\n");
526		while (mp->mnt_holdcnt != 0) {
527			mp->mnt_holdcntwaiters++;
528			msleep(&mp->mnt_holdcnt, MNT_MTX(mp),
529			       PZERO, "mntdestroy", 0);
530			mp->mnt_holdcntwaiters--;
531		}
532		printf("mount point unheld\n");
533	}
534	if (mp->mnt_writeopcount > 0) {
535		printf("Waiting for mount point write ops\n");
536		while (mp->mnt_writeopcount > 0) {
537			mp->mnt_kern_flag |= MNTK_SUSPEND;
538			msleep(&mp->mnt_writeopcount,
539			       MNT_MTX(mp),
540			       PZERO, "mntdestroy2", 0);
541		}
542		printf("mount point write ops completed\n");
543	}
544	if (mp->mnt_secondary_writes > 0) {
545		printf("Waiting for mount point secondary write ops\n");
546		while (mp->mnt_secondary_writes > 0) {
547			mp->mnt_kern_flag |= MNTK_SUSPEND;
548			msleep(&mp->mnt_secondary_writes,
549			       MNT_MTX(mp),
550			       PZERO, "mntdestroy3", 0);
551		}
552		printf("mount point secondary write ops completed\n");
553	}
554	MNT_IUNLOCK(mp);
555	mp->mnt_vfc->vfc_refcount--;
556	if (!TAILQ_EMPTY(&mp->mnt_nvnodelist)) {
557		struct vnode *vp;
558
559		TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes)
560			vprint("", vp);
561		panic("unmount: dangling vnode");
562	}
563	MNT_ILOCK(mp);
564	if (mp->mnt_kern_flag & MNTK_MWAIT)
565		wakeup(mp);
566	if (mp->mnt_writeopcount != 0)
567		panic("vfs_mount_destroy: nonzero writeopcount");
568	if (mp->mnt_secondary_writes != 0)
569		panic("vfs_mount_destroy: nonzero secondary_writes");
570	if (mp->mnt_nvnodelistsize != 0)
571		panic("vfs_mount_destroy: nonzero nvnodelistsize");
572	mp->mnt_writeopcount = -1000;
573	mp->mnt_nvnodelistsize = -1000;
574	mp->mnt_secondary_writes = -1000;
575	MNT_IUNLOCK(mp);
576#ifdef MAC
577	mac_mount_destroy(mp);
578#endif
579	if (mp->mnt_opt != NULL)
580		vfs_freeopts(mp->mnt_opt);
581	crfree(mp->mnt_cred);
582	uma_zfree(mount_zone, mp);
583}
584
585static int
586vfs_donmount(struct thread *td, int fsflags, struct uio *fsoptions)
587{
588	struct vfsoptlist *optlist;
589	struct vfsopt *opt, *noro_opt;
590	char *fstype, *fspath, *errmsg;
591	int error, fstypelen, fspathlen, errmsg_len, errmsg_pos;
592	int has_rw, has_noro;
593
594	errmsg = NULL;
595	errmsg_len = 0;
596	errmsg_pos = -1;
597	has_rw = 0;
598	has_noro = 0;
599
600	error = vfs_buildopts(fsoptions, &optlist);
601	if (error)
602		return (error);
603
604	if (vfs_getopt(optlist, "errmsg", (void **)&errmsg, &errmsg_len) == 0)
605		errmsg_pos = vfs_getopt_pos(optlist, "errmsg");
606
607	/*
608	 * We need these two options before the others,
609	 * and they are mandatory for any filesystem.
610	 * Ensure they are NUL terminated as well.
611	 */
612	fstypelen = 0;
613	error = vfs_getopt(optlist, "fstype", (void **)&fstype, &fstypelen);
614	if (error || fstype[fstypelen - 1] != '\0') {
615		error = EINVAL;
616		if (errmsg != NULL)
617			strncpy(errmsg, "Invalid fstype", errmsg_len);
618		goto bail;
619	}
620	fspathlen = 0;
621	error = vfs_getopt(optlist, "fspath", (void **)&fspath, &fspathlen);
622	if (error || fspath[fspathlen - 1] != '\0') {
623		error = EINVAL;
624		if (errmsg != NULL)
625			strncpy(errmsg, "Invalid fspath", errmsg_len);
626		goto bail;
627	}
628
629	/*
630	 * We need to see if we have the "update" option
631	 * before we call vfs_domount(), since vfs_domount() has special
632	 * logic based on MNT_UPDATE.  This is very important
633	 * when we want to update the root filesystem.
634	 */
635	TAILQ_FOREACH(opt, optlist, link) {
636		if (strcmp(opt->name, "update") == 0)
637			fsflags |= MNT_UPDATE;
638		else if (strcmp(opt->name, "async") == 0)
639			fsflags |= MNT_ASYNC;
640		else if (strcmp(opt->name, "force") == 0)
641			fsflags |= MNT_FORCE;
642		else if (strcmp(opt->name, "multilabel") == 0)
643			fsflags |= MNT_MULTILABEL;
644		else if (strcmp(opt->name, "noasync") == 0)
645			fsflags &= ~MNT_ASYNC;
646		else if (strcmp(opt->name, "noatime") == 0)
647			fsflags |= MNT_NOATIME;
648		else if (strcmp(opt->name, "atime") == 0) {
649			free(opt->name, M_MOUNT);
650			opt->name = strdup("nonoatime", M_MOUNT);
651		}
652		else if (strcmp(opt->name, "noclusterr") == 0)
653			fsflags |= MNT_NOCLUSTERR;
654		else if (strcmp(opt->name, "clusterr") == 0) {
655			free(opt->name, M_MOUNT);
656			opt->name = strdup("nonoclusterr", M_MOUNT);
657		}
658		else if (strcmp(opt->name, "noclusterw") == 0)
659			fsflags |= MNT_NOCLUSTERW;
660		else if (strcmp(opt->name, "clusterw") == 0) {
661			free(opt->name, M_MOUNT);
662			opt->name = strdup("nonoclusterw", M_MOUNT);
663		}
664		else if (strcmp(opt->name, "noexec") == 0)
665			fsflags |= MNT_NOEXEC;
666		else if (strcmp(opt->name, "exec") == 0) {
667			free(opt->name, M_MOUNT);
668			opt->name = strdup("nonoexec", M_MOUNT);
669		}
670		else if (strcmp(opt->name, "nosuid") == 0)
671			fsflags |= MNT_NOSUID;
672		else if (strcmp(opt->name, "suid") == 0) {
673			free(opt->name, M_MOUNT);
674			opt->name = strdup("nonosuid", M_MOUNT);
675		}
676		else if (strcmp(opt->name, "nosymfollow") == 0)
677			fsflags |= MNT_NOSYMFOLLOW;
678		else if (strcmp(opt->name, "symfollow") == 0) {
679			free(opt->name, M_MOUNT);
680			opt->name = strdup("nonosymfollow", M_MOUNT);
681		}
682		else if (strcmp(opt->name, "noro") == 0) {
683			fsflags &= ~MNT_RDONLY;
684			has_noro = 1;
685		}
686		else if (strcmp(opt->name, "rw") == 0) {
687			fsflags &= ~MNT_RDONLY;
688			has_rw = 1;
689		}
690		else if (strcmp(opt->name, "ro") == 0)
691			fsflags |= MNT_RDONLY;
692		else if (strcmp(opt->name, "rdonly") == 0) {
693			free(opt->name, M_MOUNT);
694			opt->name = strdup("ro", M_MOUNT);
695			fsflags |= MNT_RDONLY;
696		}
697		else if (strcmp(opt->name, "suiddir") == 0)
698			fsflags |= MNT_SUIDDIR;
699		else if (strcmp(opt->name, "sync") == 0)
700			fsflags |= MNT_SYNCHRONOUS;
701		else if (strcmp(opt->name, "union") == 0)
702			fsflags |= MNT_UNION;
703	}
704
705	/*
706	 * If "rw" was specified as a mount option, and we
707	 * are trying to update a mount-point from "ro" to "rw",
708	 * we need a mount option "noro", since in vfs_mergeopts(),
709	 * "noro" will cancel "ro", but "rw" will not do anything.
710	 */
711	if (has_rw && !has_noro) {
712		noro_opt = malloc(sizeof(struct vfsopt), M_MOUNT, M_WAITOK);
713		noro_opt->name = strdup("noro", M_MOUNT);
714		noro_opt->value = NULL;
715		noro_opt->len = 0;
716		TAILQ_INSERT_TAIL(optlist, noro_opt, link);
717	}
718
719	/*
720	 * Be ultra-paranoid about making sure the type and fspath
721	 * variables will fit in our mp buffers, including the
722	 * terminating NUL.
723	 */
724	if (fstypelen >= MFSNAMELEN - 1 || fspathlen >= MNAMELEN - 1) {
725		error = ENAMETOOLONG;
726		goto bail;
727	}
728
729	mtx_lock(&Giant);
730	error = vfs_domount(td, fstype, fspath, fsflags, optlist);
731	mtx_unlock(&Giant);
732bail:
733	/* copyout the errmsg */
734	if (errmsg_pos != -1 && ((2 * errmsg_pos + 1) < fsoptions->uio_iovcnt)
735	    && errmsg_len > 0 && errmsg != NULL) {
736		if (fsoptions->uio_segflg == UIO_SYSSPACE) {
737			bcopy(errmsg,
738			    fsoptions->uio_iov[2 * errmsg_pos + 1].iov_base,
739			    fsoptions->uio_iov[2 * errmsg_pos + 1].iov_len);
740		} else {
741			copyout(errmsg,
742			    fsoptions->uio_iov[2 * errmsg_pos + 1].iov_base,
743			    fsoptions->uio_iov[2 * errmsg_pos + 1].iov_len);
744		}
745	}
746
747	if (error != 0)
748		vfs_freeopts(optlist);
749	return (error);
750}
751
752/*
753 * Old mount API.
754 */
755#ifndef _SYS_SYSPROTO_H_
756struct mount_args {
757	char	*type;
758	char	*path;
759	int	flags;
760	caddr_t	data;
761};
762#endif
763/* ARGSUSED */
764int
765mount(td, uap)
766	struct thread *td;
767	struct mount_args /* {
768		char *type;
769		char *path;
770		int flags;
771		caddr_t data;
772	} */ *uap;
773{
774	char *fstype;
775	struct vfsconf *vfsp = NULL;
776	struct mntarg *ma = NULL;
777	int error;
778
779	AUDIT_ARG(fflags, uap->flags);
780
781	/*
782	 * Filter out MNT_ROOTFS.  We do not want clients of mount() in
783	 * userspace to set this flag, but we must filter it out if we want
784	 * MNT_UPDATE on the root file system to work.
785	 * MNT_ROOTFS should only be set in the kernel in vfs_mountroot_try().
786	 */
787	uap->flags &= ~MNT_ROOTFS;
788
789	fstype = malloc(MFSNAMELEN, M_TEMP, M_WAITOK);
790	error = copyinstr(uap->type, fstype, MFSNAMELEN, NULL);
791	if (error) {
792		free(fstype, M_TEMP);
793		return (error);
794	}
795
796	AUDIT_ARG(text, fstype);
797	mtx_lock(&Giant);
798	vfsp = vfs_byname_kld(fstype, td, &error);
799	free(fstype, M_TEMP);
800	if (vfsp == NULL) {
801		mtx_unlock(&Giant);
802		return (ENOENT);
803	}
804	if (vfsp->vfc_vfsops->vfs_cmount == NULL) {
805		mtx_unlock(&Giant);
806		return (EOPNOTSUPP);
807	}
808
809	ma = mount_argsu(ma, "fstype", uap->type, MNAMELEN);
810	ma = mount_argsu(ma, "fspath", uap->path, MNAMELEN);
811	ma = mount_argb(ma, uap->flags & MNT_RDONLY, "noro");
812	ma = mount_argb(ma, !(uap->flags & MNT_NOSUID), "nosuid");
813	ma = mount_argb(ma, !(uap->flags & MNT_NOEXEC), "noexec");
814
815	error = vfsp->vfc_vfsops->vfs_cmount(ma, uap->data, uap->flags, td);
816	mtx_unlock(&Giant);
817	return (error);
818}
819
820
821/*
822 * vfs_domount(): actually attempt a filesystem mount.
823 */
824static int
825vfs_domount(
826	struct thread *td,	/* Calling thread. */
827	const char *fstype,	/* Filesystem type. */
828	char *fspath,		/* Mount path. */
829	int fsflags,		/* Flags common to all filesystems. */
830	void *fsdata		/* Options local to the filesystem. */
831	)
832{
833	struct vnode *vp;
834	struct mount *mp;
835	struct vfsconf *vfsp;
836	struct export_args export;
837	int error, flag = 0;
838	struct vattr va;
839	struct nameidata nd;
840
841	mtx_assert(&Giant, MA_OWNED);
842	/*
843	 * Be ultra-paranoid about making sure the type and fspath
844	 * variables will fit in our mp buffers, including the
845	 * terminating NUL.
846	 */
847	if (strlen(fstype) >= MFSNAMELEN || strlen(fspath) >= MNAMELEN)
848		return (ENAMETOOLONG);
849
850	if (jailed(td->td_ucred) || usermount == 0) {
851		if ((error = priv_check(td, PRIV_VFS_MOUNT)) != 0)
852			return (error);
853	}
854
855	/*
856	 * Do not allow NFS export or MNT_SUIDDIR by unprivileged users.
857	 */
858	if (fsflags & MNT_EXPORTED) {
859		error = priv_check(td, PRIV_VFS_MOUNT_EXPORTED);
860		if (error)
861			return (error);
862	}
863	if (fsflags & MNT_SUIDDIR) {
864		error = priv_check(td, PRIV_VFS_MOUNT_SUIDDIR);
865		if (error)
866			return (error);
867	}
868	/*
869	 * Silently enforce MNT_NOSUID and MNT_USER for unprivileged users.
870	 */
871	if ((fsflags & (MNT_NOSUID | MNT_USER)) != (MNT_NOSUID | MNT_USER)) {
872		if (priv_check(td, PRIV_VFS_MOUNT_NONUSER) != 0)
873			fsflags |= MNT_NOSUID | MNT_USER;
874	}
875
876	/* Load KLDs before we lock the covered vnode to avoid reversals. */
877	vfsp = NULL;
878	if ((fsflags & MNT_UPDATE) == 0) {
879		/* Don't try to load KLDs if we're mounting the root. */
880		if (fsflags & MNT_ROOTFS)
881			vfsp = vfs_byname(fstype);
882		else
883			vfsp = vfs_byname_kld(fstype, td, &error);
884		if (vfsp == NULL)
885			return (ENODEV);
886		if (jailed(td->td_ucred) && !(vfsp->vfc_flags & VFCF_JAIL))
887			return (EPERM);
888	}
889	/*
890	 * Get vnode to be covered
891	 */
892	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_SYSSPACE,
893	    fspath, td);
894	if ((error = namei(&nd)) != 0)
895		return (error);
896	NDFREE(&nd, NDF_ONLY_PNBUF);
897	vp = nd.ni_vp;
898	if (fsflags & MNT_UPDATE) {
899		if ((vp->v_vflag & VV_ROOT) == 0) {
900			vput(vp);
901			return (EINVAL);
902		}
903		mp = vp->v_mount;
904		MNT_ILOCK(mp);
905		flag = mp->mnt_flag;
906		/*
907		 * We only allow the filesystem to be reloaded if it
908		 * is currently mounted read-only.
909		 */
910		if ((fsflags & MNT_RELOAD) &&
911		    ((mp->mnt_flag & MNT_RDONLY) == 0)) {
912			MNT_IUNLOCK(mp);
913			vput(vp);
914			return (EOPNOTSUPP);	/* Needs translation */
915		}
916		MNT_IUNLOCK(mp);
917		/*
918		 * Only privileged root, or (if MNT_USER is set) the user that
919		 * did the original mount is permitted to update it.
920		 */
921		error = vfs_suser(mp, td);
922		if (error) {
923			vput(vp);
924			return (error);
925		}
926		if (vfs_busy(mp, LK_NOWAIT, 0, td)) {
927			vput(vp);
928			return (EBUSY);
929		}
930		VI_LOCK(vp);
931		if ((vp->v_iflag & VI_MOUNT) != 0 ||
932		    vp->v_mountedhere != NULL) {
933			VI_UNLOCK(vp);
934			vfs_unbusy(mp, td);
935			vput(vp);
936			return (EBUSY);
937		}
938		vp->v_iflag |= VI_MOUNT;
939		VI_UNLOCK(vp);
940		MNT_ILOCK(mp);
941		mp->mnt_flag |= fsflags &
942		    (MNT_RELOAD | MNT_FORCE | MNT_UPDATE | MNT_SNAPSHOT | MNT_ROOTFS);
943		MNT_IUNLOCK(mp);
944		VOP_UNLOCK(vp, 0);
945		mp->mnt_optnew = fsdata;
946		vfs_mergeopts(mp->mnt_optnew, mp->mnt_opt);
947	} else {
948		/*
949		 * If the user is not root, ensure that they own the directory
950		 * onto which we are attempting to mount.
951		 */
952		error = VOP_GETATTR(vp, &va, td->td_ucred, td);
953		if (error) {
954			vput(vp);
955			return (error);
956		}
957		if (va.va_uid != td->td_ucred->cr_uid) {
958			error = priv_check_cred(td->td_ucred, PRIV_VFS_ADMIN,
959			    0);
960			if (error) {
961				vput(vp);
962				return (error);
963			}
964		}
965		error = vinvalbuf(vp, V_SAVE, td, 0, 0);
966		if (error != 0) {
967			vput(vp);
968			return (error);
969		}
970		if (vp->v_type != VDIR) {
971			vput(vp);
972			return (ENOTDIR);
973		}
974		VI_LOCK(vp);
975		if ((vp->v_iflag & VI_MOUNT) != 0 ||
976		    vp->v_mountedhere != NULL) {
977			VI_UNLOCK(vp);
978			vput(vp);
979			return (EBUSY);
980		}
981		vp->v_iflag |= VI_MOUNT;
982		VI_UNLOCK(vp);
983
984		/*
985		 * Allocate and initialize the filesystem.
986		 */
987		mp = vfs_mount_alloc(vp, vfsp, fspath, td);
988		VOP_UNLOCK(vp, 0);
989
990		/* XXXMAC: pass to vfs_mount_alloc? */
991		mp->mnt_optnew = fsdata;
992	}
993
994	/*
995	 * Set the mount level flags.
996	 */
997	MNT_ILOCK(mp);
998	mp->mnt_flag = (mp->mnt_flag & ~MNT_UPDATEMASK) |
999		(fsflags & (MNT_UPDATEMASK | MNT_FORCE | MNT_ROOTFS |
1000			    MNT_RDONLY));
1001	if ((mp->mnt_flag & MNT_ASYNC) == 0)
1002		mp->mnt_kern_flag &= ~MNTK_ASYNC;
1003	MNT_IUNLOCK(mp);
1004	/*
1005	 * Mount the filesystem.
1006	 * XXX The final recipients of VFS_MOUNT just overwrite the ndp they
1007	 * get.  No freeing of cn_pnbuf.
1008	 */
1009        error = VFS_MOUNT(mp, td);
1010
1011	/*
1012	 * Process the export option only if we are
1013	 * updating mount options.
1014	 */
1015	if (!error && (fsflags & MNT_UPDATE)) {
1016		if (vfs_copyopt(mp->mnt_optnew, "export", &export,
1017		    sizeof(export)) == 0)
1018			error = vfs_export(mp, &export);
1019	}
1020
1021	if (!error) {
1022		if (mp->mnt_opt != NULL)
1023			vfs_freeopts(mp->mnt_opt);
1024		mp->mnt_opt = mp->mnt_optnew;
1025		(void)VFS_STATFS(mp, &mp->mnt_stat, td);
1026	}
1027	/*
1028	 * Prevent external consumers of mount options from reading
1029	 * mnt_optnew.
1030	*/
1031	mp->mnt_optnew = NULL;
1032	if (mp->mnt_flag & MNT_UPDATE) {
1033		MNT_ILOCK(mp);
1034		if (error)
1035			mp->mnt_flag = (mp->mnt_flag & MNT_QUOTA) |
1036				(flag & ~MNT_QUOTA);
1037		else
1038			mp->mnt_flag &=	~(MNT_UPDATE | MNT_RELOAD |
1039					  MNT_FORCE | MNT_SNAPSHOT);
1040		if ((mp->mnt_flag & MNT_ASYNC) != 0 && mp->mnt_noasync == 0)
1041			mp->mnt_kern_flag |= MNTK_ASYNC;
1042		else
1043			mp->mnt_kern_flag &= ~MNTK_ASYNC;
1044		MNT_IUNLOCK(mp);
1045		if ((mp->mnt_flag & MNT_RDONLY) == 0) {
1046			if (mp->mnt_syncer == NULL)
1047				error = vfs_allocate_syncvnode(mp);
1048		} else {
1049			if (mp->mnt_syncer != NULL)
1050				vrele(mp->mnt_syncer);
1051			mp->mnt_syncer = NULL;
1052		}
1053		vfs_unbusy(mp, td);
1054		VI_LOCK(vp);
1055		vp->v_iflag &= ~VI_MOUNT;
1056		VI_UNLOCK(vp);
1057		vrele(vp);
1058		return (error);
1059	}
1060	MNT_ILOCK(mp);
1061	if ((mp->mnt_flag & MNT_ASYNC) != 0 && mp->mnt_noasync == 0)
1062		mp->mnt_kern_flag |= MNTK_ASYNC;
1063	else
1064		mp->mnt_kern_flag &= ~MNTK_ASYNC;
1065	MNT_IUNLOCK(mp);
1066	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1067	/*
1068	 * Put the new filesystem on the mount list after root.
1069	 */
1070	cache_purge(vp);
1071	if (!error) {
1072		struct vnode *newdp;
1073
1074		VI_LOCK(vp);
1075		vp->v_iflag &= ~VI_MOUNT;
1076		VI_UNLOCK(vp);
1077		vp->v_mountedhere = mp;
1078		mtx_lock(&mountlist_mtx);
1079		TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list);
1080		mtx_unlock(&mountlist_mtx);
1081		vfs_event_signal(NULL, VQ_MOUNT, 0);
1082		if (VFS_ROOT(mp, LK_EXCLUSIVE, &newdp, td))
1083			panic("mount: lost mount");
1084		mountcheckdirs(vp, newdp);
1085		vput(newdp);
1086		VOP_UNLOCK(vp, 0);
1087		if ((mp->mnt_flag & MNT_RDONLY) == 0)
1088			error = vfs_allocate_syncvnode(mp);
1089		vfs_unbusy(mp, td);
1090		if (error)
1091			vrele(vp);
1092	} else {
1093		VI_LOCK(vp);
1094		vp->v_iflag &= ~VI_MOUNT;
1095		VI_UNLOCK(vp);
1096		vfs_unbusy(mp, td);
1097		vfs_mount_destroy(mp);
1098		vput(vp);
1099	}
1100	return (error);
1101}
1102
1103/*
1104 * Unmount a filesystem.
1105 *
1106 * Note: unmount takes a path to the vnode mounted on as argument, not
1107 * special file (as before).
1108 */
1109#ifndef _SYS_SYSPROTO_H_
1110struct unmount_args {
1111	char	*path;
1112	int	flags;
1113};
1114#endif
1115/* ARGSUSED */
1116int
1117unmount(td, uap)
1118	struct thread *td;
1119	register struct unmount_args /* {
1120		char *path;
1121		int flags;
1122	} */ *uap;
1123{
1124	struct mount *mp;
1125	char *pathbuf;
1126	int error, id0, id1;
1127
1128	if (jailed(td->td_ucred) || usermount == 0) {
1129		error = priv_check(td, PRIV_VFS_UNMOUNT);
1130		if (error)
1131			return (error);
1132	}
1133
1134	pathbuf = malloc(MNAMELEN, M_TEMP, M_WAITOK);
1135	error = copyinstr(uap->path, pathbuf, MNAMELEN, NULL);
1136	if (error) {
1137		free(pathbuf, M_TEMP);
1138		return (error);
1139	}
1140	AUDIT_ARG(upath, td, pathbuf, ARG_UPATH1);
1141	mtx_lock(&Giant);
1142	if (uap->flags & MNT_BYFSID) {
1143		/* Decode the filesystem ID. */
1144		if (sscanf(pathbuf, "FSID:%d:%d", &id0, &id1) != 2) {
1145			mtx_unlock(&Giant);
1146			free(pathbuf, M_TEMP);
1147			return (EINVAL);
1148		}
1149
1150		mtx_lock(&mountlist_mtx);
1151		TAILQ_FOREACH_REVERSE(mp, &mountlist, mntlist, mnt_list) {
1152			if (mp->mnt_stat.f_fsid.val[0] == id0 &&
1153			    mp->mnt_stat.f_fsid.val[1] == id1)
1154				break;
1155		}
1156		mtx_unlock(&mountlist_mtx);
1157	} else {
1158		mtx_lock(&mountlist_mtx);
1159		TAILQ_FOREACH_REVERSE(mp, &mountlist, mntlist, mnt_list) {
1160			if (strcmp(mp->mnt_stat.f_mntonname, pathbuf) == 0)
1161				break;
1162		}
1163		mtx_unlock(&mountlist_mtx);
1164	}
1165	free(pathbuf, M_TEMP);
1166	if (mp == NULL) {
1167		/*
1168		 * Previously we returned ENOENT for a nonexistent path and
1169		 * EINVAL for a non-mountpoint.  We cannot tell these apart
1170		 * now, so in the !MNT_BYFSID case return the more likely
1171		 * EINVAL for compatibility.
1172		 */
1173		mtx_unlock(&Giant);
1174		return ((uap->flags & MNT_BYFSID) ? ENOENT : EINVAL);
1175	}
1176
1177	/*
1178	 * Don't allow unmounting the root filesystem.
1179	 */
1180	if (mp->mnt_flag & MNT_ROOTFS) {
1181		mtx_unlock(&Giant);
1182		return (EINVAL);
1183	}
1184	error = dounmount(mp, uap->flags, td);
1185	mtx_unlock(&Giant);
1186	return (error);
1187}
1188
1189/*
1190 * Do the actual filesystem unmount.
1191 */
1192int
1193dounmount(mp, flags, td)
1194	struct mount *mp;
1195	int flags;
1196	struct thread *td;
1197{
1198	struct vnode *coveredvp, *fsrootvp;
1199	int error;
1200	int async_flag;
1201	int mnt_gen_r;
1202
1203	mtx_assert(&Giant, MA_OWNED);
1204
1205	if ((coveredvp = mp->mnt_vnodecovered) != NULL) {
1206		mnt_gen_r = mp->mnt_gen;
1207		VI_LOCK(coveredvp);
1208		vholdl(coveredvp);
1209		vn_lock(coveredvp, LK_EXCLUSIVE | LK_INTERLOCK | LK_RETRY);
1210		vdrop(coveredvp);
1211		/*
1212		 * Check for mp being unmounted while waiting for the
1213		 * covered vnode lock.
1214		 */
1215		if (coveredvp->v_mountedhere != mp ||
1216		    coveredvp->v_mountedhere->mnt_gen != mnt_gen_r) {
1217			VOP_UNLOCK(coveredvp, 0);
1218			return (EBUSY);
1219		}
1220	}
1221	/*
1222	 * Only privileged root, or (if MNT_USER is set) the user that did the
1223	 * original mount is permitted to unmount this filesystem.
1224	 */
1225	error = vfs_suser(mp, td);
1226	if (error) {
1227		if (coveredvp)
1228			VOP_UNLOCK(coveredvp, 0);
1229		return (error);
1230	}
1231
1232	MNT_ILOCK(mp);
1233	if (mp->mnt_kern_flag & MNTK_UNMOUNT) {
1234		MNT_IUNLOCK(mp);
1235		if (coveredvp)
1236			VOP_UNLOCK(coveredvp, 0);
1237		return (EBUSY);
1238	}
1239	mp->mnt_kern_flag |= MNTK_UNMOUNT | MNTK_NOINSMNTQ;
1240	/* Allow filesystems to detect that a forced unmount is in progress. */
1241	if (flags & MNT_FORCE)
1242		mp->mnt_kern_flag |= MNTK_UNMOUNTF;
1243	error = lockmgr(&mp->mnt_lock, LK_DRAIN | LK_INTERLOCK |
1244	    ((flags & MNT_FORCE) ? 0 : LK_NOWAIT), MNT_MTX(mp));
1245	if (error) {
1246		MNT_ILOCK(mp);
1247		mp->mnt_kern_flag &= ~(MNTK_UNMOUNT | MNTK_NOINSMNTQ |
1248		    MNTK_UNMOUNTF);
1249		if (mp->mnt_kern_flag & MNTK_MWAIT)
1250			wakeup(mp);
1251		MNT_IUNLOCK(mp);
1252		if (coveredvp)
1253			VOP_UNLOCK(coveredvp, 0);
1254		return (error);
1255	}
1256	vn_start_write(NULL, &mp, V_WAIT);
1257
1258	if (mp->mnt_flag & MNT_EXPUBLIC)
1259		vfs_setpublicfs(NULL, NULL, NULL);
1260
1261	vfs_msync(mp, MNT_WAIT);
1262	MNT_ILOCK(mp);
1263	async_flag = mp->mnt_flag & MNT_ASYNC;
1264	mp->mnt_flag &= ~MNT_ASYNC;
1265	mp->mnt_kern_flag &= ~MNTK_ASYNC;
1266	MNT_IUNLOCK(mp);
1267	cache_purgevfs(mp);	/* remove cache entries for this file sys */
1268	if (mp->mnt_syncer != NULL)
1269		vrele(mp->mnt_syncer);
1270	/*
1271	 * For forced unmounts, move process cdir/rdir refs on the fs root
1272	 * vnode to the covered vnode.  For non-forced unmounts we want
1273	 * such references to cause an EBUSY error.
1274	 */
1275	if ((flags & MNT_FORCE) &&
1276	    VFS_ROOT(mp, LK_EXCLUSIVE, &fsrootvp, td) == 0) {
1277		if (mp->mnt_vnodecovered != NULL)
1278			mountcheckdirs(fsrootvp, mp->mnt_vnodecovered);
1279		if (fsrootvp == rootvnode) {
1280			vrele(rootvnode);
1281			rootvnode = NULL;
1282		}
1283		vput(fsrootvp);
1284	}
1285	if (((mp->mnt_flag & MNT_RDONLY) ||
1286	     (error = VFS_SYNC(mp, MNT_WAIT, td)) == 0) ||
1287	    (flags & MNT_FORCE)) {
1288		error = VFS_UNMOUNT(mp, flags, td);
1289	}
1290	vn_finished_write(mp);
1291	/*
1292	 * If we failed to flush the dirty blocks for this mount point,
1293	 * undo all the cdir/rdir and rootvnode changes we made above.
1294	 * Unless we failed to do so because the device is reporting that
1295	 * it doesn't exist anymore.
1296	 */
1297	if (error && error != ENXIO) {
1298		if ((flags & MNT_FORCE) &&
1299		    VFS_ROOT(mp, LK_EXCLUSIVE, &fsrootvp, td) == 0) {
1300			if (mp->mnt_vnodecovered != NULL)
1301				mountcheckdirs(mp->mnt_vnodecovered, fsrootvp);
1302			if (rootvnode == NULL) {
1303				rootvnode = fsrootvp;
1304				vref(rootvnode);
1305			}
1306			vput(fsrootvp);
1307		}
1308		MNT_ILOCK(mp);
1309		mp->mnt_kern_flag &= ~MNTK_NOINSMNTQ;
1310		if ((mp->mnt_flag & MNT_RDONLY) == 0 && mp->mnt_syncer == NULL) {
1311			MNT_IUNLOCK(mp);
1312			(void) vfs_allocate_syncvnode(mp);
1313			MNT_ILOCK(mp);
1314		}
1315		mp->mnt_kern_flag &= ~(MNTK_UNMOUNT | MNTK_UNMOUNTF);
1316		mp->mnt_flag |= async_flag;
1317		if ((mp->mnt_flag & MNT_ASYNC) != 0 && mp->mnt_noasync == 0)
1318			mp->mnt_kern_flag |= MNTK_ASYNC;
1319		lockmgr(&mp->mnt_lock, LK_RELEASE, NULL);
1320		if (mp->mnt_kern_flag & MNTK_MWAIT)
1321			wakeup(mp);
1322		MNT_IUNLOCK(mp);
1323		if (coveredvp)
1324			VOP_UNLOCK(coveredvp, 0);
1325		return (error);
1326	}
1327	mtx_lock(&mountlist_mtx);
1328	TAILQ_REMOVE(&mountlist, mp, mnt_list);
1329	mtx_unlock(&mountlist_mtx);
1330	if (coveredvp != NULL) {
1331		coveredvp->v_mountedhere = NULL;
1332		vput(coveredvp);
1333	}
1334	vfs_event_signal(NULL, VQ_UNMOUNT, 0);
1335	lockmgr(&mp->mnt_lock, LK_RELEASE, NULL);
1336	vfs_mount_destroy(mp);
1337	return (0);
1338}
1339
1340/*
1341 * ---------------------------------------------------------------------
1342 * Mounting of root filesystem
1343 *
1344 */
1345
1346struct root_hold_token {
1347	const char			*who;
1348	LIST_ENTRY(root_hold_token)	list;
1349};
1350
1351static LIST_HEAD(, root_hold_token)	root_holds =
1352    LIST_HEAD_INITIALIZER(&root_holds);
1353
1354static int root_mount_complete;
1355
1356/*
1357 * Hold root mount.
1358 */
1359struct root_hold_token *
1360root_mount_hold(const char *identifier)
1361{
1362	struct root_hold_token *h;
1363
1364	h = malloc(sizeof *h, M_DEVBUF, M_ZERO | M_WAITOK);
1365	h->who = identifier;
1366	mtx_lock(&mountlist_mtx);
1367	LIST_INSERT_HEAD(&root_holds, h, list);
1368	mtx_unlock(&mountlist_mtx);
1369	return (h);
1370}
1371
1372/*
1373 * Release root mount.
1374 */
1375void
1376root_mount_rel(struct root_hold_token *h)
1377{
1378
1379	mtx_lock(&mountlist_mtx);
1380	LIST_REMOVE(h, list);
1381	wakeup(&root_holds);
1382	mtx_unlock(&mountlist_mtx);
1383	free(h, M_DEVBUF);
1384}
1385
1386/*
1387 * Wait for all subsystems to release root mount.
1388 */
1389static void
1390root_mount_prepare(void)
1391{
1392	struct root_hold_token *h;
1393
1394	for (;;) {
1395		DROP_GIANT();
1396		g_waitidle();
1397		PICKUP_GIANT();
1398		mtx_lock(&mountlist_mtx);
1399		if (LIST_EMPTY(&root_holds)) {
1400			mtx_unlock(&mountlist_mtx);
1401			break;
1402		}
1403		printf("Root mount waiting for:");
1404		LIST_FOREACH(h, &root_holds, list)
1405			printf(" %s", h->who);
1406		printf("\n");
1407		msleep(&root_holds, &mountlist_mtx, PZERO | PDROP, "roothold",
1408		    hz);
1409	}
1410}
1411
1412/*
1413 * Root was mounted, share the good news.
1414 */
1415static void
1416root_mount_done(void)
1417{
1418
1419	/*
1420	 * Use a mutex to prevent the wakeup being missed and waiting for
1421	 * an extra 1 second sleep.
1422	 */
1423	mtx_lock(&mountlist_mtx);
1424	root_mount_complete = 1;
1425	wakeup(&root_mount_complete);
1426	mtx_unlock(&mountlist_mtx);
1427}
1428
1429/*
1430 * Return true if root is already mounted.
1431 */
1432int
1433root_mounted(void)
1434{
1435
1436	/* No mutex is acquired here because int stores are atomic. */
1437	return (root_mount_complete);
1438}
1439
1440/*
1441 * Wait until root is mounted.
1442 */
1443void
1444root_mount_wait(void)
1445{
1446
1447	/*
1448	 * Panic on an obvious deadlock - the function can't be called from
1449	 * a thread which is doing the whole SYSINIT stuff.
1450	 */
1451	KASSERT(curthread->td_proc->p_pid != 0,
1452	    ("root_mount_wait: cannot be called from the swapper thread"));
1453	mtx_lock(&mountlist_mtx);
1454	while (!root_mount_complete) {
1455		msleep(&root_mount_complete, &mountlist_mtx, PZERO, "rootwait",
1456		    hz);
1457	}
1458	mtx_unlock(&mountlist_mtx);
1459}
1460
1461static void
1462set_rootvnode(struct thread *td)
1463{
1464	struct proc *p;
1465
1466	if (VFS_ROOT(TAILQ_FIRST(&mountlist), LK_EXCLUSIVE, &rootvnode, td))
1467		panic("Cannot find root vnode");
1468
1469	p = td->td_proc;
1470	FILEDESC_XLOCK(p->p_fd);
1471
1472	if (p->p_fd->fd_cdir != NULL)
1473		vrele(p->p_fd->fd_cdir);
1474	p->p_fd->fd_cdir = rootvnode;
1475	VREF(rootvnode);
1476
1477	if (p->p_fd->fd_rdir != NULL)
1478		vrele(p->p_fd->fd_rdir);
1479	p->p_fd->fd_rdir = rootvnode;
1480	VREF(rootvnode);
1481
1482	FILEDESC_XUNLOCK(p->p_fd);
1483
1484	VOP_UNLOCK(rootvnode, 0);
1485
1486	EVENTHANDLER_INVOKE(mountroot);
1487}
1488
1489/*
1490 * Mount /devfs as our root filesystem, but do not put it on the mountlist
1491 * yet.  Create a /dev -> / symlink so that absolute pathnames will lookup.
1492 */
1493
1494static void
1495devfs_first(void)
1496{
1497	struct thread *td = curthread;
1498	struct vfsoptlist *opts;
1499	struct vfsconf *vfsp;
1500	struct mount *mp = NULL;
1501	int error;
1502
1503	vfsp = vfs_byname("devfs");
1504	KASSERT(vfsp != NULL, ("Could not find devfs by name"));
1505	if (vfsp == NULL)
1506		return;
1507
1508	mp = vfs_mount_alloc(NULLVP, vfsp, "/dev", td);
1509
1510	error = VFS_MOUNT(mp, td);
1511	KASSERT(error == 0, ("VFS_MOUNT(devfs) failed %d", error));
1512	if (error)
1513		return;
1514
1515	opts = malloc(sizeof(struct vfsoptlist), M_MOUNT, M_WAITOK);
1516	TAILQ_INIT(opts);
1517	mp->mnt_opt = opts;
1518
1519	mtx_lock(&mountlist_mtx);
1520	TAILQ_INSERT_HEAD(&mountlist, mp, mnt_list);
1521	mtx_unlock(&mountlist_mtx);
1522
1523	set_rootvnode(td);
1524
1525	error = kern_symlink(td, "/", "dev", UIO_SYSSPACE);
1526	if (error)
1527		printf("kern_symlink /dev -> / returns %d\n", error);
1528}
1529
1530/*
1531 * Surgically move our devfs to be mounted on /dev.
1532 */
1533
1534static void
1535devfs_fixup(struct thread *td)
1536{
1537	struct nameidata nd;
1538	int error;
1539	struct vnode *vp, *dvp;
1540	struct mount *mp;
1541
1542	/* Remove our devfs mount from the mountlist and purge the cache */
1543	mtx_lock(&mountlist_mtx);
1544	mp = TAILQ_FIRST(&mountlist);
1545	TAILQ_REMOVE(&mountlist, mp, mnt_list);
1546	mtx_unlock(&mountlist_mtx);
1547	cache_purgevfs(mp);
1548
1549	VFS_ROOT(mp, LK_EXCLUSIVE, &dvp, td);
1550	VI_LOCK(dvp);
1551	dvp->v_iflag &= ~VI_MOUNT;
1552	VI_UNLOCK(dvp);
1553	dvp->v_mountedhere = NULL;
1554
1555	/* Set up the real rootvnode, and purge the cache */
1556	TAILQ_FIRST(&mountlist)->mnt_vnodecovered = NULL;
1557	set_rootvnode(td);
1558	cache_purgevfs(rootvnode->v_mount);
1559
1560	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, "/dev", td);
1561	error = namei(&nd);
1562	if (error) {
1563		printf("Lookup of /dev for devfs, error: %d\n", error);
1564		return;
1565	}
1566	NDFREE(&nd, NDF_ONLY_PNBUF);
1567	vp = nd.ni_vp;
1568	if (vp->v_type != VDIR) {
1569		vput(vp);
1570	}
1571	error = vinvalbuf(vp, V_SAVE, td, 0, 0);
1572	if (error) {
1573		vput(vp);
1574	}
1575	cache_purge(vp);
1576	mp->mnt_vnodecovered = vp;
1577	vp->v_mountedhere = mp;
1578	mtx_lock(&mountlist_mtx);
1579	TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list);
1580	mtx_unlock(&mountlist_mtx);
1581	VOP_UNLOCK(vp, 0);
1582	vput(dvp);
1583	vfs_unbusy(mp, td);
1584
1585	/* Unlink the no longer needed /dev/dev -> / symlink */
1586	kern_unlink(td, "/dev/dev", UIO_SYSSPACE);
1587}
1588
1589/*
1590 * Report errors during filesystem mounting.
1591 */
1592void
1593vfs_mount_error(struct mount *mp, const char *fmt, ...)
1594{
1595	struct vfsoptlist *moptlist = mp->mnt_optnew;
1596	va_list ap;
1597	int error, len;
1598	char *errmsg;
1599
1600	error = vfs_getopt(moptlist, "errmsg", (void **)&errmsg, &len);
1601	if (error || errmsg == NULL || len <= 0)
1602		return;
1603
1604	va_start(ap, fmt);
1605	vsnprintf(errmsg, (size_t)len, fmt, ap);
1606	va_end(ap);
1607}
1608
1609/*
1610 * Find and mount the root filesystem
1611 */
1612void
1613vfs_mountroot(void)
1614{
1615	char *cp;
1616	int error, i, asked = 0;
1617
1618	root_mount_prepare();
1619
1620	mount_zone = uma_zcreate("Mountpoints", sizeof(struct mount),
1621	    NULL, NULL, mount_init, mount_fini,
1622	    UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
1623	devfs_first();
1624
1625	/*
1626	 * We are booted with instructions to prompt for the root filesystem.
1627	 */
1628	if (boothowto & RB_ASKNAME) {
1629		if (!vfs_mountroot_ask())
1630			goto mounted;
1631		asked = 1;
1632	}
1633
1634	/*
1635	 * The root filesystem information is compiled in, and we are
1636	 * booted with instructions to use it.
1637	 */
1638	if (ctrootdevname != NULL && (boothowto & RB_DFLTROOT)) {
1639		if (!vfs_mountroot_try(ctrootdevname))
1640			goto mounted;
1641		ctrootdevname = NULL;
1642	}
1643
1644	/*
1645	 * We've been given the generic "use CDROM as root" flag.  This is
1646	 * necessary because one media may be used in many different
1647	 * devices, so we need to search for them.
1648	 */
1649	if (boothowto & RB_CDROM) {
1650		for (i = 0; cdrom_rootdevnames[i] != NULL; i++) {
1651			if (!vfs_mountroot_try(cdrom_rootdevnames[i]))
1652				goto mounted;
1653		}
1654	}
1655
1656	/*
1657	 * Try to use the value read by the loader from /etc/fstab, or
1658	 * supplied via some other means.  This is the preferred
1659	 * mechanism.
1660	 */
1661	cp = getenv("vfs.root.mountfrom");
1662	if (cp != NULL) {
1663		error = vfs_mountroot_try(cp);
1664		freeenv(cp);
1665		if (!error)
1666			goto mounted;
1667	}
1668
1669	/*
1670	 * Try values that may have been computed by code during boot
1671	 */
1672	if (!vfs_mountroot_try(rootdevnames[0]))
1673		goto mounted;
1674	if (!vfs_mountroot_try(rootdevnames[1]))
1675		goto mounted;
1676
1677	/*
1678	 * If we (still) have a compiled-in default, try it.
1679	 */
1680	if (ctrootdevname != NULL)
1681		if (!vfs_mountroot_try(ctrootdevname))
1682			goto mounted;
1683	/*
1684	 * Everything so far has failed, prompt on the console if we haven't
1685	 * already tried that.
1686	 */
1687	if (!asked)
1688		if (!vfs_mountroot_ask())
1689			goto mounted;
1690
1691	panic("Root mount failed, startup aborted.");
1692
1693mounted:
1694	root_mount_done();
1695}
1696
1697/*
1698 * Mount (mountfrom) as the root filesystem.
1699 */
1700static int
1701vfs_mountroot_try(const char *mountfrom)
1702{
1703	struct mount	*mp;
1704	char		*vfsname, *path;
1705	time_t		timebase;
1706	int		error;
1707	char		patt[32];
1708
1709	vfsname = NULL;
1710	path    = NULL;
1711	mp      = NULL;
1712	error   = EINVAL;
1713
1714	if (mountfrom == NULL)
1715		return (error);		/* don't complain */
1716	printf("Trying to mount root from %s\n", mountfrom);
1717
1718	/* parse vfs name and path */
1719	vfsname = malloc(MFSNAMELEN, M_MOUNT, M_WAITOK);
1720	path = malloc(MNAMELEN, M_MOUNT, M_WAITOK);
1721	vfsname[0] = path[0] = 0;
1722	sprintf(patt, "%%%d[a-z0-9]:%%%ds", MFSNAMELEN, MNAMELEN);
1723	if (sscanf(mountfrom, patt, vfsname, path) < 1)
1724		goto out;
1725
1726	if (path[0] == '\0')
1727		strcpy(path, ROOTNAME);
1728
1729	error = kernel_vmount(
1730	    MNT_RDONLY | MNT_ROOTFS,
1731	    "fstype", vfsname,
1732	    "fspath", "/",
1733	    "from", path,
1734	    NULL);
1735	if (error == 0) {
1736		/*
1737		 * We mount devfs prior to mounting the / FS, so the first
1738		 * entry will typically be devfs.
1739		 */
1740		mp = TAILQ_FIRST(&mountlist);
1741		KASSERT(mp != NULL, ("%s: mountlist is empty", __func__));
1742
1743		/*
1744		 * Iterate over all currently mounted file systems and use
1745		 * the time stamp found to check and/or initialize the RTC.
1746		 * Typically devfs has no time stamp and the only other FS
1747		 * is the actual / FS.
1748		 * Call inittodr() only once and pass it the largest of the
1749		 * timestamps we encounter.
1750		 */
1751		timebase = 0;
1752		do {
1753			if (mp->mnt_time > timebase)
1754				timebase = mp->mnt_time;
1755			mp = TAILQ_NEXT(mp, mnt_list);
1756		} while (mp != NULL);
1757		inittodr(timebase);
1758
1759		devfs_fixup(curthread);
1760	}
1761out:
1762	free(path, M_MOUNT);
1763	free(vfsname, M_MOUNT);
1764	return (error);
1765}
1766
1767/*
1768 * ---------------------------------------------------------------------
1769 * Interactive root filesystem selection code.
1770 */
1771
1772static int
1773vfs_mountroot_ask(void)
1774{
1775	char name[128];
1776
1777	for(;;) {
1778		printf("\nManual root filesystem specification:\n");
1779		printf("  <fstype>:<device>  Mount <device> using filesystem <fstype>\n");
1780#if defined(__amd64__) || defined(__i386__) || defined(__ia64__)
1781		printf("                       eg. ufs:da0s1a\n");
1782#else
1783		printf("                       eg. ufs:/dev/da0a\n");
1784#endif
1785		printf("  ?                  List valid disk boot devices\n");
1786		printf("  <empty line>       Abort manual input\n");
1787		printf("\nmountroot> ");
1788		gets(name, sizeof(name), 1);
1789		if (name[0] == '\0')
1790			return (1);
1791		if (name[0] == '?') {
1792			printf("\nList of GEOM managed disk devices:\n  ");
1793			g_dev_print();
1794			continue;
1795		}
1796		if (!vfs_mountroot_try(name))
1797			return (0);
1798	}
1799}
1800
1801/*
1802 * ---------------------------------------------------------------------
1803 * Functions for querying mount options/arguments from filesystems.
1804 */
1805
1806/*
1807 * Check that no unknown options are given
1808 */
1809int
1810vfs_filteropt(struct vfsoptlist *opts, const char **legal)
1811{
1812	struct vfsopt *opt;
1813	char errmsg[255];
1814	const char **t, *p, *q;
1815	int ret = 0;
1816
1817	TAILQ_FOREACH(opt, opts, link) {
1818		p = opt->name;
1819		q = NULL;
1820		if (p[0] == 'n' && p[1] == 'o')
1821			q = p + 2;
1822		for(t = global_opts; *t != NULL; t++) {
1823			if (strcmp(*t, p) == 0)
1824				break;
1825			if (q != NULL) {
1826				if (strcmp(*t, q) == 0)
1827					break;
1828			}
1829		}
1830		if (*t != NULL)
1831			continue;
1832		for(t = legal; *t != NULL; t++) {
1833			if (strcmp(*t, p) == 0)
1834				break;
1835			if (q != NULL) {
1836				if (strcmp(*t, q) == 0)
1837					break;
1838			}
1839		}
1840		if (*t != NULL)
1841			continue;
1842		sprintf(errmsg, "mount option <%s> is unknown", p);
1843		printf("%s\n", errmsg);
1844		ret = EINVAL;
1845	}
1846	if (ret != 0) {
1847		TAILQ_FOREACH(opt, opts, link) {
1848			if (strcmp(opt->name, "errmsg") == 0) {
1849				strncpy((char *)opt->value, errmsg, opt->len);
1850			}
1851		}
1852	}
1853	return (ret);
1854}
1855
1856/*
1857 * Get a mount option by its name.
1858 *
1859 * Return 0 if the option was found, ENOENT otherwise.
1860 * If len is non-NULL it will be filled with the length
1861 * of the option. If buf is non-NULL, it will be filled
1862 * with the address of the option.
1863 */
1864int
1865vfs_getopt(opts, name, buf, len)
1866	struct vfsoptlist *opts;
1867	const char *name;
1868	void **buf;
1869	int *len;
1870{
1871	struct vfsopt *opt;
1872
1873	KASSERT(opts != NULL, ("vfs_getopt: caller passed 'opts' as NULL"));
1874
1875	TAILQ_FOREACH(opt, opts, link) {
1876		if (strcmp(name, opt->name) == 0) {
1877			if (len != NULL)
1878				*len = opt->len;
1879			if (buf != NULL)
1880				*buf = opt->value;
1881			return (0);
1882		}
1883	}
1884	return (ENOENT);
1885}
1886
1887static int
1888vfs_getopt_pos(struct vfsoptlist *opts, const char *name)
1889{
1890	struct vfsopt *opt;
1891	int i;
1892
1893	if (opts == NULL)
1894		return (-1);
1895
1896	i = 0;
1897	TAILQ_FOREACH(opt, opts, link) {
1898		if (strcmp(name, opt->name) == 0)
1899			return (i);
1900		++i;
1901	}
1902	return (-1);
1903}
1904
1905char *
1906vfs_getopts(struct vfsoptlist *opts, const char *name, int *error)
1907{
1908	struct vfsopt *opt;
1909
1910	*error = 0;
1911	TAILQ_FOREACH(opt, opts, link) {
1912		if (strcmp(name, opt->name) != 0)
1913			continue;
1914		if (((char *)opt->value)[opt->len - 1] != '\0') {
1915			*error = EINVAL;
1916			return (NULL);
1917		}
1918		return (opt->value);
1919	}
1920	*error = ENOENT;
1921	return (NULL);
1922}
1923
1924int
1925vfs_flagopt(struct vfsoptlist *opts, const char *name, u_int *w, u_int val)
1926{
1927	struct vfsopt *opt;
1928
1929	TAILQ_FOREACH(opt, opts, link) {
1930		if (strcmp(name, opt->name) == 0) {
1931			if (w != NULL)
1932				*w |= val;
1933			return (1);
1934		}
1935	}
1936	if (w != NULL)
1937		*w &= ~val;
1938	return (0);
1939}
1940
1941int
1942vfs_scanopt(struct vfsoptlist *opts, const char *name, const char *fmt, ...)
1943{
1944	va_list ap;
1945	struct vfsopt *opt;
1946	int ret;
1947
1948	KASSERT(opts != NULL, ("vfs_getopt: caller passed 'opts' as NULL"));
1949
1950	TAILQ_FOREACH(opt, opts, link) {
1951		if (strcmp(name, opt->name) != 0)
1952			continue;
1953		if (opt->len == 0 || opt->value == NULL)
1954			return (0);
1955		if (((char *)opt->value)[opt->len - 1] != '\0')
1956			return (0);
1957		va_start(ap, fmt);
1958		ret = vsscanf(opt->value, fmt, ap);
1959		va_end(ap);
1960		return (ret);
1961	}
1962	return (0);
1963}
1964
1965/*
1966 * Find and copy a mount option.
1967 *
1968 * The size of the buffer has to be specified
1969 * in len, if it is not the same length as the
1970 * mount option, EINVAL is returned.
1971 * Returns ENOENT if the option is not found.
1972 */
1973int
1974vfs_copyopt(opts, name, dest, len)
1975	struct vfsoptlist *opts;
1976	const char *name;
1977	void *dest;
1978	int len;
1979{
1980	struct vfsopt *opt;
1981
1982	KASSERT(opts != NULL, ("vfs_copyopt: caller passed 'opts' as NULL"));
1983
1984	TAILQ_FOREACH(opt, opts, link) {
1985		if (strcmp(name, opt->name) == 0) {
1986			if (len != opt->len)
1987				return (EINVAL);
1988			bcopy(opt->value, dest, opt->len);
1989			return (0);
1990		}
1991	}
1992	return (ENOENT);
1993}
1994
1995/*
1996 * This is a helper function for filesystems to traverse their
1997 * vnodes.  See MNT_VNODE_FOREACH() in sys/mount.h
1998 */
1999
2000struct vnode *
2001__mnt_vnode_next(struct vnode **mvp, struct mount *mp)
2002{
2003	struct vnode *vp;
2004
2005	mtx_assert(MNT_MTX(mp), MA_OWNED);
2006
2007	KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
2008	if ((*mvp)->v_yield++ == 500) {
2009		MNT_IUNLOCK(mp);
2010		(*mvp)->v_yield = 0;
2011		uio_yield();
2012		MNT_ILOCK(mp);
2013	}
2014	vp = TAILQ_NEXT(*mvp, v_nmntvnodes);
2015	while (vp != NULL && vp->v_type == VMARKER)
2016		vp = TAILQ_NEXT(vp, v_nmntvnodes);
2017
2018	/* Check if we are done */
2019	if (vp == NULL) {
2020		__mnt_vnode_markerfree(mvp, mp);
2021		return (NULL);
2022	}
2023	TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes);
2024	TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes);
2025	return (vp);
2026}
2027
2028struct vnode *
2029__mnt_vnode_first(struct vnode **mvp, struct mount *mp)
2030{
2031	struct vnode *vp;
2032
2033	mtx_assert(MNT_MTX(mp), MA_OWNED);
2034
2035	vp = TAILQ_FIRST(&mp->mnt_nvnodelist);
2036	while (vp != NULL && vp->v_type == VMARKER)
2037		vp = TAILQ_NEXT(vp, v_nmntvnodes);
2038
2039	/* Check if we are done */
2040	if (vp == NULL) {
2041		*mvp = NULL;
2042		return (NULL);
2043	}
2044	mp->mnt_holdcnt++;
2045	MNT_IUNLOCK(mp);
2046	*mvp = (struct vnode *) malloc(sizeof(struct vnode),
2047				       M_VNODE_MARKER,
2048				       M_WAITOK | M_ZERO);
2049	MNT_ILOCK(mp);
2050	(*mvp)->v_type = VMARKER;
2051
2052	vp = TAILQ_FIRST(&mp->mnt_nvnodelist);
2053	while (vp != NULL && vp->v_type == VMARKER)
2054		vp = TAILQ_NEXT(vp, v_nmntvnodes);
2055
2056	/* Check if we are done */
2057	if (vp == NULL) {
2058		MNT_IUNLOCK(mp);
2059		free(*mvp, M_VNODE_MARKER);
2060		MNT_ILOCK(mp);
2061		*mvp = NULL;
2062		mp->mnt_holdcnt--;
2063		if (mp->mnt_holdcnt == 0 && mp->mnt_holdcntwaiters != 0)
2064			wakeup(&mp->mnt_holdcnt);
2065		return (NULL);
2066	}
2067	mp->mnt_markercnt++;
2068	(*mvp)->v_mount = mp;
2069	TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes);
2070	return (vp);
2071}
2072
2073
2074void
2075__mnt_vnode_markerfree(struct vnode **mvp, struct mount *mp)
2076{
2077
2078	if (*mvp == NULL)
2079		return;
2080
2081	mtx_assert(MNT_MTX(mp), MA_OWNED);
2082
2083	KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
2084	TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes);
2085	MNT_IUNLOCK(mp);
2086	free(*mvp, M_VNODE_MARKER);
2087	MNT_ILOCK(mp);
2088	*mvp = NULL;
2089
2090	mp->mnt_markercnt--;
2091	mp->mnt_holdcnt--;
2092	if (mp->mnt_holdcnt == 0 && mp->mnt_holdcntwaiters != 0)
2093		wakeup(&mp->mnt_holdcnt);
2094}
2095
2096
2097int
2098__vfs_statfs(struct mount *mp, struct statfs *sbp, struct thread *td)
2099{
2100	int error;
2101
2102	error = mp->mnt_op->vfs_statfs(mp, &mp->mnt_stat, td);
2103	if (sbp != &mp->mnt_stat)
2104		*sbp = mp->mnt_stat;
2105	return (error);
2106}
2107
2108void
2109vfs_mountedfrom(struct mount *mp, const char *from)
2110{
2111
2112	bzero(mp->mnt_stat.f_mntfromname, sizeof mp->mnt_stat.f_mntfromname);
2113	strlcpy(mp->mnt_stat.f_mntfromname, from,
2114	    sizeof mp->mnt_stat.f_mntfromname);
2115}
2116
2117/*
2118 * ---------------------------------------------------------------------
2119 * This is the api for building mount args and mounting filesystems from
2120 * inside the kernel.
2121 *
2122 * The API works by accumulation of individual args.  First error is
2123 * latched.
2124 *
2125 * XXX: should be documented in new manpage kernel_mount(9)
2126 */
2127
2128/* A memory allocation which must be freed when we are done */
2129struct mntaarg {
2130	SLIST_ENTRY(mntaarg)	next;
2131};
2132
2133/* The header for the mount arguments */
2134struct mntarg {
2135	struct iovec *v;
2136	int len;
2137	int error;
2138	SLIST_HEAD(, mntaarg)	list;
2139};
2140
2141/*
2142 * Add a boolean argument.
2143 *
2144 * flag is the boolean value.
2145 * name must start with "no".
2146 */
2147struct mntarg *
2148mount_argb(struct mntarg *ma, int flag, const char *name)
2149{
2150
2151	KASSERT(name[0] == 'n' && name[1] == 'o',
2152	    ("mount_argb(...,%s): name must start with 'no'", name));
2153
2154	return (mount_arg(ma, name + (flag ? 2 : 0), NULL, 0));
2155}
2156
2157/*
2158 * Add an argument printf style
2159 */
2160struct mntarg *
2161mount_argf(struct mntarg *ma, const char *name, const char *fmt, ...)
2162{
2163	va_list ap;
2164	struct mntaarg *maa;
2165	struct sbuf *sb;
2166	int len;
2167
2168	if (ma == NULL) {
2169		ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK | M_ZERO);
2170		SLIST_INIT(&ma->list);
2171	}
2172	if (ma->error)
2173		return (ma);
2174
2175	ma->v = realloc(ma->v, sizeof *ma->v * (ma->len + 2),
2176	    M_MOUNT, M_WAITOK);
2177	ma->v[ma->len].iov_base = (void *)(uintptr_t)name;
2178	ma->v[ma->len].iov_len = strlen(name) + 1;
2179	ma->len++;
2180
2181	sb = sbuf_new(NULL, NULL, 0, SBUF_AUTOEXTEND);
2182	va_start(ap, fmt);
2183	sbuf_vprintf(sb, fmt, ap);
2184	va_end(ap);
2185	sbuf_finish(sb);
2186	len = sbuf_len(sb) + 1;
2187	maa = malloc(sizeof *maa + len, M_MOUNT, M_WAITOK | M_ZERO);
2188	SLIST_INSERT_HEAD(&ma->list, maa, next);
2189	bcopy(sbuf_data(sb), maa + 1, len);
2190	sbuf_delete(sb);
2191
2192	ma->v[ma->len].iov_base = maa + 1;
2193	ma->v[ma->len].iov_len = len;
2194	ma->len++;
2195
2196	return (ma);
2197}
2198
2199/*
2200 * Add an argument which is a userland string.
2201 */
2202struct mntarg *
2203mount_argsu(struct mntarg *ma, const char *name, const void *val, int len)
2204{
2205	struct mntaarg *maa;
2206	char *tbuf;
2207
2208	if (val == NULL)
2209		return (ma);
2210	if (ma == NULL) {
2211		ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK | M_ZERO);
2212		SLIST_INIT(&ma->list);
2213	}
2214	if (ma->error)
2215		return (ma);
2216	maa = malloc(sizeof *maa + len, M_MOUNT, M_WAITOK | M_ZERO);
2217	SLIST_INSERT_HEAD(&ma->list, maa, next);
2218	tbuf = (void *)(maa + 1);
2219	ma->error = copyinstr(val, tbuf, len, NULL);
2220	return (mount_arg(ma, name, tbuf, -1));
2221}
2222
2223/*
2224 * Plain argument.
2225 *
2226 * If length is -1, treat value as a C string.
2227 */
2228struct mntarg *
2229mount_arg(struct mntarg *ma, const char *name, const void *val, int len)
2230{
2231
2232	if (ma == NULL) {
2233		ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK | M_ZERO);
2234		SLIST_INIT(&ma->list);
2235	}
2236	if (ma->error)
2237		return (ma);
2238
2239	ma->v = realloc(ma->v, sizeof *ma->v * (ma->len + 2),
2240	    M_MOUNT, M_WAITOK);
2241	ma->v[ma->len].iov_base = (void *)(uintptr_t)name;
2242	ma->v[ma->len].iov_len = strlen(name) + 1;
2243	ma->len++;
2244
2245	ma->v[ma->len].iov_base = (void *)(uintptr_t)val;
2246	if (len < 0)
2247		ma->v[ma->len].iov_len = strlen(val) + 1;
2248	else
2249		ma->v[ma->len].iov_len = len;
2250	ma->len++;
2251	return (ma);
2252}
2253
2254/*
2255 * Free a mntarg structure
2256 */
2257static void
2258free_mntarg(struct mntarg *ma)
2259{
2260	struct mntaarg *maa;
2261
2262	while (!SLIST_EMPTY(&ma->list)) {
2263		maa = SLIST_FIRST(&ma->list);
2264		SLIST_REMOVE_HEAD(&ma->list, next);
2265		free(maa, M_MOUNT);
2266	}
2267	free(ma->v, M_MOUNT);
2268	free(ma, M_MOUNT);
2269}
2270
2271/*
2272 * Mount a filesystem
2273 */
2274int
2275kernel_mount(struct mntarg *ma, int flags)
2276{
2277	struct uio auio;
2278	int error;
2279
2280	KASSERT(ma != NULL, ("kernel_mount NULL ma"));
2281	KASSERT(ma->v != NULL, ("kernel_mount NULL ma->v"));
2282	KASSERT(!(ma->len & 1), ("kernel_mount odd ma->len (%d)", ma->len));
2283
2284	auio.uio_iov = ma->v;
2285	auio.uio_iovcnt = ma->len;
2286	auio.uio_segflg = UIO_SYSSPACE;
2287
2288	error = ma->error;
2289	if (!error)
2290		error = vfs_donmount(curthread, flags, &auio);
2291	free_mntarg(ma);
2292	return (error);
2293}
2294
2295/*
2296 * A printflike function to mount a filesystem.
2297 */
2298int
2299kernel_vmount(int flags, ...)
2300{
2301	struct mntarg *ma = NULL;
2302	va_list ap;
2303	const char *cp;
2304	const void *vp;
2305	int error;
2306
2307	va_start(ap, flags);
2308	for (;;) {
2309		cp = va_arg(ap, const char *);
2310		if (cp == NULL)
2311			break;
2312		vp = va_arg(ap, const void *);
2313		ma = mount_arg(ma, cp, vp, (vp != NULL ? -1 : 0));
2314	}
2315	va_end(ap);
2316
2317	error = kernel_mount(ma, flags);
2318	return (error);
2319}
2320