vfs_mount.c revision 184588
155992Swpaul/*-
255992Swpaul * Copyright (c) 1999-2004 Poul-Henning Kamp
355992Swpaul * Copyright (c) 1999 Michael Smith
455992Swpaul * Copyright (c) 1989, 1993
555992Swpaul *	The Regents of the University of California.  All rights reserved.
655992Swpaul * (c) UNIX System Laboratories, Inc.
755992Swpaul * All or some portions of this file are derived from material licensed
855992Swpaul * to the University of California by American Telephone and Telegraph
955992Swpaul * Co. or Unix System Laboratories, Inc. and are reproduced herein with
1055992Swpaul * the permission of UNIX System Laboratories, Inc.
1155992Swpaul *
1255992Swpaul * Redistribution and use in source and binary forms, with or without
1355992Swpaul * modification, are permitted provided that the following conditions
1455992Swpaul * are met:
1555992Swpaul * 1. Redistributions of source code must retain the above copyright
1655992Swpaul *    notice, this list of conditions and the following disclaimer.
1755992Swpaul * 2. Redistributions in binary form must reproduce the above copyright
1855992Swpaul *    notice, this list of conditions and the following disclaimer in the
1955992Swpaul *    documentation and/or other materials provided with the distribution.
2055992Swpaul * 4. Neither the name of the University nor the names of its contributors
2155992Swpaul *    may be used to endorse or promote products derived from this software
2255992Swpaul *    without specific prior written permission.
2355992Swpaul *
2455992Swpaul * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
2555992Swpaul * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
2655992Swpaul * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
2755992Swpaul * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
2855992Swpaul * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
2955992Swpaul * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
3055992Swpaul * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
3155992Swpaul * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
3255992Swpaul * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
3355992Swpaul * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
3455992Swpaul * SUCH DAMAGE.
3555992Swpaul */
3655992Swpaul
3755992Swpaul#include <sys/cdefs.h>
3855992Swpaul__FBSDID("$FreeBSD: head/sys/kern/vfs_mount.c 184588 2008-11-03 10:38:00Z dfr $");
3955992Swpaul
4055992Swpaul#include <sys/param.h>
4155992Swpaul#include <sys/conf.h>
4255992Swpaul#include <sys/fcntl.h>
4355992Swpaul#include <sys/jail.h>
4455992Swpaul#include <sys/kernel.h>
4555992Swpaul#include <sys/libkern.h>
4655992Swpaul#include <sys/malloc.h>
4755992Swpaul#include <sys/mount.h>
4855992Swpaul#include <sys/mutex.h>
4955992Swpaul#include <sys/namei.h>
5055992Swpaul#include <sys/priv.h>
5155992Swpaul#include <sys/proc.h>
5255992Swpaul#include <sys/filedesc.h>
5355992Swpaul#include <sys/reboot.h>
5455992Swpaul#include <sys/syscallsubr.h>
5555992Swpaul#include <sys/sysproto.h>
5655992Swpaul#include <sys/sx.h>
5755992Swpaul#include <sys/sysctl.h>
5855992Swpaul#include <sys/sysent.h>
5955992Swpaul#include <sys/systm.h>
6055992Swpaul#include <sys/vnode.h>
6155992Swpaul#include <vm/uma.h>
6255992Swpaul
6355992Swpaul#include <geom/geom.h>
6455992Swpaul
6555992Swpaul#include <machine/stdarg.h>
6655992Swpaul
6755992Swpaul#include <security/audit/audit.h>
6855992Swpaul#include <security/mac/mac_framework.h>
6955992Swpaul
7055992Swpaul#include "opt_rootdevname.h"
7155992Swpaul#include "opt_mac.h"
7284811Sjhb
7367365Sjhb#define	ROOTNAME		"root_device"
7455992Swpaul#define	VFS_MOUNTARG_SIZE_MAX	(1024 * 64)
7555992Swpaul
7655992Swpaulstatic int	vfs_domount(struct thread *td, const char *fstype,
7755992Swpaul		    char *fspath, int fsflags, void *fsdata);
7855992Swpaulstatic int	vfs_mountroot_ask(void);
7955992Swpaulstatic int	vfs_mountroot_try(const char *mountfrom);
8077217Sphkstatic void	free_mntarg(struct mntarg *ma);
8155992Swpaulstatic int	vfs_getopt_pos(struct vfsoptlist *opts, const char *name);
8255992Swpaul
8355992Swpaulstatic int	usermount = 0;
8455992SwpaulSYSCTL_INT(_vfs, OID_AUTO, usermount, CTLFLAG_RW, &usermount, 0,
8555992Swpaul    "Unprivileged users may mount and unmount file systems");
8655992Swpaul
8755992SwpaulMALLOC_DEFINE(M_MOUNT, "mount", "vfs mount structure");
8855992SwpaulMALLOC_DEFINE(M_VNODE_MARKER, "vnodemarker", "vnode marker");
8955992Swpaulstatic uma_zone_t mount_zone;
9055992Swpaul
9155992Swpaul/* List of mounted filesystems. */
9255992Swpaulstruct mntlist mountlist = TAILQ_HEAD_INITIALIZER(mountlist);
9355992Swpaul
9455992Swpaul/* For any iteration/modification of mountlist */
9555992Swpaulstruct mtx mountlist_mtx;
9655992SwpaulMTX_SYSINIT(mountlist, &mountlist_mtx, "mountlist", MTX_DEF);
9755992Swpaul
9855992SwpaulTAILQ_HEAD(vfsoptlist, vfsopt);
9955992Swpaulstruct vfsopt {
10081221Sbrooks	TAILQ_ENTRY(vfsopt) link;
10155992Swpaul	char	*name;
10255992Swpaul	void	*value;
10355992Swpaul	int	len;
104108401Sambrisko};
10555992Swpaul
10655992Swpaul/*
10755992Swpaul * The vnode of the system's root (/ in the filesystem, without chroot
10855992Swpaul * active.)
10981221Sbrooks */
11055992Swpaulstruct vnode	*rootvnode;
11155992Swpaul
11255992Swpaul/*
11355992Swpaul * The root filesystem is detailed in the kernel environment variable
11455992Swpaul * vfs.root.mountfrom, which is expected to be in the general format
11555992Swpaul *
11692739Salfred * <vfsname>:[<path>]
11792739Salfred * vfsname   := the name of a VFS known to the kernel and capable
11892739Salfred *              of being mounted as root
119110362Sambrisko * path      := disk device name or other data used by the filesystem
120110362Sambrisko *              to locate its physical store
12155992Swpaul */
12283270Sbrooks
12383270Sbrooks/*
12455992Swpaul * Global opts, taken by all filesystems
12555992Swpaul */
12655992Swpaulstatic const char *global_opts[] = {
12755992Swpaul	"errmsg",
12855992Swpaul	"fstype",
12983270Sbrooks	"fspath",
13055992Swpaul	"ro",
13156051Swpaul	"rw",
13255992Swpaul	"nosuid",
13355992Swpaul	"noexec",
13455992Swpaul	NULL
13555992Swpaul};
13655992Swpaul
13755992Swpaul/*
138108401Sambrisko * The root specifiers we will try if RB_CDROM is specified.
139108401Sambrisko */
140108401Sambriskostatic char *cdrom_rootdevnames[] = {
141108401Sambrisko	"cd9660:cd0",
142108401Sambrisko	"cd9660:acd0",
143108401Sambrisko	NULL
14455992Swpaul};
14555992Swpaul
14655992Swpaul/* legacy find-root code */
14783270Sbrookschar		*rootdevnames[2] = {NULL, NULL};
14883270Sbrooks#ifndef ROOTDEVNAME
14955992Swpaul#  define ROOTDEVNAME NULL
15055992Swpaul#endif
15155992Swpaulstatic const char	*ctrootdevname = ROOTDEVNAME;
15255992Swpaul
15355992Swpaul/*
15455992Swpaul * ---------------------------------------------------------------------
15555992Swpaul * Functions for building and sanitizing the mount options
15655992Swpaul */
15755992Swpaul
15855992Swpaul/* Remove one mount option. */
15983270Sbrooksstatic void
160108401Sambriskovfs_freeopt(struct vfsoptlist *opts, struct vfsopt *opt)
161108401Sambrisko{
162108401Sambrisko
163108401Sambrisko	TAILQ_REMOVE(opts, opt, link);
164108401Sambrisko	free(opt->name, M_MOUNT);
165108401Sambrisko	if (opt->value != NULL)
166108401Sambrisko		free(opt->value, M_MOUNT);
167108401Sambrisko#ifdef INVARIANTS
168108401Sambrisko	else if (opt->len != 0)
169108401Sambrisko		panic("%s: mount option with NULL value but length != 0",
170108401Sambrisko		    __func__);
171108401Sambrisko#endif
17255992Swpaul	free(opt, M_MOUNT);
173108401Sambrisko}
174108401Sambrisko
175108401Sambrisko/* Release all resources related to the mount options. */
176108401Sambriskovoid
177108401Sambriskovfs_freeopts(struct vfsoptlist *opts)
178108401Sambrisko{
17955992Swpaul	struct vfsopt *opt;
18055992Swpaul
18155992Swpaul	while (!TAILQ_EMPTY(opts)) {
18255992Swpaul		opt = TAILQ_FIRST(opts);
18355992Swpaul		vfs_freeopt(opts, opt);
18455992Swpaul	}
18555992Swpaul	free(opts, M_MOUNT);
18655992Swpaul}
18755992Swpaul
18855992Swpaulvoid
18955992Swpaulvfs_deleteopt(struct vfsoptlist *opts, const char *name)
190108401Sambrisko{
191108401Sambrisko	struct vfsopt *opt, *temp;
192108401Sambrisko
193108401Sambrisko	if (opts == NULL)
194108401Sambrisko		return;
195108401Sambrisko	TAILQ_FOREACH_SAFE(opt, opts, link, temp)  {
196108401Sambrisko		if (strcmp(opt->name, name) == 0)
197108401Sambrisko			vfs_freeopt(opts, opt);
198108401Sambrisko	}
199108401Sambrisko}
200108401Sambrisko
201108401Sambrisko/*
202108401Sambrisko * Check if options are equal (with or without the "no" prefix).
203108401Sambrisko */
204108401Sambriskostatic int
205108401Sambriskovfs_equalopts(const char *opt1, const char *opt2)
206108401Sambrisko{
207108401Sambrisko
208108401Sambrisko	/* "opt" vs. "opt" or "noopt" vs. "noopt" */
209108401Sambrisko	if (strcmp(opt1, opt2) == 0)
210108401Sambrisko		return (1);
211108401Sambrisko	/* "noopt" vs. "opt" */
212108401Sambrisko	if (strncmp(opt1, "no", 2) == 0 && strcmp(opt1 + 2, opt2) == 0)
213108401Sambrisko		return (1);
214108401Sambrisko	/* "opt" vs. "noopt" */
215108401Sambrisko	if (strncmp(opt2, "no", 2) == 0 && strcmp(opt1, opt2 + 2) == 0)
216108401Sambrisko		return (1);
217108401Sambrisko	return (0);
218108401Sambrisko}
219108401Sambrisko
220108401Sambrisko/*
221108401Sambrisko * If a mount option is specified several times,
222108401Sambrisko * (with or without the "no" prefix) only keep
223108401Sambrisko * the last occurence of it.
224108401Sambrisko */
225108401Sambriskostatic void
226108401Sambriskovfs_sanitizeopts(struct vfsoptlist *opts)
227108401Sambrisko{
228108401Sambrisko	struct vfsopt *opt, *opt2, *tmp;
229108401Sambrisko
23055992Swpaul	TAILQ_FOREACH_REVERSE(opt, opts, vfsoptlist, link) {
23155992Swpaul		opt2 = TAILQ_PREV(opt, vfsoptlist, link);
23255992Swpaul		while (opt2 != NULL) {
23355992Swpaul			if (vfs_equalopts(opt->name, opt2->name)) {
23455992Swpaul				tmp = TAILQ_PREV(opt2, vfsoptlist, link);
23583270Sbrooks				vfs_freeopt(opts, opt2);
23655992Swpaul				opt2 = tmp;
23755992Swpaul			} else {
23855992Swpaul				opt2 = TAILQ_PREV(opt2, vfsoptlist, link);
23955992Swpaul			}
24055992Swpaul		}
24155992Swpaul	}
24267096Swpaul}
24355992Swpaul
24455992Swpaul/*
24555992Swpaul * Build a linked list of mount options from a struct uio.
246108401Sambrisko */
247108401Sambriskostatic int
24855992Swpaulvfs_buildopts(struct uio *auio, struct vfsoptlist **options)
24955992Swpaul{
25055992Swpaul	struct vfsoptlist *opts;
25183270Sbrooks	struct vfsopt *opt;
25255992Swpaul	size_t memused;
25355992Swpaul	unsigned int i, iovcnt;
25455992Swpaul	int error, namelen, optlen;
25555992Swpaul
25655992Swpaul	opts = malloc(sizeof(struct vfsoptlist), M_MOUNT, M_WAITOK);
25755992Swpaul	TAILQ_INIT(opts);
25877217Sphk	memused = 0;
259106937Ssam	iovcnt = auio->uio_iovcnt;
26055992Swpaul	for (i = 0; i < iovcnt; i += 2) {
26155992Swpaul		opt = malloc(sizeof(struct vfsopt), M_MOUNT, M_WAITOK);
26255992Swpaul		namelen = auio->uio_iov[i].iov_len;
26355992Swpaul		optlen = auio->uio_iov[i + 1].iov_len;
26455992Swpaul		opt->name = malloc(namelen, M_MOUNT, M_WAITOK);
26555992Swpaul		opt->value = NULL;
266110362Sambrisko		opt->len = 0;
267110362Sambrisko
268110362Sambrisko		/*
269110362Sambrisko		 * Do this early, so jumps to "bad" will free the current
270110362Sambrisko		 * option.
271110362Sambrisko		 */
272110362Sambrisko		TAILQ_INSERT_TAIL(opts, opt, link);
273110362Sambrisko		memused += sizeof(struct vfsopt) + optlen + namelen;
274110362Sambrisko
275110362Sambrisko		/*
276110362Sambrisko		 * Avoid consuming too much memory, and attempts to overflow
277110362Sambrisko		 * memused.
278110362Sambrisko		 */
279110362Sambrisko		if (memused > VFS_MOUNTARG_SIZE_MAX ||
280110362Sambrisko		    optlen > VFS_MOUNTARG_SIZE_MAX ||
281110362Sambrisko		    namelen > VFS_MOUNTARG_SIZE_MAX) {
28255992Swpaul			error = EINVAL;
28355992Swpaul			goto bad;
28455992Swpaul		}
28555992Swpaul
28655992Swpaul		if (auio->uio_segflg == UIO_SYSSPACE) {
28755992Swpaul			bcopy(auio->uio_iov[i].iov_base, opt->name, namelen);
288110362Sambrisko		} else {
289110362Sambrisko			error = copyin(auio->uio_iov[i].iov_base, opt->name,
29055992Swpaul			    namelen);
29155992Swpaul			if (error)
29255992Swpaul				goto bad;
29355992Swpaul		}
29455992Swpaul		/* Ensure names are null-terminated strings. */
29555992Swpaul		if (opt->name[namelen - 1] != '\0') {
29655992Swpaul			error = EINVAL;
29755992Swpaul			goto bad;
29855992Swpaul		}
29955992Swpaul		if (optlen != 0) {
30055992Swpaul			opt->len = optlen;
30155992Swpaul			opt->value = malloc(optlen, M_MOUNT, M_WAITOK);
302			if (auio->uio_segflg == UIO_SYSSPACE) {
303				bcopy(auio->uio_iov[i + 1].iov_base, opt->value,
304				    optlen);
305			} else {
306				error = copyin(auio->uio_iov[i + 1].iov_base,
307				    opt->value, optlen);
308				if (error)
309					goto bad;
310			}
311		}
312	}
313	vfs_sanitizeopts(opts);
314	*options = opts;
315	return (0);
316bad:
317	vfs_freeopts(opts);
318	return (error);
319}
320
321/*
322 * Merge the old mount options with the new ones passed
323 * in the MNT_UPDATE case.
324 *
325 * XXX This function will keep a "nofoo" option in the
326 *     new options if there is no matching "foo" option
327 *     to be cancelled in the old options.  This is a bug
328 *     if the option's canonical name is "foo".  E.g., "noro"
329 *     shouldn't end up in the mount point's active options,
330 *     but it can.
331 */
332static void
333vfs_mergeopts(struct vfsoptlist *toopts, struct vfsoptlist *opts)
334{
335	struct vfsopt *opt, *opt2, *new;
336
337	TAILQ_FOREACH(opt, opts, link) {
338		/*
339		 * Check that this option hasn't been redefined
340		 * nor cancelled with a "no" mount option.
341		 */
342		opt2 = TAILQ_FIRST(toopts);
343		while (opt2 != NULL) {
344			if (strcmp(opt2->name, opt->name) == 0)
345				goto next;
346			if (strncmp(opt2->name, "no", 2) == 0 &&
347			    strcmp(opt2->name + 2, opt->name) == 0) {
348				vfs_freeopt(toopts, opt2);
349				goto next;
350			}
351			opt2 = TAILQ_NEXT(opt2, link);
352		}
353		/* We want this option, duplicate it. */
354		new = malloc(sizeof(struct vfsopt), M_MOUNT, M_WAITOK);
355		new->name = malloc(strlen(opt->name) + 1, M_MOUNT, M_WAITOK);
356		strcpy(new->name, opt->name);
357		if (opt->len != 0) {
358			new->value = malloc(opt->len, M_MOUNT, M_WAITOK);
359			bcopy(opt->value, new->value, opt->len);
360		} else {
361			new->value = NULL;
362		}
363		new->len = opt->len;
364		TAILQ_INSERT_TAIL(toopts, new, link);
365next:
366		continue;
367	}
368}
369
370/*
371 * Mount a filesystem.
372 */
373int
374nmount(td, uap)
375	struct thread *td;
376	struct nmount_args /* {
377		struct iovec *iovp;
378		unsigned int iovcnt;
379		int flags;
380	} */ *uap;
381{
382	struct uio *auio;
383	struct iovec *iov;
384	unsigned int i;
385	int error;
386	u_int iovcnt;
387
388	AUDIT_ARG(fflags, uap->flags);
389
390	/*
391	 * Filter out MNT_ROOTFS.  We do not want clients of nmount() in
392	 * userspace to set this flag, but we must filter it out if we want
393	 * MNT_UPDATE on the root file system to work.
394	 * MNT_ROOTFS should only be set in the kernel in vfs_mountroot_try().
395	 */
396	uap->flags &= ~MNT_ROOTFS;
397
398	iovcnt = uap->iovcnt;
399	/*
400	 * Check that we have an even number of iovec's
401	 * and that we have at least two options.
402	 */
403	if ((iovcnt & 1) || (iovcnt < 4))
404		return (EINVAL);
405
406	error = copyinuio(uap->iovp, iovcnt, &auio);
407	if (error)
408		return (error);
409	iov = auio->uio_iov;
410	for (i = 0; i < iovcnt; i++) {
411		if (iov->iov_len > MMAXOPTIONLEN) {
412			free(auio, M_IOV);
413			return (EINVAL);
414		}
415		iov++;
416	}
417	error = vfs_donmount(td, uap->flags, auio);
418
419	free(auio, M_IOV);
420	return (error);
421}
422
423/*
424 * ---------------------------------------------------------------------
425 * Various utility functions
426 */
427
428void
429vfs_ref(struct mount *mp)
430{
431
432	MNT_ILOCK(mp);
433	MNT_REF(mp);
434	MNT_IUNLOCK(mp);
435}
436
437void
438vfs_rel(struct mount *mp)
439{
440
441	MNT_ILOCK(mp);
442	MNT_REL(mp);
443	MNT_IUNLOCK(mp);
444}
445
446static int
447mount_init(void *mem, int size, int flags)
448{
449	struct mount *mp;
450
451	mp = (struct mount *)mem;
452	mtx_init(&mp->mnt_mtx, "struct mount mtx", NULL, MTX_DEF);
453	lockinit(&mp->mnt_explock, PVFS, "explock", 0, 0);
454	return (0);
455}
456
457static void
458mount_fini(void *mem, int size)
459{
460	struct mount *mp;
461
462	mp = (struct mount *)mem;
463	lockdestroy(&mp->mnt_explock);
464	mtx_destroy(&mp->mnt_mtx);
465}
466
467/*
468 * Allocate and initialize the mount point struct.
469 */
470struct mount *
471vfs_mount_alloc(struct vnode *vp, struct vfsconf *vfsp, const char *fspath,
472    struct ucred *cred)
473{
474	struct mount *mp;
475
476	mp = uma_zalloc(mount_zone, M_WAITOK);
477	bzero(&mp->mnt_startzero,
478	    __rangeof(struct mount, mnt_startzero, mnt_endzero));
479	TAILQ_INIT(&mp->mnt_nvnodelist);
480	mp->mnt_nvnodelistsize = 0;
481	mp->mnt_ref = 0;
482	(void) vfs_busy(mp, MBF_NOWAIT);
483	mp->mnt_op = vfsp->vfc_vfsops;
484	mp->mnt_vfc = vfsp;
485	vfsp->vfc_refcount++;	/* XXX Unlocked */
486	mp->mnt_stat.f_type = vfsp->vfc_typenum;
487	mp->mnt_gen++;
488	strlcpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN);
489	mp->mnt_vnodecovered = vp;
490	mp->mnt_cred = crdup(cred);
491	mp->mnt_stat.f_owner = cred->cr_uid;
492	strlcpy(mp->mnt_stat.f_mntonname, fspath, MNAMELEN);
493	mp->mnt_iosize_max = DFLTPHYS;
494#ifdef MAC
495	mac_mount_init(mp);
496	mac_mount_create(cred, mp);
497#endif
498	arc4rand(&mp->mnt_hashseed, sizeof mp->mnt_hashseed, 0);
499	return (mp);
500}
501
502/*
503 * Destroy the mount struct previously allocated by vfs_mount_alloc().
504 */
505void
506vfs_mount_destroy(struct mount *mp)
507{
508
509	MNT_ILOCK(mp);
510	while (mp->mnt_ref)
511		msleep(mp, MNT_MTX(mp), PVFS, "mntref", 0);
512	if (mp->mnt_holdcnt != 0) {
513		printf("Waiting for mount point to be unheld\n");
514		while (mp->mnt_holdcnt != 0) {
515			mp->mnt_holdcntwaiters++;
516			msleep(&mp->mnt_holdcnt, MNT_MTX(mp),
517			       PZERO, "mntdestroy", 0);
518			mp->mnt_holdcntwaiters--;
519		}
520		printf("mount point unheld\n");
521	}
522	if (mp->mnt_writeopcount > 0) {
523		printf("Waiting for mount point write ops\n");
524		while (mp->mnt_writeopcount > 0) {
525			mp->mnt_kern_flag |= MNTK_SUSPEND;
526			msleep(&mp->mnt_writeopcount,
527			       MNT_MTX(mp),
528			       PZERO, "mntdestroy2", 0);
529		}
530		printf("mount point write ops completed\n");
531	}
532	if (mp->mnt_secondary_writes > 0) {
533		printf("Waiting for mount point secondary write ops\n");
534		while (mp->mnt_secondary_writes > 0) {
535			mp->mnt_kern_flag |= MNTK_SUSPEND;
536			msleep(&mp->mnt_secondary_writes,
537			       MNT_MTX(mp),
538			       PZERO, "mntdestroy3", 0);
539		}
540		printf("mount point secondary write ops completed\n");
541	}
542	MNT_IUNLOCK(mp);
543	mp->mnt_vfc->vfc_refcount--;
544	if (!TAILQ_EMPTY(&mp->mnt_nvnodelist)) {
545		struct vnode *vp;
546
547		TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes)
548			vprint("", vp);
549		panic("unmount: dangling vnode");
550	}
551	MNT_ILOCK(mp);
552	if (mp->mnt_kern_flag & MNTK_MWAIT)
553		wakeup(mp);
554	if (mp->mnt_writeopcount != 0)
555		panic("vfs_mount_destroy: nonzero writeopcount");
556	if (mp->mnt_secondary_writes != 0)
557		panic("vfs_mount_destroy: nonzero secondary_writes");
558	if (mp->mnt_nvnodelistsize != 0)
559		panic("vfs_mount_destroy: nonzero nvnodelistsize");
560	mp->mnt_writeopcount = -1000;
561	mp->mnt_nvnodelistsize = -1000;
562	mp->mnt_secondary_writes = -1000;
563	MNT_IUNLOCK(mp);
564#ifdef MAC
565	mac_mount_destroy(mp);
566#endif
567	if (mp->mnt_opt != NULL)
568		vfs_freeopts(mp->mnt_opt);
569	crfree(mp->mnt_cred);
570	uma_zfree(mount_zone, mp);
571}
572
573int
574vfs_donmount(struct thread *td, int fsflags, struct uio *fsoptions)
575{
576	struct vfsoptlist *optlist;
577	struct vfsopt *opt, *noro_opt, *tmp_opt;
578	char *fstype, *fspath, *errmsg;
579	int error, fstypelen, fspathlen, errmsg_len, errmsg_pos;
580	int has_rw, has_noro;
581
582	errmsg = fspath = NULL;
583	errmsg_len = has_noro = has_rw = fspathlen = 0;
584	errmsg_pos = -1;
585
586	error = vfs_buildopts(fsoptions, &optlist);
587	if (error)
588		return (error);
589
590	if (vfs_getopt(optlist, "errmsg", (void **)&errmsg, &errmsg_len) == 0)
591		errmsg_pos = vfs_getopt_pos(optlist, "errmsg");
592
593	/*
594	 * We need these two options before the others,
595	 * and they are mandatory for any filesystem.
596	 * Ensure they are NUL terminated as well.
597	 */
598	fstypelen = 0;
599	error = vfs_getopt(optlist, "fstype", (void **)&fstype, &fstypelen);
600	if (error || fstype[fstypelen - 1] != '\0') {
601		error = EINVAL;
602		if (errmsg != NULL)
603			strncpy(errmsg, "Invalid fstype", errmsg_len);
604		goto bail;
605	}
606	fspathlen = 0;
607	error = vfs_getopt(optlist, "fspath", (void **)&fspath, &fspathlen);
608	if (error || fspath[fspathlen - 1] != '\0') {
609		error = EINVAL;
610		if (errmsg != NULL)
611			strncpy(errmsg, "Invalid fspath", errmsg_len);
612		goto bail;
613	}
614
615	/*
616	 * We need to see if we have the "update" option
617	 * before we call vfs_domount(), since vfs_domount() has special
618	 * logic based on MNT_UPDATE.  This is very important
619	 * when we want to update the root filesystem.
620	 */
621	TAILQ_FOREACH_SAFE(opt, optlist, link, tmp_opt) {
622		if (strcmp(opt->name, "update") == 0) {
623			fsflags |= MNT_UPDATE;
624			vfs_freeopt(optlist, opt);
625		}
626		else if (strcmp(opt->name, "async") == 0)
627			fsflags |= MNT_ASYNC;
628		else if (strcmp(opt->name, "force") == 0) {
629			fsflags |= MNT_FORCE;
630			vfs_freeopt(optlist, opt);
631		}
632		else if (strcmp(opt->name, "reload") == 0) {
633			fsflags |= MNT_RELOAD;
634			vfs_freeopt(optlist, opt);
635		}
636		else if (strcmp(opt->name, "multilabel") == 0)
637			fsflags |= MNT_MULTILABEL;
638		else if (strcmp(opt->name, "noasync") == 0)
639			fsflags &= ~MNT_ASYNC;
640		else if (strcmp(opt->name, "noatime") == 0)
641			fsflags |= MNT_NOATIME;
642		else if (strcmp(opt->name, "atime") == 0) {
643			free(opt->name, M_MOUNT);
644			opt->name = strdup("nonoatime", M_MOUNT);
645		}
646		else if (strcmp(opt->name, "noclusterr") == 0)
647			fsflags |= MNT_NOCLUSTERR;
648		else if (strcmp(opt->name, "clusterr") == 0) {
649			free(opt->name, M_MOUNT);
650			opt->name = strdup("nonoclusterr", M_MOUNT);
651		}
652		else if (strcmp(opt->name, "noclusterw") == 0)
653			fsflags |= MNT_NOCLUSTERW;
654		else if (strcmp(opt->name, "clusterw") == 0) {
655			free(opt->name, M_MOUNT);
656			opt->name = strdup("nonoclusterw", M_MOUNT);
657		}
658		else if (strcmp(opt->name, "noexec") == 0)
659			fsflags |= MNT_NOEXEC;
660		else if (strcmp(opt->name, "exec") == 0) {
661			free(opt->name, M_MOUNT);
662			opt->name = strdup("nonoexec", M_MOUNT);
663		}
664		else if (strcmp(opt->name, "nosuid") == 0)
665			fsflags |= MNT_NOSUID;
666		else if (strcmp(opt->name, "suid") == 0) {
667			free(opt->name, M_MOUNT);
668			opt->name = strdup("nonosuid", M_MOUNT);
669		}
670		else if (strcmp(opt->name, "nosymfollow") == 0)
671			fsflags |= MNT_NOSYMFOLLOW;
672		else if (strcmp(opt->name, "symfollow") == 0) {
673			free(opt->name, M_MOUNT);
674			opt->name = strdup("nonosymfollow", M_MOUNT);
675		}
676		else if (strcmp(opt->name, "noro") == 0) {
677			fsflags &= ~MNT_RDONLY;
678			has_noro = 1;
679		}
680		else if (strcmp(opt->name, "rw") == 0) {
681			fsflags &= ~MNT_RDONLY;
682			has_rw = 1;
683		}
684		else if (strcmp(opt->name, "ro") == 0)
685			fsflags |= MNT_RDONLY;
686		else if (strcmp(opt->name, "rdonly") == 0) {
687			free(opt->name, M_MOUNT);
688			opt->name = strdup("ro", M_MOUNT);
689			fsflags |= MNT_RDONLY;
690		}
691		else if (strcmp(opt->name, "suiddir") == 0)
692			fsflags |= MNT_SUIDDIR;
693		else if (strcmp(opt->name, "sync") == 0)
694			fsflags |= MNT_SYNCHRONOUS;
695		else if (strcmp(opt->name, "union") == 0)
696			fsflags |= MNT_UNION;
697	}
698
699	/*
700	 * If "rw" was specified as a mount option, and we
701	 * are trying to update a mount-point from "ro" to "rw",
702	 * we need a mount option "noro", since in vfs_mergeopts(),
703	 * "noro" will cancel "ro", but "rw" will not do anything.
704	 */
705	if (has_rw && !has_noro) {
706		noro_opt = malloc(sizeof(struct vfsopt), M_MOUNT, M_WAITOK);
707		noro_opt->name = strdup("noro", M_MOUNT);
708		noro_opt->value = NULL;
709		noro_opt->len = 0;
710		TAILQ_INSERT_TAIL(optlist, noro_opt, link);
711	}
712
713	/*
714	 * Be ultra-paranoid about making sure the type and fspath
715	 * variables will fit in our mp buffers, including the
716	 * terminating NUL.
717	 */
718	if (fstypelen >= MFSNAMELEN - 1 || fspathlen >= MNAMELEN - 1) {
719		error = ENAMETOOLONG;
720		goto bail;
721	}
722
723	mtx_lock(&Giant);
724	error = vfs_domount(td, fstype, fspath, fsflags, optlist);
725	mtx_unlock(&Giant);
726bail:
727	/* copyout the errmsg */
728	if (errmsg_pos != -1 && ((2 * errmsg_pos + 1) < fsoptions->uio_iovcnt)
729	    && errmsg_len > 0 && errmsg != NULL) {
730		if (fsoptions->uio_segflg == UIO_SYSSPACE) {
731			bcopy(errmsg,
732			    fsoptions->uio_iov[2 * errmsg_pos + 1].iov_base,
733			    fsoptions->uio_iov[2 * errmsg_pos + 1].iov_len);
734		} else {
735			copyout(errmsg,
736			    fsoptions->uio_iov[2 * errmsg_pos + 1].iov_base,
737			    fsoptions->uio_iov[2 * errmsg_pos + 1].iov_len);
738		}
739	}
740
741	if (error != 0)
742		vfs_freeopts(optlist);
743	return (error);
744}
745
746/*
747 * Old mount API.
748 */
749#ifndef _SYS_SYSPROTO_H_
750struct mount_args {
751	char	*type;
752	char	*path;
753	int	flags;
754	caddr_t	data;
755};
756#endif
757/* ARGSUSED */
758int
759mount(td, uap)
760	struct thread *td;
761	struct mount_args /* {
762		char *type;
763		char *path;
764		int flags;
765		caddr_t data;
766	} */ *uap;
767{
768	char *fstype;
769	struct vfsconf *vfsp = NULL;
770	struct mntarg *ma = NULL;
771	int error;
772
773	AUDIT_ARG(fflags, uap->flags);
774
775	/*
776	 * Filter out MNT_ROOTFS.  We do not want clients of mount() in
777	 * userspace to set this flag, but we must filter it out if we want
778	 * MNT_UPDATE on the root file system to work.
779	 * MNT_ROOTFS should only be set in the kernel in vfs_mountroot_try().
780	 */
781	uap->flags &= ~MNT_ROOTFS;
782
783	fstype = malloc(MFSNAMELEN, M_TEMP, M_WAITOK);
784	error = copyinstr(uap->type, fstype, MFSNAMELEN, NULL);
785	if (error) {
786		free(fstype, M_TEMP);
787		return (error);
788	}
789
790	AUDIT_ARG(text, fstype);
791	mtx_lock(&Giant);
792	vfsp = vfs_byname_kld(fstype, td, &error);
793	free(fstype, M_TEMP);
794	if (vfsp == NULL) {
795		mtx_unlock(&Giant);
796		return (ENOENT);
797	}
798	if (vfsp->vfc_vfsops->vfs_cmount == NULL) {
799		mtx_unlock(&Giant);
800		return (EOPNOTSUPP);
801	}
802
803	ma = mount_argsu(ma, "fstype", uap->type, MNAMELEN);
804	ma = mount_argsu(ma, "fspath", uap->path, MNAMELEN);
805	ma = mount_argb(ma, uap->flags & MNT_RDONLY, "noro");
806	ma = mount_argb(ma, !(uap->flags & MNT_NOSUID), "nosuid");
807	ma = mount_argb(ma, !(uap->flags & MNT_NOEXEC), "noexec");
808
809	error = vfsp->vfc_vfsops->vfs_cmount(ma, uap->data, uap->flags, td);
810	mtx_unlock(&Giant);
811	return (error);
812}
813
814
815/*
816 * vfs_domount(): actually attempt a filesystem mount.
817 */
818static int
819vfs_domount(
820	struct thread *td,	/* Calling thread. */
821	const char *fstype,	/* Filesystem type. */
822	char *fspath,		/* Mount path. */
823	int fsflags,		/* Flags common to all filesystems. */
824	void *fsdata		/* Options local to the filesystem. */
825	)
826{
827	struct vnode *vp;
828	struct mount *mp;
829	struct vfsconf *vfsp;
830	struct oexport_args oexport;
831	struct export_args export;
832	int error, flag = 0;
833	struct vattr va;
834	struct nameidata nd;
835
836	mtx_assert(&Giant, MA_OWNED);
837	/*
838	 * Be ultra-paranoid about making sure the type and fspath
839	 * variables will fit in our mp buffers, including the
840	 * terminating NUL.
841	 */
842	if (strlen(fstype) >= MFSNAMELEN || strlen(fspath) >= MNAMELEN)
843		return (ENAMETOOLONG);
844
845	if (jailed(td->td_ucred) || usermount == 0) {
846		if ((error = priv_check(td, PRIV_VFS_MOUNT)) != 0)
847			return (error);
848	}
849
850	/*
851	 * Do not allow NFS export or MNT_SUIDDIR by unprivileged users.
852	 */
853	if (fsflags & MNT_EXPORTED) {
854		error = priv_check(td, PRIV_VFS_MOUNT_EXPORTED);
855		if (error)
856			return (error);
857	}
858	if (fsflags & MNT_SUIDDIR) {
859		error = priv_check(td, PRIV_VFS_MOUNT_SUIDDIR);
860		if (error)
861			return (error);
862	}
863	/*
864	 * Silently enforce MNT_NOSUID and MNT_USER for unprivileged users.
865	 */
866	if ((fsflags & (MNT_NOSUID | MNT_USER)) != (MNT_NOSUID | MNT_USER)) {
867		if (priv_check(td, PRIV_VFS_MOUNT_NONUSER) != 0)
868			fsflags |= MNT_NOSUID | MNT_USER;
869	}
870
871	/* Load KLDs before we lock the covered vnode to avoid reversals. */
872	vfsp = NULL;
873	if ((fsflags & MNT_UPDATE) == 0) {
874		/* Don't try to load KLDs if we're mounting the root. */
875		if (fsflags & MNT_ROOTFS)
876			vfsp = vfs_byname(fstype);
877		else
878			vfsp = vfs_byname_kld(fstype, td, &error);
879		if (vfsp == NULL)
880			return (ENODEV);
881		if (jailed(td->td_ucred) && !(vfsp->vfc_flags & VFCF_JAIL))
882			return (EPERM);
883	}
884	/*
885	 * Get vnode to be covered
886	 */
887	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_SYSSPACE,
888	    fspath, td);
889	if ((error = namei(&nd)) != 0)
890		return (error);
891	NDFREE(&nd, NDF_ONLY_PNBUF);
892	vp = nd.ni_vp;
893	if (fsflags & MNT_UPDATE) {
894		if ((vp->v_vflag & VV_ROOT) == 0) {
895			vput(vp);
896			return (EINVAL);
897		}
898		mp = vp->v_mount;
899		MNT_ILOCK(mp);
900		flag = mp->mnt_flag;
901		/*
902		 * We only allow the filesystem to be reloaded if it
903		 * is currently mounted read-only.
904		 */
905		if ((fsflags & MNT_RELOAD) &&
906		    ((mp->mnt_flag & MNT_RDONLY) == 0)) {
907			MNT_IUNLOCK(mp);
908			vput(vp);
909			return (EOPNOTSUPP);	/* Needs translation */
910		}
911		MNT_IUNLOCK(mp);
912		/*
913		 * Only privileged root, or (if MNT_USER is set) the user that
914		 * did the original mount is permitted to update it.
915		 */
916		error = vfs_suser(mp, td);
917		if (error) {
918			vput(vp);
919			return (error);
920		}
921		if (vfs_busy(mp, MBF_NOWAIT)) {
922			vput(vp);
923			return (EBUSY);
924		}
925		VI_LOCK(vp);
926		if ((vp->v_iflag & VI_MOUNT) != 0 ||
927		    vp->v_mountedhere != NULL) {
928			VI_UNLOCK(vp);
929			vfs_unbusy(mp);
930			vput(vp);
931			return (EBUSY);
932		}
933		vp->v_iflag |= VI_MOUNT;
934		VI_UNLOCK(vp);
935		MNT_ILOCK(mp);
936		mp->mnt_flag |= fsflags &
937		    (MNT_RELOAD | MNT_FORCE | MNT_UPDATE | MNT_SNAPSHOT | MNT_ROOTFS);
938		MNT_IUNLOCK(mp);
939		VOP_UNLOCK(vp, 0);
940		mp->mnt_optnew = fsdata;
941		vfs_mergeopts(mp->mnt_optnew, mp->mnt_opt);
942	} else {
943		/*
944		 * If the user is not root, ensure that they own the directory
945		 * onto which we are attempting to mount.
946		 */
947		error = VOP_GETATTR(vp, &va, td->td_ucred);
948		if (error) {
949			vput(vp);
950			return (error);
951		}
952		if (va.va_uid != td->td_ucred->cr_uid) {
953			error = priv_check_cred(td->td_ucred, PRIV_VFS_ADMIN,
954			    0);
955			if (error) {
956				vput(vp);
957				return (error);
958			}
959		}
960		error = vinvalbuf(vp, V_SAVE, 0, 0);
961		if (error != 0) {
962			vput(vp);
963			return (error);
964		}
965		if (vp->v_type != VDIR) {
966			vput(vp);
967			return (ENOTDIR);
968		}
969		VI_LOCK(vp);
970		if ((vp->v_iflag & VI_MOUNT) != 0 ||
971		    vp->v_mountedhere != NULL) {
972			VI_UNLOCK(vp);
973			vput(vp);
974			return (EBUSY);
975		}
976		vp->v_iflag |= VI_MOUNT;
977		VI_UNLOCK(vp);
978
979		/*
980		 * Allocate and initialize the filesystem.
981		 */
982		mp = vfs_mount_alloc(vp, vfsp, fspath, td->td_ucred);
983		VOP_UNLOCK(vp, 0);
984
985		/* XXXMAC: pass to vfs_mount_alloc? */
986		mp->mnt_optnew = fsdata;
987	}
988
989	/*
990	 * Set the mount level flags.
991	 */
992	MNT_ILOCK(mp);
993	mp->mnt_flag = (mp->mnt_flag & ~MNT_UPDATEMASK) |
994		(fsflags & (MNT_UPDATEMASK | MNT_FORCE | MNT_ROOTFS |
995			    MNT_RDONLY));
996	if ((mp->mnt_flag & MNT_ASYNC) == 0)
997		mp->mnt_kern_flag &= ~MNTK_ASYNC;
998	MNT_IUNLOCK(mp);
999	/*
1000	 * Mount the filesystem.
1001	 * XXX The final recipients of VFS_MOUNT just overwrite the ndp they
1002	 * get.  No freeing of cn_pnbuf.
1003	 */
1004        error = VFS_MOUNT(mp, td);
1005
1006	/*
1007	 * Process the export option only if we are
1008	 * updating mount options.
1009	 */
1010	if (!error && (fsflags & MNT_UPDATE)) {
1011		if (vfs_copyopt(mp->mnt_optnew, "export", &export,
1012		    sizeof(export)) == 0)
1013			error = vfs_export(mp, &export);
1014		else if (vfs_copyopt(mp->mnt_optnew, "export", &oexport,
1015			sizeof(oexport)) == 0) {
1016			export.ex_flags = oexport.ex_flags;
1017			export.ex_root = oexport.ex_root;
1018			export.ex_anon = oexport.ex_anon;
1019			export.ex_addr = oexport.ex_addr;
1020			export.ex_addrlen = oexport.ex_addrlen;
1021			export.ex_mask = oexport.ex_mask;
1022			export.ex_masklen = oexport.ex_masklen;
1023			export.ex_indexfile = oexport.ex_indexfile;
1024			export.ex_numsecflavors = 0;
1025			error = vfs_export(mp, &export);
1026		}
1027	}
1028
1029	if (!error) {
1030		if (mp->mnt_opt != NULL)
1031			vfs_freeopts(mp->mnt_opt);
1032		mp->mnt_opt = mp->mnt_optnew;
1033		(void)VFS_STATFS(mp, &mp->mnt_stat, td);
1034	}
1035	/*
1036	 * Prevent external consumers of mount options from reading
1037	 * mnt_optnew.
1038	*/
1039	mp->mnt_optnew = NULL;
1040	if (mp->mnt_flag & MNT_UPDATE) {
1041		MNT_ILOCK(mp);
1042		if (error)
1043			mp->mnt_flag = (mp->mnt_flag & MNT_QUOTA) |
1044				(flag & ~MNT_QUOTA);
1045		else
1046			mp->mnt_flag &=	~(MNT_UPDATE | MNT_RELOAD |
1047					  MNT_FORCE | MNT_SNAPSHOT);
1048		if ((mp->mnt_flag & MNT_ASYNC) != 0 && mp->mnt_noasync == 0)
1049			mp->mnt_kern_flag |= MNTK_ASYNC;
1050		else
1051			mp->mnt_kern_flag &= ~MNTK_ASYNC;
1052		MNT_IUNLOCK(mp);
1053		if ((mp->mnt_flag & MNT_RDONLY) == 0) {
1054			if (mp->mnt_syncer == NULL)
1055				error = vfs_allocate_syncvnode(mp);
1056		} else {
1057			if (mp->mnt_syncer != NULL)
1058				vrele(mp->mnt_syncer);
1059			mp->mnt_syncer = NULL;
1060		}
1061		vfs_unbusy(mp);
1062		VI_LOCK(vp);
1063		vp->v_iflag &= ~VI_MOUNT;
1064		VI_UNLOCK(vp);
1065		vrele(vp);
1066		return (error);
1067	}
1068	MNT_ILOCK(mp);
1069	if ((mp->mnt_flag & MNT_ASYNC) != 0 && mp->mnt_noasync == 0)
1070		mp->mnt_kern_flag |= MNTK_ASYNC;
1071	else
1072		mp->mnt_kern_flag &= ~MNTK_ASYNC;
1073	MNT_IUNLOCK(mp);
1074	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1075	/*
1076	 * Put the new filesystem on the mount list after root.
1077	 */
1078	cache_purge(vp);
1079	if (!error) {
1080		struct vnode *newdp;
1081
1082		VI_LOCK(vp);
1083		vp->v_iflag &= ~VI_MOUNT;
1084		VI_UNLOCK(vp);
1085		vp->v_mountedhere = mp;
1086		mtx_lock(&mountlist_mtx);
1087		TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list);
1088		mtx_unlock(&mountlist_mtx);
1089		vfs_event_signal(NULL, VQ_MOUNT, 0);
1090		if (VFS_ROOT(mp, LK_EXCLUSIVE, &newdp, td))
1091			panic("mount: lost mount");
1092		mountcheckdirs(vp, newdp);
1093		vput(newdp);
1094		VOP_UNLOCK(vp, 0);
1095		if ((mp->mnt_flag & MNT_RDONLY) == 0)
1096			error = vfs_allocate_syncvnode(mp);
1097		vfs_unbusy(mp);
1098		if (error)
1099			vrele(vp);
1100	} else {
1101		VI_LOCK(vp);
1102		vp->v_iflag &= ~VI_MOUNT;
1103		VI_UNLOCK(vp);
1104		vfs_unbusy(mp);
1105		vfs_mount_destroy(mp);
1106		vput(vp);
1107	}
1108	return (error);
1109}
1110
1111/*
1112 * Unmount a filesystem.
1113 *
1114 * Note: unmount takes a path to the vnode mounted on as argument, not
1115 * special file (as before).
1116 */
1117#ifndef _SYS_SYSPROTO_H_
1118struct unmount_args {
1119	char	*path;
1120	int	flags;
1121};
1122#endif
1123/* ARGSUSED */
1124int
1125unmount(td, uap)
1126	struct thread *td;
1127	register struct unmount_args /* {
1128		char *path;
1129		int flags;
1130	} */ *uap;
1131{
1132	struct mount *mp;
1133	char *pathbuf;
1134	int error, id0, id1;
1135
1136	if (jailed(td->td_ucred) || usermount == 0) {
1137		error = priv_check(td, PRIV_VFS_UNMOUNT);
1138		if (error)
1139			return (error);
1140	}
1141
1142	pathbuf = malloc(MNAMELEN, M_TEMP, M_WAITOK);
1143	error = copyinstr(uap->path, pathbuf, MNAMELEN, NULL);
1144	if (error) {
1145		free(pathbuf, M_TEMP);
1146		return (error);
1147	}
1148	AUDIT_ARG(upath, td, pathbuf, ARG_UPATH1);
1149	mtx_lock(&Giant);
1150	if (uap->flags & MNT_BYFSID) {
1151		/* Decode the filesystem ID. */
1152		if (sscanf(pathbuf, "FSID:%d:%d", &id0, &id1) != 2) {
1153			mtx_unlock(&Giant);
1154			free(pathbuf, M_TEMP);
1155			return (EINVAL);
1156		}
1157
1158		mtx_lock(&mountlist_mtx);
1159		TAILQ_FOREACH_REVERSE(mp, &mountlist, mntlist, mnt_list) {
1160			if (mp->mnt_stat.f_fsid.val[0] == id0 &&
1161			    mp->mnt_stat.f_fsid.val[1] == id1)
1162				break;
1163		}
1164		mtx_unlock(&mountlist_mtx);
1165	} else {
1166		mtx_lock(&mountlist_mtx);
1167		TAILQ_FOREACH_REVERSE(mp, &mountlist, mntlist, mnt_list) {
1168			if (strcmp(mp->mnt_stat.f_mntonname, pathbuf) == 0)
1169				break;
1170		}
1171		mtx_unlock(&mountlist_mtx);
1172	}
1173	free(pathbuf, M_TEMP);
1174	if (mp == NULL) {
1175		/*
1176		 * Previously we returned ENOENT for a nonexistent path and
1177		 * EINVAL for a non-mountpoint.  We cannot tell these apart
1178		 * now, so in the !MNT_BYFSID case return the more likely
1179		 * EINVAL for compatibility.
1180		 */
1181		mtx_unlock(&Giant);
1182		return ((uap->flags & MNT_BYFSID) ? ENOENT : EINVAL);
1183	}
1184
1185	/*
1186	 * Don't allow unmounting the root filesystem.
1187	 */
1188	if (mp->mnt_flag & MNT_ROOTFS) {
1189		mtx_unlock(&Giant);
1190		return (EINVAL);
1191	}
1192	error = dounmount(mp, uap->flags, td);
1193	mtx_unlock(&Giant);
1194	return (error);
1195}
1196
1197/*
1198 * Do the actual filesystem unmount.
1199 */
1200int
1201dounmount(mp, flags, td)
1202	struct mount *mp;
1203	int flags;
1204	struct thread *td;
1205{
1206	struct vnode *coveredvp, *fsrootvp;
1207	int error;
1208	int async_flag;
1209	int mnt_gen_r;
1210
1211	mtx_assert(&Giant, MA_OWNED);
1212
1213	if ((coveredvp = mp->mnt_vnodecovered) != NULL) {
1214		mnt_gen_r = mp->mnt_gen;
1215		VI_LOCK(coveredvp);
1216		vholdl(coveredvp);
1217		vn_lock(coveredvp, LK_EXCLUSIVE | LK_INTERLOCK | LK_RETRY);
1218		vdrop(coveredvp);
1219		/*
1220		 * Check for mp being unmounted while waiting for the
1221		 * covered vnode lock.
1222		 */
1223		if (coveredvp->v_mountedhere != mp ||
1224		    coveredvp->v_mountedhere->mnt_gen != mnt_gen_r) {
1225			VOP_UNLOCK(coveredvp, 0);
1226			return (EBUSY);
1227		}
1228	}
1229	/*
1230	 * Only privileged root, or (if MNT_USER is set) the user that did the
1231	 * original mount is permitted to unmount this filesystem.
1232	 */
1233	error = vfs_suser(mp, td);
1234	if (error) {
1235		if (coveredvp)
1236			VOP_UNLOCK(coveredvp, 0);
1237		return (error);
1238	}
1239
1240	MNT_ILOCK(mp);
1241	if (mp->mnt_kern_flag & MNTK_UNMOUNT) {
1242		MNT_IUNLOCK(mp);
1243		if (coveredvp)
1244			VOP_UNLOCK(coveredvp, 0);
1245		return (EBUSY);
1246	}
1247	mp->mnt_kern_flag |= MNTK_UNMOUNT | MNTK_NOINSMNTQ;
1248	/* Allow filesystems to detect that a forced unmount is in progress. */
1249	if (flags & MNT_FORCE)
1250		mp->mnt_kern_flag |= MNTK_UNMOUNTF;
1251	error = 0;
1252	if (mp->mnt_lockref) {
1253		if (flags & MNT_FORCE) {
1254			mp->mnt_kern_flag &= ~(MNTK_UNMOUNT | MNTK_NOINSMNTQ |
1255			    MNTK_UNMOUNTF);
1256			if (mp->mnt_kern_flag & MNTK_MWAIT) {
1257				mp->mnt_kern_flag &= ~MNTK_MWAIT;
1258				wakeup(mp);
1259			}
1260			MNT_IUNLOCK(mp);
1261			if (coveredvp)
1262				VOP_UNLOCK(coveredvp, 0);
1263			return (EBUSY);
1264		}
1265		mp->mnt_kern_flag |= MNTK_DRAINING;
1266		error = msleep(&mp->mnt_lockref, MNT_MTX(mp), PVFS,
1267		    "mount drain", 0);
1268	}
1269	MNT_IUNLOCK(mp);
1270	KASSERT(mp->mnt_lockref == 0,
1271	    ("%s: invalid lock refcount in the drain path @ %s:%d",
1272	    __func__, __FILE__, __LINE__));
1273	KASSERT(error == 0,
1274	    ("%s: invalid return value for msleep in the drain path @ %s:%d",
1275	    __func__, __FILE__, __LINE__));
1276	vn_start_write(NULL, &mp, V_WAIT);
1277
1278	if (mp->mnt_flag & MNT_EXPUBLIC)
1279		vfs_setpublicfs(NULL, NULL, NULL);
1280
1281	vfs_msync(mp, MNT_WAIT);
1282	MNT_ILOCK(mp);
1283	async_flag = mp->mnt_flag & MNT_ASYNC;
1284	mp->mnt_flag &= ~MNT_ASYNC;
1285	mp->mnt_kern_flag &= ~MNTK_ASYNC;
1286	MNT_IUNLOCK(mp);
1287	cache_purgevfs(mp);	/* remove cache entries for this file sys */
1288	if (mp->mnt_syncer != NULL)
1289		vrele(mp->mnt_syncer);
1290	/*
1291	 * For forced unmounts, move process cdir/rdir refs on the fs root
1292	 * vnode to the covered vnode.  For non-forced unmounts we want
1293	 * such references to cause an EBUSY error.
1294	 */
1295	if ((flags & MNT_FORCE) &&
1296	    VFS_ROOT(mp, LK_EXCLUSIVE, &fsrootvp, td) == 0) {
1297		if (mp->mnt_vnodecovered != NULL)
1298			mountcheckdirs(fsrootvp, mp->mnt_vnodecovered);
1299		if (fsrootvp == rootvnode) {
1300			vrele(rootvnode);
1301			rootvnode = NULL;
1302		}
1303		vput(fsrootvp);
1304	}
1305	if (((mp->mnt_flag & MNT_RDONLY) ||
1306	     (error = VFS_SYNC(mp, MNT_WAIT, td)) == 0) ||
1307	    (flags & MNT_FORCE)) {
1308		error = VFS_UNMOUNT(mp, flags, td);
1309	}
1310	vn_finished_write(mp);
1311	/*
1312	 * If we failed to flush the dirty blocks for this mount point,
1313	 * undo all the cdir/rdir and rootvnode changes we made above.
1314	 * Unless we failed to do so because the device is reporting that
1315	 * it doesn't exist anymore.
1316	 */
1317	if (error && error != ENXIO) {
1318		if ((flags & MNT_FORCE) &&
1319		    VFS_ROOT(mp, LK_EXCLUSIVE, &fsrootvp, td) == 0) {
1320			if (mp->mnt_vnodecovered != NULL)
1321				mountcheckdirs(mp->mnt_vnodecovered, fsrootvp);
1322			if (rootvnode == NULL) {
1323				rootvnode = fsrootvp;
1324				vref(rootvnode);
1325			}
1326			vput(fsrootvp);
1327		}
1328		MNT_ILOCK(mp);
1329		mp->mnt_kern_flag &= ~MNTK_NOINSMNTQ;
1330		if ((mp->mnt_flag & MNT_RDONLY) == 0 && mp->mnt_syncer == NULL) {
1331			MNT_IUNLOCK(mp);
1332			(void) vfs_allocate_syncvnode(mp);
1333			MNT_ILOCK(mp);
1334		}
1335		mp->mnt_kern_flag &= ~(MNTK_UNMOUNT | MNTK_UNMOUNTF);
1336		mp->mnt_flag |= async_flag;
1337		if ((mp->mnt_flag & MNT_ASYNC) != 0 && mp->mnt_noasync == 0)
1338			mp->mnt_kern_flag |= MNTK_ASYNC;
1339		if (mp->mnt_kern_flag & MNTK_MWAIT) {
1340			mp->mnt_kern_flag &= ~MNTK_MWAIT;
1341			wakeup(mp);
1342		}
1343		MNT_IUNLOCK(mp);
1344		if (coveredvp)
1345			VOP_UNLOCK(coveredvp, 0);
1346		return (error);
1347	}
1348	mtx_lock(&mountlist_mtx);
1349	TAILQ_REMOVE(&mountlist, mp, mnt_list);
1350	mtx_unlock(&mountlist_mtx);
1351	if (coveredvp != NULL) {
1352		coveredvp->v_mountedhere = NULL;
1353		vput(coveredvp);
1354	}
1355	vfs_event_signal(NULL, VQ_UNMOUNT, 0);
1356	vfs_mount_destroy(mp);
1357	return (0);
1358}
1359
1360/*
1361 * ---------------------------------------------------------------------
1362 * Mounting of root filesystem
1363 *
1364 */
1365
1366struct root_hold_token {
1367	const char			*who;
1368	LIST_ENTRY(root_hold_token)	list;
1369};
1370
1371static LIST_HEAD(, root_hold_token)	root_holds =
1372    LIST_HEAD_INITIALIZER(&root_holds);
1373
1374static int root_mount_complete;
1375
1376/*
1377 * Hold root mount.
1378 */
1379struct root_hold_token *
1380root_mount_hold(const char *identifier)
1381{
1382	struct root_hold_token *h;
1383
1384	h = malloc(sizeof *h, M_DEVBUF, M_ZERO | M_WAITOK);
1385	h->who = identifier;
1386	mtx_lock(&mountlist_mtx);
1387	LIST_INSERT_HEAD(&root_holds, h, list);
1388	mtx_unlock(&mountlist_mtx);
1389	return (h);
1390}
1391
1392/*
1393 * Release root mount.
1394 */
1395void
1396root_mount_rel(struct root_hold_token *h)
1397{
1398
1399	mtx_lock(&mountlist_mtx);
1400	LIST_REMOVE(h, list);
1401	wakeup(&root_holds);
1402	mtx_unlock(&mountlist_mtx);
1403	free(h, M_DEVBUF);
1404}
1405
1406/*
1407 * Wait for all subsystems to release root mount.
1408 */
1409static void
1410root_mount_prepare(void)
1411{
1412	struct root_hold_token *h;
1413
1414	for (;;) {
1415		DROP_GIANT();
1416		g_waitidle();
1417		PICKUP_GIANT();
1418		mtx_lock(&mountlist_mtx);
1419		if (LIST_EMPTY(&root_holds)) {
1420			mtx_unlock(&mountlist_mtx);
1421			break;
1422		}
1423		printf("Root mount waiting for:");
1424		LIST_FOREACH(h, &root_holds, list)
1425			printf(" %s", h->who);
1426		printf("\n");
1427		msleep(&root_holds, &mountlist_mtx, PZERO | PDROP, "roothold",
1428		    hz);
1429	}
1430}
1431
1432/*
1433 * Root was mounted, share the good news.
1434 */
1435static void
1436root_mount_done(void)
1437{
1438
1439	/*
1440	 * Use a mutex to prevent the wakeup being missed and waiting for
1441	 * an extra 1 second sleep.
1442	 */
1443	mtx_lock(&mountlist_mtx);
1444	root_mount_complete = 1;
1445	wakeup(&root_mount_complete);
1446	mtx_unlock(&mountlist_mtx);
1447}
1448
1449/*
1450 * Return true if root is already mounted.
1451 */
1452int
1453root_mounted(void)
1454{
1455
1456	/* No mutex is acquired here because int stores are atomic. */
1457	return (root_mount_complete);
1458}
1459
1460/*
1461 * Wait until root is mounted.
1462 */
1463void
1464root_mount_wait(void)
1465{
1466
1467	/*
1468	 * Panic on an obvious deadlock - the function can't be called from
1469	 * a thread which is doing the whole SYSINIT stuff.
1470	 */
1471	KASSERT(curthread->td_proc->p_pid != 0,
1472	    ("root_mount_wait: cannot be called from the swapper thread"));
1473	mtx_lock(&mountlist_mtx);
1474	while (!root_mount_complete) {
1475		msleep(&root_mount_complete, &mountlist_mtx, PZERO, "rootwait",
1476		    hz);
1477	}
1478	mtx_unlock(&mountlist_mtx);
1479}
1480
1481static void
1482set_rootvnode(struct thread *td)
1483{
1484	struct proc *p;
1485
1486	if (VFS_ROOT(TAILQ_FIRST(&mountlist), LK_EXCLUSIVE, &rootvnode, td))
1487		panic("Cannot find root vnode");
1488
1489	p = td->td_proc;
1490	FILEDESC_XLOCK(p->p_fd);
1491
1492	if (p->p_fd->fd_cdir != NULL)
1493		vrele(p->p_fd->fd_cdir);
1494	p->p_fd->fd_cdir = rootvnode;
1495	VREF(rootvnode);
1496
1497	if (p->p_fd->fd_rdir != NULL)
1498		vrele(p->p_fd->fd_rdir);
1499	p->p_fd->fd_rdir = rootvnode;
1500	VREF(rootvnode);
1501
1502	FILEDESC_XUNLOCK(p->p_fd);
1503
1504	VOP_UNLOCK(rootvnode, 0);
1505
1506	EVENTHANDLER_INVOKE(mountroot);
1507}
1508
1509/*
1510 * Mount /devfs as our root filesystem, but do not put it on the mountlist
1511 * yet.  Create a /dev -> / symlink so that absolute pathnames will lookup.
1512 */
1513
1514static void
1515devfs_first(void)
1516{
1517	struct thread *td = curthread;
1518	struct vfsoptlist *opts;
1519	struct vfsconf *vfsp;
1520	struct mount *mp = NULL;
1521	int error;
1522
1523	vfsp = vfs_byname("devfs");
1524	KASSERT(vfsp != NULL, ("Could not find devfs by name"));
1525	if (vfsp == NULL)
1526		return;
1527
1528	mp = vfs_mount_alloc(NULLVP, vfsp, "/dev", td->td_ucred);
1529
1530	error = VFS_MOUNT(mp, td);
1531	KASSERT(error == 0, ("VFS_MOUNT(devfs) failed %d", error));
1532	if (error)
1533		return;
1534
1535	opts = malloc(sizeof(struct vfsoptlist), M_MOUNT, M_WAITOK);
1536	TAILQ_INIT(opts);
1537	mp->mnt_opt = opts;
1538
1539	mtx_lock(&mountlist_mtx);
1540	TAILQ_INSERT_HEAD(&mountlist, mp, mnt_list);
1541	mtx_unlock(&mountlist_mtx);
1542
1543	set_rootvnode(td);
1544
1545	error = kern_symlink(td, "/", "dev", UIO_SYSSPACE);
1546	if (error)
1547		printf("kern_symlink /dev -> / returns %d\n", error);
1548}
1549
1550/*
1551 * Surgically move our devfs to be mounted on /dev.
1552 */
1553
1554static void
1555devfs_fixup(struct thread *td)
1556{
1557	struct nameidata nd;
1558	int error;
1559	struct vnode *vp, *dvp;
1560	struct mount *mp;
1561
1562	/* Remove our devfs mount from the mountlist and purge the cache */
1563	mtx_lock(&mountlist_mtx);
1564	mp = TAILQ_FIRST(&mountlist);
1565	TAILQ_REMOVE(&mountlist, mp, mnt_list);
1566	mtx_unlock(&mountlist_mtx);
1567	cache_purgevfs(mp);
1568
1569	VFS_ROOT(mp, LK_EXCLUSIVE, &dvp, td);
1570	VI_LOCK(dvp);
1571	dvp->v_iflag &= ~VI_MOUNT;
1572	VI_UNLOCK(dvp);
1573	dvp->v_mountedhere = NULL;
1574
1575	/* Set up the real rootvnode, and purge the cache */
1576	TAILQ_FIRST(&mountlist)->mnt_vnodecovered = NULL;
1577	set_rootvnode(td);
1578	cache_purgevfs(rootvnode->v_mount);
1579
1580	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, "/dev", td);
1581	error = namei(&nd);
1582	if (error) {
1583		printf("Lookup of /dev for devfs, error: %d\n", error);
1584		return;
1585	}
1586	NDFREE(&nd, NDF_ONLY_PNBUF);
1587	vp = nd.ni_vp;
1588	if (vp->v_type != VDIR) {
1589		vput(vp);
1590	}
1591	error = vinvalbuf(vp, V_SAVE, 0, 0);
1592	if (error) {
1593		vput(vp);
1594	}
1595	cache_purge(vp);
1596	mp->mnt_vnodecovered = vp;
1597	vp->v_mountedhere = mp;
1598	mtx_lock(&mountlist_mtx);
1599	TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list);
1600	mtx_unlock(&mountlist_mtx);
1601	VOP_UNLOCK(vp, 0);
1602	vput(dvp);
1603	vfs_unbusy(mp);
1604
1605	/* Unlink the no longer needed /dev/dev -> / symlink */
1606	kern_unlink(td, "/dev/dev", UIO_SYSSPACE);
1607}
1608
1609/*
1610 * Report errors during filesystem mounting.
1611 */
1612void
1613vfs_mount_error(struct mount *mp, const char *fmt, ...)
1614{
1615	struct vfsoptlist *moptlist = mp->mnt_optnew;
1616	va_list ap;
1617	int error, len;
1618	char *errmsg;
1619
1620	error = vfs_getopt(moptlist, "errmsg", (void **)&errmsg, &len);
1621	if (error || errmsg == NULL || len <= 0)
1622		return;
1623
1624	va_start(ap, fmt);
1625	vsnprintf(errmsg, (size_t)len, fmt, ap);
1626	va_end(ap);
1627}
1628
1629/*
1630 * Find and mount the root filesystem
1631 */
1632void
1633vfs_mountroot(void)
1634{
1635	char *cp;
1636	int error, i, asked = 0;
1637
1638	root_mount_prepare();
1639
1640	mount_zone = uma_zcreate("Mountpoints", sizeof(struct mount),
1641	    NULL, NULL, mount_init, mount_fini,
1642	    UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
1643	devfs_first();
1644
1645	/*
1646	 * We are booted with instructions to prompt for the root filesystem.
1647	 */
1648	if (boothowto & RB_ASKNAME) {
1649		if (!vfs_mountroot_ask())
1650			goto mounted;
1651		asked = 1;
1652	}
1653
1654	/*
1655	 * The root filesystem information is compiled in, and we are
1656	 * booted with instructions to use it.
1657	 */
1658	if (ctrootdevname != NULL && (boothowto & RB_DFLTROOT)) {
1659		if (!vfs_mountroot_try(ctrootdevname))
1660			goto mounted;
1661		ctrootdevname = NULL;
1662	}
1663
1664	/*
1665	 * We've been given the generic "use CDROM as root" flag.  This is
1666	 * necessary because one media may be used in many different
1667	 * devices, so we need to search for them.
1668	 */
1669	if (boothowto & RB_CDROM) {
1670		for (i = 0; cdrom_rootdevnames[i] != NULL; i++) {
1671			if (!vfs_mountroot_try(cdrom_rootdevnames[i]))
1672				goto mounted;
1673		}
1674	}
1675
1676	/*
1677	 * Try to use the value read by the loader from /etc/fstab, or
1678	 * supplied via some other means.  This is the preferred
1679	 * mechanism.
1680	 */
1681	cp = getenv("vfs.root.mountfrom");
1682	if (cp != NULL) {
1683		error = vfs_mountroot_try(cp);
1684		freeenv(cp);
1685		if (!error)
1686			goto mounted;
1687	}
1688
1689	/*
1690	 * Try values that may have been computed by code during boot
1691	 */
1692	if (!vfs_mountroot_try(rootdevnames[0]))
1693		goto mounted;
1694	if (!vfs_mountroot_try(rootdevnames[1]))
1695		goto mounted;
1696
1697	/*
1698	 * If we (still) have a compiled-in default, try it.
1699	 */
1700	if (ctrootdevname != NULL)
1701		if (!vfs_mountroot_try(ctrootdevname))
1702			goto mounted;
1703	/*
1704	 * Everything so far has failed, prompt on the console if we haven't
1705	 * already tried that.
1706	 */
1707	if (!asked)
1708		if (!vfs_mountroot_ask())
1709			goto mounted;
1710
1711	panic("Root mount failed, startup aborted.");
1712
1713mounted:
1714	root_mount_done();
1715}
1716
1717/*
1718 * Mount (mountfrom) as the root filesystem.
1719 */
1720static int
1721vfs_mountroot_try(const char *mountfrom)
1722{
1723	struct mount	*mp;
1724	char		*vfsname, *path;
1725	time_t		timebase;
1726	int		error;
1727	char		patt[32];
1728
1729	vfsname = NULL;
1730	path    = NULL;
1731	mp      = NULL;
1732	error   = EINVAL;
1733
1734	if (mountfrom == NULL)
1735		return (error);		/* don't complain */
1736	printf("Trying to mount root from %s\n", mountfrom);
1737
1738	/* parse vfs name and path */
1739	vfsname = malloc(MFSNAMELEN, M_MOUNT, M_WAITOK);
1740	path = malloc(MNAMELEN, M_MOUNT, M_WAITOK);
1741	vfsname[0] = path[0] = 0;
1742	sprintf(patt, "%%%d[a-z0-9]:%%%ds", MFSNAMELEN, MNAMELEN);
1743	if (sscanf(mountfrom, patt, vfsname, path) < 1)
1744		goto out;
1745
1746	if (path[0] == '\0')
1747		strcpy(path, ROOTNAME);
1748
1749	error = kernel_vmount(
1750	    MNT_RDONLY | MNT_ROOTFS,
1751	    "fstype", vfsname,
1752	    "fspath", "/",
1753	    "from", path,
1754	    NULL);
1755	if (error == 0) {
1756		/*
1757		 * We mount devfs prior to mounting the / FS, so the first
1758		 * entry will typically be devfs.
1759		 */
1760		mp = TAILQ_FIRST(&mountlist);
1761		KASSERT(mp != NULL, ("%s: mountlist is empty", __func__));
1762
1763		/*
1764		 * Iterate over all currently mounted file systems and use
1765		 * the time stamp found to check and/or initialize the RTC.
1766		 * Typically devfs has no time stamp and the only other FS
1767		 * is the actual / FS.
1768		 * Call inittodr() only once and pass it the largest of the
1769		 * timestamps we encounter.
1770		 */
1771		timebase = 0;
1772		do {
1773			if (mp->mnt_time > timebase)
1774				timebase = mp->mnt_time;
1775			mp = TAILQ_NEXT(mp, mnt_list);
1776		} while (mp != NULL);
1777		inittodr(timebase);
1778
1779		devfs_fixup(curthread);
1780	}
1781out:
1782	free(path, M_MOUNT);
1783	free(vfsname, M_MOUNT);
1784	return (error);
1785}
1786
1787/*
1788 * ---------------------------------------------------------------------
1789 * Interactive root filesystem selection code.
1790 */
1791
1792static int
1793vfs_mountroot_ask(void)
1794{
1795	char name[128];
1796
1797	for(;;) {
1798		printf("\nManual root filesystem specification:\n");
1799		printf("  <fstype>:<device>  Mount <device> using filesystem <fstype>\n");
1800#if defined(__amd64__) || defined(__i386__) || defined(__ia64__)
1801		printf("                       eg. ufs:da0s1a\n");
1802#else
1803		printf("                       eg. ufs:/dev/da0a\n");
1804#endif
1805		printf("  ?                  List valid disk boot devices\n");
1806		printf("  <empty line>       Abort manual input\n");
1807		printf("\nmountroot> ");
1808		gets(name, sizeof(name), 1);
1809		if (name[0] == '\0')
1810			return (1);
1811		if (name[0] == '?') {
1812			printf("\nList of GEOM managed disk devices:\n  ");
1813			g_dev_print();
1814			continue;
1815		}
1816		if (!vfs_mountroot_try(name))
1817			return (0);
1818	}
1819}
1820
1821/*
1822 * ---------------------------------------------------------------------
1823 * Functions for querying mount options/arguments from filesystems.
1824 */
1825
1826/*
1827 * Check that no unknown options are given
1828 */
1829int
1830vfs_filteropt(struct vfsoptlist *opts, const char **legal)
1831{
1832	struct vfsopt *opt;
1833	char errmsg[255];
1834	const char **t, *p, *q;
1835	int ret = 0;
1836
1837	TAILQ_FOREACH(opt, opts, link) {
1838		p = opt->name;
1839		q = NULL;
1840		if (p[0] == 'n' && p[1] == 'o')
1841			q = p + 2;
1842		for(t = global_opts; *t != NULL; t++) {
1843			if (strcmp(*t, p) == 0)
1844				break;
1845			if (q != NULL) {
1846				if (strcmp(*t, q) == 0)
1847					break;
1848			}
1849		}
1850		if (*t != NULL)
1851			continue;
1852		for(t = legal; *t != NULL; t++) {
1853			if (strcmp(*t, p) == 0)
1854				break;
1855			if (q != NULL) {
1856				if (strcmp(*t, q) == 0)
1857					break;
1858			}
1859		}
1860		if (*t != NULL)
1861			continue;
1862		snprintf(errmsg, sizeof(errmsg),
1863		    "mount option <%s> is unknown", p);
1864		printf("%s\n", errmsg);
1865		ret = EINVAL;
1866	}
1867	if (ret != 0) {
1868		TAILQ_FOREACH(opt, opts, link) {
1869			if (strcmp(opt->name, "errmsg") == 0) {
1870				strncpy((char *)opt->value, errmsg, opt->len);
1871			}
1872		}
1873	}
1874	return (ret);
1875}
1876
1877/*
1878 * Get a mount option by its name.
1879 *
1880 * Return 0 if the option was found, ENOENT otherwise.
1881 * If len is non-NULL it will be filled with the length
1882 * of the option. If buf is non-NULL, it will be filled
1883 * with the address of the option.
1884 */
1885int
1886vfs_getopt(opts, name, buf, len)
1887	struct vfsoptlist *opts;
1888	const char *name;
1889	void **buf;
1890	int *len;
1891{
1892	struct vfsopt *opt;
1893
1894	KASSERT(opts != NULL, ("vfs_getopt: caller passed 'opts' as NULL"));
1895
1896	TAILQ_FOREACH(opt, opts, link) {
1897		if (strcmp(name, opt->name) == 0) {
1898			if (len != NULL)
1899				*len = opt->len;
1900			if (buf != NULL)
1901				*buf = opt->value;
1902			return (0);
1903		}
1904	}
1905	return (ENOENT);
1906}
1907
1908static int
1909vfs_getopt_pos(struct vfsoptlist *opts, const char *name)
1910{
1911	struct vfsopt *opt;
1912	int i;
1913
1914	if (opts == NULL)
1915		return (-1);
1916
1917	i = 0;
1918	TAILQ_FOREACH(opt, opts, link) {
1919		if (strcmp(name, opt->name) == 0)
1920			return (i);
1921		++i;
1922	}
1923	return (-1);
1924}
1925
1926char *
1927vfs_getopts(struct vfsoptlist *opts, const char *name, int *error)
1928{
1929	struct vfsopt *opt;
1930
1931	*error = 0;
1932	TAILQ_FOREACH(opt, opts, link) {
1933		if (strcmp(name, opt->name) != 0)
1934			continue;
1935		if (((char *)opt->value)[opt->len - 1] != '\0') {
1936			*error = EINVAL;
1937			return (NULL);
1938		}
1939		return (opt->value);
1940	}
1941	*error = ENOENT;
1942	return (NULL);
1943}
1944
1945int
1946vfs_flagopt(struct vfsoptlist *opts, const char *name, u_int *w, u_int val)
1947{
1948	struct vfsopt *opt;
1949
1950	TAILQ_FOREACH(opt, opts, link) {
1951		if (strcmp(name, opt->name) == 0) {
1952			if (w != NULL)
1953				*w |= val;
1954			return (1);
1955		}
1956	}
1957	if (w != NULL)
1958		*w &= ~val;
1959	return (0);
1960}
1961
1962int
1963vfs_scanopt(struct vfsoptlist *opts, const char *name, const char *fmt, ...)
1964{
1965	va_list ap;
1966	struct vfsopt *opt;
1967	int ret;
1968
1969	KASSERT(opts != NULL, ("vfs_getopt: caller passed 'opts' as NULL"));
1970
1971	TAILQ_FOREACH(opt, opts, link) {
1972		if (strcmp(name, opt->name) != 0)
1973			continue;
1974		if (opt->len == 0 || opt->value == NULL)
1975			return (0);
1976		if (((char *)opt->value)[opt->len - 1] != '\0')
1977			return (0);
1978		va_start(ap, fmt);
1979		ret = vsscanf(opt->value, fmt, ap);
1980		va_end(ap);
1981		return (ret);
1982	}
1983	return (0);
1984}
1985
1986/*
1987 * Find and copy a mount option.
1988 *
1989 * The size of the buffer has to be specified
1990 * in len, if it is not the same length as the
1991 * mount option, EINVAL is returned.
1992 * Returns ENOENT if the option is not found.
1993 */
1994int
1995vfs_copyopt(opts, name, dest, len)
1996	struct vfsoptlist *opts;
1997	const char *name;
1998	void *dest;
1999	int len;
2000{
2001	struct vfsopt *opt;
2002
2003	KASSERT(opts != NULL, ("vfs_copyopt: caller passed 'opts' as NULL"));
2004
2005	TAILQ_FOREACH(opt, opts, link) {
2006		if (strcmp(name, opt->name) == 0) {
2007			if (len != opt->len)
2008				return (EINVAL);
2009			bcopy(opt->value, dest, opt->len);
2010			return (0);
2011		}
2012	}
2013	return (ENOENT);
2014}
2015
2016/*
2017 * This is a helper function for filesystems to traverse their
2018 * vnodes.  See MNT_VNODE_FOREACH() in sys/mount.h
2019 */
2020
2021struct vnode *
2022__mnt_vnode_next(struct vnode **mvp, struct mount *mp)
2023{
2024	struct vnode *vp;
2025
2026	mtx_assert(MNT_MTX(mp), MA_OWNED);
2027
2028	KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
2029	if ((*mvp)->v_yield++ == 500) {
2030		MNT_IUNLOCK(mp);
2031		(*mvp)->v_yield = 0;
2032		uio_yield();
2033		MNT_ILOCK(mp);
2034	}
2035	vp = TAILQ_NEXT(*mvp, v_nmntvnodes);
2036	while (vp != NULL && vp->v_type == VMARKER)
2037		vp = TAILQ_NEXT(vp, v_nmntvnodes);
2038
2039	/* Check if we are done */
2040	if (vp == NULL) {
2041		__mnt_vnode_markerfree(mvp, mp);
2042		return (NULL);
2043	}
2044	TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes);
2045	TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes);
2046	return (vp);
2047}
2048
2049struct vnode *
2050__mnt_vnode_first(struct vnode **mvp, struct mount *mp)
2051{
2052	struct vnode *vp;
2053
2054	mtx_assert(MNT_MTX(mp), MA_OWNED);
2055
2056	vp = TAILQ_FIRST(&mp->mnt_nvnodelist);
2057	while (vp != NULL && vp->v_type == VMARKER)
2058		vp = TAILQ_NEXT(vp, v_nmntvnodes);
2059
2060	/* Check if we are done */
2061	if (vp == NULL) {
2062		*mvp = NULL;
2063		return (NULL);
2064	}
2065	mp->mnt_holdcnt++;
2066	MNT_IUNLOCK(mp);
2067	*mvp = (struct vnode *) malloc(sizeof(struct vnode),
2068				       M_VNODE_MARKER,
2069				       M_WAITOK | M_ZERO);
2070	MNT_ILOCK(mp);
2071	(*mvp)->v_type = VMARKER;
2072
2073	vp = TAILQ_FIRST(&mp->mnt_nvnodelist);
2074	while (vp != NULL && vp->v_type == VMARKER)
2075		vp = TAILQ_NEXT(vp, v_nmntvnodes);
2076
2077	/* Check if we are done */
2078	if (vp == NULL) {
2079		MNT_IUNLOCK(mp);
2080		free(*mvp, M_VNODE_MARKER);
2081		MNT_ILOCK(mp);
2082		*mvp = NULL;
2083		mp->mnt_holdcnt--;
2084		if (mp->mnt_holdcnt == 0 && mp->mnt_holdcntwaiters != 0)
2085			wakeup(&mp->mnt_holdcnt);
2086		return (NULL);
2087	}
2088	(*mvp)->v_mount = mp;
2089	TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes);
2090	return (vp);
2091}
2092
2093
2094void
2095__mnt_vnode_markerfree(struct vnode **mvp, struct mount *mp)
2096{
2097
2098	if (*mvp == NULL)
2099		return;
2100
2101	mtx_assert(MNT_MTX(mp), MA_OWNED);
2102
2103	KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
2104	TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes);
2105	MNT_IUNLOCK(mp);
2106	free(*mvp, M_VNODE_MARKER);
2107	MNT_ILOCK(mp);
2108	*mvp = NULL;
2109
2110	mp->mnt_holdcnt--;
2111	if (mp->mnt_holdcnt == 0 && mp->mnt_holdcntwaiters != 0)
2112		wakeup(&mp->mnt_holdcnt);
2113}
2114
2115
2116int
2117__vfs_statfs(struct mount *mp, struct statfs *sbp, struct thread *td)
2118{
2119	int error;
2120
2121	error = mp->mnt_op->vfs_statfs(mp, &mp->mnt_stat, td);
2122	if (sbp != &mp->mnt_stat)
2123		*sbp = mp->mnt_stat;
2124	return (error);
2125}
2126
2127void
2128vfs_mountedfrom(struct mount *mp, const char *from)
2129{
2130
2131	bzero(mp->mnt_stat.f_mntfromname, sizeof mp->mnt_stat.f_mntfromname);
2132	strlcpy(mp->mnt_stat.f_mntfromname, from,
2133	    sizeof mp->mnt_stat.f_mntfromname);
2134}
2135
2136/*
2137 * ---------------------------------------------------------------------
2138 * This is the api for building mount args and mounting filesystems from
2139 * inside the kernel.
2140 *
2141 * The API works by accumulation of individual args.  First error is
2142 * latched.
2143 *
2144 * XXX: should be documented in new manpage kernel_mount(9)
2145 */
2146
2147/* A memory allocation which must be freed when we are done */
2148struct mntaarg {
2149	SLIST_ENTRY(mntaarg)	next;
2150};
2151
2152/* The header for the mount arguments */
2153struct mntarg {
2154	struct iovec *v;
2155	int len;
2156	int error;
2157	SLIST_HEAD(, mntaarg)	list;
2158};
2159
2160/*
2161 * Add a boolean argument.
2162 *
2163 * flag is the boolean value.
2164 * name must start with "no".
2165 */
2166struct mntarg *
2167mount_argb(struct mntarg *ma, int flag, const char *name)
2168{
2169
2170	KASSERT(name[0] == 'n' && name[1] == 'o',
2171	    ("mount_argb(...,%s): name must start with 'no'", name));
2172
2173	return (mount_arg(ma, name + (flag ? 2 : 0), NULL, 0));
2174}
2175
2176/*
2177 * Add an argument printf style
2178 */
2179struct mntarg *
2180mount_argf(struct mntarg *ma, const char *name, const char *fmt, ...)
2181{
2182	va_list ap;
2183	struct mntaarg *maa;
2184	struct sbuf *sb;
2185	int len;
2186
2187	if (ma == NULL) {
2188		ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK | M_ZERO);
2189		SLIST_INIT(&ma->list);
2190	}
2191	if (ma->error)
2192		return (ma);
2193
2194	ma->v = realloc(ma->v, sizeof *ma->v * (ma->len + 2),
2195	    M_MOUNT, M_WAITOK);
2196	ma->v[ma->len].iov_base = (void *)(uintptr_t)name;
2197	ma->v[ma->len].iov_len = strlen(name) + 1;
2198	ma->len++;
2199
2200	sb = sbuf_new_auto();
2201	va_start(ap, fmt);
2202	sbuf_vprintf(sb, fmt, ap);
2203	va_end(ap);
2204	sbuf_finish(sb);
2205	len = sbuf_len(sb) + 1;
2206	maa = malloc(sizeof *maa + len, M_MOUNT, M_WAITOK | M_ZERO);
2207	SLIST_INSERT_HEAD(&ma->list, maa, next);
2208	bcopy(sbuf_data(sb), maa + 1, len);
2209	sbuf_delete(sb);
2210
2211	ma->v[ma->len].iov_base = maa + 1;
2212	ma->v[ma->len].iov_len = len;
2213	ma->len++;
2214
2215	return (ma);
2216}
2217
2218/*
2219 * Add an argument which is a userland string.
2220 */
2221struct mntarg *
2222mount_argsu(struct mntarg *ma, const char *name, const void *val, int len)
2223{
2224	struct mntaarg *maa;
2225	char *tbuf;
2226
2227	if (val == NULL)
2228		return (ma);
2229	if (ma == NULL) {
2230		ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK | M_ZERO);
2231		SLIST_INIT(&ma->list);
2232	}
2233	if (ma->error)
2234		return (ma);
2235	maa = malloc(sizeof *maa + len, M_MOUNT, M_WAITOK | M_ZERO);
2236	SLIST_INSERT_HEAD(&ma->list, maa, next);
2237	tbuf = (void *)(maa + 1);
2238	ma->error = copyinstr(val, tbuf, len, NULL);
2239	return (mount_arg(ma, name, tbuf, -1));
2240}
2241
2242/*
2243 * Plain argument.
2244 *
2245 * If length is -1, treat value as a C string.
2246 */
2247struct mntarg *
2248mount_arg(struct mntarg *ma, const char *name, const void *val, int len)
2249{
2250
2251	if (ma == NULL) {
2252		ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK | M_ZERO);
2253		SLIST_INIT(&ma->list);
2254	}
2255	if (ma->error)
2256		return (ma);
2257
2258	ma->v = realloc(ma->v, sizeof *ma->v * (ma->len + 2),
2259	    M_MOUNT, M_WAITOK);
2260	ma->v[ma->len].iov_base = (void *)(uintptr_t)name;
2261	ma->v[ma->len].iov_len = strlen(name) + 1;
2262	ma->len++;
2263
2264	ma->v[ma->len].iov_base = (void *)(uintptr_t)val;
2265	if (len < 0)
2266		ma->v[ma->len].iov_len = strlen(val) + 1;
2267	else
2268		ma->v[ma->len].iov_len = len;
2269	ma->len++;
2270	return (ma);
2271}
2272
2273/*
2274 * Free a mntarg structure
2275 */
2276static void
2277free_mntarg(struct mntarg *ma)
2278{
2279	struct mntaarg *maa;
2280
2281	while (!SLIST_EMPTY(&ma->list)) {
2282		maa = SLIST_FIRST(&ma->list);
2283		SLIST_REMOVE_HEAD(&ma->list, next);
2284		free(maa, M_MOUNT);
2285	}
2286	free(ma->v, M_MOUNT);
2287	free(ma, M_MOUNT);
2288}
2289
2290/*
2291 * Mount a filesystem
2292 */
2293int
2294kernel_mount(struct mntarg *ma, int flags)
2295{
2296	struct uio auio;
2297	int error;
2298
2299	KASSERT(ma != NULL, ("kernel_mount NULL ma"));
2300	KASSERT(ma->v != NULL, ("kernel_mount NULL ma->v"));
2301	KASSERT(!(ma->len & 1), ("kernel_mount odd ma->len (%d)", ma->len));
2302
2303	auio.uio_iov = ma->v;
2304	auio.uio_iovcnt = ma->len;
2305	auio.uio_segflg = UIO_SYSSPACE;
2306
2307	error = ma->error;
2308	if (!error)
2309		error = vfs_donmount(curthread, flags, &auio);
2310	free_mntarg(ma);
2311	return (error);
2312}
2313
2314/*
2315 * A printflike function to mount a filesystem.
2316 */
2317int
2318kernel_vmount(int flags, ...)
2319{
2320	struct mntarg *ma = NULL;
2321	va_list ap;
2322	const char *cp;
2323	const void *vp;
2324	int error;
2325
2326	va_start(ap, flags);
2327	for (;;) {
2328		cp = va_arg(ap, const char *);
2329		if (cp == NULL)
2330			break;
2331		vp = va_arg(ap, const void *);
2332		ma = mount_arg(ma, cp, vp, (vp != NULL ? -1 : 0));
2333	}
2334	va_end(ap);
2335
2336	error = kernel_mount(ma, flags);
2337	return (error);
2338}
2339