vfs_mount.c revision 217792
1/*-
2 * Copyright (c) 1999-2004 Poul-Henning Kamp
3 * Copyright (c) 1999 Michael Smith
4 * Copyright (c) 1989, 1993
5 *	The Regents of the University of California.  All rights reserved.
6 * (c) UNIX System Laboratories, Inc.
7 * All or some portions of this file are derived from material licensed
8 * to the University of California by American Telephone and Telegraph
9 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
10 * the permission of UNIX System Laboratories, Inc.
11 *
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
14 * are met:
15 * 1. Redistributions of source code must retain the above copyright
16 *    notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 *    notice, this list of conditions and the following disclaimer in the
19 *    documentation and/or other materials provided with the distribution.
20 * 4. Neither the name of the University nor the names of its contributors
21 *    may be used to endorse or promote products derived from this software
22 *    without specific prior written permission.
23 *
24 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * SUCH DAMAGE.
35 */
36
37#include <sys/cdefs.h>
38__FBSDID("$FreeBSD: head/sys/kern/vfs_mount.c 217792 2011-01-24 17:08:26Z jh $");
39
40#include <sys/param.h>
41#include <sys/conf.h>
42#include <sys/fcntl.h>
43#include <sys/jail.h>
44#include <sys/kernel.h>
45#include <sys/libkern.h>
46#include <sys/malloc.h>
47#include <sys/mount.h>
48#include <sys/mutex.h>
49#include <sys/namei.h>
50#include <sys/priv.h>
51#include <sys/proc.h>
52#include <sys/filedesc.h>
53#include <sys/reboot.h>
54#include <sys/syscallsubr.h>
55#include <sys/sysproto.h>
56#include <sys/sx.h>
57#include <sys/sysctl.h>
58#include <sys/sysent.h>
59#include <sys/systm.h>
60#include <sys/vnode.h>
61#include <vm/uma.h>
62
63#include <geom/geom.h>
64
65#include <machine/stdarg.h>
66
67#include <security/audit/audit.h>
68#include <security/mac/mac_framework.h>
69
70#define	VFS_MOUNTARG_SIZE_MAX	(1024 * 64)
71
72static int	vfs_domount(struct thread *td, const char *fstype,
73		    char *fspath, int fsflags, void *fsdata);
74static void	free_mntarg(struct mntarg *ma);
75
76static int	usermount = 0;
77SYSCTL_INT(_vfs, OID_AUTO, usermount, CTLFLAG_RW, &usermount, 0,
78    "Unprivileged users may mount and unmount file systems");
79
80MALLOC_DEFINE(M_MOUNT, "mount", "vfs mount structure");
81MALLOC_DEFINE(M_VNODE_MARKER, "vnodemarker", "vnode marker");
82static uma_zone_t mount_zone;
83
84/* List of mounted filesystems. */
85struct mntlist mountlist = TAILQ_HEAD_INITIALIZER(mountlist);
86
87/* For any iteration/modification of mountlist */
88struct mtx mountlist_mtx;
89MTX_SYSINIT(mountlist, &mountlist_mtx, "mountlist", MTX_DEF);
90
91/*
92 * Global opts, taken by all filesystems
93 */
94static const char *global_opts[] = {
95	"errmsg",
96	"fstype",
97	"fspath",
98	"ro",
99	"rw",
100	"nosuid",
101	"noexec",
102	NULL
103};
104
105static int
106mount_init(void *mem, int size, int flags)
107{
108	struct mount *mp;
109
110	mp = (struct mount *)mem;
111	mtx_init(&mp->mnt_mtx, "struct mount mtx", NULL, MTX_DEF);
112	lockinit(&mp->mnt_explock, PVFS, "explock", 0, 0);
113	return (0);
114}
115
116static void
117mount_fini(void *mem, int size)
118{
119	struct mount *mp;
120
121	mp = (struct mount *)mem;
122	lockdestroy(&mp->mnt_explock);
123	mtx_destroy(&mp->mnt_mtx);
124}
125
126static void
127vfs_mount_init(void *dummy __unused)
128{
129
130	mount_zone = uma_zcreate("Mountpoints", sizeof(struct mount), NULL,
131	    NULL, mount_init, mount_fini, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
132}
133SYSINIT(vfs_mount, SI_SUB_VFS, SI_ORDER_ANY, vfs_mount_init, NULL);
134
135/*
136 * ---------------------------------------------------------------------
137 * Functions for building and sanitizing the mount options
138 */
139
140/* Remove one mount option. */
141static void
142vfs_freeopt(struct vfsoptlist *opts, struct vfsopt *opt)
143{
144
145	TAILQ_REMOVE(opts, opt, link);
146	free(opt->name, M_MOUNT);
147	if (opt->value != NULL)
148		free(opt->value, M_MOUNT);
149	free(opt, M_MOUNT);
150}
151
152/* Release all resources related to the mount options. */
153void
154vfs_freeopts(struct vfsoptlist *opts)
155{
156	struct vfsopt *opt;
157
158	while (!TAILQ_EMPTY(opts)) {
159		opt = TAILQ_FIRST(opts);
160		vfs_freeopt(opts, opt);
161	}
162	free(opts, M_MOUNT);
163}
164
165void
166vfs_deleteopt(struct vfsoptlist *opts, const char *name)
167{
168	struct vfsopt *opt, *temp;
169
170	if (opts == NULL)
171		return;
172	TAILQ_FOREACH_SAFE(opt, opts, link, temp)  {
173		if (strcmp(opt->name, name) == 0)
174			vfs_freeopt(opts, opt);
175	}
176}
177
178/*
179 * Check if options are equal (with or without the "no" prefix).
180 */
181static int
182vfs_equalopts(const char *opt1, const char *opt2)
183{
184	char *p;
185
186	/* "opt" vs. "opt" or "noopt" vs. "noopt" */
187	if (strcmp(opt1, opt2) == 0)
188		return (1);
189	/* "noopt" vs. "opt" */
190	if (strncmp(opt1, "no", 2) == 0 && strcmp(opt1 + 2, opt2) == 0)
191		return (1);
192	/* "opt" vs. "noopt" */
193	if (strncmp(opt2, "no", 2) == 0 && strcmp(opt1, opt2 + 2) == 0)
194		return (1);
195	while ((p = strchr(opt1, '.')) != NULL &&
196	    !strncmp(opt1, opt2, ++p - opt1)) {
197		opt2 += p - opt1;
198		opt1 = p;
199		/* "foo.noopt" vs. "foo.opt" */
200		if (strncmp(opt1, "no", 2) == 0 && strcmp(opt1 + 2, opt2) == 0)
201			return (1);
202		/* "foo.opt" vs. "foo.noopt" */
203		if (strncmp(opt2, "no", 2) == 0 && strcmp(opt1, opt2 + 2) == 0)
204			return (1);
205	}
206	return (0);
207}
208
209/*
210 * If a mount option is specified several times,
211 * (with or without the "no" prefix) only keep
212 * the last occurence of it.
213 */
214static void
215vfs_sanitizeopts(struct vfsoptlist *opts)
216{
217	struct vfsopt *opt, *opt2, *tmp;
218
219	TAILQ_FOREACH_REVERSE(opt, opts, vfsoptlist, link) {
220		opt2 = TAILQ_PREV(opt, vfsoptlist, link);
221		while (opt2 != NULL) {
222			if (vfs_equalopts(opt->name, opt2->name)) {
223				tmp = TAILQ_PREV(opt2, vfsoptlist, link);
224				vfs_freeopt(opts, opt2);
225				opt2 = tmp;
226			} else {
227				opt2 = TAILQ_PREV(opt2, vfsoptlist, link);
228			}
229		}
230	}
231}
232
233/*
234 * Build a linked list of mount options from a struct uio.
235 */
236int
237vfs_buildopts(struct uio *auio, struct vfsoptlist **options)
238{
239	struct vfsoptlist *opts;
240	struct vfsopt *opt;
241	size_t memused, namelen, optlen;
242	unsigned int i, iovcnt;
243	int error;
244
245	opts = malloc(sizeof(struct vfsoptlist), M_MOUNT, M_WAITOK);
246	TAILQ_INIT(opts);
247	memused = 0;
248	iovcnt = auio->uio_iovcnt;
249	for (i = 0; i < iovcnt; i += 2) {
250		namelen = auio->uio_iov[i].iov_len;
251		optlen = auio->uio_iov[i + 1].iov_len;
252		memused += sizeof(struct vfsopt) + optlen + namelen;
253		/*
254		 * Avoid consuming too much memory, and attempts to overflow
255		 * memused.
256		 */
257		if (memused > VFS_MOUNTARG_SIZE_MAX ||
258		    optlen > VFS_MOUNTARG_SIZE_MAX ||
259		    namelen > VFS_MOUNTARG_SIZE_MAX) {
260			error = EINVAL;
261			goto bad;
262		}
263
264		opt = malloc(sizeof(struct vfsopt), M_MOUNT, M_WAITOK);
265		opt->name = malloc(namelen, M_MOUNT, M_WAITOK);
266		opt->value = NULL;
267		opt->len = 0;
268		opt->pos = i / 2;
269		opt->seen = 0;
270
271		/*
272		 * Do this early, so jumps to "bad" will free the current
273		 * option.
274		 */
275		TAILQ_INSERT_TAIL(opts, opt, link);
276
277		if (auio->uio_segflg == UIO_SYSSPACE) {
278			bcopy(auio->uio_iov[i].iov_base, opt->name, namelen);
279		} else {
280			error = copyin(auio->uio_iov[i].iov_base, opt->name,
281			    namelen);
282			if (error)
283				goto bad;
284		}
285		/* Ensure names are null-terminated strings. */
286		if (namelen == 0 || opt->name[namelen - 1] != '\0') {
287			error = EINVAL;
288			goto bad;
289		}
290		if (optlen != 0) {
291			opt->len = optlen;
292			opt->value = malloc(optlen, M_MOUNT, M_WAITOK);
293			if (auio->uio_segflg == UIO_SYSSPACE) {
294				bcopy(auio->uio_iov[i + 1].iov_base, opt->value,
295				    optlen);
296			} else {
297				error = copyin(auio->uio_iov[i + 1].iov_base,
298				    opt->value, optlen);
299				if (error)
300					goto bad;
301			}
302		}
303	}
304	vfs_sanitizeopts(opts);
305	*options = opts;
306	return (0);
307bad:
308	vfs_freeopts(opts);
309	return (error);
310}
311
312/*
313 * Merge the old mount options with the new ones passed
314 * in the MNT_UPDATE case.
315 *
316 * XXX This function will keep a "nofoo" option in the
317 *     new options if there is no matching "foo" option
318 *     to be cancelled in the old options.  This is a bug
319 *     if the option's canonical name is "foo".  E.g., "noro"
320 *     shouldn't end up in the mount point's active options,
321 *     but it can.
322 */
323static void
324vfs_mergeopts(struct vfsoptlist *toopts, struct vfsoptlist *opts)
325{
326	struct vfsopt *opt, *opt2, *new;
327
328	TAILQ_FOREACH(opt, opts, link) {
329		/*
330		 * Check that this option hasn't been redefined
331		 * nor cancelled with a "no" mount option.
332		 */
333		opt2 = TAILQ_FIRST(toopts);
334		while (opt2 != NULL) {
335			if (strcmp(opt2->name, opt->name) == 0)
336				goto next;
337			if (strncmp(opt2->name, "no", 2) == 0 &&
338			    strcmp(opt2->name + 2, opt->name) == 0) {
339				vfs_freeopt(toopts, opt2);
340				goto next;
341			}
342			opt2 = TAILQ_NEXT(opt2, link);
343		}
344		/* We want this option, duplicate it. */
345		new = malloc(sizeof(struct vfsopt), M_MOUNT, M_WAITOK);
346		new->name = malloc(strlen(opt->name) + 1, M_MOUNT, M_WAITOK);
347		strcpy(new->name, opt->name);
348		if (opt->len != 0) {
349			new->value = malloc(opt->len, M_MOUNT, M_WAITOK);
350			bcopy(opt->value, new->value, opt->len);
351		} else {
352			new->value = NULL;
353		}
354		new->len = opt->len;
355		new->seen = opt->seen;
356		TAILQ_INSERT_TAIL(toopts, new, link);
357next:
358		continue;
359	}
360}
361
362/*
363 * Mount a filesystem.
364 */
365int
366nmount(td, uap)
367	struct thread *td;
368	struct nmount_args /* {
369		struct iovec *iovp;
370		unsigned int iovcnt;
371		int flags;
372	} */ *uap;
373{
374	struct uio *auio;
375	int error;
376	u_int iovcnt;
377
378	AUDIT_ARG_FFLAGS(uap->flags);
379	CTR4(KTR_VFS, "%s: iovp %p with iovcnt %d and flags %d", __func__,
380	    uap->iovp, uap->iovcnt, uap->flags);
381
382	/*
383	 * Filter out MNT_ROOTFS.  We do not want clients of nmount() in
384	 * userspace to set this flag, but we must filter it out if we want
385	 * MNT_UPDATE on the root file system to work.
386	 * MNT_ROOTFS should only be set by the kernel when mounting its
387	 * root file system.
388	 */
389	uap->flags &= ~MNT_ROOTFS;
390
391	iovcnt = uap->iovcnt;
392	/*
393	 * Check that we have an even number of iovec's
394	 * and that we have at least two options.
395	 */
396	if ((iovcnt & 1) || (iovcnt < 4)) {
397		CTR2(KTR_VFS, "%s: failed for invalid iovcnt %d", __func__,
398		    uap->iovcnt);
399		return (EINVAL);
400	}
401
402	error = copyinuio(uap->iovp, iovcnt, &auio);
403	if (error) {
404		CTR2(KTR_VFS, "%s: failed for invalid uio op with %d errno",
405		    __func__, error);
406		return (error);
407	}
408	error = vfs_donmount(td, uap->flags, auio);
409
410	free(auio, M_IOV);
411	return (error);
412}
413
414/*
415 * ---------------------------------------------------------------------
416 * Various utility functions
417 */
418
419void
420vfs_ref(struct mount *mp)
421{
422
423	CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
424	MNT_ILOCK(mp);
425	MNT_REF(mp);
426	MNT_IUNLOCK(mp);
427}
428
429void
430vfs_rel(struct mount *mp)
431{
432
433	CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
434	MNT_ILOCK(mp);
435	MNT_REL(mp);
436	MNT_IUNLOCK(mp);
437}
438
439/*
440 * Allocate and initialize the mount point struct.
441 */
442struct mount *
443vfs_mount_alloc(struct vnode *vp, struct vfsconf *vfsp, const char *fspath,
444    struct ucred *cred)
445{
446	struct mount *mp;
447
448	mp = uma_zalloc(mount_zone, M_WAITOK);
449	bzero(&mp->mnt_startzero,
450	    __rangeof(struct mount, mnt_startzero, mnt_endzero));
451	TAILQ_INIT(&mp->mnt_nvnodelist);
452	mp->mnt_nvnodelistsize = 0;
453	mp->mnt_ref = 0;
454	(void) vfs_busy(mp, MBF_NOWAIT);
455	mp->mnt_op = vfsp->vfc_vfsops;
456	mp->mnt_vfc = vfsp;
457	vfsp->vfc_refcount++;	/* XXX Unlocked */
458	mp->mnt_stat.f_type = vfsp->vfc_typenum;
459	mp->mnt_gen++;
460	strlcpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN);
461	mp->mnt_vnodecovered = vp;
462	mp->mnt_cred = crdup(cred);
463	mp->mnt_stat.f_owner = cred->cr_uid;
464	strlcpy(mp->mnt_stat.f_mntonname, fspath, MNAMELEN);
465	mp->mnt_iosize_max = DFLTPHYS;
466#ifdef MAC
467	mac_mount_init(mp);
468	mac_mount_create(cred, mp);
469#endif
470	arc4rand(&mp->mnt_hashseed, sizeof mp->mnt_hashseed, 0);
471	return (mp);
472}
473
474/*
475 * Destroy the mount struct previously allocated by vfs_mount_alloc().
476 */
477void
478vfs_mount_destroy(struct mount *mp)
479{
480
481	MNT_ILOCK(mp);
482	mp->mnt_kern_flag |= MNTK_REFEXPIRE;
483	if (mp->mnt_kern_flag & MNTK_MWAIT) {
484		mp->mnt_kern_flag &= ~MNTK_MWAIT;
485		wakeup(mp);
486	}
487	while (mp->mnt_ref)
488		msleep(mp, MNT_MTX(mp), PVFS, "mntref", 0);
489	KASSERT(mp->mnt_ref == 0,
490	    ("%s: invalid refcount in the drain path @ %s:%d", __func__,
491	    __FILE__, __LINE__));
492	if (mp->mnt_writeopcount != 0)
493		panic("vfs_mount_destroy: nonzero writeopcount");
494	if (mp->mnt_secondary_writes != 0)
495		panic("vfs_mount_destroy: nonzero secondary_writes");
496	mp->mnt_vfc->vfc_refcount--;
497	if (!TAILQ_EMPTY(&mp->mnt_nvnodelist)) {
498		struct vnode *vp;
499
500		TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes)
501			vprint("", vp);
502		panic("unmount: dangling vnode");
503	}
504	if (mp->mnt_nvnodelistsize != 0)
505		panic("vfs_mount_destroy: nonzero nvnodelistsize");
506	if (mp->mnt_lockref != 0)
507		panic("vfs_mount_destroy: nonzero lock refcount");
508	MNT_IUNLOCK(mp);
509#ifdef MAC
510	mac_mount_destroy(mp);
511#endif
512	if (mp->mnt_opt != NULL)
513		vfs_freeopts(mp->mnt_opt);
514	crfree(mp->mnt_cred);
515	uma_zfree(mount_zone, mp);
516}
517
518int
519vfs_donmount(struct thread *td, int fsflags, struct uio *fsoptions)
520{
521	struct vfsoptlist *optlist;
522	struct vfsopt *opt, *noro_opt, *tmp_opt;
523	char *fstype, *fspath, *errmsg;
524	int error, fstypelen, fspathlen, errmsg_len, errmsg_pos;
525	int has_rw, has_noro;
526
527	errmsg = fspath = NULL;
528	errmsg_len = has_noro = has_rw = fspathlen = 0;
529	errmsg_pos = -1;
530
531	error = vfs_buildopts(fsoptions, &optlist);
532	if (error)
533		return (error);
534
535	if (vfs_getopt(optlist, "errmsg", (void **)&errmsg, &errmsg_len) == 0)
536		errmsg_pos = vfs_getopt_pos(optlist, "errmsg");
537
538	/*
539	 * We need these two options before the others,
540	 * and they are mandatory for any filesystem.
541	 * Ensure they are NUL terminated as well.
542	 */
543	fstypelen = 0;
544	error = vfs_getopt(optlist, "fstype", (void **)&fstype, &fstypelen);
545	if (error || fstype[fstypelen - 1] != '\0') {
546		error = EINVAL;
547		if (errmsg != NULL)
548			strncpy(errmsg, "Invalid fstype", errmsg_len);
549		goto bail;
550	}
551	fspathlen = 0;
552	error = vfs_getopt(optlist, "fspath", (void **)&fspath, &fspathlen);
553	if (error || fspath[fspathlen - 1] != '\0') {
554		error = EINVAL;
555		if (errmsg != NULL)
556			strncpy(errmsg, "Invalid fspath", errmsg_len);
557		goto bail;
558	}
559
560	/*
561	 * We need to see if we have the "update" option
562	 * before we call vfs_domount(), since vfs_domount() has special
563	 * logic based on MNT_UPDATE.  This is very important
564	 * when we want to update the root filesystem.
565	 */
566	TAILQ_FOREACH_SAFE(opt, optlist, link, tmp_opt) {
567		if (strcmp(opt->name, "update") == 0) {
568			fsflags |= MNT_UPDATE;
569			vfs_freeopt(optlist, opt);
570		}
571		else if (strcmp(opt->name, "async") == 0)
572			fsflags |= MNT_ASYNC;
573		else if (strcmp(opt->name, "force") == 0) {
574			fsflags |= MNT_FORCE;
575			vfs_freeopt(optlist, opt);
576		}
577		else if (strcmp(opt->name, "reload") == 0) {
578			fsflags |= MNT_RELOAD;
579			vfs_freeopt(optlist, opt);
580		}
581		else if (strcmp(opt->name, "multilabel") == 0)
582			fsflags |= MNT_MULTILABEL;
583		else if (strcmp(opt->name, "noasync") == 0)
584			fsflags &= ~MNT_ASYNC;
585		else if (strcmp(opt->name, "noatime") == 0)
586			fsflags |= MNT_NOATIME;
587		else if (strcmp(opt->name, "atime") == 0) {
588			free(opt->name, M_MOUNT);
589			opt->name = strdup("nonoatime", M_MOUNT);
590		}
591		else if (strcmp(opt->name, "noclusterr") == 0)
592			fsflags |= MNT_NOCLUSTERR;
593		else if (strcmp(opt->name, "clusterr") == 0) {
594			free(opt->name, M_MOUNT);
595			opt->name = strdup("nonoclusterr", M_MOUNT);
596		}
597		else if (strcmp(opt->name, "noclusterw") == 0)
598			fsflags |= MNT_NOCLUSTERW;
599		else if (strcmp(opt->name, "clusterw") == 0) {
600			free(opt->name, M_MOUNT);
601			opt->name = strdup("nonoclusterw", M_MOUNT);
602		}
603		else if (strcmp(opt->name, "noexec") == 0)
604			fsflags |= MNT_NOEXEC;
605		else if (strcmp(opt->name, "exec") == 0) {
606			free(opt->name, M_MOUNT);
607			opt->name = strdup("nonoexec", M_MOUNT);
608		}
609		else if (strcmp(opt->name, "nosuid") == 0)
610			fsflags |= MNT_NOSUID;
611		else if (strcmp(opt->name, "suid") == 0) {
612			free(opt->name, M_MOUNT);
613			opt->name = strdup("nonosuid", M_MOUNT);
614		}
615		else if (strcmp(opt->name, "nosymfollow") == 0)
616			fsflags |= MNT_NOSYMFOLLOW;
617		else if (strcmp(opt->name, "symfollow") == 0) {
618			free(opt->name, M_MOUNT);
619			opt->name = strdup("nonosymfollow", M_MOUNT);
620		}
621		else if (strcmp(opt->name, "noro") == 0) {
622			fsflags &= ~MNT_RDONLY;
623			has_noro = 1;
624		}
625		else if (strcmp(opt->name, "rw") == 0) {
626			fsflags &= ~MNT_RDONLY;
627			has_rw = 1;
628		}
629		else if (strcmp(opt->name, "ro") == 0)
630			fsflags |= MNT_RDONLY;
631		else if (strcmp(opt->name, "rdonly") == 0) {
632			free(opt->name, M_MOUNT);
633			opt->name = strdup("ro", M_MOUNT);
634			fsflags |= MNT_RDONLY;
635		}
636		else if (strcmp(opt->name, "suiddir") == 0)
637			fsflags |= MNT_SUIDDIR;
638		else if (strcmp(opt->name, "sync") == 0)
639			fsflags |= MNT_SYNCHRONOUS;
640		else if (strcmp(opt->name, "union") == 0)
641			fsflags |= MNT_UNION;
642	}
643
644	/*
645	 * If "rw" was specified as a mount option, and we
646	 * are trying to update a mount-point from "ro" to "rw",
647	 * we need a mount option "noro", since in vfs_mergeopts(),
648	 * "noro" will cancel "ro", but "rw" will not do anything.
649	 */
650	if (has_rw && !has_noro) {
651		noro_opt = malloc(sizeof(struct vfsopt), M_MOUNT, M_WAITOK);
652		noro_opt->name = strdup("noro", M_MOUNT);
653		noro_opt->value = NULL;
654		noro_opt->len = 0;
655		noro_opt->pos = -1;
656		noro_opt->seen = 1;
657		TAILQ_INSERT_TAIL(optlist, noro_opt, link);
658	}
659
660	/*
661	 * Be ultra-paranoid about making sure the type and fspath
662	 * variables will fit in our mp buffers, including the
663	 * terminating NUL.
664	 */
665	if (fstypelen >= MFSNAMELEN - 1 || fspathlen >= MNAMELEN - 1) {
666		error = ENAMETOOLONG;
667		goto bail;
668	}
669
670	error = vfs_domount(td, fstype, fspath, fsflags, optlist);
671bail:
672	/* copyout the errmsg */
673	if (errmsg_pos != -1 && ((2 * errmsg_pos + 1) < fsoptions->uio_iovcnt)
674	    && errmsg_len > 0 && errmsg != NULL) {
675		if (fsoptions->uio_segflg == UIO_SYSSPACE) {
676			bcopy(errmsg,
677			    fsoptions->uio_iov[2 * errmsg_pos + 1].iov_base,
678			    fsoptions->uio_iov[2 * errmsg_pos + 1].iov_len);
679		} else {
680			copyout(errmsg,
681			    fsoptions->uio_iov[2 * errmsg_pos + 1].iov_base,
682			    fsoptions->uio_iov[2 * errmsg_pos + 1].iov_len);
683		}
684	}
685
686	if (error != 0)
687		vfs_freeopts(optlist);
688	return (error);
689}
690
691/*
692 * Old mount API.
693 */
694#ifndef _SYS_SYSPROTO_H_
695struct mount_args {
696	char	*type;
697	char	*path;
698	int	flags;
699	caddr_t	data;
700};
701#endif
702/* ARGSUSED */
703int
704mount(td, uap)
705	struct thread *td;
706	struct mount_args /* {
707		char *type;
708		char *path;
709		int flags;
710		caddr_t data;
711	} */ *uap;
712{
713	char *fstype;
714	struct vfsconf *vfsp = NULL;
715	struct mntarg *ma = NULL;
716	int error;
717
718	AUDIT_ARG_FFLAGS(uap->flags);
719
720	/*
721	 * Filter out MNT_ROOTFS.  We do not want clients of mount() in
722	 * userspace to set this flag, but we must filter it out if we want
723	 * MNT_UPDATE on the root file system to work.
724	 * MNT_ROOTFS should only be set by the kernel when mounting its
725	 * root file system.
726	 */
727	uap->flags &= ~MNT_ROOTFS;
728
729	fstype = malloc(MFSNAMELEN, M_TEMP, M_WAITOK);
730	error = copyinstr(uap->type, fstype, MFSNAMELEN, NULL);
731	if (error) {
732		free(fstype, M_TEMP);
733		return (error);
734	}
735
736	AUDIT_ARG_TEXT(fstype);
737	mtx_lock(&Giant);
738	vfsp = vfs_byname_kld(fstype, td, &error);
739	free(fstype, M_TEMP);
740	if (vfsp == NULL) {
741		mtx_unlock(&Giant);
742		return (ENOENT);
743	}
744	if (vfsp->vfc_vfsops->vfs_cmount == NULL) {
745		mtx_unlock(&Giant);
746		return (EOPNOTSUPP);
747	}
748
749	ma = mount_argsu(ma, "fstype", uap->type, MNAMELEN);
750	ma = mount_argsu(ma, "fspath", uap->path, MNAMELEN);
751	ma = mount_argb(ma, uap->flags & MNT_RDONLY, "noro");
752	ma = mount_argb(ma, !(uap->flags & MNT_NOSUID), "nosuid");
753	ma = mount_argb(ma, !(uap->flags & MNT_NOEXEC), "noexec");
754
755	error = vfsp->vfc_vfsops->vfs_cmount(ma, uap->data, uap->flags);
756	mtx_unlock(&Giant);
757	return (error);
758}
759
760/*
761 * vfs_domount_first(): first file system mount (not update)
762 */
763static int
764vfs_domount_first(
765	struct thread *td,	/* Calling thread. */
766	struct vfsconf *vfsp,	/* File system type. */
767	char *fspath,		/* Mount path. */
768	struct vnode *vp,	/* Vnode to be covered. */
769	int fsflags,		/* Flags common to all filesystems. */
770	void *fsdata		/* Options local to the filesystem. */
771	)
772{
773	struct vattr va;
774	struct mount *mp;
775	struct vnode *newdp;
776	int error;
777
778	mtx_assert(&Giant, MA_OWNED);
779	ASSERT_VOP_ELOCKED(vp, __func__);
780	KASSERT((fsflags & MNT_UPDATE) == 0, ("MNT_UPDATE shouldn't be here"));
781
782	/*
783	 * If the user is not root, ensure that they own the directory
784	 * onto which we are attempting to mount.
785	 */
786	error = VOP_GETATTR(vp, &va, td->td_ucred);
787	if (error == 0 && va.va_uid != td->td_ucred->cr_uid)
788		error = priv_check_cred(td->td_ucred, PRIV_VFS_ADMIN, 0);
789	if (error == 0)
790		error = vinvalbuf(vp, V_SAVE, 0, 0);
791	if (error == 0 && vp->v_type != VDIR)
792		error = ENOTDIR;
793	if (error == 0) {
794		VI_LOCK(vp);
795		if ((vp->v_iflag & VI_MOUNT) == 0 && vp->v_mountedhere == NULL)
796			vp->v_iflag |= VI_MOUNT;
797		else
798			error = EBUSY;
799		VI_UNLOCK(vp);
800	}
801	if (error != 0) {
802		vput(vp);
803		return (error);
804	}
805	VOP_UNLOCK(vp, 0);
806
807	/* Allocate and initialize the filesystem. */
808	mp = vfs_mount_alloc(vp, vfsp, fspath, td->td_ucred);
809	/* XXXMAC: pass to vfs_mount_alloc? */
810	mp->mnt_optnew = fsdata;
811	/* Set the mount level flags. */
812	mp->mnt_flag = (fsflags & (MNT_UPDATEMASK | MNT_ROOTFS | MNT_RDONLY));
813
814	/*
815	 * Mount the filesystem.
816	 * XXX The final recipients of VFS_MOUNT just overwrite the ndp they
817	 * get.  No freeing of cn_pnbuf.
818	 */
819	error = VFS_MOUNT(mp);
820	if (error != 0) {
821		vfs_unbusy(mp);
822		vfs_mount_destroy(mp);
823		VI_LOCK(vp);
824		vp->v_iflag &= ~VI_MOUNT;
825		VI_UNLOCK(vp);
826		vrele(vp);
827		return (error);
828	}
829
830	if (mp->mnt_opt != NULL)
831		vfs_freeopts(mp->mnt_opt);
832	mp->mnt_opt = mp->mnt_optnew;
833	(void)VFS_STATFS(mp, &mp->mnt_stat);
834
835	/*
836	 * Prevent external consumers of mount options from reading mnt_optnew.
837	 */
838	mp->mnt_optnew = NULL;
839
840	MNT_ILOCK(mp);
841	if ((mp->mnt_flag & MNT_ASYNC) != 0 && mp->mnt_noasync == 0)
842		mp->mnt_kern_flag |= MNTK_ASYNC;
843	else
844		mp->mnt_kern_flag &= ~MNTK_ASYNC;
845	MNT_IUNLOCK(mp);
846
847	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
848	cache_purge(vp);
849	VI_LOCK(vp);
850	vp->v_iflag &= ~VI_MOUNT;
851	VI_UNLOCK(vp);
852	vp->v_mountedhere = mp;
853	/* Place the new filesystem at the end of the mount list. */
854	mtx_lock(&mountlist_mtx);
855	TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list);
856	mtx_unlock(&mountlist_mtx);
857	vfs_event_signal(NULL, VQ_MOUNT, 0);
858	if (VFS_ROOT(mp, LK_EXCLUSIVE, &newdp))
859		panic("mount: lost mount");
860	VOP_UNLOCK(newdp, 0);
861	VOP_UNLOCK(vp, 0);
862	mountcheckdirs(vp, newdp);
863	vrele(newdp);
864	if ((mp->mnt_flag & MNT_RDONLY) == 0)
865		vfs_allocate_syncvnode(mp);
866	vfs_unbusy(mp);
867	return (0);
868}
869
870/*
871 * vfs_domount_update(): update of mounted file system
872 */
873static int
874vfs_domount_update(
875	struct thread *td,	/* Calling thread. */
876	struct vnode *vp,	/* Mount point vnode. */
877	int fsflags,		/* Flags common to all filesystems. */
878	void *fsdata		/* Options local to the filesystem. */
879	)
880{
881	struct oexport_args oexport;
882	struct export_args export;
883	struct mount *mp;
884	int error, flag;
885
886	mtx_assert(&Giant, MA_OWNED);
887	ASSERT_VOP_ELOCKED(vp, __func__);
888	KASSERT((fsflags & MNT_UPDATE) != 0, ("MNT_UPDATE should be here"));
889
890	if ((vp->v_vflag & VV_ROOT) == 0) {
891		vput(vp);
892		return (EINVAL);
893	}
894	mp = vp->v_mount;
895	/*
896	 * We only allow the filesystem to be reloaded if it
897	 * is currently mounted read-only.
898	 */
899	flag = mp->mnt_flag;
900	if ((fsflags & MNT_RELOAD) != 0 && (flag & MNT_RDONLY) == 0) {
901		vput(vp);
902		return (EOPNOTSUPP);	/* Needs translation */
903	}
904	/*
905	 * Only privileged root, or (if MNT_USER is set) the user that
906	 * did the original mount is permitted to update it.
907	 */
908	error = vfs_suser(mp, td);
909	if (error != 0) {
910		vput(vp);
911		return (error);
912	}
913	if (vfs_busy(mp, MBF_NOWAIT)) {
914		vput(vp);
915		return (EBUSY);
916	}
917	VI_LOCK(vp);
918	if ((vp->v_iflag & VI_MOUNT) != 0 || vp->v_mountedhere != NULL) {
919		VI_UNLOCK(vp);
920		vfs_unbusy(mp);
921		vput(vp);
922		return (EBUSY);
923	}
924	vp->v_iflag |= VI_MOUNT;
925	VI_UNLOCK(vp);
926	VOP_UNLOCK(vp, 0);
927
928	MNT_ILOCK(mp);
929	mp->mnt_flag &= ~MNT_UPDATEMASK;
930	mp->mnt_flag |= fsflags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE |
931	    MNT_SNAPSHOT | MNT_ROOTFS | MNT_UPDATEMASK | MNT_RDONLY);
932	if ((mp->mnt_flag & MNT_ASYNC) == 0)
933		mp->mnt_kern_flag &= ~MNTK_ASYNC;
934	MNT_IUNLOCK(mp);
935	mp->mnt_optnew = fsdata;
936	vfs_mergeopts(mp->mnt_optnew, mp->mnt_opt);
937
938	/*
939	 * Mount the filesystem.
940	 * XXX The final recipients of VFS_MOUNT just overwrite the ndp they
941	 * get.  No freeing of cn_pnbuf.
942	 */
943	error = VFS_MOUNT(mp);
944
945	if (error == 0) {
946		/* Process the export option. */
947		if (vfs_copyopt(mp->mnt_optnew, "export", &export,
948		    sizeof(export)) == 0) {
949			error = vfs_export(mp, &export);
950		} else if (vfs_copyopt(mp->mnt_optnew, "export", &oexport,
951		    sizeof(oexport)) == 0) {
952			export.ex_flags = oexport.ex_flags;
953			export.ex_root = oexport.ex_root;
954			export.ex_anon = oexport.ex_anon;
955			export.ex_addr = oexport.ex_addr;
956			export.ex_addrlen = oexport.ex_addrlen;
957			export.ex_mask = oexport.ex_mask;
958			export.ex_masklen = oexport.ex_masklen;
959			export.ex_indexfile = oexport.ex_indexfile;
960			export.ex_numsecflavors = 0;
961			error = vfs_export(mp, &export);
962		}
963	}
964
965	MNT_ILOCK(mp);
966	if (error == 0) {
967		mp->mnt_flag &=	~(MNT_UPDATE | MNT_RELOAD | MNT_FORCE |
968		    MNT_SNAPSHOT);
969	} else {
970		/*
971		 * If we fail, restore old mount flags. MNT_QUOTA is special,
972		 * because it is not part of MNT_UPDATEMASK, but it could have
973		 * changed in the meantime if quotactl(2) was called.
974		 * All in all we want current value of MNT_QUOTA, not the old
975		 * one.
976		 */
977		mp->mnt_flag = (mp->mnt_flag & MNT_QUOTA) | (flag & ~MNT_QUOTA);
978	}
979	if ((mp->mnt_flag & MNT_ASYNC) != 0 && mp->mnt_noasync == 0)
980		mp->mnt_kern_flag |= MNTK_ASYNC;
981	else
982		mp->mnt_kern_flag &= ~MNTK_ASYNC;
983	MNT_IUNLOCK(mp);
984
985	if (error != 0)
986		goto end;
987
988	if (mp->mnt_opt != NULL)
989		vfs_freeopts(mp->mnt_opt);
990	mp->mnt_opt = mp->mnt_optnew;
991	(void)VFS_STATFS(mp, &mp->mnt_stat);
992	/*
993	 * Prevent external consumers of mount options from reading
994	 * mnt_optnew.
995	 */
996	mp->mnt_optnew = NULL;
997
998	if ((mp->mnt_flag & MNT_RDONLY) == 0)
999		vfs_allocate_syncvnode(mp);
1000	else
1001		vfs_deallocate_syncvnode(mp);
1002end:
1003	vfs_unbusy(mp);
1004	VI_LOCK(vp);
1005	vp->v_iflag &= ~VI_MOUNT;
1006	VI_UNLOCK(vp);
1007	vrele(vp);
1008	return (error);
1009}
1010
1011/*
1012 * vfs_domount(): actually attempt a filesystem mount.
1013 */
1014static int
1015vfs_domount(
1016	struct thread *td,	/* Calling thread. */
1017	const char *fstype,	/* Filesystem type. */
1018	char *fspath,		/* Mount path. */
1019	int fsflags,		/* Flags common to all filesystems. */
1020	void *fsdata		/* Options local to the filesystem. */
1021	)
1022{
1023	struct vfsconf *vfsp;
1024	struct nameidata nd;
1025	struct vnode *vp;
1026	int error;
1027
1028	/*
1029	 * Be ultra-paranoid about making sure the type and fspath
1030	 * variables will fit in our mp buffers, including the
1031	 * terminating NUL.
1032	 */
1033	if (strlen(fstype) >= MFSNAMELEN || strlen(fspath) >= MNAMELEN)
1034		return (ENAMETOOLONG);
1035
1036	if (jailed(td->td_ucred) || usermount == 0) {
1037		if ((error = priv_check(td, PRIV_VFS_MOUNT)) != 0)
1038			return (error);
1039	}
1040
1041	/*
1042	 * Do not allow NFS export or MNT_SUIDDIR by unprivileged users.
1043	 */
1044	if (fsflags & MNT_EXPORTED) {
1045		error = priv_check(td, PRIV_VFS_MOUNT_EXPORTED);
1046		if (error)
1047			return (error);
1048	}
1049	if (fsflags & MNT_SUIDDIR) {
1050		error = priv_check(td, PRIV_VFS_MOUNT_SUIDDIR);
1051		if (error)
1052			return (error);
1053	}
1054	/*
1055	 * Silently enforce MNT_NOSUID and MNT_USER for unprivileged users.
1056	 */
1057	if ((fsflags & (MNT_NOSUID | MNT_USER)) != (MNT_NOSUID | MNT_USER)) {
1058		if (priv_check(td, PRIV_VFS_MOUNT_NONUSER) != 0)
1059			fsflags |= MNT_NOSUID | MNT_USER;
1060	}
1061
1062	/* Load KLDs before we lock the covered vnode to avoid reversals. */
1063	vfsp = NULL;
1064	if ((fsflags & MNT_UPDATE) == 0) {
1065		/* Don't try to load KLDs if we're mounting the root. */
1066		if (fsflags & MNT_ROOTFS)
1067			vfsp = vfs_byname(fstype);
1068		else
1069			vfsp = vfs_byname_kld(fstype, td, &error);
1070		if (vfsp == NULL)
1071			return (ENODEV);
1072		if (jailed(td->td_ucred) && !(vfsp->vfc_flags & VFCF_JAIL))
1073			return (EPERM);
1074	}
1075
1076	/*
1077	 * Get vnode to be covered or mount point's vnode in case of MNT_UPDATE.
1078	 */
1079	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1,
1080	    UIO_SYSSPACE, fspath, td);
1081	error = namei(&nd);
1082	if (error != 0)
1083		return (error);
1084	if (!NDHASGIANT(&nd))
1085		mtx_lock(&Giant);
1086	NDFREE(&nd, NDF_ONLY_PNBUF);
1087	vp = nd.ni_vp;
1088	if ((fsflags & MNT_UPDATE) == 0) {
1089		error = vfs_domount_first(td, vfsp, fspath, vp, fsflags,
1090		    fsdata);
1091	} else {
1092		error = vfs_domount_update(td, vp, fsflags, fsdata);
1093	}
1094	mtx_unlock(&Giant);
1095
1096	ASSERT_VI_UNLOCKED(vp, __func__);
1097	ASSERT_VOP_UNLOCKED(vp, __func__);
1098
1099	return (error);
1100}
1101
1102/*
1103 * Unmount a filesystem.
1104 *
1105 * Note: unmount takes a path to the vnode mounted on as argument, not
1106 * special file (as before).
1107 */
1108#ifndef _SYS_SYSPROTO_H_
1109struct unmount_args {
1110	char	*path;
1111	int	flags;
1112};
1113#endif
1114/* ARGSUSED */
1115int
1116unmount(td, uap)
1117	struct thread *td;
1118	register struct unmount_args /* {
1119		char *path;
1120		int flags;
1121	} */ *uap;
1122{
1123	struct mount *mp;
1124	char *pathbuf;
1125	int error, id0, id1;
1126
1127	AUDIT_ARG_VALUE(uap->flags);
1128	if (jailed(td->td_ucred) || usermount == 0) {
1129		error = priv_check(td, PRIV_VFS_UNMOUNT);
1130		if (error)
1131			return (error);
1132	}
1133
1134	pathbuf = malloc(MNAMELEN, M_TEMP, M_WAITOK);
1135	error = copyinstr(uap->path, pathbuf, MNAMELEN, NULL);
1136	if (error) {
1137		free(pathbuf, M_TEMP);
1138		return (error);
1139	}
1140	mtx_lock(&Giant);
1141	if (uap->flags & MNT_BYFSID) {
1142		AUDIT_ARG_TEXT(pathbuf);
1143		/* Decode the filesystem ID. */
1144		if (sscanf(pathbuf, "FSID:%d:%d", &id0, &id1) != 2) {
1145			mtx_unlock(&Giant);
1146			free(pathbuf, M_TEMP);
1147			return (EINVAL);
1148		}
1149
1150		mtx_lock(&mountlist_mtx);
1151		TAILQ_FOREACH_REVERSE(mp, &mountlist, mntlist, mnt_list) {
1152			if (mp->mnt_stat.f_fsid.val[0] == id0 &&
1153			    mp->mnt_stat.f_fsid.val[1] == id1)
1154				break;
1155		}
1156		mtx_unlock(&mountlist_mtx);
1157	} else {
1158		AUDIT_ARG_UPATH1(td, pathbuf);
1159		mtx_lock(&mountlist_mtx);
1160		TAILQ_FOREACH_REVERSE(mp, &mountlist, mntlist, mnt_list) {
1161			if (strcmp(mp->mnt_stat.f_mntonname, pathbuf) == 0)
1162				break;
1163		}
1164		mtx_unlock(&mountlist_mtx);
1165	}
1166	free(pathbuf, M_TEMP);
1167	if (mp == NULL) {
1168		/*
1169		 * Previously we returned ENOENT for a nonexistent path and
1170		 * EINVAL for a non-mountpoint.  We cannot tell these apart
1171		 * now, so in the !MNT_BYFSID case return the more likely
1172		 * EINVAL for compatibility.
1173		 */
1174		mtx_unlock(&Giant);
1175		return ((uap->flags & MNT_BYFSID) ? ENOENT : EINVAL);
1176	}
1177
1178	/*
1179	 * Don't allow unmounting the root filesystem.
1180	 */
1181	if (mp->mnt_flag & MNT_ROOTFS) {
1182		mtx_unlock(&Giant);
1183		return (EINVAL);
1184	}
1185	error = dounmount(mp, uap->flags, td);
1186	mtx_unlock(&Giant);
1187	return (error);
1188}
1189
1190/*
1191 * Do the actual filesystem unmount.
1192 */
1193int
1194dounmount(mp, flags, td)
1195	struct mount *mp;
1196	int flags;
1197	struct thread *td;
1198{
1199	struct vnode *coveredvp, *fsrootvp;
1200	int error;
1201	int async_flag;
1202	int mnt_gen_r;
1203
1204	mtx_assert(&Giant, MA_OWNED);
1205
1206	if ((coveredvp = mp->mnt_vnodecovered) != NULL) {
1207		mnt_gen_r = mp->mnt_gen;
1208		VI_LOCK(coveredvp);
1209		vholdl(coveredvp);
1210		vn_lock(coveredvp, LK_EXCLUSIVE | LK_INTERLOCK | LK_RETRY);
1211		vdrop(coveredvp);
1212		/*
1213		 * Check for mp being unmounted while waiting for the
1214		 * covered vnode lock.
1215		 */
1216		if (coveredvp->v_mountedhere != mp ||
1217		    coveredvp->v_mountedhere->mnt_gen != mnt_gen_r) {
1218			VOP_UNLOCK(coveredvp, 0);
1219			return (EBUSY);
1220		}
1221	}
1222	/*
1223	 * Only privileged root, or (if MNT_USER is set) the user that did the
1224	 * original mount is permitted to unmount this filesystem.
1225	 */
1226	error = vfs_suser(mp, td);
1227	if (error) {
1228		if (coveredvp)
1229			VOP_UNLOCK(coveredvp, 0);
1230		return (error);
1231	}
1232
1233	MNT_ILOCK(mp);
1234	if (mp->mnt_kern_flag & MNTK_UNMOUNT) {
1235		MNT_IUNLOCK(mp);
1236		if (coveredvp)
1237			VOP_UNLOCK(coveredvp, 0);
1238		return (EBUSY);
1239	}
1240	mp->mnt_kern_flag |= MNTK_UNMOUNT | MNTK_NOINSMNTQ;
1241	/* Allow filesystems to detect that a forced unmount is in progress. */
1242	if (flags & MNT_FORCE)
1243		mp->mnt_kern_flag |= MNTK_UNMOUNTF;
1244	error = 0;
1245	if (mp->mnt_lockref) {
1246		if ((flags & MNT_FORCE) == 0) {
1247			mp->mnt_kern_flag &= ~(MNTK_UNMOUNT | MNTK_NOINSMNTQ |
1248			    MNTK_UNMOUNTF);
1249			if (mp->mnt_kern_flag & MNTK_MWAIT) {
1250				mp->mnt_kern_flag &= ~MNTK_MWAIT;
1251				wakeup(mp);
1252			}
1253			MNT_IUNLOCK(mp);
1254			if (coveredvp)
1255				VOP_UNLOCK(coveredvp, 0);
1256			return (EBUSY);
1257		}
1258		mp->mnt_kern_flag |= MNTK_DRAINING;
1259		error = msleep(&mp->mnt_lockref, MNT_MTX(mp), PVFS,
1260		    "mount drain", 0);
1261	}
1262	MNT_IUNLOCK(mp);
1263	KASSERT(mp->mnt_lockref == 0,
1264	    ("%s: invalid lock refcount in the drain path @ %s:%d",
1265	    __func__, __FILE__, __LINE__));
1266	KASSERT(error == 0,
1267	    ("%s: invalid return value for msleep in the drain path @ %s:%d",
1268	    __func__, __FILE__, __LINE__));
1269	vn_start_write(NULL, &mp, V_WAIT);
1270
1271	if (mp->mnt_flag & MNT_EXPUBLIC)
1272		vfs_setpublicfs(NULL, NULL, NULL);
1273
1274	vfs_msync(mp, MNT_WAIT);
1275	MNT_ILOCK(mp);
1276	async_flag = mp->mnt_flag & MNT_ASYNC;
1277	mp->mnt_flag &= ~MNT_ASYNC;
1278	mp->mnt_kern_flag &= ~MNTK_ASYNC;
1279	MNT_IUNLOCK(mp);
1280	cache_purgevfs(mp);	/* remove cache entries for this file sys */
1281	vfs_deallocate_syncvnode(mp);
1282	/*
1283	 * For forced unmounts, move process cdir/rdir refs on the fs root
1284	 * vnode to the covered vnode.  For non-forced unmounts we want
1285	 * such references to cause an EBUSY error.
1286	 */
1287	if ((flags & MNT_FORCE) &&
1288	    VFS_ROOT(mp, LK_EXCLUSIVE, &fsrootvp) == 0) {
1289		if (mp->mnt_vnodecovered != NULL)
1290			mountcheckdirs(fsrootvp, mp->mnt_vnodecovered);
1291		if (fsrootvp == rootvnode) {
1292			vrele(rootvnode);
1293			rootvnode = NULL;
1294		}
1295		vput(fsrootvp);
1296	}
1297	if (((mp->mnt_flag & MNT_RDONLY) ||
1298	     (error = VFS_SYNC(mp, MNT_WAIT)) == 0) || (flags & MNT_FORCE) != 0)
1299		error = VFS_UNMOUNT(mp, flags);
1300	vn_finished_write(mp);
1301	/*
1302	 * If we failed to flush the dirty blocks for this mount point,
1303	 * undo all the cdir/rdir and rootvnode changes we made above.
1304	 * Unless we failed to do so because the device is reporting that
1305	 * it doesn't exist anymore.
1306	 */
1307	if (error && error != ENXIO) {
1308		if ((flags & MNT_FORCE) &&
1309		    VFS_ROOT(mp, LK_EXCLUSIVE, &fsrootvp) == 0) {
1310			if (mp->mnt_vnodecovered != NULL)
1311				mountcheckdirs(mp->mnt_vnodecovered, fsrootvp);
1312			if (rootvnode == NULL) {
1313				rootvnode = fsrootvp;
1314				vref(rootvnode);
1315			}
1316			vput(fsrootvp);
1317		}
1318		MNT_ILOCK(mp);
1319		mp->mnt_kern_flag &= ~MNTK_NOINSMNTQ;
1320		if ((mp->mnt_flag & MNT_RDONLY) == 0) {
1321			MNT_IUNLOCK(mp);
1322			vfs_allocate_syncvnode(mp);
1323			MNT_ILOCK(mp);
1324		}
1325		mp->mnt_kern_flag &= ~(MNTK_UNMOUNT | MNTK_UNMOUNTF);
1326		mp->mnt_flag |= async_flag;
1327		if ((mp->mnt_flag & MNT_ASYNC) != 0 && mp->mnt_noasync == 0)
1328			mp->mnt_kern_flag |= MNTK_ASYNC;
1329		if (mp->mnt_kern_flag & MNTK_MWAIT) {
1330			mp->mnt_kern_flag &= ~MNTK_MWAIT;
1331			wakeup(mp);
1332		}
1333		MNT_IUNLOCK(mp);
1334		if (coveredvp)
1335			VOP_UNLOCK(coveredvp, 0);
1336		return (error);
1337	}
1338	mtx_lock(&mountlist_mtx);
1339	TAILQ_REMOVE(&mountlist, mp, mnt_list);
1340	mtx_unlock(&mountlist_mtx);
1341	if (coveredvp != NULL) {
1342		coveredvp->v_mountedhere = NULL;
1343		vput(coveredvp);
1344	}
1345	vfs_event_signal(NULL, VQ_UNMOUNT, 0);
1346	vfs_mount_destroy(mp);
1347	return (0);
1348}
1349
1350/*
1351 * Report errors during filesystem mounting.
1352 */
1353void
1354vfs_mount_error(struct mount *mp, const char *fmt, ...)
1355{
1356	struct vfsoptlist *moptlist = mp->mnt_optnew;
1357	va_list ap;
1358	int error, len;
1359	char *errmsg;
1360
1361	error = vfs_getopt(moptlist, "errmsg", (void **)&errmsg, &len);
1362	if (error || errmsg == NULL || len <= 0)
1363		return;
1364
1365	va_start(ap, fmt);
1366	vsnprintf(errmsg, (size_t)len, fmt, ap);
1367	va_end(ap);
1368}
1369
1370void
1371vfs_opterror(struct vfsoptlist *opts, const char *fmt, ...)
1372{
1373	va_list ap;
1374	int error, len;
1375	char *errmsg;
1376
1377	error = vfs_getopt(opts, "errmsg", (void **)&errmsg, &len);
1378	if (error || errmsg == NULL || len <= 0)
1379		return;
1380
1381	va_start(ap, fmt);
1382	vsnprintf(errmsg, (size_t)len, fmt, ap);
1383	va_end(ap);
1384}
1385
1386/*
1387 * ---------------------------------------------------------------------
1388 * Functions for querying mount options/arguments from filesystems.
1389 */
1390
1391/*
1392 * Check that no unknown options are given
1393 */
1394int
1395vfs_filteropt(struct vfsoptlist *opts, const char **legal)
1396{
1397	struct vfsopt *opt;
1398	char errmsg[255];
1399	const char **t, *p, *q;
1400	int ret = 0;
1401
1402	TAILQ_FOREACH(opt, opts, link) {
1403		p = opt->name;
1404		q = NULL;
1405		if (p[0] == 'n' && p[1] == 'o')
1406			q = p + 2;
1407		for(t = global_opts; *t != NULL; t++) {
1408			if (strcmp(*t, p) == 0)
1409				break;
1410			if (q != NULL) {
1411				if (strcmp(*t, q) == 0)
1412					break;
1413			}
1414		}
1415		if (*t != NULL)
1416			continue;
1417		for(t = legal; *t != NULL; t++) {
1418			if (strcmp(*t, p) == 0)
1419				break;
1420			if (q != NULL) {
1421				if (strcmp(*t, q) == 0)
1422					break;
1423			}
1424		}
1425		if (*t != NULL)
1426			continue;
1427		snprintf(errmsg, sizeof(errmsg),
1428		    "mount option <%s> is unknown", p);
1429		ret = EINVAL;
1430	}
1431	if (ret != 0) {
1432		TAILQ_FOREACH(opt, opts, link) {
1433			if (strcmp(opt->name, "errmsg") == 0) {
1434				strncpy((char *)opt->value, errmsg, opt->len);
1435				break;
1436			}
1437		}
1438		if (opt == NULL)
1439			printf("%s\n", errmsg);
1440	}
1441	return (ret);
1442}
1443
1444/*
1445 * Get a mount option by its name.
1446 *
1447 * Return 0 if the option was found, ENOENT otherwise.
1448 * If len is non-NULL it will be filled with the length
1449 * of the option. If buf is non-NULL, it will be filled
1450 * with the address of the option.
1451 */
1452int
1453vfs_getopt(opts, name, buf, len)
1454	struct vfsoptlist *opts;
1455	const char *name;
1456	void **buf;
1457	int *len;
1458{
1459	struct vfsopt *opt;
1460
1461	KASSERT(opts != NULL, ("vfs_getopt: caller passed 'opts' as NULL"));
1462
1463	TAILQ_FOREACH(opt, opts, link) {
1464		if (strcmp(name, opt->name) == 0) {
1465			opt->seen = 1;
1466			if (len != NULL)
1467				*len = opt->len;
1468			if (buf != NULL)
1469				*buf = opt->value;
1470			return (0);
1471		}
1472	}
1473	return (ENOENT);
1474}
1475
1476int
1477vfs_getopt_pos(struct vfsoptlist *opts, const char *name)
1478{
1479	struct vfsopt *opt;
1480
1481	if (opts == NULL)
1482		return (-1);
1483
1484	TAILQ_FOREACH(opt, opts, link) {
1485		if (strcmp(name, opt->name) == 0) {
1486			opt->seen = 1;
1487			return (opt->pos);
1488		}
1489	}
1490	return (-1);
1491}
1492
1493char *
1494vfs_getopts(struct vfsoptlist *opts, const char *name, int *error)
1495{
1496	struct vfsopt *opt;
1497
1498	*error = 0;
1499	TAILQ_FOREACH(opt, opts, link) {
1500		if (strcmp(name, opt->name) != 0)
1501			continue;
1502		opt->seen = 1;
1503		if (opt->len == 0 ||
1504		    ((char *)opt->value)[opt->len - 1] != '\0') {
1505			*error = EINVAL;
1506			return (NULL);
1507		}
1508		return (opt->value);
1509	}
1510	*error = ENOENT;
1511	return (NULL);
1512}
1513
1514int
1515vfs_flagopt(struct vfsoptlist *opts, const char *name, u_int *w, u_int val)
1516{
1517	struct vfsopt *opt;
1518
1519	TAILQ_FOREACH(opt, opts, link) {
1520		if (strcmp(name, opt->name) == 0) {
1521			opt->seen = 1;
1522			if (w != NULL)
1523				*w |= val;
1524			return (1);
1525		}
1526	}
1527	if (w != NULL)
1528		*w &= ~val;
1529	return (0);
1530}
1531
1532int
1533vfs_scanopt(struct vfsoptlist *opts, const char *name, const char *fmt, ...)
1534{
1535	va_list ap;
1536	struct vfsopt *opt;
1537	int ret;
1538
1539	KASSERT(opts != NULL, ("vfs_getopt: caller passed 'opts' as NULL"));
1540
1541	TAILQ_FOREACH(opt, opts, link) {
1542		if (strcmp(name, opt->name) != 0)
1543			continue;
1544		opt->seen = 1;
1545		if (opt->len == 0 || opt->value == NULL)
1546			return (0);
1547		if (((char *)opt->value)[opt->len - 1] != '\0')
1548			return (0);
1549		va_start(ap, fmt);
1550		ret = vsscanf(opt->value, fmt, ap);
1551		va_end(ap);
1552		return (ret);
1553	}
1554	return (0);
1555}
1556
1557int
1558vfs_setopt(struct vfsoptlist *opts, const char *name, void *value, int len)
1559{
1560	struct vfsopt *opt;
1561
1562	TAILQ_FOREACH(opt, opts, link) {
1563		if (strcmp(name, opt->name) != 0)
1564			continue;
1565		opt->seen = 1;
1566		if (opt->value == NULL)
1567			opt->len = len;
1568		else {
1569			if (opt->len != len)
1570				return (EINVAL);
1571			bcopy(value, opt->value, len);
1572		}
1573		return (0);
1574	}
1575	return (ENOENT);
1576}
1577
1578int
1579vfs_setopt_part(struct vfsoptlist *opts, const char *name, void *value, int len)
1580{
1581	struct vfsopt *opt;
1582
1583	TAILQ_FOREACH(opt, opts, link) {
1584		if (strcmp(name, opt->name) != 0)
1585			continue;
1586		opt->seen = 1;
1587		if (opt->value == NULL)
1588			opt->len = len;
1589		else {
1590			if (opt->len < len)
1591				return (EINVAL);
1592			opt->len = len;
1593			bcopy(value, opt->value, len);
1594		}
1595		return (0);
1596	}
1597	return (ENOENT);
1598}
1599
1600int
1601vfs_setopts(struct vfsoptlist *opts, const char *name, const char *value)
1602{
1603	struct vfsopt *opt;
1604
1605	TAILQ_FOREACH(opt, opts, link) {
1606		if (strcmp(name, opt->name) != 0)
1607			continue;
1608		opt->seen = 1;
1609		if (opt->value == NULL)
1610			opt->len = strlen(value) + 1;
1611		else if (strlcpy(opt->value, value, opt->len) >= opt->len)
1612			return (EINVAL);
1613		return (0);
1614	}
1615	return (ENOENT);
1616}
1617
1618/*
1619 * Find and copy a mount option.
1620 *
1621 * The size of the buffer has to be specified
1622 * in len, if it is not the same length as the
1623 * mount option, EINVAL is returned.
1624 * Returns ENOENT if the option is not found.
1625 */
1626int
1627vfs_copyopt(opts, name, dest, len)
1628	struct vfsoptlist *opts;
1629	const char *name;
1630	void *dest;
1631	int len;
1632{
1633	struct vfsopt *opt;
1634
1635	KASSERT(opts != NULL, ("vfs_copyopt: caller passed 'opts' as NULL"));
1636
1637	TAILQ_FOREACH(opt, opts, link) {
1638		if (strcmp(name, opt->name) == 0) {
1639			opt->seen = 1;
1640			if (len != opt->len)
1641				return (EINVAL);
1642			bcopy(opt->value, dest, opt->len);
1643			return (0);
1644		}
1645	}
1646	return (ENOENT);
1647}
1648
1649/*
1650 * This is a helper function for filesystems to traverse their
1651 * vnodes.  See MNT_VNODE_FOREACH() in sys/mount.h
1652 */
1653
1654struct vnode *
1655__mnt_vnode_next(struct vnode **mvp, struct mount *mp)
1656{
1657	struct vnode *vp;
1658
1659	mtx_assert(MNT_MTX(mp), MA_OWNED);
1660
1661	KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
1662	if ((*mvp)->v_yield++ == 500) {
1663		MNT_IUNLOCK(mp);
1664		(*mvp)->v_yield = 0;
1665		uio_yield();
1666		MNT_ILOCK(mp);
1667	}
1668	vp = TAILQ_NEXT(*mvp, v_nmntvnodes);
1669	while (vp != NULL && vp->v_type == VMARKER)
1670		vp = TAILQ_NEXT(vp, v_nmntvnodes);
1671
1672	/* Check if we are done */
1673	if (vp == NULL) {
1674		__mnt_vnode_markerfree(mvp, mp);
1675		return (NULL);
1676	}
1677	TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes);
1678	TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes);
1679	return (vp);
1680}
1681
1682struct vnode *
1683__mnt_vnode_first(struct vnode **mvp, struct mount *mp)
1684{
1685	struct vnode *vp;
1686
1687	mtx_assert(MNT_MTX(mp), MA_OWNED);
1688
1689	vp = TAILQ_FIRST(&mp->mnt_nvnodelist);
1690	while (vp != NULL && vp->v_type == VMARKER)
1691		vp = TAILQ_NEXT(vp, v_nmntvnodes);
1692
1693	/* Check if we are done */
1694	if (vp == NULL) {
1695		*mvp = NULL;
1696		return (NULL);
1697	}
1698	MNT_REF(mp);
1699	MNT_IUNLOCK(mp);
1700	*mvp = (struct vnode *) malloc(sizeof(struct vnode),
1701				       M_VNODE_MARKER,
1702				       M_WAITOK | M_ZERO);
1703	MNT_ILOCK(mp);
1704	(*mvp)->v_type = VMARKER;
1705
1706	vp = TAILQ_FIRST(&mp->mnt_nvnodelist);
1707	while (vp != NULL && vp->v_type == VMARKER)
1708		vp = TAILQ_NEXT(vp, v_nmntvnodes);
1709
1710	/* Check if we are done */
1711	if (vp == NULL) {
1712		MNT_IUNLOCK(mp);
1713		free(*mvp, M_VNODE_MARKER);
1714		MNT_ILOCK(mp);
1715		*mvp = NULL;
1716		MNT_REL(mp);
1717		return (NULL);
1718	}
1719	(*mvp)->v_mount = mp;
1720	TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes);
1721	return (vp);
1722}
1723
1724
1725void
1726__mnt_vnode_markerfree(struct vnode **mvp, struct mount *mp)
1727{
1728
1729	if (*mvp == NULL)
1730		return;
1731
1732	mtx_assert(MNT_MTX(mp), MA_OWNED);
1733
1734	KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
1735	TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes);
1736	MNT_IUNLOCK(mp);
1737	free(*mvp, M_VNODE_MARKER);
1738	MNT_ILOCK(mp);
1739	*mvp = NULL;
1740	MNT_REL(mp);
1741}
1742
1743
1744int
1745__vfs_statfs(struct mount *mp, struct statfs *sbp)
1746{
1747	int error;
1748
1749	error = mp->mnt_op->vfs_statfs(mp, &mp->mnt_stat);
1750	if (sbp != &mp->mnt_stat)
1751		*sbp = mp->mnt_stat;
1752	return (error);
1753}
1754
1755void
1756vfs_mountedfrom(struct mount *mp, const char *from)
1757{
1758
1759	bzero(mp->mnt_stat.f_mntfromname, sizeof mp->mnt_stat.f_mntfromname);
1760	strlcpy(mp->mnt_stat.f_mntfromname, from,
1761	    sizeof mp->mnt_stat.f_mntfromname);
1762}
1763
1764/*
1765 * ---------------------------------------------------------------------
1766 * This is the api for building mount args and mounting filesystems from
1767 * inside the kernel.
1768 *
1769 * The API works by accumulation of individual args.  First error is
1770 * latched.
1771 *
1772 * XXX: should be documented in new manpage kernel_mount(9)
1773 */
1774
1775/* A memory allocation which must be freed when we are done */
1776struct mntaarg {
1777	SLIST_ENTRY(mntaarg)	next;
1778};
1779
1780/* The header for the mount arguments */
1781struct mntarg {
1782	struct iovec *v;
1783	int len;
1784	int error;
1785	SLIST_HEAD(, mntaarg)	list;
1786};
1787
1788/*
1789 * Add a boolean argument.
1790 *
1791 * flag is the boolean value.
1792 * name must start with "no".
1793 */
1794struct mntarg *
1795mount_argb(struct mntarg *ma, int flag, const char *name)
1796{
1797
1798	KASSERT(name[0] == 'n' && name[1] == 'o',
1799	    ("mount_argb(...,%s): name must start with 'no'", name));
1800
1801	return (mount_arg(ma, name + (flag ? 2 : 0), NULL, 0));
1802}
1803
1804/*
1805 * Add an argument printf style
1806 */
1807struct mntarg *
1808mount_argf(struct mntarg *ma, const char *name, const char *fmt, ...)
1809{
1810	va_list ap;
1811	struct mntaarg *maa;
1812	struct sbuf *sb;
1813	int len;
1814
1815	if (ma == NULL) {
1816		ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK | M_ZERO);
1817		SLIST_INIT(&ma->list);
1818	}
1819	if (ma->error)
1820		return (ma);
1821
1822	ma->v = realloc(ma->v, sizeof *ma->v * (ma->len + 2),
1823	    M_MOUNT, M_WAITOK);
1824	ma->v[ma->len].iov_base = (void *)(uintptr_t)name;
1825	ma->v[ma->len].iov_len = strlen(name) + 1;
1826	ma->len++;
1827
1828	sb = sbuf_new_auto();
1829	va_start(ap, fmt);
1830	sbuf_vprintf(sb, fmt, ap);
1831	va_end(ap);
1832	sbuf_finish(sb);
1833	len = sbuf_len(sb) + 1;
1834	maa = malloc(sizeof *maa + len, M_MOUNT, M_WAITOK | M_ZERO);
1835	SLIST_INSERT_HEAD(&ma->list, maa, next);
1836	bcopy(sbuf_data(sb), maa + 1, len);
1837	sbuf_delete(sb);
1838
1839	ma->v[ma->len].iov_base = maa + 1;
1840	ma->v[ma->len].iov_len = len;
1841	ma->len++;
1842
1843	return (ma);
1844}
1845
1846/*
1847 * Add an argument which is a userland string.
1848 */
1849struct mntarg *
1850mount_argsu(struct mntarg *ma, const char *name, const void *val, int len)
1851{
1852	struct mntaarg *maa;
1853	char *tbuf;
1854
1855	if (val == NULL)
1856		return (ma);
1857	if (ma == NULL) {
1858		ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK | M_ZERO);
1859		SLIST_INIT(&ma->list);
1860	}
1861	if (ma->error)
1862		return (ma);
1863	maa = malloc(sizeof *maa + len, M_MOUNT, M_WAITOK | M_ZERO);
1864	SLIST_INSERT_HEAD(&ma->list, maa, next);
1865	tbuf = (void *)(maa + 1);
1866	ma->error = copyinstr(val, tbuf, len, NULL);
1867	return (mount_arg(ma, name, tbuf, -1));
1868}
1869
1870/*
1871 * Plain argument.
1872 *
1873 * If length is -1, treat value as a C string.
1874 */
1875struct mntarg *
1876mount_arg(struct mntarg *ma, const char *name, const void *val, int len)
1877{
1878
1879	if (ma == NULL) {
1880		ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK | M_ZERO);
1881		SLIST_INIT(&ma->list);
1882	}
1883	if (ma->error)
1884		return (ma);
1885
1886	ma->v = realloc(ma->v, sizeof *ma->v * (ma->len + 2),
1887	    M_MOUNT, M_WAITOK);
1888	ma->v[ma->len].iov_base = (void *)(uintptr_t)name;
1889	ma->v[ma->len].iov_len = strlen(name) + 1;
1890	ma->len++;
1891
1892	ma->v[ma->len].iov_base = (void *)(uintptr_t)val;
1893	if (len < 0)
1894		ma->v[ma->len].iov_len = strlen(val) + 1;
1895	else
1896		ma->v[ma->len].iov_len = len;
1897	ma->len++;
1898	return (ma);
1899}
1900
1901/*
1902 * Free a mntarg structure
1903 */
1904static void
1905free_mntarg(struct mntarg *ma)
1906{
1907	struct mntaarg *maa;
1908
1909	while (!SLIST_EMPTY(&ma->list)) {
1910		maa = SLIST_FIRST(&ma->list);
1911		SLIST_REMOVE_HEAD(&ma->list, next);
1912		free(maa, M_MOUNT);
1913	}
1914	free(ma->v, M_MOUNT);
1915	free(ma, M_MOUNT);
1916}
1917
1918/*
1919 * Mount a filesystem
1920 */
1921int
1922kernel_mount(struct mntarg *ma, int flags)
1923{
1924	struct uio auio;
1925	int error;
1926
1927	KASSERT(ma != NULL, ("kernel_mount NULL ma"));
1928	KASSERT(ma->v != NULL, ("kernel_mount NULL ma->v"));
1929	KASSERT(!(ma->len & 1), ("kernel_mount odd ma->len (%d)", ma->len));
1930
1931	auio.uio_iov = ma->v;
1932	auio.uio_iovcnt = ma->len;
1933	auio.uio_segflg = UIO_SYSSPACE;
1934
1935	error = ma->error;
1936	if (!error)
1937		error = vfs_donmount(curthread, flags, &auio);
1938	free_mntarg(ma);
1939	return (error);
1940}
1941
1942/*
1943 * A printflike function to mount a filesystem.
1944 */
1945int
1946kernel_vmount(int flags, ...)
1947{
1948	struct mntarg *ma = NULL;
1949	va_list ap;
1950	const char *cp;
1951	const void *vp;
1952	int error;
1953
1954	va_start(ap, flags);
1955	for (;;) {
1956		cp = va_arg(ap, const char *);
1957		if (cp == NULL)
1958			break;
1959		vp = va_arg(ap, const void *);
1960		ma = mount_arg(ma, cp, vp, (vp != NULL ? -1 : 0));
1961	}
1962	va_end(ap);
1963
1964	error = kernel_mount(ma, flags);
1965	return (error);
1966}
1967
1968void
1969vfs_oexport_conv(const struct oexport_args *oexp, struct export_args *exp)
1970{
1971
1972	bcopy(oexp, exp, sizeof(*oexp));
1973	exp->ex_numsecflavors = 0;
1974}
1975