vfs_mount.c revision 218852
1/*-
2 * Copyright (c) 1999-2004 Poul-Henning Kamp
3 * Copyright (c) 1999 Michael Smith
4 * Copyright (c) 1989, 1993
5 *	The Regents of the University of California.  All rights reserved.
6 * (c) UNIX System Laboratories, Inc.
7 * All or some portions of this file are derived from material licensed
8 * to the University of California by American Telephone and Telegraph
9 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
10 * the permission of UNIX System Laboratories, Inc.
11 *
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
14 * are met:
15 * 1. Redistributions of source code must retain the above copyright
16 *    notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 *    notice, this list of conditions and the following disclaimer in the
19 *    documentation and/or other materials provided with the distribution.
20 * 4. Neither the name of the University nor the names of its contributors
21 *    may be used to endorse or promote products derived from this software
22 *    without specific prior written permission.
23 *
24 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * SUCH DAMAGE.
35 */
36
37#include <sys/cdefs.h>
38__FBSDID("$FreeBSD: head/sys/kern/vfs_mount.c 218852 2011-02-19 14:27:14Z jh $");
39
40#include <sys/param.h>
41#include <sys/conf.h>
42#include <sys/fcntl.h>
43#include <sys/jail.h>
44#include <sys/kernel.h>
45#include <sys/libkern.h>
46#include <sys/malloc.h>
47#include <sys/mount.h>
48#include <sys/mutex.h>
49#include <sys/namei.h>
50#include <sys/priv.h>
51#include <sys/proc.h>
52#include <sys/filedesc.h>
53#include <sys/reboot.h>
54#include <sys/syscallsubr.h>
55#include <sys/sysproto.h>
56#include <sys/sx.h>
57#include <sys/sysctl.h>
58#include <sys/sysent.h>
59#include <sys/systm.h>
60#include <sys/vnode.h>
61#include <vm/uma.h>
62
63#include <geom/geom.h>
64
65#include <machine/stdarg.h>
66
67#include <security/audit/audit.h>
68#include <security/mac/mac_framework.h>
69
70#define	VFS_MOUNTARG_SIZE_MAX	(1024 * 64)
71
72static int	vfs_domount(struct thread *td, const char *fstype,
73		    char *fspath, int fsflags, struct vfsoptlist **optlist);
74static void	free_mntarg(struct mntarg *ma);
75
76static int	usermount = 0;
77SYSCTL_INT(_vfs, OID_AUTO, usermount, CTLFLAG_RW, &usermount, 0,
78    "Unprivileged users may mount and unmount file systems");
79
80MALLOC_DEFINE(M_MOUNT, "mount", "vfs mount structure");
81MALLOC_DEFINE(M_VNODE_MARKER, "vnodemarker", "vnode marker");
82static uma_zone_t mount_zone;
83
84/* List of mounted filesystems. */
85struct mntlist mountlist = TAILQ_HEAD_INITIALIZER(mountlist);
86
87/* For any iteration/modification of mountlist */
88struct mtx mountlist_mtx;
89MTX_SYSINIT(mountlist, &mountlist_mtx, "mountlist", MTX_DEF);
90
91/*
92 * Global opts, taken by all filesystems
93 */
94static const char *global_opts[] = {
95	"errmsg",
96	"fstype",
97	"fspath",
98	"ro",
99	"rw",
100	"nosuid",
101	"noexec",
102	NULL
103};
104
105static int
106mount_init(void *mem, int size, int flags)
107{
108	struct mount *mp;
109
110	mp = (struct mount *)mem;
111	mtx_init(&mp->mnt_mtx, "struct mount mtx", NULL, MTX_DEF);
112	lockinit(&mp->mnt_explock, PVFS, "explock", 0, 0);
113	return (0);
114}
115
116static void
117mount_fini(void *mem, int size)
118{
119	struct mount *mp;
120
121	mp = (struct mount *)mem;
122	lockdestroy(&mp->mnt_explock);
123	mtx_destroy(&mp->mnt_mtx);
124}
125
126static void
127vfs_mount_init(void *dummy __unused)
128{
129
130	mount_zone = uma_zcreate("Mountpoints", sizeof(struct mount), NULL,
131	    NULL, mount_init, mount_fini, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
132}
133SYSINIT(vfs_mount, SI_SUB_VFS, SI_ORDER_ANY, vfs_mount_init, NULL);
134
135/*
136 * ---------------------------------------------------------------------
137 * Functions for building and sanitizing the mount options
138 */
139
140/* Remove one mount option. */
141static void
142vfs_freeopt(struct vfsoptlist *opts, struct vfsopt *opt)
143{
144
145	TAILQ_REMOVE(opts, opt, link);
146	free(opt->name, M_MOUNT);
147	if (opt->value != NULL)
148		free(opt->value, M_MOUNT);
149	free(opt, M_MOUNT);
150}
151
152/* Release all resources related to the mount options. */
153void
154vfs_freeopts(struct vfsoptlist *opts)
155{
156	struct vfsopt *opt;
157
158	while (!TAILQ_EMPTY(opts)) {
159		opt = TAILQ_FIRST(opts);
160		vfs_freeopt(opts, opt);
161	}
162	free(opts, M_MOUNT);
163}
164
165void
166vfs_deleteopt(struct vfsoptlist *opts, const char *name)
167{
168	struct vfsopt *opt, *temp;
169
170	if (opts == NULL)
171		return;
172	TAILQ_FOREACH_SAFE(opt, opts, link, temp)  {
173		if (strcmp(opt->name, name) == 0)
174			vfs_freeopt(opts, opt);
175	}
176}
177
178/*
179 * Check if options are equal (with or without the "no" prefix).
180 */
181static int
182vfs_equalopts(const char *opt1, const char *opt2)
183{
184	char *p;
185
186	/* "opt" vs. "opt" or "noopt" vs. "noopt" */
187	if (strcmp(opt1, opt2) == 0)
188		return (1);
189	/* "noopt" vs. "opt" */
190	if (strncmp(opt1, "no", 2) == 0 && strcmp(opt1 + 2, opt2) == 0)
191		return (1);
192	/* "opt" vs. "noopt" */
193	if (strncmp(opt2, "no", 2) == 0 && strcmp(opt1, opt2 + 2) == 0)
194		return (1);
195	while ((p = strchr(opt1, '.')) != NULL &&
196	    !strncmp(opt1, opt2, ++p - opt1)) {
197		opt2 += p - opt1;
198		opt1 = p;
199		/* "foo.noopt" vs. "foo.opt" */
200		if (strncmp(opt1, "no", 2) == 0 && strcmp(opt1 + 2, opt2) == 0)
201			return (1);
202		/* "foo.opt" vs. "foo.noopt" */
203		if (strncmp(opt2, "no", 2) == 0 && strcmp(opt1, opt2 + 2) == 0)
204			return (1);
205	}
206	return (0);
207}
208
209/*
210 * If a mount option is specified several times,
211 * (with or without the "no" prefix) only keep
212 * the last occurence of it.
213 */
214static void
215vfs_sanitizeopts(struct vfsoptlist *opts)
216{
217	struct vfsopt *opt, *opt2, *tmp;
218
219	TAILQ_FOREACH_REVERSE(opt, opts, vfsoptlist, link) {
220		opt2 = TAILQ_PREV(opt, vfsoptlist, link);
221		while (opt2 != NULL) {
222			if (vfs_equalopts(opt->name, opt2->name)) {
223				tmp = TAILQ_PREV(opt2, vfsoptlist, link);
224				vfs_freeopt(opts, opt2);
225				opt2 = tmp;
226			} else {
227				opt2 = TAILQ_PREV(opt2, vfsoptlist, link);
228			}
229		}
230	}
231}
232
233/*
234 * Build a linked list of mount options from a struct uio.
235 */
236int
237vfs_buildopts(struct uio *auio, struct vfsoptlist **options)
238{
239	struct vfsoptlist *opts;
240	struct vfsopt *opt;
241	size_t memused, namelen, optlen;
242	unsigned int i, iovcnt;
243	int error;
244
245	opts = malloc(sizeof(struct vfsoptlist), M_MOUNT, M_WAITOK);
246	TAILQ_INIT(opts);
247	memused = 0;
248	iovcnt = auio->uio_iovcnt;
249	for (i = 0; i < iovcnt; i += 2) {
250		namelen = auio->uio_iov[i].iov_len;
251		optlen = auio->uio_iov[i + 1].iov_len;
252		memused += sizeof(struct vfsopt) + optlen + namelen;
253		/*
254		 * Avoid consuming too much memory, and attempts to overflow
255		 * memused.
256		 */
257		if (memused > VFS_MOUNTARG_SIZE_MAX ||
258		    optlen > VFS_MOUNTARG_SIZE_MAX ||
259		    namelen > VFS_MOUNTARG_SIZE_MAX) {
260			error = EINVAL;
261			goto bad;
262		}
263
264		opt = malloc(sizeof(struct vfsopt), M_MOUNT, M_WAITOK);
265		opt->name = malloc(namelen, M_MOUNT, M_WAITOK);
266		opt->value = NULL;
267		opt->len = 0;
268		opt->pos = i / 2;
269		opt->seen = 0;
270
271		/*
272		 * Do this early, so jumps to "bad" will free the current
273		 * option.
274		 */
275		TAILQ_INSERT_TAIL(opts, opt, link);
276
277		if (auio->uio_segflg == UIO_SYSSPACE) {
278			bcopy(auio->uio_iov[i].iov_base, opt->name, namelen);
279		} else {
280			error = copyin(auio->uio_iov[i].iov_base, opt->name,
281			    namelen);
282			if (error)
283				goto bad;
284		}
285		/* Ensure names are null-terminated strings. */
286		if (namelen == 0 || opt->name[namelen - 1] != '\0') {
287			error = EINVAL;
288			goto bad;
289		}
290		if (optlen != 0) {
291			opt->len = optlen;
292			opt->value = malloc(optlen, M_MOUNT, M_WAITOK);
293			if (auio->uio_segflg == UIO_SYSSPACE) {
294				bcopy(auio->uio_iov[i + 1].iov_base, opt->value,
295				    optlen);
296			} else {
297				error = copyin(auio->uio_iov[i + 1].iov_base,
298				    opt->value, optlen);
299				if (error)
300					goto bad;
301			}
302		}
303	}
304	vfs_sanitizeopts(opts);
305	*options = opts;
306	return (0);
307bad:
308	vfs_freeopts(opts);
309	return (error);
310}
311
312/*
313 * Merge the old mount options with the new ones passed
314 * in the MNT_UPDATE case.
315 *
316 * XXX This function will keep a "nofoo" option in the
317 *     new options if there is no matching "foo" option
318 *     to be cancelled in the old options.  This is a bug
319 *     if the option's canonical name is "foo".  E.g., "noro"
320 *     shouldn't end up in the mount point's active options,
321 *     but it can.
322 */
323static void
324vfs_mergeopts(struct vfsoptlist *toopts, struct vfsoptlist *opts)
325{
326	struct vfsopt *opt, *opt2, *new;
327
328	TAILQ_FOREACH(opt, opts, link) {
329		/*
330		 * Check that this option hasn't been redefined
331		 * nor cancelled with a "no" mount option.
332		 */
333		opt2 = TAILQ_FIRST(toopts);
334		while (opt2 != NULL) {
335			if (strcmp(opt2->name, opt->name) == 0)
336				goto next;
337			if (strncmp(opt2->name, "no", 2) == 0 &&
338			    strcmp(opt2->name + 2, opt->name) == 0) {
339				vfs_freeopt(toopts, opt2);
340				goto next;
341			}
342			opt2 = TAILQ_NEXT(opt2, link);
343		}
344		/* We want this option, duplicate it. */
345		new = malloc(sizeof(struct vfsopt), M_MOUNT, M_WAITOK);
346		new->name = malloc(strlen(opt->name) + 1, M_MOUNT, M_WAITOK);
347		strcpy(new->name, opt->name);
348		if (opt->len != 0) {
349			new->value = malloc(opt->len, M_MOUNT, M_WAITOK);
350			bcopy(opt->value, new->value, opt->len);
351		} else {
352			new->value = NULL;
353		}
354		new->len = opt->len;
355		new->seen = opt->seen;
356		TAILQ_INSERT_TAIL(toopts, new, link);
357next:
358		continue;
359	}
360}
361
362/*
363 * Mount a filesystem.
364 */
365int
366nmount(td, uap)
367	struct thread *td;
368	struct nmount_args /* {
369		struct iovec *iovp;
370		unsigned int iovcnt;
371		int flags;
372	} */ *uap;
373{
374	struct uio *auio;
375	int error;
376	u_int iovcnt;
377
378	AUDIT_ARG_FFLAGS(uap->flags);
379	CTR4(KTR_VFS, "%s: iovp %p with iovcnt %d and flags %d", __func__,
380	    uap->iovp, uap->iovcnt, uap->flags);
381
382	/*
383	 * Filter out MNT_ROOTFS.  We do not want clients of nmount() in
384	 * userspace to set this flag, but we must filter it out if we want
385	 * MNT_UPDATE on the root file system to work.
386	 * MNT_ROOTFS should only be set by the kernel when mounting its
387	 * root file system.
388	 */
389	uap->flags &= ~MNT_ROOTFS;
390
391	iovcnt = uap->iovcnt;
392	/*
393	 * Check that we have an even number of iovec's
394	 * and that we have at least two options.
395	 */
396	if ((iovcnt & 1) || (iovcnt < 4)) {
397		CTR2(KTR_VFS, "%s: failed for invalid iovcnt %d", __func__,
398		    uap->iovcnt);
399		return (EINVAL);
400	}
401
402	error = copyinuio(uap->iovp, iovcnt, &auio);
403	if (error) {
404		CTR2(KTR_VFS, "%s: failed for invalid uio op with %d errno",
405		    __func__, error);
406		return (error);
407	}
408	error = vfs_donmount(td, uap->flags, auio);
409
410	free(auio, M_IOV);
411	return (error);
412}
413
414/*
415 * ---------------------------------------------------------------------
416 * Various utility functions
417 */
418
419void
420vfs_ref(struct mount *mp)
421{
422
423	CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
424	MNT_ILOCK(mp);
425	MNT_REF(mp);
426	MNT_IUNLOCK(mp);
427}
428
429void
430vfs_rel(struct mount *mp)
431{
432
433	CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
434	MNT_ILOCK(mp);
435	MNT_REL(mp);
436	MNT_IUNLOCK(mp);
437}
438
439/*
440 * Allocate and initialize the mount point struct.
441 */
442struct mount *
443vfs_mount_alloc(struct vnode *vp, struct vfsconf *vfsp, const char *fspath,
444    struct ucred *cred)
445{
446	struct mount *mp;
447
448	mp = uma_zalloc(mount_zone, M_WAITOK);
449	bzero(&mp->mnt_startzero,
450	    __rangeof(struct mount, mnt_startzero, mnt_endzero));
451	TAILQ_INIT(&mp->mnt_nvnodelist);
452	mp->mnt_nvnodelistsize = 0;
453	mp->mnt_ref = 0;
454	(void) vfs_busy(mp, MBF_NOWAIT);
455	mp->mnt_op = vfsp->vfc_vfsops;
456	mp->mnt_vfc = vfsp;
457	vfsp->vfc_refcount++;	/* XXX Unlocked */
458	mp->mnt_stat.f_type = vfsp->vfc_typenum;
459	mp->mnt_gen++;
460	strlcpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN);
461	mp->mnt_vnodecovered = vp;
462	mp->mnt_cred = crdup(cred);
463	mp->mnt_stat.f_owner = cred->cr_uid;
464	strlcpy(mp->mnt_stat.f_mntonname, fspath, MNAMELEN);
465	mp->mnt_iosize_max = DFLTPHYS;
466#ifdef MAC
467	mac_mount_init(mp);
468	mac_mount_create(cred, mp);
469#endif
470	arc4rand(&mp->mnt_hashseed, sizeof mp->mnt_hashseed, 0);
471	return (mp);
472}
473
474/*
475 * Destroy the mount struct previously allocated by vfs_mount_alloc().
476 */
477void
478vfs_mount_destroy(struct mount *mp)
479{
480
481	MNT_ILOCK(mp);
482	mp->mnt_kern_flag |= MNTK_REFEXPIRE;
483	if (mp->mnt_kern_flag & MNTK_MWAIT) {
484		mp->mnt_kern_flag &= ~MNTK_MWAIT;
485		wakeup(mp);
486	}
487	while (mp->mnt_ref)
488		msleep(mp, MNT_MTX(mp), PVFS, "mntref", 0);
489	KASSERT(mp->mnt_ref == 0,
490	    ("%s: invalid refcount in the drain path @ %s:%d", __func__,
491	    __FILE__, __LINE__));
492	if (mp->mnt_writeopcount != 0)
493		panic("vfs_mount_destroy: nonzero writeopcount");
494	if (mp->mnt_secondary_writes != 0)
495		panic("vfs_mount_destroy: nonzero secondary_writes");
496	mp->mnt_vfc->vfc_refcount--;
497	if (!TAILQ_EMPTY(&mp->mnt_nvnodelist)) {
498		struct vnode *vp;
499
500		TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes)
501			vprint("", vp);
502		panic("unmount: dangling vnode");
503	}
504	if (mp->mnt_nvnodelistsize != 0)
505		panic("vfs_mount_destroy: nonzero nvnodelistsize");
506	if (mp->mnt_lockref != 0)
507		panic("vfs_mount_destroy: nonzero lock refcount");
508	MNT_IUNLOCK(mp);
509#ifdef MAC
510	mac_mount_destroy(mp);
511#endif
512	if (mp->mnt_opt != NULL)
513		vfs_freeopts(mp->mnt_opt);
514	crfree(mp->mnt_cred);
515	uma_zfree(mount_zone, mp);
516}
517
518int
519vfs_donmount(struct thread *td, int fsflags, struct uio *fsoptions)
520{
521	struct vfsoptlist *optlist;
522	struct vfsopt *opt, *noro_opt, *tmp_opt;
523	char *fstype, *fspath, *errmsg;
524	int error, fstypelen, fspathlen, errmsg_len, errmsg_pos;
525	int has_rw, has_noro;
526
527	errmsg = fspath = NULL;
528	errmsg_len = has_noro = has_rw = fspathlen = 0;
529	errmsg_pos = -1;
530
531	error = vfs_buildopts(fsoptions, &optlist);
532	if (error)
533		return (error);
534
535	if (vfs_getopt(optlist, "errmsg", (void **)&errmsg, &errmsg_len) == 0)
536		errmsg_pos = vfs_getopt_pos(optlist, "errmsg");
537
538	/*
539	 * We need these two options before the others,
540	 * and they are mandatory for any filesystem.
541	 * Ensure they are NUL terminated as well.
542	 */
543	fstypelen = 0;
544	error = vfs_getopt(optlist, "fstype", (void **)&fstype, &fstypelen);
545	if (error || fstype[fstypelen - 1] != '\0') {
546		error = EINVAL;
547		if (errmsg != NULL)
548			strncpy(errmsg, "Invalid fstype", errmsg_len);
549		goto bail;
550	}
551	fspathlen = 0;
552	error = vfs_getopt(optlist, "fspath", (void **)&fspath, &fspathlen);
553	if (error || fspath[fspathlen - 1] != '\0') {
554		error = EINVAL;
555		if (errmsg != NULL)
556			strncpy(errmsg, "Invalid fspath", errmsg_len);
557		goto bail;
558	}
559
560	/*
561	 * We need to see if we have the "update" option
562	 * before we call vfs_domount(), since vfs_domount() has special
563	 * logic based on MNT_UPDATE.  This is very important
564	 * when we want to update the root filesystem.
565	 */
566	TAILQ_FOREACH_SAFE(opt, optlist, link, tmp_opt) {
567		if (strcmp(opt->name, "update") == 0) {
568			fsflags |= MNT_UPDATE;
569			vfs_freeopt(optlist, opt);
570		}
571		else if (strcmp(opt->name, "async") == 0)
572			fsflags |= MNT_ASYNC;
573		else if (strcmp(opt->name, "force") == 0) {
574			fsflags |= MNT_FORCE;
575			vfs_freeopt(optlist, opt);
576		}
577		else if (strcmp(opt->name, "reload") == 0) {
578			fsflags |= MNT_RELOAD;
579			vfs_freeopt(optlist, opt);
580		}
581		else if (strcmp(opt->name, "multilabel") == 0)
582			fsflags |= MNT_MULTILABEL;
583		else if (strcmp(opt->name, "noasync") == 0)
584			fsflags &= ~MNT_ASYNC;
585		else if (strcmp(opt->name, "noatime") == 0)
586			fsflags |= MNT_NOATIME;
587		else if (strcmp(opt->name, "atime") == 0) {
588			free(opt->name, M_MOUNT);
589			opt->name = strdup("nonoatime", M_MOUNT);
590		}
591		else if (strcmp(opt->name, "noclusterr") == 0)
592			fsflags |= MNT_NOCLUSTERR;
593		else if (strcmp(opt->name, "clusterr") == 0) {
594			free(opt->name, M_MOUNT);
595			opt->name = strdup("nonoclusterr", M_MOUNT);
596		}
597		else if (strcmp(opt->name, "noclusterw") == 0)
598			fsflags |= MNT_NOCLUSTERW;
599		else if (strcmp(opt->name, "clusterw") == 0) {
600			free(opt->name, M_MOUNT);
601			opt->name = strdup("nonoclusterw", M_MOUNT);
602		}
603		else if (strcmp(opt->name, "noexec") == 0)
604			fsflags |= MNT_NOEXEC;
605		else if (strcmp(opt->name, "exec") == 0) {
606			free(opt->name, M_MOUNT);
607			opt->name = strdup("nonoexec", M_MOUNT);
608		}
609		else if (strcmp(opt->name, "nosuid") == 0)
610			fsflags |= MNT_NOSUID;
611		else if (strcmp(opt->name, "suid") == 0) {
612			free(opt->name, M_MOUNT);
613			opt->name = strdup("nonosuid", M_MOUNT);
614		}
615		else if (strcmp(opt->name, "nosymfollow") == 0)
616			fsflags |= MNT_NOSYMFOLLOW;
617		else if (strcmp(opt->name, "symfollow") == 0) {
618			free(opt->name, M_MOUNT);
619			opt->name = strdup("nonosymfollow", M_MOUNT);
620		}
621		else if (strcmp(opt->name, "noro") == 0) {
622			fsflags &= ~MNT_RDONLY;
623			has_noro = 1;
624		}
625		else if (strcmp(opt->name, "rw") == 0) {
626			fsflags &= ~MNT_RDONLY;
627			has_rw = 1;
628		}
629		else if (strcmp(opt->name, "ro") == 0)
630			fsflags |= MNT_RDONLY;
631		else if (strcmp(opt->name, "rdonly") == 0) {
632			free(opt->name, M_MOUNT);
633			opt->name = strdup("ro", M_MOUNT);
634			fsflags |= MNT_RDONLY;
635		}
636		else if (strcmp(opt->name, "suiddir") == 0)
637			fsflags |= MNT_SUIDDIR;
638		else if (strcmp(opt->name, "sync") == 0)
639			fsflags |= MNT_SYNCHRONOUS;
640		else if (strcmp(opt->name, "union") == 0)
641			fsflags |= MNT_UNION;
642	}
643
644	/*
645	 * If "rw" was specified as a mount option, and we
646	 * are trying to update a mount-point from "ro" to "rw",
647	 * we need a mount option "noro", since in vfs_mergeopts(),
648	 * "noro" will cancel "ro", but "rw" will not do anything.
649	 */
650	if (has_rw && !has_noro) {
651		noro_opt = malloc(sizeof(struct vfsopt), M_MOUNT, M_WAITOK);
652		noro_opt->name = strdup("noro", M_MOUNT);
653		noro_opt->value = NULL;
654		noro_opt->len = 0;
655		noro_opt->pos = -1;
656		noro_opt->seen = 1;
657		TAILQ_INSERT_TAIL(optlist, noro_opt, link);
658	}
659
660	/*
661	 * Be ultra-paranoid about making sure the type and fspath
662	 * variables will fit in our mp buffers, including the
663	 * terminating NUL.
664	 */
665	if (fstypelen >= MFSNAMELEN - 1 || fspathlen >= MNAMELEN - 1) {
666		error = ENAMETOOLONG;
667		goto bail;
668	}
669
670	error = vfs_domount(td, fstype, fspath, fsflags, &optlist);
671bail:
672	/* copyout the errmsg */
673	if (errmsg_pos != -1 && ((2 * errmsg_pos + 1) < fsoptions->uio_iovcnt)
674	    && errmsg_len > 0 && errmsg != NULL) {
675		if (fsoptions->uio_segflg == UIO_SYSSPACE) {
676			bcopy(errmsg,
677			    fsoptions->uio_iov[2 * errmsg_pos + 1].iov_base,
678			    fsoptions->uio_iov[2 * errmsg_pos + 1].iov_len);
679		} else {
680			copyout(errmsg,
681			    fsoptions->uio_iov[2 * errmsg_pos + 1].iov_base,
682			    fsoptions->uio_iov[2 * errmsg_pos + 1].iov_len);
683		}
684	}
685
686	if (optlist != NULL)
687		vfs_freeopts(optlist);
688	return (error);
689}
690
691/*
692 * Old mount API.
693 */
694#ifndef _SYS_SYSPROTO_H_
695struct mount_args {
696	char	*type;
697	char	*path;
698	int	flags;
699	caddr_t	data;
700};
701#endif
702/* ARGSUSED */
703int
704mount(td, uap)
705	struct thread *td;
706	struct mount_args /* {
707		char *type;
708		char *path;
709		int flags;
710		caddr_t data;
711	} */ *uap;
712{
713	char *fstype;
714	struct vfsconf *vfsp = NULL;
715	struct mntarg *ma = NULL;
716	int error;
717
718	AUDIT_ARG_FFLAGS(uap->flags);
719
720	/*
721	 * Filter out MNT_ROOTFS.  We do not want clients of mount() in
722	 * userspace to set this flag, but we must filter it out if we want
723	 * MNT_UPDATE on the root file system to work.
724	 * MNT_ROOTFS should only be set by the kernel when mounting its
725	 * root file system.
726	 */
727	uap->flags &= ~MNT_ROOTFS;
728
729	fstype = malloc(MFSNAMELEN, M_TEMP, M_WAITOK);
730	error = copyinstr(uap->type, fstype, MFSNAMELEN, NULL);
731	if (error) {
732		free(fstype, M_TEMP);
733		return (error);
734	}
735
736	AUDIT_ARG_TEXT(fstype);
737	mtx_lock(&Giant);
738	vfsp = vfs_byname_kld(fstype, td, &error);
739	free(fstype, M_TEMP);
740	if (vfsp == NULL) {
741		mtx_unlock(&Giant);
742		return (ENOENT);
743	}
744	if (vfsp->vfc_vfsops->vfs_cmount == NULL) {
745		mtx_unlock(&Giant);
746		return (EOPNOTSUPP);
747	}
748
749	ma = mount_argsu(ma, "fstype", uap->type, MNAMELEN);
750	ma = mount_argsu(ma, "fspath", uap->path, MNAMELEN);
751	ma = mount_argb(ma, uap->flags & MNT_RDONLY, "noro");
752	ma = mount_argb(ma, !(uap->flags & MNT_NOSUID), "nosuid");
753	ma = mount_argb(ma, !(uap->flags & MNT_NOEXEC), "noexec");
754
755	error = vfsp->vfc_vfsops->vfs_cmount(ma, uap->data, uap->flags);
756	mtx_unlock(&Giant);
757	return (error);
758}
759
760/*
761 * vfs_domount_first(): first file system mount (not update)
762 */
763static int
764vfs_domount_first(
765	struct thread *td,		/* Calling thread. */
766	struct vfsconf *vfsp,		/* File system type. */
767	char *fspath,			/* Mount path. */
768	struct vnode *vp,		/* Vnode to be covered. */
769	int fsflags,			/* Flags common to all filesystems. */
770	struct vfsoptlist **optlist	/* Options local to the filesystem. */
771	)
772{
773	struct vattr va;
774	struct mount *mp;
775	struct vnode *newdp;
776	int error;
777
778	mtx_assert(&Giant, MA_OWNED);
779	ASSERT_VOP_ELOCKED(vp, __func__);
780	KASSERT((fsflags & MNT_UPDATE) == 0, ("MNT_UPDATE shouldn't be here"));
781
782	/*
783	 * If the user is not root, ensure that they own the directory
784	 * onto which we are attempting to mount.
785	 */
786	error = VOP_GETATTR(vp, &va, td->td_ucred);
787	if (error == 0 && va.va_uid != td->td_ucred->cr_uid)
788		error = priv_check_cred(td->td_ucred, PRIV_VFS_ADMIN, 0);
789	if (error == 0)
790		error = vinvalbuf(vp, V_SAVE, 0, 0);
791	if (error == 0 && vp->v_type != VDIR)
792		error = ENOTDIR;
793	if (error == 0) {
794		VI_LOCK(vp);
795		if ((vp->v_iflag & VI_MOUNT) == 0 && vp->v_mountedhere == NULL)
796			vp->v_iflag |= VI_MOUNT;
797		else
798			error = EBUSY;
799		VI_UNLOCK(vp);
800	}
801	if (error != 0) {
802		vput(vp);
803		return (error);
804	}
805	VOP_UNLOCK(vp, 0);
806
807	/* Allocate and initialize the filesystem. */
808	mp = vfs_mount_alloc(vp, vfsp, fspath, td->td_ucred);
809	/* XXXMAC: pass to vfs_mount_alloc? */
810	mp->mnt_optnew = *optlist;
811	/* Set the mount level flags. */
812	mp->mnt_flag = (fsflags & (MNT_UPDATEMASK | MNT_ROOTFS | MNT_RDONLY));
813
814	/*
815	 * Mount the filesystem.
816	 * XXX The final recipients of VFS_MOUNT just overwrite the ndp they
817	 * get.  No freeing of cn_pnbuf.
818	 */
819	error = VFS_MOUNT(mp);
820	if (error != 0) {
821		vfs_unbusy(mp);
822		vfs_mount_destroy(mp);
823		VI_LOCK(vp);
824		vp->v_iflag &= ~VI_MOUNT;
825		VI_UNLOCK(vp);
826		vrele(vp);
827		return (error);
828	}
829
830	if (mp->mnt_opt != NULL)
831		vfs_freeopts(mp->mnt_opt);
832	mp->mnt_opt = mp->mnt_optnew;
833	*optlist = NULL;
834	(void)VFS_STATFS(mp, &mp->mnt_stat);
835
836	/*
837	 * Prevent external consumers of mount options from reading mnt_optnew.
838	 */
839	mp->mnt_optnew = NULL;
840
841	MNT_ILOCK(mp);
842	if ((mp->mnt_flag & MNT_ASYNC) != 0 && mp->mnt_noasync == 0)
843		mp->mnt_kern_flag |= MNTK_ASYNC;
844	else
845		mp->mnt_kern_flag &= ~MNTK_ASYNC;
846	MNT_IUNLOCK(mp);
847
848	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
849	cache_purge(vp);
850	VI_LOCK(vp);
851	vp->v_iflag &= ~VI_MOUNT;
852	VI_UNLOCK(vp);
853	vp->v_mountedhere = mp;
854	/* Place the new filesystem at the end of the mount list. */
855	mtx_lock(&mountlist_mtx);
856	TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list);
857	mtx_unlock(&mountlist_mtx);
858	vfs_event_signal(NULL, VQ_MOUNT, 0);
859	if (VFS_ROOT(mp, LK_EXCLUSIVE, &newdp))
860		panic("mount: lost mount");
861	VOP_UNLOCK(newdp, 0);
862	VOP_UNLOCK(vp, 0);
863	mountcheckdirs(vp, newdp);
864	vrele(newdp);
865	if ((mp->mnt_flag & MNT_RDONLY) == 0)
866		vfs_allocate_syncvnode(mp);
867	vfs_unbusy(mp);
868	return (0);
869}
870
871/*
872 * vfs_domount_update(): update of mounted file system
873 */
874static int
875vfs_domount_update(
876	struct thread *td,		/* Calling thread. */
877	struct vnode *vp,		/* Mount point vnode. */
878	int fsflags,			/* Flags common to all filesystems. */
879	struct vfsoptlist **optlist	/* Options local to the filesystem. */
880	)
881{
882	struct oexport_args oexport;
883	struct export_args export;
884	struct mount *mp;
885	int error, export_error, flag;
886
887	mtx_assert(&Giant, MA_OWNED);
888	ASSERT_VOP_ELOCKED(vp, __func__);
889	KASSERT((fsflags & MNT_UPDATE) != 0, ("MNT_UPDATE should be here"));
890
891	if ((vp->v_vflag & VV_ROOT) == 0) {
892		vput(vp);
893		return (EINVAL);
894	}
895	mp = vp->v_mount;
896	/*
897	 * We only allow the filesystem to be reloaded if it
898	 * is currently mounted read-only.
899	 */
900	flag = mp->mnt_flag;
901	if ((fsflags & MNT_RELOAD) != 0 && (flag & MNT_RDONLY) == 0) {
902		vput(vp);
903		return (EOPNOTSUPP);	/* Needs translation */
904	}
905	/*
906	 * Only privileged root, or (if MNT_USER is set) the user that
907	 * did the original mount is permitted to update it.
908	 */
909	error = vfs_suser(mp, td);
910	if (error != 0) {
911		vput(vp);
912		return (error);
913	}
914	if (vfs_busy(mp, MBF_NOWAIT)) {
915		vput(vp);
916		return (EBUSY);
917	}
918	VI_LOCK(vp);
919	if ((vp->v_iflag & VI_MOUNT) != 0 || vp->v_mountedhere != NULL) {
920		VI_UNLOCK(vp);
921		vfs_unbusy(mp);
922		vput(vp);
923		return (EBUSY);
924	}
925	vp->v_iflag |= VI_MOUNT;
926	VI_UNLOCK(vp);
927	VOP_UNLOCK(vp, 0);
928
929	MNT_ILOCK(mp);
930	mp->mnt_flag &= ~MNT_UPDATEMASK;
931	mp->mnt_flag |= fsflags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE |
932	    MNT_SNAPSHOT | MNT_ROOTFS | MNT_UPDATEMASK | MNT_RDONLY);
933	if ((mp->mnt_flag & MNT_ASYNC) == 0)
934		mp->mnt_kern_flag &= ~MNTK_ASYNC;
935	MNT_IUNLOCK(mp);
936	mp->mnt_optnew = *optlist;
937	vfs_mergeopts(mp->mnt_optnew, mp->mnt_opt);
938
939	/*
940	 * Mount the filesystem.
941	 * XXX The final recipients of VFS_MOUNT just overwrite the ndp they
942	 * get.  No freeing of cn_pnbuf.
943	 */
944	error = VFS_MOUNT(mp);
945
946	export_error = 0;
947	if (error == 0) {
948		/* Process the export option. */
949		if (vfs_copyopt(mp->mnt_optnew, "export", &export,
950		    sizeof(export)) == 0) {
951			export_error = vfs_export(mp, &export);
952		} else if (vfs_copyopt(mp->mnt_optnew, "export", &oexport,
953		    sizeof(oexport)) == 0) {
954			export.ex_flags = oexport.ex_flags;
955			export.ex_root = oexport.ex_root;
956			export.ex_anon = oexport.ex_anon;
957			export.ex_addr = oexport.ex_addr;
958			export.ex_addrlen = oexport.ex_addrlen;
959			export.ex_mask = oexport.ex_mask;
960			export.ex_masklen = oexport.ex_masklen;
961			export.ex_indexfile = oexport.ex_indexfile;
962			export.ex_numsecflavors = 0;
963			export_error = vfs_export(mp, &export);
964		}
965	}
966
967	MNT_ILOCK(mp);
968	if (error == 0) {
969		mp->mnt_flag &=	~(MNT_UPDATE | MNT_RELOAD | MNT_FORCE |
970		    MNT_SNAPSHOT);
971	} else {
972		/*
973		 * If we fail, restore old mount flags. MNT_QUOTA is special,
974		 * because it is not part of MNT_UPDATEMASK, but it could have
975		 * changed in the meantime if quotactl(2) was called.
976		 * All in all we want current value of MNT_QUOTA, not the old
977		 * one.
978		 */
979		mp->mnt_flag = (mp->mnt_flag & MNT_QUOTA) | (flag & ~MNT_QUOTA);
980	}
981	if ((mp->mnt_flag & MNT_ASYNC) != 0 && mp->mnt_noasync == 0)
982		mp->mnt_kern_flag |= MNTK_ASYNC;
983	else
984		mp->mnt_kern_flag &= ~MNTK_ASYNC;
985	MNT_IUNLOCK(mp);
986
987	if (error != 0)
988		goto end;
989
990	if (mp->mnt_opt != NULL)
991		vfs_freeopts(mp->mnt_opt);
992	mp->mnt_opt = mp->mnt_optnew;
993	*optlist = NULL;
994	(void)VFS_STATFS(mp, &mp->mnt_stat);
995	/*
996	 * Prevent external consumers of mount options from reading
997	 * mnt_optnew.
998	 */
999	mp->mnt_optnew = NULL;
1000
1001	if ((mp->mnt_flag & MNT_RDONLY) == 0)
1002		vfs_allocate_syncvnode(mp);
1003	else
1004		vfs_deallocate_syncvnode(mp);
1005end:
1006	vfs_unbusy(mp);
1007	VI_LOCK(vp);
1008	vp->v_iflag &= ~VI_MOUNT;
1009	VI_UNLOCK(vp);
1010	vrele(vp);
1011	return (error != 0 ? error : export_error);
1012}
1013
1014/*
1015 * vfs_domount(): actually attempt a filesystem mount.
1016 */
1017static int
1018vfs_domount(
1019	struct thread *td,		/* Calling thread. */
1020	const char *fstype,		/* Filesystem type. */
1021	char *fspath,			/* Mount path. */
1022	int fsflags,			/* Flags common to all filesystems. */
1023	struct vfsoptlist **optlist	/* Options local to the filesystem. */
1024	)
1025{
1026	struct vfsconf *vfsp;
1027	struct nameidata nd;
1028	struct vnode *vp;
1029	int error;
1030
1031	/*
1032	 * Be ultra-paranoid about making sure the type and fspath
1033	 * variables will fit in our mp buffers, including the
1034	 * terminating NUL.
1035	 */
1036	if (strlen(fstype) >= MFSNAMELEN || strlen(fspath) >= MNAMELEN)
1037		return (ENAMETOOLONG);
1038
1039	if (jailed(td->td_ucred) || usermount == 0) {
1040		if ((error = priv_check(td, PRIV_VFS_MOUNT)) != 0)
1041			return (error);
1042	}
1043
1044	/*
1045	 * Do not allow NFS export or MNT_SUIDDIR by unprivileged users.
1046	 */
1047	if (fsflags & MNT_EXPORTED) {
1048		error = priv_check(td, PRIV_VFS_MOUNT_EXPORTED);
1049		if (error)
1050			return (error);
1051	}
1052	if (fsflags & MNT_SUIDDIR) {
1053		error = priv_check(td, PRIV_VFS_MOUNT_SUIDDIR);
1054		if (error)
1055			return (error);
1056	}
1057	/*
1058	 * Silently enforce MNT_NOSUID and MNT_USER for unprivileged users.
1059	 */
1060	if ((fsflags & (MNT_NOSUID | MNT_USER)) != (MNT_NOSUID | MNT_USER)) {
1061		if (priv_check(td, PRIV_VFS_MOUNT_NONUSER) != 0)
1062			fsflags |= MNT_NOSUID | MNT_USER;
1063	}
1064
1065	/* Load KLDs before we lock the covered vnode to avoid reversals. */
1066	vfsp = NULL;
1067	if ((fsflags & MNT_UPDATE) == 0) {
1068		/* Don't try to load KLDs if we're mounting the root. */
1069		if (fsflags & MNT_ROOTFS)
1070			vfsp = vfs_byname(fstype);
1071		else
1072			vfsp = vfs_byname_kld(fstype, td, &error);
1073		if (vfsp == NULL)
1074			return (ENODEV);
1075		if (jailed(td->td_ucred) && !(vfsp->vfc_flags & VFCF_JAIL))
1076			return (EPERM);
1077	}
1078
1079	/*
1080	 * Get vnode to be covered or mount point's vnode in case of MNT_UPDATE.
1081	 */
1082	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1,
1083	    UIO_SYSSPACE, fspath, td);
1084	error = namei(&nd);
1085	if (error != 0)
1086		return (error);
1087	if (!NDHASGIANT(&nd))
1088		mtx_lock(&Giant);
1089	NDFREE(&nd, NDF_ONLY_PNBUF);
1090	vp = nd.ni_vp;
1091	if ((fsflags & MNT_UPDATE) == 0) {
1092		error = vfs_domount_first(td, vfsp, fspath, vp, fsflags,
1093		    optlist);
1094	} else {
1095		error = vfs_domount_update(td, vp, fsflags, optlist);
1096	}
1097	mtx_unlock(&Giant);
1098
1099	ASSERT_VI_UNLOCKED(vp, __func__);
1100	ASSERT_VOP_UNLOCKED(vp, __func__);
1101
1102	return (error);
1103}
1104
1105/*
1106 * Unmount a filesystem.
1107 *
1108 * Note: unmount takes a path to the vnode mounted on as argument, not
1109 * special file (as before).
1110 */
1111#ifndef _SYS_SYSPROTO_H_
1112struct unmount_args {
1113	char	*path;
1114	int	flags;
1115};
1116#endif
1117/* ARGSUSED */
1118int
1119unmount(td, uap)
1120	struct thread *td;
1121	register struct unmount_args /* {
1122		char *path;
1123		int flags;
1124	} */ *uap;
1125{
1126	struct mount *mp;
1127	char *pathbuf;
1128	int error, id0, id1;
1129
1130	AUDIT_ARG_VALUE(uap->flags);
1131	if (jailed(td->td_ucred) || usermount == 0) {
1132		error = priv_check(td, PRIV_VFS_UNMOUNT);
1133		if (error)
1134			return (error);
1135	}
1136
1137	pathbuf = malloc(MNAMELEN, M_TEMP, M_WAITOK);
1138	error = copyinstr(uap->path, pathbuf, MNAMELEN, NULL);
1139	if (error) {
1140		free(pathbuf, M_TEMP);
1141		return (error);
1142	}
1143	mtx_lock(&Giant);
1144	if (uap->flags & MNT_BYFSID) {
1145		AUDIT_ARG_TEXT(pathbuf);
1146		/* Decode the filesystem ID. */
1147		if (sscanf(pathbuf, "FSID:%d:%d", &id0, &id1) != 2) {
1148			mtx_unlock(&Giant);
1149			free(pathbuf, M_TEMP);
1150			return (EINVAL);
1151		}
1152
1153		mtx_lock(&mountlist_mtx);
1154		TAILQ_FOREACH_REVERSE(mp, &mountlist, mntlist, mnt_list) {
1155			if (mp->mnt_stat.f_fsid.val[0] == id0 &&
1156			    mp->mnt_stat.f_fsid.val[1] == id1)
1157				break;
1158		}
1159		mtx_unlock(&mountlist_mtx);
1160	} else {
1161		AUDIT_ARG_UPATH1(td, pathbuf);
1162		mtx_lock(&mountlist_mtx);
1163		TAILQ_FOREACH_REVERSE(mp, &mountlist, mntlist, mnt_list) {
1164			if (strcmp(mp->mnt_stat.f_mntonname, pathbuf) == 0)
1165				break;
1166		}
1167		mtx_unlock(&mountlist_mtx);
1168	}
1169	free(pathbuf, M_TEMP);
1170	if (mp == NULL) {
1171		/*
1172		 * Previously we returned ENOENT for a nonexistent path and
1173		 * EINVAL for a non-mountpoint.  We cannot tell these apart
1174		 * now, so in the !MNT_BYFSID case return the more likely
1175		 * EINVAL for compatibility.
1176		 */
1177		mtx_unlock(&Giant);
1178		return ((uap->flags & MNT_BYFSID) ? ENOENT : EINVAL);
1179	}
1180
1181	/*
1182	 * Don't allow unmounting the root filesystem.
1183	 */
1184	if (mp->mnt_flag & MNT_ROOTFS) {
1185		mtx_unlock(&Giant);
1186		return (EINVAL);
1187	}
1188	error = dounmount(mp, uap->flags, td);
1189	mtx_unlock(&Giant);
1190	return (error);
1191}
1192
1193/*
1194 * Do the actual filesystem unmount.
1195 */
1196int
1197dounmount(mp, flags, td)
1198	struct mount *mp;
1199	int flags;
1200	struct thread *td;
1201{
1202	struct vnode *coveredvp, *fsrootvp;
1203	int error;
1204	int async_flag;
1205	int mnt_gen_r;
1206
1207	mtx_assert(&Giant, MA_OWNED);
1208
1209	if ((coveredvp = mp->mnt_vnodecovered) != NULL) {
1210		mnt_gen_r = mp->mnt_gen;
1211		VI_LOCK(coveredvp);
1212		vholdl(coveredvp);
1213		vn_lock(coveredvp, LK_EXCLUSIVE | LK_INTERLOCK | LK_RETRY);
1214		vdrop(coveredvp);
1215		/*
1216		 * Check for mp being unmounted while waiting for the
1217		 * covered vnode lock.
1218		 */
1219		if (coveredvp->v_mountedhere != mp ||
1220		    coveredvp->v_mountedhere->mnt_gen != mnt_gen_r) {
1221			VOP_UNLOCK(coveredvp, 0);
1222			return (EBUSY);
1223		}
1224	}
1225	/*
1226	 * Only privileged root, or (if MNT_USER is set) the user that did the
1227	 * original mount is permitted to unmount this filesystem.
1228	 */
1229	error = vfs_suser(mp, td);
1230	if (error) {
1231		if (coveredvp)
1232			VOP_UNLOCK(coveredvp, 0);
1233		return (error);
1234	}
1235
1236	MNT_ILOCK(mp);
1237	if (mp->mnt_kern_flag & MNTK_UNMOUNT) {
1238		MNT_IUNLOCK(mp);
1239		if (coveredvp)
1240			VOP_UNLOCK(coveredvp, 0);
1241		return (EBUSY);
1242	}
1243	mp->mnt_kern_flag |= MNTK_UNMOUNT | MNTK_NOINSMNTQ;
1244	/* Allow filesystems to detect that a forced unmount is in progress. */
1245	if (flags & MNT_FORCE)
1246		mp->mnt_kern_flag |= MNTK_UNMOUNTF;
1247	error = 0;
1248	if (mp->mnt_lockref) {
1249		if ((flags & MNT_FORCE) == 0) {
1250			mp->mnt_kern_flag &= ~(MNTK_UNMOUNT | MNTK_NOINSMNTQ |
1251			    MNTK_UNMOUNTF);
1252			if (mp->mnt_kern_flag & MNTK_MWAIT) {
1253				mp->mnt_kern_flag &= ~MNTK_MWAIT;
1254				wakeup(mp);
1255			}
1256			MNT_IUNLOCK(mp);
1257			if (coveredvp)
1258				VOP_UNLOCK(coveredvp, 0);
1259			return (EBUSY);
1260		}
1261		mp->mnt_kern_flag |= MNTK_DRAINING;
1262		error = msleep(&mp->mnt_lockref, MNT_MTX(mp), PVFS,
1263		    "mount drain", 0);
1264	}
1265	MNT_IUNLOCK(mp);
1266	KASSERT(mp->mnt_lockref == 0,
1267	    ("%s: invalid lock refcount in the drain path @ %s:%d",
1268	    __func__, __FILE__, __LINE__));
1269	KASSERT(error == 0,
1270	    ("%s: invalid return value for msleep in the drain path @ %s:%d",
1271	    __func__, __FILE__, __LINE__));
1272	vn_start_write(NULL, &mp, V_WAIT);
1273
1274	if (mp->mnt_flag & MNT_EXPUBLIC)
1275		vfs_setpublicfs(NULL, NULL, NULL);
1276
1277	vfs_msync(mp, MNT_WAIT);
1278	MNT_ILOCK(mp);
1279	async_flag = mp->mnt_flag & MNT_ASYNC;
1280	mp->mnt_flag &= ~MNT_ASYNC;
1281	mp->mnt_kern_flag &= ~MNTK_ASYNC;
1282	MNT_IUNLOCK(mp);
1283	cache_purgevfs(mp);	/* remove cache entries for this file sys */
1284	vfs_deallocate_syncvnode(mp);
1285	/*
1286	 * For forced unmounts, move process cdir/rdir refs on the fs root
1287	 * vnode to the covered vnode.  For non-forced unmounts we want
1288	 * such references to cause an EBUSY error.
1289	 */
1290	if ((flags & MNT_FORCE) &&
1291	    VFS_ROOT(mp, LK_EXCLUSIVE, &fsrootvp) == 0) {
1292		if (mp->mnt_vnodecovered != NULL)
1293			mountcheckdirs(fsrootvp, mp->mnt_vnodecovered);
1294		if (fsrootvp == rootvnode) {
1295			vrele(rootvnode);
1296			rootvnode = NULL;
1297		}
1298		vput(fsrootvp);
1299	}
1300	if (((mp->mnt_flag & MNT_RDONLY) ||
1301	     (error = VFS_SYNC(mp, MNT_WAIT)) == 0) || (flags & MNT_FORCE) != 0)
1302		error = VFS_UNMOUNT(mp, flags);
1303	vn_finished_write(mp);
1304	/*
1305	 * If we failed to flush the dirty blocks for this mount point,
1306	 * undo all the cdir/rdir and rootvnode changes we made above.
1307	 * Unless we failed to do so because the device is reporting that
1308	 * it doesn't exist anymore.
1309	 */
1310	if (error && error != ENXIO) {
1311		if ((flags & MNT_FORCE) &&
1312		    VFS_ROOT(mp, LK_EXCLUSIVE, &fsrootvp) == 0) {
1313			if (mp->mnt_vnodecovered != NULL)
1314				mountcheckdirs(mp->mnt_vnodecovered, fsrootvp);
1315			if (rootvnode == NULL) {
1316				rootvnode = fsrootvp;
1317				vref(rootvnode);
1318			}
1319			vput(fsrootvp);
1320		}
1321		MNT_ILOCK(mp);
1322		mp->mnt_kern_flag &= ~MNTK_NOINSMNTQ;
1323		if ((mp->mnt_flag & MNT_RDONLY) == 0) {
1324			MNT_IUNLOCK(mp);
1325			vfs_allocate_syncvnode(mp);
1326			MNT_ILOCK(mp);
1327		}
1328		mp->mnt_kern_flag &= ~(MNTK_UNMOUNT | MNTK_UNMOUNTF);
1329		mp->mnt_flag |= async_flag;
1330		if ((mp->mnt_flag & MNT_ASYNC) != 0 && mp->mnt_noasync == 0)
1331			mp->mnt_kern_flag |= MNTK_ASYNC;
1332		if (mp->mnt_kern_flag & MNTK_MWAIT) {
1333			mp->mnt_kern_flag &= ~MNTK_MWAIT;
1334			wakeup(mp);
1335		}
1336		MNT_IUNLOCK(mp);
1337		if (coveredvp)
1338			VOP_UNLOCK(coveredvp, 0);
1339		return (error);
1340	}
1341	mtx_lock(&mountlist_mtx);
1342	TAILQ_REMOVE(&mountlist, mp, mnt_list);
1343	mtx_unlock(&mountlist_mtx);
1344	if (coveredvp != NULL) {
1345		coveredvp->v_mountedhere = NULL;
1346		vput(coveredvp);
1347	}
1348	vfs_event_signal(NULL, VQ_UNMOUNT, 0);
1349	vfs_mount_destroy(mp);
1350	return (0);
1351}
1352
1353/*
1354 * Report errors during filesystem mounting.
1355 */
1356void
1357vfs_mount_error(struct mount *mp, const char *fmt, ...)
1358{
1359	struct vfsoptlist *moptlist = mp->mnt_optnew;
1360	va_list ap;
1361	int error, len;
1362	char *errmsg;
1363
1364	error = vfs_getopt(moptlist, "errmsg", (void **)&errmsg, &len);
1365	if (error || errmsg == NULL || len <= 0)
1366		return;
1367
1368	va_start(ap, fmt);
1369	vsnprintf(errmsg, (size_t)len, fmt, ap);
1370	va_end(ap);
1371}
1372
1373void
1374vfs_opterror(struct vfsoptlist *opts, const char *fmt, ...)
1375{
1376	va_list ap;
1377	int error, len;
1378	char *errmsg;
1379
1380	error = vfs_getopt(opts, "errmsg", (void **)&errmsg, &len);
1381	if (error || errmsg == NULL || len <= 0)
1382		return;
1383
1384	va_start(ap, fmt);
1385	vsnprintf(errmsg, (size_t)len, fmt, ap);
1386	va_end(ap);
1387}
1388
1389/*
1390 * ---------------------------------------------------------------------
1391 * Functions for querying mount options/arguments from filesystems.
1392 */
1393
1394/*
1395 * Check that no unknown options are given
1396 */
1397int
1398vfs_filteropt(struct vfsoptlist *opts, const char **legal)
1399{
1400	struct vfsopt *opt;
1401	char errmsg[255];
1402	const char **t, *p, *q;
1403	int ret = 0;
1404
1405	TAILQ_FOREACH(opt, opts, link) {
1406		p = opt->name;
1407		q = NULL;
1408		if (p[0] == 'n' && p[1] == 'o')
1409			q = p + 2;
1410		for(t = global_opts; *t != NULL; t++) {
1411			if (strcmp(*t, p) == 0)
1412				break;
1413			if (q != NULL) {
1414				if (strcmp(*t, q) == 0)
1415					break;
1416			}
1417		}
1418		if (*t != NULL)
1419			continue;
1420		for(t = legal; *t != NULL; t++) {
1421			if (strcmp(*t, p) == 0)
1422				break;
1423			if (q != NULL) {
1424				if (strcmp(*t, q) == 0)
1425					break;
1426			}
1427		}
1428		if (*t != NULL)
1429			continue;
1430		snprintf(errmsg, sizeof(errmsg),
1431		    "mount option <%s> is unknown", p);
1432		ret = EINVAL;
1433	}
1434	if (ret != 0) {
1435		TAILQ_FOREACH(opt, opts, link) {
1436			if (strcmp(opt->name, "errmsg") == 0) {
1437				strncpy((char *)opt->value, errmsg, opt->len);
1438				break;
1439			}
1440		}
1441		if (opt == NULL)
1442			printf("%s\n", errmsg);
1443	}
1444	return (ret);
1445}
1446
1447/*
1448 * Get a mount option by its name.
1449 *
1450 * Return 0 if the option was found, ENOENT otherwise.
1451 * If len is non-NULL it will be filled with the length
1452 * of the option. If buf is non-NULL, it will be filled
1453 * with the address of the option.
1454 */
1455int
1456vfs_getopt(opts, name, buf, len)
1457	struct vfsoptlist *opts;
1458	const char *name;
1459	void **buf;
1460	int *len;
1461{
1462	struct vfsopt *opt;
1463
1464	KASSERT(opts != NULL, ("vfs_getopt: caller passed 'opts' as NULL"));
1465
1466	TAILQ_FOREACH(opt, opts, link) {
1467		if (strcmp(name, opt->name) == 0) {
1468			opt->seen = 1;
1469			if (len != NULL)
1470				*len = opt->len;
1471			if (buf != NULL)
1472				*buf = opt->value;
1473			return (0);
1474		}
1475	}
1476	return (ENOENT);
1477}
1478
1479int
1480vfs_getopt_pos(struct vfsoptlist *opts, const char *name)
1481{
1482	struct vfsopt *opt;
1483
1484	if (opts == NULL)
1485		return (-1);
1486
1487	TAILQ_FOREACH(opt, opts, link) {
1488		if (strcmp(name, opt->name) == 0) {
1489			opt->seen = 1;
1490			return (opt->pos);
1491		}
1492	}
1493	return (-1);
1494}
1495
1496char *
1497vfs_getopts(struct vfsoptlist *opts, const char *name, int *error)
1498{
1499	struct vfsopt *opt;
1500
1501	*error = 0;
1502	TAILQ_FOREACH(opt, opts, link) {
1503		if (strcmp(name, opt->name) != 0)
1504			continue;
1505		opt->seen = 1;
1506		if (opt->len == 0 ||
1507		    ((char *)opt->value)[opt->len - 1] != '\0') {
1508			*error = EINVAL;
1509			return (NULL);
1510		}
1511		return (opt->value);
1512	}
1513	*error = ENOENT;
1514	return (NULL);
1515}
1516
1517int
1518vfs_flagopt(struct vfsoptlist *opts, const char *name, u_int *w, u_int val)
1519{
1520	struct vfsopt *opt;
1521
1522	TAILQ_FOREACH(opt, opts, link) {
1523		if (strcmp(name, opt->name) == 0) {
1524			opt->seen = 1;
1525			if (w != NULL)
1526				*w |= val;
1527			return (1);
1528		}
1529	}
1530	if (w != NULL)
1531		*w &= ~val;
1532	return (0);
1533}
1534
1535int
1536vfs_scanopt(struct vfsoptlist *opts, const char *name, const char *fmt, ...)
1537{
1538	va_list ap;
1539	struct vfsopt *opt;
1540	int ret;
1541
1542	KASSERT(opts != NULL, ("vfs_getopt: caller passed 'opts' as NULL"));
1543
1544	TAILQ_FOREACH(opt, opts, link) {
1545		if (strcmp(name, opt->name) != 0)
1546			continue;
1547		opt->seen = 1;
1548		if (opt->len == 0 || opt->value == NULL)
1549			return (0);
1550		if (((char *)opt->value)[opt->len - 1] != '\0')
1551			return (0);
1552		va_start(ap, fmt);
1553		ret = vsscanf(opt->value, fmt, ap);
1554		va_end(ap);
1555		return (ret);
1556	}
1557	return (0);
1558}
1559
1560int
1561vfs_setopt(struct vfsoptlist *opts, const char *name, void *value, int len)
1562{
1563	struct vfsopt *opt;
1564
1565	TAILQ_FOREACH(opt, opts, link) {
1566		if (strcmp(name, opt->name) != 0)
1567			continue;
1568		opt->seen = 1;
1569		if (opt->value == NULL)
1570			opt->len = len;
1571		else {
1572			if (opt->len != len)
1573				return (EINVAL);
1574			bcopy(value, opt->value, len);
1575		}
1576		return (0);
1577	}
1578	return (ENOENT);
1579}
1580
1581int
1582vfs_setopt_part(struct vfsoptlist *opts, const char *name, void *value, int len)
1583{
1584	struct vfsopt *opt;
1585
1586	TAILQ_FOREACH(opt, opts, link) {
1587		if (strcmp(name, opt->name) != 0)
1588			continue;
1589		opt->seen = 1;
1590		if (opt->value == NULL)
1591			opt->len = len;
1592		else {
1593			if (opt->len < len)
1594				return (EINVAL);
1595			opt->len = len;
1596			bcopy(value, opt->value, len);
1597		}
1598		return (0);
1599	}
1600	return (ENOENT);
1601}
1602
1603int
1604vfs_setopts(struct vfsoptlist *opts, const char *name, const char *value)
1605{
1606	struct vfsopt *opt;
1607
1608	TAILQ_FOREACH(opt, opts, link) {
1609		if (strcmp(name, opt->name) != 0)
1610			continue;
1611		opt->seen = 1;
1612		if (opt->value == NULL)
1613			opt->len = strlen(value) + 1;
1614		else if (strlcpy(opt->value, value, opt->len) >= opt->len)
1615			return (EINVAL);
1616		return (0);
1617	}
1618	return (ENOENT);
1619}
1620
1621/*
1622 * Find and copy a mount option.
1623 *
1624 * The size of the buffer has to be specified
1625 * in len, if it is not the same length as the
1626 * mount option, EINVAL is returned.
1627 * Returns ENOENT if the option is not found.
1628 */
1629int
1630vfs_copyopt(opts, name, dest, len)
1631	struct vfsoptlist *opts;
1632	const char *name;
1633	void *dest;
1634	int len;
1635{
1636	struct vfsopt *opt;
1637
1638	KASSERT(opts != NULL, ("vfs_copyopt: caller passed 'opts' as NULL"));
1639
1640	TAILQ_FOREACH(opt, opts, link) {
1641		if (strcmp(name, opt->name) == 0) {
1642			opt->seen = 1;
1643			if (len != opt->len)
1644				return (EINVAL);
1645			bcopy(opt->value, dest, opt->len);
1646			return (0);
1647		}
1648	}
1649	return (ENOENT);
1650}
1651
1652/*
1653 * This is a helper function for filesystems to traverse their
1654 * vnodes.  See MNT_VNODE_FOREACH() in sys/mount.h
1655 */
1656
1657struct vnode *
1658__mnt_vnode_next(struct vnode **mvp, struct mount *mp)
1659{
1660	struct vnode *vp;
1661
1662	mtx_assert(MNT_MTX(mp), MA_OWNED);
1663
1664	KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
1665	if (should_yield()) {
1666		MNT_IUNLOCK(mp);
1667		kern_yield(-1);
1668		MNT_ILOCK(mp);
1669	}
1670	vp = TAILQ_NEXT(*mvp, v_nmntvnodes);
1671	while (vp != NULL && vp->v_type == VMARKER)
1672		vp = TAILQ_NEXT(vp, v_nmntvnodes);
1673
1674	/* Check if we are done */
1675	if (vp == NULL) {
1676		__mnt_vnode_markerfree(mvp, mp);
1677		return (NULL);
1678	}
1679	TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes);
1680	TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes);
1681	return (vp);
1682}
1683
1684struct vnode *
1685__mnt_vnode_first(struct vnode **mvp, struct mount *mp)
1686{
1687	struct vnode *vp;
1688
1689	mtx_assert(MNT_MTX(mp), MA_OWNED);
1690
1691	vp = TAILQ_FIRST(&mp->mnt_nvnodelist);
1692	while (vp != NULL && vp->v_type == VMARKER)
1693		vp = TAILQ_NEXT(vp, v_nmntvnodes);
1694
1695	/* Check if we are done */
1696	if (vp == NULL) {
1697		*mvp = NULL;
1698		return (NULL);
1699	}
1700	MNT_REF(mp);
1701	MNT_IUNLOCK(mp);
1702	*mvp = (struct vnode *) malloc(sizeof(struct vnode),
1703				       M_VNODE_MARKER,
1704				       M_WAITOK | M_ZERO);
1705	MNT_ILOCK(mp);
1706	(*mvp)->v_type = VMARKER;
1707
1708	vp = TAILQ_FIRST(&mp->mnt_nvnodelist);
1709	while (vp != NULL && vp->v_type == VMARKER)
1710		vp = TAILQ_NEXT(vp, v_nmntvnodes);
1711
1712	/* Check if we are done */
1713	if (vp == NULL) {
1714		MNT_IUNLOCK(mp);
1715		free(*mvp, M_VNODE_MARKER);
1716		MNT_ILOCK(mp);
1717		*mvp = NULL;
1718		MNT_REL(mp);
1719		return (NULL);
1720	}
1721	(*mvp)->v_mount = mp;
1722	TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes);
1723	return (vp);
1724}
1725
1726
1727void
1728__mnt_vnode_markerfree(struct vnode **mvp, struct mount *mp)
1729{
1730
1731	if (*mvp == NULL)
1732		return;
1733
1734	mtx_assert(MNT_MTX(mp), MA_OWNED);
1735
1736	KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
1737	TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes);
1738	MNT_IUNLOCK(mp);
1739	free(*mvp, M_VNODE_MARKER);
1740	MNT_ILOCK(mp);
1741	*mvp = NULL;
1742	MNT_REL(mp);
1743}
1744
1745
1746int
1747__vfs_statfs(struct mount *mp, struct statfs *sbp)
1748{
1749	int error;
1750
1751	error = mp->mnt_op->vfs_statfs(mp, &mp->mnt_stat);
1752	if (sbp != &mp->mnt_stat)
1753		*sbp = mp->mnt_stat;
1754	return (error);
1755}
1756
1757void
1758vfs_mountedfrom(struct mount *mp, const char *from)
1759{
1760
1761	bzero(mp->mnt_stat.f_mntfromname, sizeof mp->mnt_stat.f_mntfromname);
1762	strlcpy(mp->mnt_stat.f_mntfromname, from,
1763	    sizeof mp->mnt_stat.f_mntfromname);
1764}
1765
1766/*
1767 * ---------------------------------------------------------------------
1768 * This is the api for building mount args and mounting filesystems from
1769 * inside the kernel.
1770 *
1771 * The API works by accumulation of individual args.  First error is
1772 * latched.
1773 *
1774 * XXX: should be documented in new manpage kernel_mount(9)
1775 */
1776
1777/* A memory allocation which must be freed when we are done */
1778struct mntaarg {
1779	SLIST_ENTRY(mntaarg)	next;
1780};
1781
1782/* The header for the mount arguments */
1783struct mntarg {
1784	struct iovec *v;
1785	int len;
1786	int error;
1787	SLIST_HEAD(, mntaarg)	list;
1788};
1789
1790/*
1791 * Add a boolean argument.
1792 *
1793 * flag is the boolean value.
1794 * name must start with "no".
1795 */
1796struct mntarg *
1797mount_argb(struct mntarg *ma, int flag, const char *name)
1798{
1799
1800	KASSERT(name[0] == 'n' && name[1] == 'o',
1801	    ("mount_argb(...,%s): name must start with 'no'", name));
1802
1803	return (mount_arg(ma, name + (flag ? 2 : 0), NULL, 0));
1804}
1805
1806/*
1807 * Add an argument printf style
1808 */
1809struct mntarg *
1810mount_argf(struct mntarg *ma, const char *name, const char *fmt, ...)
1811{
1812	va_list ap;
1813	struct mntaarg *maa;
1814	struct sbuf *sb;
1815	int len;
1816
1817	if (ma == NULL) {
1818		ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK | M_ZERO);
1819		SLIST_INIT(&ma->list);
1820	}
1821	if (ma->error)
1822		return (ma);
1823
1824	ma->v = realloc(ma->v, sizeof *ma->v * (ma->len + 2),
1825	    M_MOUNT, M_WAITOK);
1826	ma->v[ma->len].iov_base = (void *)(uintptr_t)name;
1827	ma->v[ma->len].iov_len = strlen(name) + 1;
1828	ma->len++;
1829
1830	sb = sbuf_new_auto();
1831	va_start(ap, fmt);
1832	sbuf_vprintf(sb, fmt, ap);
1833	va_end(ap);
1834	sbuf_finish(sb);
1835	len = sbuf_len(sb) + 1;
1836	maa = malloc(sizeof *maa + len, M_MOUNT, M_WAITOK | M_ZERO);
1837	SLIST_INSERT_HEAD(&ma->list, maa, next);
1838	bcopy(sbuf_data(sb), maa + 1, len);
1839	sbuf_delete(sb);
1840
1841	ma->v[ma->len].iov_base = maa + 1;
1842	ma->v[ma->len].iov_len = len;
1843	ma->len++;
1844
1845	return (ma);
1846}
1847
1848/*
1849 * Add an argument which is a userland string.
1850 */
1851struct mntarg *
1852mount_argsu(struct mntarg *ma, const char *name, const void *val, int len)
1853{
1854	struct mntaarg *maa;
1855	char *tbuf;
1856
1857	if (val == NULL)
1858		return (ma);
1859	if (ma == NULL) {
1860		ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK | M_ZERO);
1861		SLIST_INIT(&ma->list);
1862	}
1863	if (ma->error)
1864		return (ma);
1865	maa = malloc(sizeof *maa + len, M_MOUNT, M_WAITOK | M_ZERO);
1866	SLIST_INSERT_HEAD(&ma->list, maa, next);
1867	tbuf = (void *)(maa + 1);
1868	ma->error = copyinstr(val, tbuf, len, NULL);
1869	return (mount_arg(ma, name, tbuf, -1));
1870}
1871
1872/*
1873 * Plain argument.
1874 *
1875 * If length is -1, treat value as a C string.
1876 */
1877struct mntarg *
1878mount_arg(struct mntarg *ma, const char *name, const void *val, int len)
1879{
1880
1881	if (ma == NULL) {
1882		ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK | M_ZERO);
1883		SLIST_INIT(&ma->list);
1884	}
1885	if (ma->error)
1886		return (ma);
1887
1888	ma->v = realloc(ma->v, sizeof *ma->v * (ma->len + 2),
1889	    M_MOUNT, M_WAITOK);
1890	ma->v[ma->len].iov_base = (void *)(uintptr_t)name;
1891	ma->v[ma->len].iov_len = strlen(name) + 1;
1892	ma->len++;
1893
1894	ma->v[ma->len].iov_base = (void *)(uintptr_t)val;
1895	if (len < 0)
1896		ma->v[ma->len].iov_len = strlen(val) + 1;
1897	else
1898		ma->v[ma->len].iov_len = len;
1899	ma->len++;
1900	return (ma);
1901}
1902
1903/*
1904 * Free a mntarg structure
1905 */
1906static void
1907free_mntarg(struct mntarg *ma)
1908{
1909	struct mntaarg *maa;
1910
1911	while (!SLIST_EMPTY(&ma->list)) {
1912		maa = SLIST_FIRST(&ma->list);
1913		SLIST_REMOVE_HEAD(&ma->list, next);
1914		free(maa, M_MOUNT);
1915	}
1916	free(ma->v, M_MOUNT);
1917	free(ma, M_MOUNT);
1918}
1919
1920/*
1921 * Mount a filesystem
1922 */
1923int
1924kernel_mount(struct mntarg *ma, int flags)
1925{
1926	struct uio auio;
1927	int error;
1928
1929	KASSERT(ma != NULL, ("kernel_mount NULL ma"));
1930	KASSERT(ma->v != NULL, ("kernel_mount NULL ma->v"));
1931	KASSERT(!(ma->len & 1), ("kernel_mount odd ma->len (%d)", ma->len));
1932
1933	auio.uio_iov = ma->v;
1934	auio.uio_iovcnt = ma->len;
1935	auio.uio_segflg = UIO_SYSSPACE;
1936
1937	error = ma->error;
1938	if (!error)
1939		error = vfs_donmount(curthread, flags, &auio);
1940	free_mntarg(ma);
1941	return (error);
1942}
1943
1944/*
1945 * A printflike function to mount a filesystem.
1946 */
1947int
1948kernel_vmount(int flags, ...)
1949{
1950	struct mntarg *ma = NULL;
1951	va_list ap;
1952	const char *cp;
1953	const void *vp;
1954	int error;
1955
1956	va_start(ap, flags);
1957	for (;;) {
1958		cp = va_arg(ap, const char *);
1959		if (cp == NULL)
1960			break;
1961		vp = va_arg(ap, const void *);
1962		ma = mount_arg(ma, cp, vp, (vp != NULL ? -1 : 0));
1963	}
1964	va_end(ap);
1965
1966	error = kernel_mount(ma, flags);
1967	return (error);
1968}
1969
1970void
1971vfs_oexport_conv(const struct oexport_args *oexp, struct export_args *exp)
1972{
1973
1974	bcopy(oexp, exp, sizeof(*oexp));
1975	exp->ex_numsecflavors = 0;
1976}
1977