1/*
2 * Copyright (c) 2006-2007 Pawel Jakub Dawidek <pjd@FreeBSD.org>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27#include <sys/cdefs.h>
28__FBSDID("$FreeBSD$");
29
30#include <sys/types.h>
31#include <sys/param.h>
32#include <sys/kernel.h>
33#include <sys/systm.h>
34#include <sys/malloc.h>
35#include <sys/mount.h>
36#include <sys/cred.h>
37#include <sys/vfs.h>
38#include <sys/priv.h>
39#include <sys/libkern.h>
40
41#include <sys/mutex.h>
42#include <sys/vnode.h>
43#include <sys/taskq.h>
44
45#include <sys/ccompat.h>
46
47MALLOC_DECLARE(M_MOUNT);
48
49void
50vfs_setmntopt(vfs_t *vfsp, const char *name, const char *arg,
51    int flags __unused)
52{
53	struct vfsopt *opt;
54	size_t namesize;
55	int locked;
56
57	if (!(locked = mtx_owned(MNT_MTX(vfsp))))
58		MNT_ILOCK(vfsp);
59
60	if (vfsp->mnt_opt == NULL) {
61		void *opts;
62
63		MNT_IUNLOCK(vfsp);
64		opts = malloc(sizeof (*vfsp->mnt_opt), M_MOUNT, M_WAITOK);
65		MNT_ILOCK(vfsp);
66		if (vfsp->mnt_opt == NULL) {
67			vfsp->mnt_opt = opts;
68			TAILQ_INIT(vfsp->mnt_opt);
69		} else {
70			free(opts, M_MOUNT);
71		}
72	}
73
74	MNT_IUNLOCK(vfsp);
75
76	opt = malloc(sizeof (*opt), M_MOUNT, M_WAITOK);
77	namesize = strlen(name) + 1;
78	opt->name = malloc(namesize, M_MOUNT, M_WAITOK);
79	strlcpy(opt->name, name, namesize);
80	opt->pos = -1;
81	opt->seen = 1;
82	if (arg == NULL) {
83		opt->value = NULL;
84		opt->len = 0;
85	} else {
86		opt->len = strlen(arg) + 1;
87		opt->value = malloc(opt->len, M_MOUNT, M_WAITOK);
88		bcopy(arg, opt->value, opt->len);
89	}
90
91	MNT_ILOCK(vfsp);
92	TAILQ_INSERT_TAIL(vfsp->mnt_opt, opt, link);
93	if (!locked)
94		MNT_IUNLOCK(vfsp);
95}
96
97void
98vfs_clearmntopt(vfs_t *vfsp, const char *name)
99{
100	int locked;
101
102	if (!(locked = mtx_owned(MNT_MTX(vfsp))))
103		MNT_ILOCK(vfsp);
104	vfs_deleteopt(vfsp->mnt_opt, name);
105	if (!locked)
106		MNT_IUNLOCK(vfsp);
107}
108
109int
110vfs_optionisset(const vfs_t *vfsp, const char *opt, char **argp)
111{
112	struct vfsoptlist *opts = vfsp->mnt_optnew;
113	int error;
114
115	if (opts == NULL)
116		return (0);
117	error = vfs_getopt(opts, opt, (void **)argp, NULL);
118	return (error != 0 ? 0 : 1);
119}
120
121int
122mount_snapshot(kthread_t *td, vnode_t **vpp, const char *fstype, char *fspath,
123    char *fspec, int fsflags)
124{
125	struct vfsconf *vfsp;
126	struct mount *mp;
127	vnode_t *vp, *mvp;
128	struct ucred *cr;
129	int error;
130
131	ASSERT_VOP_ELOCKED(*vpp, "mount_snapshot");
132
133	vp = *vpp;
134	*vpp = NULL;
135	error = 0;
136
137	/*
138	 * Be ultra-paranoid about making sure the type and fspath
139	 * variables will fit in our mp buffers, including the
140	 * terminating NUL.
141	 */
142	if (strlen(fstype) >= MFSNAMELEN || strlen(fspath) >= MNAMELEN)
143		error = ENAMETOOLONG;
144	if (error == 0 && (vfsp = vfs_byname_kld(fstype, td, &error)) == NULL)
145		error = ENODEV;
146	if (error == 0 && vp->v_type != VDIR)
147		error = ENOTDIR;
148	/*
149	 * We need vnode lock to protect v_mountedhere and vnode interlock
150	 * to protect v_iflag.
151	 */
152	if (error == 0) {
153		VI_LOCK(vp);
154		if ((vp->v_iflag & VI_MOUNT) == 0 && vp->v_mountedhere == NULL)
155			vp->v_iflag |= VI_MOUNT;
156		else
157			error = EBUSY;
158		VI_UNLOCK(vp);
159	}
160	if (error != 0) {
161		vput(vp);
162		return (error);
163	}
164	vn_seqc_write_begin(vp);
165	VOP_UNLOCK1(vp);
166
167	/*
168	 * Allocate and initialize the filesystem.
169	 * We don't want regular user that triggered snapshot mount to be able
170	 * to unmount it, so pass credentials of the parent mount.
171	 */
172	mp = vfs_mount_alloc(vp, vfsp, fspath, vp->v_mount->mnt_cred);
173
174	mp->mnt_optnew = NULL;
175	vfs_setmntopt(mp, "from", fspec, 0);
176	mp->mnt_optnew = mp->mnt_opt;
177	mp->mnt_opt = NULL;
178
179	/*
180	 * Set the mount level flags.
181	 */
182	mp->mnt_flag = fsflags & MNT_UPDATEMASK;
183	/*
184	 * Snapshots are always read-only.
185	 */
186	mp->mnt_flag |= MNT_RDONLY;
187	/*
188	 * We don't want snapshots to allow access to vulnerable setuid
189	 * programs, so we turn off setuid when mounting snapshots.
190	 */
191	mp->mnt_flag |= MNT_NOSUID;
192	/*
193	 * We don't want snapshots to be visible in regular
194	 * mount(8) and df(1) output.
195	 */
196	mp->mnt_flag |= MNT_IGNORE;
197	/*
198	 * XXX: This is evil, but we can't mount a snapshot as a regular user.
199	 * XXX: Is is safe when snapshot is mounted from within a jail?
200	 */
201	cr = td->td_ucred;
202	td->td_ucred = kcred;
203	error = VFS_MOUNT(mp);
204	td->td_ucred = cr;
205
206	if (error != 0) {
207		/*
208		 * Clear VI_MOUNT and decrement the use count "atomically",
209		 * under the vnode lock.  This is not strictly required,
210		 * but makes it easier to reason about the life-cycle and
211		 * ownership of the covered vnode.
212		 */
213		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
214		VI_LOCK(vp);
215		vp->v_iflag &= ~VI_MOUNT;
216		VI_UNLOCK(vp);
217		vn_seqc_write_end(vp);
218		vput(vp);
219		vfs_unbusy(mp);
220		vfs_freeopts(mp->mnt_optnew);
221		mp->mnt_vnodecovered = NULL;
222		vfs_mount_destroy(mp);
223		return (error);
224	}
225
226	if (mp->mnt_opt != NULL)
227		vfs_freeopts(mp->mnt_opt);
228	mp->mnt_opt = mp->mnt_optnew;
229	(void) VFS_STATFS(mp, &mp->mnt_stat);
230
231	/*
232	 * Prevent external consumers of mount options from reading
233	 * mnt_optnew.
234	 */
235	mp->mnt_optnew = NULL;
236
237	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
238#ifdef FREEBSD_NAMECACHE
239	cache_purge(vp);
240#endif
241	VI_LOCK(vp);
242	vp->v_iflag &= ~VI_MOUNT;
243#ifdef VIRF_MOUNTPOINT
244	vn_irflag_set_locked(vp, VIRF_MOUNTPOINT);
245#endif
246	vp->v_mountedhere = mp;
247	VI_UNLOCK(vp);
248	/* Put the new filesystem on the mount list. */
249	mtx_lock(&mountlist_mtx);
250	TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list);
251	mtx_unlock(&mountlist_mtx);
252	vfs_event_signal(NULL, VQ_MOUNT, 0);
253	if (VFS_ROOT(mp, LK_EXCLUSIVE, &mvp))
254		panic("mount: lost mount");
255	vn_seqc_write_end(vp);
256	VOP_UNLOCK1(vp);
257#if __FreeBSD_version >= 1300048
258	vfs_op_exit(mp);
259#endif
260	vfs_unbusy(mp);
261	*vpp = mvp;
262	return (0);
263}
264
265/*
266 * Like vn_rele() except if we are going to call VOP_INACTIVE() then do it
267 * asynchronously using a taskq. This can avoid deadlocks caused by re-entering
268 * the file system as a result of releasing the vnode. Note, file systems
269 * already have to handle the race where the vnode is incremented before the
270 * inactive routine is called and does its locking.
271 *
272 * Warning: Excessive use of this routine can lead to performance problems.
273 * This is because taskqs throttle back allocation if too many are created.
274 */
275void
276vn_rele_async(vnode_t *vp, taskq_t *taskq)
277{
278	VERIFY(vp->v_usecount > 0);
279	if (refcount_release_if_not_last(&vp->v_usecount)) {
280#if __FreeBSD_version < 1300045
281		vdrop(vp);
282#endif
283		return;
284	}
285	VERIFY(taskq_dispatch((taskq_t *)taskq,
286	    (task_func_t *)vrele, vp, TQ_SLEEP) != 0);
287}
288