lofs_vfsops.c revision 3898:c788126f2a20
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26#pragma ident	"%Z%%M%	%I%	%E% SMI"
27
28#include <sys/param.h>
29#include <sys/errno.h>
30#include <sys/vfs.h>
31#include <sys/vfs_opreg.h>
32#include <sys/vnode.h>
33#include <sys/uio.h>
34#include <sys/pathname.h>
35#include <sys/kmem.h>
36#include <sys/cred.h>
37#include <sys/statvfs.h>
38#include <sys/fs/lofs_info.h>
39#include <sys/fs/lofs_node.h>
40#include <sys/mount.h>
41#include <sys/mntent.h>
42#include <sys/mkdev.h>
43#include <sys/priv.h>
44#include <sys/sysmacros.h>
45#include <sys/systm.h>
46#include <sys/cmn_err.h>
47#include <sys/policy.h>
48#include <sys/tsol/label.h>
49#include "fs/fs_subr.h"
50
51/*
52 * This is the loadable module wrapper.
53 */
54#include <sys/modctl.h>
55
56static mntopts_t lofs_mntopts;
57
58static int lofsinit(int, char *);
59
60static vfsdef_t vfw = {
61	VFSDEF_VERSION,
62	"lofs",
63	lofsinit,
64	VSW_HASPROTO|VSW_STATS,
65	&lofs_mntopts
66};
67
68/*
69 * LOFS mount options table
70 */
71static char *xattr_cancel[] = { MNTOPT_NOXATTR, NULL };
72static char *noxattr_cancel[] = { MNTOPT_XATTR, NULL };
73static char *sub_cancel[] = { MNTOPT_LOFS_NOSUB, NULL };
74static char *nosub_cancel[] = { MNTOPT_LOFS_SUB, NULL };
75
76static mntopt_t mntopts[] = {
77/*
78 *	option name		cancel option	default arg	flags
79 *		private data
80 */
81	{ MNTOPT_XATTR,		xattr_cancel,	NULL,		0,
82		(void *)0 },
83	{ MNTOPT_NOXATTR,	noxattr_cancel,	NULL,		0,
84		(void *)0 },
85	{ MNTOPT_LOFS_SUB,	sub_cancel,	NULL,		0,
86		(void *)0 },
87	{ MNTOPT_LOFS_NOSUB,	nosub_cancel,	NULL,		0,
88		(void *)0 },
89};
90
91static mntopts_t lofs_mntopts = {
92	sizeof (mntopts) / sizeof (mntopt_t),
93	mntopts
94};
95
96/*
97 * Module linkage information for the kernel.
98 */
99
100static struct modlfs modlfs = {
101	&mod_fsops, "filesystem for lofs", &vfw
102};
103
104static struct modlinkage modlinkage = {
105	MODREV_1, (void *)&modlfs, NULL
106};
107
108/*
109 * This is the module initialization routine.
110 */
111
112int
113_init(void)
114{
115	int status;
116
117	lofs_subrinit();
118	status = mod_install(&modlinkage);
119	if (status != 0) {
120		/*
121		 * Cleanup previously initialized work.
122		 */
123		lofs_subrfini();
124	}
125
126	return (status);
127}
128
129/*
130 * Don't allow the lofs module to be unloaded for now.
131 * There is a memory leak if it gets unloaded.
132 */
133
134int
135_fini(void)
136{
137	return (EBUSY);
138}
139
140int
141_info(struct modinfo *modinfop)
142{
143	return (mod_info(&modlinkage, modinfop));
144}
145
146
147static int lofsfstype;
148vfsops_t *lo_vfsops;
149
150/*
151 * lo mount vfsop
152 * Set up mount info record and attach it to vfs struct.
153 */
154/*ARGSUSED*/
155static int
156lo_mount(struct vfs *vfsp,
157	struct vnode *vp,
158	struct mounta *uap,
159	struct cred *cr)
160{
161	int error;
162	struct vnode *srootvp = NULL;	/* the server's root */
163	struct vnode *realrootvp;
164	struct loinfo *li;
165	int nodev;
166
167	nodev = vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL);
168
169	if ((error = secpolicy_fs_mount(cr, vp, vfsp)) != 0)
170		return (EPERM);
171
172	/*
173	 * Loopback devices which get "nodevices" added can be done without
174	 * "nodevices" set because we cannot import devices into a zone
175	 * with loopback.  Note that we have all zone privileges when
176	 * this happens; if not, we'd have gotten "nosuid".
177	 */
178	if (!nodev && vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL))
179		vfs_setmntopt(vfsp, MNTOPT_DEVICES, NULL, VFS_NODISPLAY);
180
181	mutex_enter(&vp->v_lock);
182	if (!(uap->flags & MS_OVERLAY) &&
183	    (vp->v_count != 1 || (vp->v_flag & VROOT))) {
184		mutex_exit(&vp->v_lock);
185		return (EBUSY);
186	}
187	mutex_exit(&vp->v_lock);
188
189	/*
190	 * Find real root, and make vfs point to real vfs
191	 */
192	if (error = lookupname(uap->spec, (uap->flags & MS_SYSSPACE) ?
193		UIO_SYSSPACE : UIO_USERSPACE, FOLLOW, NULLVPP,
194	    &realrootvp))
195		return (error);
196
197	/*
198	 * Enforce MAC policy if needed.
199	 *
200	 * Loopback mounts must not allow writing up. The dominance test
201	 * is intended to prevent a global zone caller from accidentally
202	 * creating write-up conditions between two labeled zones.
203	 * Local zones can't violate MAC on their own without help from
204	 * the global zone because they can't name a pathname that
205	 * they don't already have.
206	 *
207	 * The special case check for the NET_MAC_AWARE process flag is
208	 * to support the case of the automounter in the global zone. We
209	 * permit automounting of local zone directories such as home
210	 * directories, into the global zone as required by setlabel,
211	 * zonecopy, and saving of desktop sessions. Such mounts are
212	 * trusted not to expose the contents of one zone's directories
213	 * to another by leaking them through the global zone.
214	 */
215	if (is_system_labeled() && crgetzoneid(cr) == GLOBAL_ZONEID) {
216		char	specname[MAXPATHLEN];
217		zone_t	*from_zptr;
218		zone_t	*to_zptr;
219
220		if (vnodetopath(NULL, realrootvp, specname,
221		    sizeof (specname), CRED()) != 0)
222			return (EACCES);
223
224		from_zptr = zone_find_by_path(specname);
225		to_zptr = zone_find_by_path(refstr_value(vfsp->vfs_mntpt));
226
227		/*
228		 * Special case for zone devfs: the zone for /dev will
229		 * incorrectly appear as the global zone since it's not
230		 * under the zone rootpath.  So for zone devfs check allow
231		 * read-write mounts.
232		 *
233		 * Second special case for scratch zones used for Live Upgrade:
234		 * this is used to mount the zone's root from /root to /a in
235		 * the scratch zone.  As with the other special case, this
236		 * appears to be outside of the zone because it's not under
237		 * the zone rootpath, which is $ZONEPATH/lu in the scratch
238		 * zone case.
239		 */
240
241		if (from_zptr != to_zptr &&
242		    !(to_zptr->zone_flags & ZF_IS_SCRATCH)) {
243			/*
244			 * We know at this point that the labels aren't equal
245			 * because the zone pointers aren't equal, and zones
246			 * can't share a label.
247			 *
248			 * If the source is the global zone then making
249			 * it available to a local zone must be done in
250			 * read-only mode as the label will become admin_low.
251			 *
252			 * If it is a mount between local zones then if
253			 * the current process is in the global zone and has
254			 * the NET_MAC_AWARE flag, then regular read-write
255			 * access is allowed.  If it's in some other zone, but
256			 * the label on the mount point dominates the original
257			 * source, then allow the mount as read-only
258			 * ("read-down").
259			 */
260			if (from_zptr->zone_id == GLOBAL_ZONEID) {
261				/* make the mount read-only */
262				vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0);
263			} else { /* cross-zone mount */
264				if (to_zptr->zone_id == GLOBAL_ZONEID &&
265				    /* LINTED: no consequent */
266				    getpflags(NET_MAC_AWARE, cr) != 0) {
267					/* Allow the mount as read-write */
268				} else if (bldominates(
269				    label2bslabel(to_zptr->zone_slabel),
270				    label2bslabel(from_zptr->zone_slabel))) {
271					/* make the mount read-only */
272					vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0);
273				} else {
274					zone_rele(to_zptr);
275					zone_rele(from_zptr);
276					return (EACCES);
277				}
278			}
279		}
280		zone_rele(to_zptr);
281		zone_rele(from_zptr);
282	}
283
284	/*
285	 * realrootvp may be an AUTOFS node, in which case we
286	 * perform a VOP_ACCESS() to trigger the mount of the
287	 * intended filesystem, so we loopback mount the intended
288	 * filesystem instead of the AUTOFS filesystem.
289	 */
290	(void) VOP_ACCESS(realrootvp, 0, 0, cr);
291
292	/*
293	 * We're interested in the top most filesystem.
294	 * This is specially important when uap->spec is a trigger
295	 * AUTOFS node, since we're really interested in mounting the
296	 * filesystem AUTOFS mounted as result of the VOP_ACCESS()
297	 * call not the AUTOFS node itself.
298	 */
299	if (vn_mountedvfs(realrootvp) != NULL) {
300		if (error = traverse(&realrootvp)) {
301			VN_RELE(realrootvp);
302			return (error);
303		}
304	}
305
306	/*
307	 * Allocate a vfs info struct and attach it
308	 */
309	li = kmem_zalloc(sizeof (struct loinfo), KM_SLEEP);
310	li->li_realvfs = realrootvp->v_vfsp;
311	li->li_mountvfs = vfsp;
312
313	/*
314	 * Set mount flags to be inherited by loopback vfs's
315	 */
316	if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) {
317		li->li_mflag |= VFS_RDONLY;
318	}
319	if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) {
320		li->li_mflag |= (VFS_NOSETUID|VFS_NODEVICES);
321	}
322	if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL)) {
323		li->li_mflag |= VFS_NODEVICES;
324	}
325	if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) {
326		li->li_mflag |= VFS_NOSETUID;
327	}
328	/*
329	 * Permissive flags are added to the "deny" bitmap.
330	 */
331	if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) {
332		li->li_dflag |= VFS_XATTR;
333	}
334	if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) {
335		li->li_dflag |= VFS_NBMAND;
336	}
337
338	/*
339	 * Propagate inheritable mount flags from the real vfs.
340	 */
341	if ((li->li_realvfs->vfs_flag & VFS_RDONLY) &&
342	    !vfs_optionisset(vfsp, MNTOPT_RO, NULL))
343		vfs_setmntopt(vfsp, MNTOPT_RO, NULL,
344		    VFS_NODISPLAY);
345	if ((li->li_realvfs->vfs_flag & VFS_NOSETUID) &&
346	    !vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL))
347		vfs_setmntopt(vfsp, MNTOPT_NOSETUID, NULL,
348		    VFS_NODISPLAY);
349	if ((li->li_realvfs->vfs_flag & VFS_NODEVICES) &&
350	    !vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL))
351		vfs_setmntopt(vfsp, MNTOPT_NODEVICES, NULL,
352		    VFS_NODISPLAY);
353	/*
354	 * Permissive flags such as VFS_XATTR, as opposed to restrictive flags
355	 * such as VFS_RDONLY, are handled differently.  An explicit
356	 * MNTOPT_NOXATTR should override the underlying filesystem's VFS_XATTR.
357	 */
358	if ((li->li_realvfs->vfs_flag & VFS_XATTR) &&
359	    !vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL) &&
360	    !vfs_optionisset(vfsp, MNTOPT_XATTR, NULL))
361		vfs_setmntopt(vfsp, MNTOPT_XATTR, NULL,
362		    VFS_NODISPLAY);
363	if ((li->li_realvfs->vfs_flag & VFS_NBMAND) &&
364	    !vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL) &&
365	    !vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL))
366		vfs_setmntopt(vfsp, MNTOPT_NBMAND, NULL,
367		    VFS_NODISPLAY);
368
369	li->li_refct = 0;
370	vfsp->vfs_data = (caddr_t)li;
371	vfsp->vfs_bcount = 0;
372	vfsp->vfs_fstype = lofsfstype;
373	vfsp->vfs_bsize = li->li_realvfs->vfs_bsize;
374
375	vfsp->vfs_dev = li->li_realvfs->vfs_dev;
376	vfsp->vfs_fsid.val[0] = li->li_realvfs->vfs_fsid.val[0];
377	vfsp->vfs_fsid.val[1] = li->li_realvfs->vfs_fsid.val[1];
378
379	if (vfs_optionisset(vfsp, MNTOPT_LOFS_NOSUB, NULL)) {
380		li->li_flag |= LO_NOSUB;
381	}
382
383	/*
384	 * Setup the hashtable. If the root of this mount isn't a directory,
385	 * there's no point in allocating a large hashtable. A table with one
386	 * bucket is sufficient.
387	 */
388	if (realrootvp->v_type != VDIR)
389		lsetup(li, 1);
390	else
391		lsetup(li, 0);
392
393	/*
394	 * Make the root vnode
395	 */
396	srootvp = makelonode(realrootvp, li, 0);
397	srootvp->v_flag |= VROOT;
398	li->li_rootvp = srootvp;
399
400#ifdef LODEBUG
401	lo_dprint(4, "lo_mount: vfs %p realvfs %p root %p realroot %p li %p\n",
402	    vfsp, li->li_realvfs, srootvp, realrootvp, li);
403#endif
404	return (0);
405}
406
407/*
408 * Undo loopback mount
409 */
410static int
411lo_unmount(struct vfs *vfsp, int flag, struct cred *cr)
412{
413	struct loinfo *li;
414
415	if (secpolicy_fs_unmount(cr, vfsp) != 0)
416		return (EPERM);
417
418	/*
419	 * Forced unmount is not supported by this file system
420	 * and thus, ENOTSUP, is being returned.
421	 */
422	if (flag & MS_FORCE)
423		return (ENOTSUP);
424
425	li = vtoli(vfsp);
426#ifdef LODEBUG
427	lo_dprint(4, "lo_unmount(%p) li %p\n", vfsp, li);
428#endif
429	if (li->li_refct != 1 || li->li_rootvp->v_count != 1) {
430#ifdef LODEBUG
431		lo_dprint(4, "refct %d v_ct %d\n", li->li_refct,
432		    li->li_rootvp->v_count);
433#endif
434		return (EBUSY);
435	}
436	VN_RELE(li->li_rootvp);
437	return (0);
438}
439
440/*
441 * Find root of lofs mount.
442 */
443static int
444lo_root(struct vfs *vfsp, struct vnode **vpp)
445{
446	*vpp = vtoli(vfsp)->li_rootvp;
447#ifdef LODEBUG
448	lo_dprint(4, "lo_root(0x%p) = %p\n", vfsp, *vpp);
449#endif
450	/*
451	 * If the root of the filesystem is a special file, return the specvp
452	 * version of the vnode. We don't save the specvp vnode in our
453	 * hashtable since that's exclusively for lnodes.
454	 */
455	if (IS_DEVVP(*vpp)) {
456		struct vnode *svp;
457
458		svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, kcred);
459		if (svp == NULL)
460			return (ENOSYS);
461		*vpp = svp;
462	} else {
463		VN_HOLD(*vpp);
464	}
465
466	return (0);
467}
468
469/*
470 * Get file system statistics.
471 */
472static int
473lo_statvfs(register struct vfs *vfsp, struct statvfs64 *sbp)
474{
475	vnode_t *realrootvp;
476
477#ifdef LODEBUG
478	lo_dprint(4, "lostatvfs %p\n", vfsp);
479#endif
480	/*
481	 * Using realrootvp->v_vfsp (instead of the realvfsp that was
482	 * cached) is necessary to make lofs work woth forced UFS unmounts.
483	 * In the case of a forced unmount, UFS stores a set of dummy vfsops
484	 * in all the (i)vnodes in the filesystem. The dummy ops simply
485	 * returns back EIO.
486	 */
487	(void) lo_realvfs(vfsp, &realrootvp);
488	if (realrootvp != NULL)
489		return (VFS_STATVFS(realrootvp->v_vfsp, sbp));
490	else
491		return (EIO);
492}
493
494/*
495 * LOFS doesn't have any data or metadata to flush, pending I/O on the
496 * underlying filesystem will be flushed when such filesystem is synched.
497 */
498/* ARGSUSED */
499static int
500lo_sync(struct vfs *vfsp,
501	short flag,
502	struct cred *cr)
503{
504#ifdef LODEBUG
505	lo_dprint(4, "lo_sync: %p\n", vfsp);
506#endif
507	return (0);
508}
509
510/*
511 * Obtain the vnode from the underlying filesystem.
512 */
513static int
514lo_vget(struct vfs *vfsp, struct vnode **vpp, struct fid *fidp)
515{
516	vnode_t *realrootvp;
517
518#ifdef LODEBUG
519	lo_dprint(4, "lo_vget: %p\n", vfsp);
520#endif
521	(void) lo_realvfs(vfsp, &realrootvp);
522	if (realrootvp != NULL)
523		return (VFS_VGET(realrootvp->v_vfsp, vpp, fidp));
524	else
525		return (EIO);
526}
527
528/*
529 * Free mount-specific data.
530 */
531static void
532lo_freevfs(struct vfs *vfsp)
533{
534	struct loinfo *li = vtoli(vfsp);
535
536	ldestroy(li);
537	kmem_free(li, sizeof (struct loinfo));
538}
539
540static int
541lofsinit(int fstyp, char *name)
542{
543	static const fs_operation_def_t lo_vfsops_template[] = {
544		VFSNAME_MOUNT,		{ .vfs_mount = lo_mount },
545		VFSNAME_UNMOUNT,	{ .vfs_unmount = lo_unmount },
546		VFSNAME_ROOT,		{ .vfs_root = lo_root },
547		VFSNAME_STATVFS,	{ .vfs_statvfs = lo_statvfs },
548		VFSNAME_SYNC,		{ .vfs_sync = lo_sync },
549		VFSNAME_VGET,		{ .vfs_vget = lo_vget },
550		VFSNAME_FREEVFS,	{ .vfs_freevfs = lo_freevfs },
551		NULL,			NULL
552	};
553	int error;
554
555	error = vfs_setfsops(fstyp, lo_vfsops_template, &lo_vfsops);
556	if (error != 0) {
557		cmn_err(CE_WARN, "lofsinit: bad vfs ops template");
558		return (error);
559	}
560
561	error = vn_make_ops(name, lo_vnodeops_template, &lo_vnodeops);
562	if (error != 0) {
563		(void) vfs_freevfsops_by_type(fstyp);
564		cmn_err(CE_WARN, "lofsinit: bad vnode ops template");
565		return (error);
566	}
567
568	lofsfstype = fstyp;
569
570	return (0);
571}
572