ext2_vfsops.c revision 322738
1130803Smarcel/*-
2130803Smarcel *  modified for EXT2FS support in Lites 1.1
3130803Smarcel *
4130803Smarcel *  Aug 1995, Godmar Back (gback@cs.utah.edu)
5130803Smarcel *  University of Utah, Department of Computer Science
6130803Smarcel */
7130803Smarcel/*-
8130803Smarcel * Copyright (c) 1989, 1991, 1993, 1994
9130803Smarcel *	The Regents of the University of California.  All rights reserved.
10130803Smarcel *
11130803Smarcel * Redistribution and use in source and binary forms, with or without
12130803Smarcel * modification, are permitted provided that the following conditions
13130803Smarcel * are met:
14130803Smarcel * 1. Redistributions of source code must retain the above copyright
15130803Smarcel *    notice, this list of conditions and the following disclaimer.
16130803Smarcel * 2. Redistributions in binary form must reproduce the above copyright
17130803Smarcel *    notice, this list of conditions and the following disclaimer in the
18130803Smarcel *    documentation and/or other materials provided with the distribution.
19130803Smarcel * 4. Neither the name of the University nor the names of its contributors
20130803Smarcel *    may be used to endorse or promote products derived from this software
21130803Smarcel *    without specific prior written permission.
22130803Smarcel *
23130803Smarcel * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24130803Smarcel * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25130803Smarcel * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26130803Smarcel * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27130803Smarcel * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28130803Smarcel * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29130803Smarcel * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30130803Smarcel * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31130803Smarcel * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32130803Smarcel * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33130803Smarcel * SUCH DAMAGE.
34130803Smarcel *
35 *	@(#)ffs_vfsops.c	8.8 (Berkeley) 4/18/94
36 * $FreeBSD: stable/11/sys/fs/ext2fs/ext2_vfsops.c 322738 2017-08-20 23:05:36Z pfg $
37 */
38
39#include <sys/param.h>
40#include <sys/systm.h>
41#include <sys/namei.h>
42#include <sys/priv.h>
43#include <sys/proc.h>
44#include <sys/kernel.h>
45#include <sys/vnode.h>
46#include <sys/mount.h>
47#include <sys/bio.h>
48#include <sys/buf.h>
49#include <sys/conf.h>
50#include <sys/endian.h>
51#include <sys/fcntl.h>
52#include <sys/malloc.h>
53#include <sys/stat.h>
54#include <sys/mutex.h>
55
56#include <geom/geom.h>
57#include <geom/geom_vfs.h>
58
59#include <fs/ext2fs/ext2_mount.h>
60#include <fs/ext2fs/inode.h>
61
62#include <fs/ext2fs/fs.h>
63#include <fs/ext2fs/ext2fs.h>
64#include <fs/ext2fs/ext2_dinode.h>
65#include <fs/ext2fs/ext2_extern.h>
66
67static int	ext2_flushfiles(struct mount *mp, int flags, struct thread *td);
68static int	ext2_mountfs(struct vnode *, struct mount *);
69static int	ext2_reload(struct mount *mp, struct thread *td);
70static int	ext2_sbupdate(struct ext2mount *, int);
71static int	ext2_cgupdate(struct ext2mount *, int);
72static vfs_unmount_t		ext2_unmount;
73static vfs_root_t		ext2_root;
74static vfs_statfs_t		ext2_statfs;
75static vfs_sync_t		ext2_sync;
76static vfs_vget_t		ext2_vget;
77static vfs_fhtovp_t		ext2_fhtovp;
78static vfs_mount_t		ext2_mount;
79
80MALLOC_DEFINE(M_EXT2NODE, "ext2_node", "EXT2 vnode private part");
81static MALLOC_DEFINE(M_EXT2MNT, "ext2_mount", "EXT2 mount structure");
82
83static struct vfsops ext2fs_vfsops = {
84	.vfs_fhtovp =		ext2_fhtovp,
85	.vfs_mount =		ext2_mount,
86	.vfs_root =		ext2_root,	/* root inode via vget */
87	.vfs_statfs =		ext2_statfs,
88	.vfs_sync =		ext2_sync,
89	.vfs_unmount =		ext2_unmount,
90	.vfs_vget =		ext2_vget,
91};
92
93VFS_SET(ext2fs_vfsops, ext2fs, 0);
94
95static int	ext2_check_sb_compat(struct ext2fs *es, struct cdev *dev,
96		    int ronly);
97static int	compute_sb_data(struct vnode * devvp,
98		    struct ext2fs * es, struct m_ext2fs * fs);
99
100static const char *ext2_opts[] = { "acls", "async", "noatime", "noclusterr",
101    "noclusterw", "noexec", "export", "force", "from", "multilabel",
102    "suiddir", "nosymfollow", "sync", "union", NULL };
103
104/*
105 * VFS Operations.
106 *
107 * mount system call
108 */
109static int
110ext2_mount(struct mount *mp)
111{
112	struct vfsoptlist *opts;
113	struct vnode *devvp;
114	struct thread *td;
115	struct ext2mount *ump = NULL;
116	struct m_ext2fs *fs;
117	struct nameidata nd, *ndp = &nd;
118	accmode_t accmode;
119	char *path, *fspec;
120	int error, flags, len;
121
122	td = curthread;
123	opts = mp->mnt_optnew;
124
125	if (vfs_filteropt(opts, ext2_opts))
126		return (EINVAL);
127
128	vfs_getopt(opts, "fspath", (void **)&path, NULL);
129	/* Double-check the length of path.. */
130	if (strlen(path) >= MAXMNTLEN)
131		return (ENAMETOOLONG);
132
133	fspec = NULL;
134	error = vfs_getopt(opts, "from", (void **)&fspec, &len);
135	if (!error && fspec[len - 1] != '\0')
136		return (EINVAL);
137
138	/*
139	 * If updating, check whether changing from read-only to
140	 * read/write; if there is no device name, that's all we do.
141	 */
142	if (mp->mnt_flag & MNT_UPDATE) {
143		ump = VFSTOEXT2(mp);
144		fs = ump->um_e2fs;
145		error = 0;
146		if (fs->e2fs_ronly == 0 &&
147		    vfs_flagopt(opts, "ro", NULL, 0)) {
148			error = VFS_SYNC(mp, MNT_WAIT);
149			if (error)
150				return (error);
151			flags = WRITECLOSE;
152			if (mp->mnt_flag & MNT_FORCE)
153				flags |= FORCECLOSE;
154			error = ext2_flushfiles(mp, flags, td);
155			if (error == 0 && fs->e2fs_wasvalid && ext2_cgupdate(ump, MNT_WAIT) == 0) {
156				fs->e2fs->e2fs_state |= E2FS_ISCLEAN;
157				ext2_sbupdate(ump, MNT_WAIT);
158			}
159			fs->e2fs_ronly = 1;
160			vfs_flagopt(opts, "ro", &mp->mnt_flag, MNT_RDONLY);
161			g_topology_lock();
162			g_access(ump->um_cp, 0, -1, 0);
163			g_topology_unlock();
164		}
165		if (!error && (mp->mnt_flag & MNT_RELOAD))
166			error = ext2_reload(mp, td);
167		if (error)
168			return (error);
169		devvp = ump->um_devvp;
170		if (fs->e2fs_ronly && !vfs_flagopt(opts, "ro", NULL, 0)) {
171			if (ext2_check_sb_compat(fs->e2fs, devvp->v_rdev, 0))
172				return (EPERM);
173
174			/*
175			 * If upgrade to read-write by non-root, then verify
176			 * that user has necessary permissions on the device.
177			 */
178			vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
179			error = VOP_ACCESS(devvp, VREAD | VWRITE,
180			    td->td_ucred, td);
181			if (error)
182				error = priv_check(td, PRIV_VFS_MOUNT_PERM);
183			if (error) {
184				VOP_UNLOCK(devvp, 0);
185				return (error);
186			}
187			VOP_UNLOCK(devvp, 0);
188			g_topology_lock();
189			error = g_access(ump->um_cp, 0, 1, 0);
190			g_topology_unlock();
191			if (error)
192				return (error);
193
194			if ((fs->e2fs->e2fs_state & E2FS_ISCLEAN) == 0 ||
195			    (fs->e2fs->e2fs_state & E2FS_ERRORS)) {
196				if (mp->mnt_flag & MNT_FORCE) {
197					printf(
198"WARNING: %s was not properly dismounted\n", fs->e2fs_fsmnt);
199				} else {
200					printf(
201"WARNING: R/W mount of %s denied.  Filesystem is not clean - run fsck\n",
202					    fs->e2fs_fsmnt);
203					return (EPERM);
204				}
205			}
206			fs->e2fs->e2fs_state &= ~E2FS_ISCLEAN;
207			(void)ext2_cgupdate(ump, MNT_WAIT);
208			fs->e2fs_ronly = 0;
209			MNT_ILOCK(mp);
210			mp->mnt_flag &= ~MNT_RDONLY;
211			MNT_IUNLOCK(mp);
212		}
213		if (vfs_flagopt(opts, "export", NULL, 0)) {
214			/* Process export requests in vfs_mount.c. */
215			return (error);
216		}
217	}
218
219	/*
220	 * Not an update, or updating the name: look up the name
221	 * and verify that it refers to a sensible disk device.
222	 */
223	if (fspec == NULL)
224		return (EINVAL);
225	NDINIT(ndp, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, fspec, td);
226	if ((error = namei(ndp)) != 0)
227		return (error);
228	NDFREE(ndp, NDF_ONLY_PNBUF);
229	devvp = ndp->ni_vp;
230
231	if (!vn_isdisk(devvp, &error)) {
232		vput(devvp);
233		return (error);
234	}
235
236	/*
237	 * If mount by non-root, then verify that user has necessary
238	 * permissions on the device.
239	 *
240	 * XXXRW: VOP_ACCESS() enough?
241	 */
242	accmode = VREAD;
243	if ((mp->mnt_flag & MNT_RDONLY) == 0)
244		accmode |= VWRITE;
245	error = VOP_ACCESS(devvp, accmode, td->td_ucred, td);
246	if (error)
247		error = priv_check(td, PRIV_VFS_MOUNT_PERM);
248	if (error) {
249		vput(devvp);
250		return (error);
251	}
252
253	if ((mp->mnt_flag & MNT_UPDATE) == 0) {
254		error = ext2_mountfs(devvp, mp);
255	} else {
256		if (devvp != ump->um_devvp) {
257			vput(devvp);
258			return (EINVAL);	/* needs translation */
259		} else
260			vput(devvp);
261	}
262	if (error) {
263		vrele(devvp);
264		return (error);
265	}
266	ump = VFSTOEXT2(mp);
267	fs = ump->um_e2fs;
268
269	/*
270	 * Note that this strncpy() is ok because of a check at the start
271	 * of ext2_mount().
272	 */
273	strncpy(fs->e2fs_fsmnt, path, MAXMNTLEN);
274	fs->e2fs_fsmnt[MAXMNTLEN - 1] = '\0';
275	vfs_mountedfrom(mp, fspec);
276	return (0);
277}
278
279static int
280ext2_check_sb_compat(struct ext2fs *es, struct cdev *dev, int ronly)
281{
282	uint32_t i, mask;
283
284	if (es->e2fs_magic != E2FS_MAGIC) {
285		printf("ext2fs: %s: wrong magic number %#x (expected %#x)\n",
286		    devtoname(dev), es->e2fs_magic, E2FS_MAGIC);
287		return (1);
288	}
289	if (es->e2fs_rev > E2FS_REV0) {
290		mask = es->e2fs_features_incompat & ~(EXT2F_INCOMPAT_SUPP |
291		    EXT4F_RO_INCOMPAT_SUPP);
292		if (mask) {
293			printf("WARNING: mount of %s denied due to "
294			    "unsupported optional features:\n", devtoname(dev));
295			for (i = 0;
296			    i < sizeof(incompat)/sizeof(struct ext2_feature);
297			    i++)
298				if (mask & incompat[i].mask)
299					printf("%s ", incompat[i].name);
300			printf("\n");
301			return (1);
302		}
303		mask = es->e2fs_features_rocompat & ~EXT2F_ROCOMPAT_SUPP;
304		if (!ronly && mask) {
305			printf("WARNING: R/W mount of %s denied due to "
306			    "unsupported optional features:\n", devtoname(dev));
307			for (i = 0;
308			    i < sizeof(ro_compat)/sizeof(struct ext2_feature);
309			    i++)
310				if (mask & ro_compat[i].mask)
311					printf("%s ", ro_compat[i].name);
312			printf("\n");
313			return (1);
314		}
315	}
316	return (0);
317}
318
319/*
320 * This computes the fields of the m_ext2fs structure from the
321 * data in the ext2fs structure read in.
322 */
323static int
324compute_sb_data(struct vnode *devvp, struct ext2fs *es,
325    struct m_ext2fs *fs)
326{
327	int db_count, error;
328	int i;
329	int logic_sb_block = 1;	/* XXX for now */
330	struct buf *bp;
331	uint32_t e2fs_descpb;
332
333	fs->e2fs_bshift = EXT2_MIN_BLOCK_LOG_SIZE + es->e2fs_log_bsize;
334	fs->e2fs_bsize = 1U << fs->e2fs_bshift;
335	fs->e2fs_fsbtodb = es->e2fs_log_bsize + 1;
336	fs->e2fs_qbmask = fs->e2fs_bsize - 1;
337	fs->e2fs_fsize = EXT2_MIN_FRAG_SIZE << es->e2fs_log_fsize;
338	if (fs->e2fs_fsize)
339		fs->e2fs_fpb = fs->e2fs_bsize / fs->e2fs_fsize;
340	fs->e2fs_bpg = es->e2fs_bpg;
341	fs->e2fs_fpg = es->e2fs_fpg;
342	fs->e2fs_ipg = es->e2fs_ipg;
343	if (es->e2fs_rev == E2FS_REV0) {
344		fs->e2fs_isize = E2FS_REV0_INODE_SIZE;
345	} else {
346		fs->e2fs_isize = es->e2fs_inode_size;
347
348		/*
349		 * Simple sanity check for superblock inode size value.
350		 */
351		if (EXT2_INODE_SIZE(fs) < E2FS_REV0_INODE_SIZE ||
352		    EXT2_INODE_SIZE(fs) > fs->e2fs_bsize ||
353		    (fs->e2fs_isize & (fs->e2fs_isize - 1)) != 0) {
354			printf("ext2fs: invalid inode size %d\n",
355			    fs->e2fs_isize);
356			return (EIO);
357		}
358	}
359	/* Check for extra isize in big inodes. */
360	if (EXT2_HAS_RO_COMPAT_FEATURE(fs, EXT2F_ROCOMPAT_EXTRA_ISIZE) &&
361	    EXT2_INODE_SIZE(fs) < sizeof(struct ext2fs_dinode)) {
362		printf("ext2fs: no space for extra inode timestamps\n");
363		return (EINVAL);
364	}
365	/* Check for group descriptor size */
366	if (EXT2_HAS_INCOMPAT_FEATURE(fs, EXT2F_INCOMPAT_64BIT) &&
367	    (es->e3fs_desc_size != sizeof(struct ext2_gd))) {
368		printf("ext2fs: group descriptor size unsupported %d\n",
369		    es->e3fs_desc_size);
370		return (EINVAL);
371	}
372
373	fs->e2fs_ipb = fs->e2fs_bsize / EXT2_INODE_SIZE(fs);
374	fs->e2fs_itpg = fs->e2fs_ipg / fs->e2fs_ipb;
375	/* s_resuid / s_resgid ? */
376	fs->e2fs_gcount = howmany(es->e2fs_bcount - es->e2fs_first_dblock,
377	    EXT2_BLOCKS_PER_GROUP(fs));
378	e2fs_descpb = fs->e2fs_bsize / sizeof(struct ext2_gd);
379	db_count = howmany(fs->e2fs_gcount, e2fs_descpb);
380	fs->e2fs_gdbcount = db_count;
381	fs->e2fs_gd = malloc(db_count * fs->e2fs_bsize,
382	    M_EXT2MNT, M_WAITOK);
383	fs->e2fs_contigdirs = malloc(fs->e2fs_gcount *
384	    sizeof(*fs->e2fs_contigdirs), M_EXT2MNT, M_WAITOK | M_ZERO);
385
386	/*
387	 * Adjust logic_sb_block.
388	 * Godmar thinks: if the blocksize is greater than 1024, then
389	 * the superblock is logically part of block zero.
390	 */
391	if (fs->e2fs_bsize > SBSIZE)
392		logic_sb_block = 0;
393	for (i = 0; i < db_count; i++) {
394		error = bread(devvp,
395		    fsbtodb(fs, logic_sb_block + i + 1),
396		    fs->e2fs_bsize, NOCRED, &bp);
397		if (error) {
398			free(fs->e2fs_contigdirs, M_EXT2MNT);
399			free(fs->e2fs_gd, M_EXT2MNT);
400			brelse(bp);
401			return (error);
402		}
403		e2fs_cgload((struct ext2_gd *)bp->b_data,
404		    &fs->e2fs_gd[
405		    i * fs->e2fs_bsize / sizeof(struct ext2_gd)],
406		    fs->e2fs_bsize);
407		brelse(bp);
408		bp = NULL;
409	}
410	/* Verify cg csum */
411	if (EXT2_HAS_RO_COMPAT_FEATURE(fs, EXT2F_ROCOMPAT_GDT_CSUM)) {
412		error = ext2_gd_csum_verify(fs, devvp->v_rdev);
413		if (error)
414			return (error);
415	}
416	/* Initialization for the ext2 Orlov allocator variant. */
417	fs->e2fs_total_dir = 0;
418	for (i = 0; i < fs->e2fs_gcount; i++)
419		fs->e2fs_total_dir += fs->e2fs_gd[i].ext2bgd_ndirs;
420
421	if (es->e2fs_rev == E2FS_REV0 ||
422	    !EXT2_HAS_RO_COMPAT_FEATURE(fs, EXT2F_ROCOMPAT_LARGEFILE))
423		fs->e2fs_maxfilesize = 0x7fffffff;
424	else {
425		fs->e2fs_maxfilesize = 0xffffffffffff;
426		if (EXT2_HAS_RO_COMPAT_FEATURE(fs, EXT2F_ROCOMPAT_HUGE_FILE))
427			fs->e2fs_maxfilesize = 0x7fffffffffffffff;
428	}
429	if (es->e4fs_flags & E2FS_UNSIGNED_HASH) {
430		fs->e2fs_uhash = 3;
431	} else if ((es->e4fs_flags & E2FS_SIGNED_HASH) == 0) {
432#ifdef __CHAR_UNSIGNED__
433		es->e4fs_flags |= E2FS_UNSIGNED_HASH;
434		fs->e2fs_uhash = 3;
435#else
436		es->e4fs_flags |= E2FS_SIGNED_HASH;
437#endif
438	}
439
440	return (0);
441}
442
443/*
444 * Reload all incore data for a filesystem (used after running fsck on
445 * the root filesystem and finding things to fix). The filesystem must
446 * be mounted read-only.
447 *
448 * Things to do to update the mount:
449 *	1) invalidate all cached meta-data.
450 *	2) re-read superblock from disk.
451 *	3) invalidate all cluster summary information.
452 *	4) invalidate all inactive vnodes.
453 *	5) invalidate all cached file data.
454 *	6) re-read inode data for all active vnodes.
455 * XXX we are missing some steps, in particular # 3, this has to be reviewed.
456 */
457static int
458ext2_reload(struct mount *mp, struct thread *td)
459{
460	struct vnode *vp, *mvp, *devvp;
461	struct inode *ip;
462	struct buf *bp;
463	struct ext2fs *es;
464	struct m_ext2fs *fs;
465	struct csum *sump;
466	int error, i;
467	int32_t *lp;
468
469	if ((mp->mnt_flag & MNT_RDONLY) == 0)
470		return (EINVAL);
471	/*
472	 * Step 1: invalidate all cached meta-data.
473	 */
474	devvp = VFSTOEXT2(mp)->um_devvp;
475	vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
476	if (vinvalbuf(devvp, 0, 0, 0) != 0)
477		panic("ext2_reload: dirty1");
478	VOP_UNLOCK(devvp, 0);
479
480	/*
481	 * Step 2: re-read superblock from disk.
482	 * constants have been adjusted for ext2
483	 */
484	if ((error = bread(devvp, SBLOCK, SBSIZE, NOCRED, &bp)) != 0)
485		return (error);
486	es = (struct ext2fs *)bp->b_data;
487	if (ext2_check_sb_compat(es, devvp->v_rdev, 0) != 0) {
488		brelse(bp);
489		return (EIO);		/* XXX needs translation */
490	}
491	fs = VFSTOEXT2(mp)->um_e2fs;
492	bcopy(bp->b_data, fs->e2fs, sizeof(struct ext2fs));
493
494	if ((error = compute_sb_data(devvp, es, fs)) != 0) {
495		brelse(bp);
496		return (error);
497	}
498#ifdef UNKLAR
499	if (fs->fs_sbsize < SBSIZE)
500		bp->b_flags |= B_INVAL;
501#endif
502	brelse(bp);
503
504	/*
505	 * Step 3: invalidate all cluster summary information.
506	 */
507	if (fs->e2fs_contigsumsize > 0) {
508		lp = fs->e2fs_maxcluster;
509		sump = fs->e2fs_clustersum;
510		for (i = 0; i < fs->e2fs_gcount; i++, sump++) {
511			*lp++ = fs->e2fs_contigsumsize;
512			sump->cs_init = 0;
513			bzero(sump->cs_sum, fs->e2fs_contigsumsize + 1);
514		}
515	}
516
517loop:
518	MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
519		/*
520		 * Step 4: invalidate all cached file data.
521		 */
522		if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, td)) {
523			MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
524			goto loop;
525		}
526		if (vinvalbuf(vp, 0, 0, 0))
527			panic("ext2_reload: dirty2");
528
529		/*
530		 * Step 5: re-read inode data for all active vnodes.
531		 */
532		ip = VTOI(vp);
533		error = bread(devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
534		    (int)fs->e2fs_bsize, NOCRED, &bp);
535		if (error) {
536			VOP_UNLOCK(vp, 0);
537			vrele(vp);
538			MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
539			return (error);
540		}
541		ext2_ei2i((struct ext2fs_dinode *)((char *)bp->b_data +
542		    EXT2_INODE_SIZE(fs) * ino_to_fsbo(fs, ip->i_number)), ip);
543		brelse(bp);
544		VOP_UNLOCK(vp, 0);
545		vrele(vp);
546	}
547	return (0);
548}
549
550/*
551 * Common code for mount and mountroot.
552 */
553static int
554ext2_mountfs(struct vnode *devvp, struct mount *mp)
555{
556	struct ext2mount *ump;
557	struct buf *bp;
558	struct m_ext2fs *fs;
559	struct ext2fs *es;
560	struct cdev *dev = devvp->v_rdev;
561	struct g_consumer *cp;
562	struct bufobj *bo;
563	struct csum *sump;
564	int error;
565	int ronly;
566	int i;
567	u_long size;
568	int32_t *lp;
569	int32_t e2fs_maxcontig;
570
571	ronly = vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0);
572	/* XXX: use VOP_ACESS to check FS perms */
573	g_topology_lock();
574	error = g_vfs_open(devvp, &cp, "ext2fs", ronly ? 0 : 1);
575	g_topology_unlock();
576	VOP_UNLOCK(devvp, 0);
577	if (error)
578		return (error);
579
580	/* XXX: should we check for some sectorsize or 512 instead? */
581	if (((SBSIZE % cp->provider->sectorsize) != 0) ||
582	    (SBSIZE < cp->provider->sectorsize)) {
583		g_topology_lock();
584		g_vfs_close(cp);
585		g_topology_unlock();
586		return (EINVAL);
587	}
588
589	bo = &devvp->v_bufobj;
590	bo->bo_private = cp;
591	bo->bo_ops = g_vfs_bufops;
592	if (devvp->v_rdev->si_iosize_max != 0)
593		mp->mnt_iosize_max = devvp->v_rdev->si_iosize_max;
594	if (mp->mnt_iosize_max > MAXPHYS)
595		mp->mnt_iosize_max = MAXPHYS;
596
597	bp = NULL;
598	ump = NULL;
599	if ((error = bread(devvp, SBLOCK, SBSIZE, NOCRED, &bp)) != 0)
600		goto out;
601	es = (struct ext2fs *)bp->b_data;
602	if (ext2_check_sb_compat(es, dev, ronly) != 0) {
603		error = EINVAL;		/* XXX needs translation */
604		goto out;
605	}
606	if ((es->e2fs_state & E2FS_ISCLEAN) == 0 ||
607	    (es->e2fs_state & E2FS_ERRORS)) {
608		if (ronly || (mp->mnt_flag & MNT_FORCE)) {
609			printf(
610"WARNING: Filesystem was not properly dismounted\n");
611		} else {
612			printf(
613"WARNING: R/W mount denied.  Filesystem is not clean - run fsck\n");
614			error = EPERM;
615			goto out;
616		}
617	}
618	ump = malloc(sizeof(*ump), M_EXT2MNT, M_WAITOK | M_ZERO);
619
620	/*
621	 * I don't know whether this is the right strategy. Note that
622	 * we dynamically allocate both an m_ext2fs and an ext2fs
623	 * while Linux keeps the super block in a locked buffer.
624	 */
625	ump->um_e2fs = malloc(sizeof(struct m_ext2fs),
626	    M_EXT2MNT, M_WAITOK | M_ZERO);
627	ump->um_e2fs->e2fs = malloc(sizeof(struct ext2fs),
628	    M_EXT2MNT, M_WAITOK);
629	mtx_init(EXT2_MTX(ump), "EXT2FS", "EXT2FS Lock", MTX_DEF);
630	bcopy(es, ump->um_e2fs->e2fs, (u_int)sizeof(struct ext2fs));
631	if ((error = compute_sb_data(devvp, ump->um_e2fs->e2fs, ump->um_e2fs)))
632		goto out;
633
634	/*
635	 * Calculate the maximum contiguous blocks and size of cluster summary
636	 * array.  In FFS this is done by newfs; however, the superblock
637	 * in ext2fs doesn't have these variables, so we can calculate
638	 * them here.
639	 */
640	e2fs_maxcontig = MAX(1, MAXPHYS / ump->um_e2fs->e2fs_bsize);
641	ump->um_e2fs->e2fs_contigsumsize = MIN(e2fs_maxcontig, EXT2_MAXCONTIG);
642	if (ump->um_e2fs->e2fs_contigsumsize > 0) {
643		size = ump->um_e2fs->e2fs_gcount * sizeof(int32_t);
644		ump->um_e2fs->e2fs_maxcluster = malloc(size, M_EXT2MNT, M_WAITOK);
645		size = ump->um_e2fs->e2fs_gcount * sizeof(struct csum);
646		ump->um_e2fs->e2fs_clustersum = malloc(size, M_EXT2MNT, M_WAITOK);
647		lp = ump->um_e2fs->e2fs_maxcluster;
648		sump = ump->um_e2fs->e2fs_clustersum;
649		for (i = 0; i < ump->um_e2fs->e2fs_gcount; i++, sump++) {
650			*lp++ = ump->um_e2fs->e2fs_contigsumsize;
651			sump->cs_init = 0;
652			sump->cs_sum = malloc((ump->um_e2fs->e2fs_contigsumsize + 1) *
653			    sizeof(int32_t), M_EXT2MNT, M_WAITOK | M_ZERO);
654		}
655	}
656
657	brelse(bp);
658	bp = NULL;
659	fs = ump->um_e2fs;
660	fs->e2fs_ronly = ronly;	/* ronly is set according to mnt_flags */
661
662	/*
663	 * If the fs is not mounted read-only, make sure the super block is
664	 * always written back on a sync().
665	 */
666	fs->e2fs_wasvalid = fs->e2fs->e2fs_state & E2FS_ISCLEAN ? 1 : 0;
667	if (ronly == 0) {
668		fs->e2fs_fmod = 1;	/* mark it modified */
669		fs->e2fs->e2fs_state &= ~E2FS_ISCLEAN;	/* set fs invalid */
670	}
671	mp->mnt_data = ump;
672	mp->mnt_stat.f_fsid.val[0] = dev2udev(dev);
673	mp->mnt_stat.f_fsid.val[1] = mp->mnt_vfc->vfc_typenum;
674	mp->mnt_maxsymlinklen = EXT2_MAXSYMLINKLEN;
675	MNT_ILOCK(mp);
676	mp->mnt_flag |= MNT_LOCAL;
677	MNT_IUNLOCK(mp);
678	ump->um_mountp = mp;
679	ump->um_dev = dev;
680	ump->um_devvp = devvp;
681	ump->um_bo = &devvp->v_bufobj;
682	ump->um_cp = cp;
683
684	/*
685	 * Setting those two parameters allowed us to use
686	 * ufs_bmap w/o changse!
687	 */
688	ump->um_nindir = EXT2_ADDR_PER_BLOCK(fs);
689	ump->um_bptrtodb = fs->e2fs->e2fs_log_bsize + 1;
690	ump->um_seqinc = EXT2_FRAGS_PER_BLOCK(fs);
691	if (ronly == 0)
692		ext2_sbupdate(ump, MNT_WAIT);
693	/*
694	 * Initialize filesystem stat information in mount struct.
695	 */
696	MNT_ILOCK(mp);
697	mp->mnt_kern_flag |= MNTK_LOOKUP_SHARED | MNTK_EXTENDED_SHARED |
698	    MNTK_USES_BCACHE;
699	MNT_IUNLOCK(mp);
700	return (0);
701out:
702	if (bp)
703		brelse(bp);
704	if (cp != NULL) {
705		g_topology_lock();
706		g_vfs_close(cp);
707		g_topology_unlock();
708	}
709	if (ump) {
710		mtx_destroy(EXT2_MTX(ump));
711		free(ump->um_e2fs->e2fs_gd, M_EXT2MNT);
712		free(ump->um_e2fs->e2fs_contigdirs, M_EXT2MNT);
713		free(ump->um_e2fs->e2fs, M_EXT2MNT);
714		free(ump->um_e2fs, M_EXT2MNT);
715		free(ump, M_EXT2MNT);
716		mp->mnt_data = NULL;
717	}
718	return (error);
719}
720
721/*
722 * Unmount system call.
723 */
724static int
725ext2_unmount(struct mount *mp, int mntflags)
726{
727	struct ext2mount *ump;
728	struct m_ext2fs *fs;
729	struct csum *sump;
730	int error, flags, i, ronly;
731
732	flags = 0;
733	if (mntflags & MNT_FORCE) {
734		if (mp->mnt_flag & MNT_ROOTFS)
735			return (EINVAL);
736		flags |= FORCECLOSE;
737	}
738	if ((error = ext2_flushfiles(mp, flags, curthread)) != 0)
739		return (error);
740	ump = VFSTOEXT2(mp);
741	fs = ump->um_e2fs;
742	ronly = fs->e2fs_ronly;
743	if (ronly == 0 && ext2_cgupdate(ump, MNT_WAIT) == 0) {
744		if (fs->e2fs_wasvalid)
745			fs->e2fs->e2fs_state |= E2FS_ISCLEAN;
746		ext2_sbupdate(ump, MNT_WAIT);
747	}
748
749	g_topology_lock();
750	g_vfs_close(ump->um_cp);
751	g_topology_unlock();
752	vrele(ump->um_devvp);
753	sump = fs->e2fs_clustersum;
754	for (i = 0; i < fs->e2fs_gcount; i++, sump++)
755		free(sump->cs_sum, M_EXT2MNT);
756	free(fs->e2fs_clustersum, M_EXT2MNT);
757	free(fs->e2fs_maxcluster, M_EXT2MNT);
758	free(fs->e2fs_gd, M_EXT2MNT);
759	free(fs->e2fs_contigdirs, M_EXT2MNT);
760	free(fs->e2fs, M_EXT2MNT);
761	free(fs, M_EXT2MNT);
762	free(ump, M_EXT2MNT);
763	mp->mnt_data = NULL;
764	MNT_ILOCK(mp);
765	mp->mnt_flag &= ~MNT_LOCAL;
766	MNT_IUNLOCK(mp);
767	return (error);
768}
769
770/*
771 * Flush out all the files in a filesystem.
772 */
773static int
774ext2_flushfiles(struct mount *mp, int flags, struct thread *td)
775{
776	int error;
777
778	error = vflush(mp, 0, flags, td);
779	return (error);
780}
781
782/*
783 * Get filesystem statistics.
784 */
785int
786ext2_statfs(struct mount *mp, struct statfs *sbp)
787{
788	struct ext2mount *ump;
789	struct m_ext2fs *fs;
790	uint32_t overhead, overhead_per_group, ngdb;
791	int i, ngroups;
792
793	ump = VFSTOEXT2(mp);
794	fs = ump->um_e2fs;
795	if (fs->e2fs->e2fs_magic != E2FS_MAGIC)
796		panic("ext2_statfs");
797
798	/*
799	 * Compute the overhead (FS structures)
800	 */
801	overhead_per_group =
802	    1 /* block bitmap */ +
803	    1 /* inode bitmap */ +
804	    fs->e2fs_itpg;
805	overhead = fs->e2fs->e2fs_first_dblock +
806	    fs->e2fs_gcount * overhead_per_group;
807	if (fs->e2fs->e2fs_rev > E2FS_REV0 &&
808	    fs->e2fs->e2fs_features_rocompat & EXT2F_ROCOMPAT_SPARSESUPER) {
809		for (i = 0, ngroups = 0; i < fs->e2fs_gcount; i++) {
810			if (ext2_cg_has_sb(fs, i))
811				ngroups++;
812		}
813	} else {
814		ngroups = fs->e2fs_gcount;
815	}
816	ngdb = fs->e2fs_gdbcount;
817	if (fs->e2fs->e2fs_rev > E2FS_REV0 &&
818	    fs->e2fs->e2fs_features_compat & EXT2F_COMPAT_RESIZE)
819		ngdb += fs->e2fs->e2fs_reserved_ngdb;
820	overhead += ngroups * (1 /* superblock */ + ngdb);
821
822	sbp->f_bsize = EXT2_FRAG_SIZE(fs);
823	sbp->f_iosize = EXT2_BLOCK_SIZE(fs);
824	sbp->f_blocks = fs->e2fs->e2fs_bcount - overhead;
825	sbp->f_bfree = fs->e2fs->e2fs_fbcount;
826	sbp->f_bavail = sbp->f_bfree - fs->e2fs->e2fs_rbcount;
827	sbp->f_files = fs->e2fs->e2fs_icount;
828	sbp->f_ffree = fs->e2fs->e2fs_ficount;
829	return (0);
830}
831
832/*
833 * Go through the disk queues to initiate sandbagged IO;
834 * go through the inodes to write those that have been modified;
835 * initiate the writing of the super block if it has been modified.
836 *
837 * Note: we are always called with the filesystem marked `MPBUSY'.
838 */
839static int
840ext2_sync(struct mount *mp, int waitfor)
841{
842	struct vnode *mvp, *vp;
843	struct thread *td;
844	struct inode *ip;
845	struct ext2mount *ump = VFSTOEXT2(mp);
846	struct m_ext2fs *fs;
847	int error, allerror = 0;
848
849	td = curthread;
850	fs = ump->um_e2fs;
851	if (fs->e2fs_fmod != 0 && fs->e2fs_ronly != 0) {		/* XXX */
852		printf("fs = %s\n", fs->e2fs_fsmnt);
853		panic("ext2_sync: rofs mod");
854	}
855
856	/*
857	 * Write back each (modified) inode.
858	 */
859loop:
860	MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
861		if (vp->v_type == VNON) {
862			VI_UNLOCK(vp);
863			continue;
864		}
865		ip = VTOI(vp);
866		if ((ip->i_flag &
867		    (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE)) == 0 &&
868		    (vp->v_bufobj.bo_dirty.bv_cnt == 0 ||
869		    waitfor == MNT_LAZY)) {
870			VI_UNLOCK(vp);
871			continue;
872		}
873		error = vget(vp, LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK, td);
874		if (error) {
875			if (error == ENOENT) {
876				MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
877				goto loop;
878			}
879			continue;
880		}
881		if ((error = VOP_FSYNC(vp, waitfor, td)) != 0)
882			allerror = error;
883		VOP_UNLOCK(vp, 0);
884		vrele(vp);
885	}
886
887	/*
888	 * Force stale filesystem control information to be flushed.
889	 */
890	if (waitfor != MNT_LAZY) {
891		vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY);
892		if ((error = VOP_FSYNC(ump->um_devvp, waitfor, td)) != 0)
893			allerror = error;
894		VOP_UNLOCK(ump->um_devvp, 0);
895	}
896
897	/*
898	 * Write back modified superblock.
899	 */
900	if (fs->e2fs_fmod != 0) {
901		fs->e2fs_fmod = 0;
902		fs->e2fs->e2fs_wtime = time_second;
903		if ((error = ext2_cgupdate(ump, waitfor)) != 0)
904			allerror = error;
905	}
906	return (allerror);
907}
908
909/*
910 * Look up an EXT2FS dinode number to find its incore vnode, otherwise read it
911 * in from disk.  If it is in core, wait for the lock bit to clear, then
912 * return the inode locked.  Detection and handling of mount points must be
913 * done by the calling routine.
914 */
915static int
916ext2_vget(struct mount *mp, ino_t ino, int flags, struct vnode **vpp)
917{
918	struct m_ext2fs *fs;
919	struct inode *ip;
920	struct ext2mount *ump;
921	struct buf *bp;
922	struct vnode *vp;
923	struct thread *td;
924	int i, error;
925	int used_blocks;
926
927	td = curthread;
928	error = vfs_hash_get(mp, ino, flags, td, vpp, NULL, NULL);
929	if (error || *vpp != NULL)
930		return (error);
931
932	ump = VFSTOEXT2(mp);
933	ip = malloc(sizeof(struct inode), M_EXT2NODE, M_WAITOK | M_ZERO);
934
935	/* Allocate a new vnode/inode. */
936	if ((error = getnewvnode("ext2fs", mp, &ext2_vnodeops, &vp)) != 0) {
937		*vpp = NULL;
938		free(ip, M_EXT2NODE);
939		return (error);
940	}
941	vp->v_data = ip;
942	ip->i_vnode = vp;
943	ip->i_e2fs = fs = ump->um_e2fs;
944	ip->i_ump = ump;
945	ip->i_number = ino;
946
947	lockmgr(vp->v_vnlock, LK_EXCLUSIVE, NULL);
948	error = insmntque(vp, mp);
949	if (error != 0) {
950		free(ip, M_EXT2NODE);
951		*vpp = NULL;
952		return (error);
953	}
954	error = vfs_hash_insert(vp, ino, flags, td, vpp, NULL, NULL);
955	if (error || *vpp != NULL)
956		return (error);
957
958	/* Read in the disk contents for the inode, copy into the inode. */
959	if ((error = bread(ump->um_devvp, fsbtodb(fs, ino_to_fsba(fs, ino)),
960	    (int)fs->e2fs_bsize, NOCRED, &bp)) != 0) {
961		/*
962		 * The inode does not contain anything useful, so it would
963		 * be misleading to leave it on its hash chain. With mode
964		 * still zero, it will be unlinked and returned to the free
965		 * list by vput().
966		 */
967		brelse(bp);
968		vput(vp);
969		*vpp = NULL;
970		return (error);
971	}
972	/* convert ext2 inode to dinode */
973	ext2_ei2i((struct ext2fs_dinode *)((char *)bp->b_data + EXT2_INODE_SIZE(fs) *
974	    ino_to_fsbo(fs, ino)), ip);
975	ip->i_block_group = ino_to_cg(fs, ino);
976	ip->i_next_alloc_block = 0;
977	ip->i_next_alloc_goal = 0;
978
979	/*
980	 * Now we want to make sure that block pointers for unused
981	 * blocks are zeroed out - ext2_balloc depends on this
982	 * although for regular files and directories only
983	 *
984	 * If IN_E4EXTENTS is enabled, unused blocks are not zeroed
985	 * out because we could corrupt the extent tree.
986	 */
987	if (!(ip->i_flag & IN_E4EXTENTS) &&
988	    (S_ISDIR(ip->i_mode) || S_ISREG(ip->i_mode))) {
989		used_blocks = howmany(ip->i_size, fs->e2fs_bsize);
990		for (i = used_blocks; i < EXT2_NDIR_BLOCKS; i++)
991			ip->i_db[i] = 0;
992	}
993#ifdef EXT2FS_DEBUG
994	ext2_print_inode(ip);
995#endif
996	bqrelse(bp);
997
998	/*
999	 * Initialize the vnode from the inode, check for aliases.
1000	 * Note that the underlying vnode may have changed.
1001	 */
1002	if ((error = ext2_vinit(mp, &ext2_fifoops, &vp)) != 0) {
1003		vput(vp);
1004		*vpp = NULL;
1005		return (error);
1006	}
1007
1008	/*
1009	 * Finish inode initialization.
1010	 */
1011
1012	*vpp = vp;
1013	return (0);
1014}
1015
1016/*
1017 * File handle to vnode
1018 *
1019 * Have to be really careful about stale file handles:
1020 * - check that the inode number is valid
1021 * - call ext2_vget() to get the locked inode
1022 * - check for an unallocated inode (i_mode == 0)
1023 * - check that the given client host has export rights and return
1024 *   those rights via. exflagsp and credanonp
1025 */
1026static int
1027ext2_fhtovp(struct mount *mp, struct fid *fhp, int flags, struct vnode **vpp)
1028{
1029	struct inode *ip;
1030	struct ufid *ufhp;
1031	struct vnode *nvp;
1032	struct m_ext2fs *fs;
1033	int error;
1034
1035	ufhp = (struct ufid *)fhp;
1036	fs = VFSTOEXT2(mp)->um_e2fs;
1037	if (ufhp->ufid_ino < EXT2_ROOTINO ||
1038	    ufhp->ufid_ino > fs->e2fs_gcount * fs->e2fs->e2fs_ipg)
1039		return (ESTALE);
1040
1041	error = VFS_VGET(mp, ufhp->ufid_ino, LK_EXCLUSIVE, &nvp);
1042	if (error) {
1043		*vpp = NULLVP;
1044		return (error);
1045	}
1046	ip = VTOI(nvp);
1047	if (ip->i_mode == 0 ||
1048	    ip->i_gen != ufhp->ufid_gen || ip->i_nlink <= 0) {
1049		vput(nvp);
1050		*vpp = NULLVP;
1051		return (ESTALE);
1052	}
1053	*vpp = nvp;
1054	vnode_create_vobject(*vpp, 0, curthread);
1055	return (0);
1056}
1057
1058/*
1059 * Write a superblock and associated information back to disk.
1060 */
1061static int
1062ext2_sbupdate(struct ext2mount *mp, int waitfor)
1063{
1064	struct m_ext2fs *fs = mp->um_e2fs;
1065	struct ext2fs *es = fs->e2fs;
1066	struct buf *bp;
1067	int error = 0;
1068
1069	bp = getblk(mp->um_devvp, SBLOCK, SBSIZE, 0, 0, 0);
1070	bcopy((caddr_t)es, bp->b_data, (u_int)sizeof(struct ext2fs));
1071	if (waitfor == MNT_WAIT)
1072		error = bwrite(bp);
1073	else
1074		bawrite(bp);
1075
1076	/*
1077	 * The buffers for group descriptors, inode bitmaps and block bitmaps
1078	 * are not busy at this point and are (hopefully) written by the
1079	 * usual sync mechanism. No need to write them here.
1080	 */
1081	return (error);
1082}
1083int
1084ext2_cgupdate(struct ext2mount *mp, int waitfor)
1085{
1086	struct m_ext2fs *fs = mp->um_e2fs;
1087	struct buf *bp;
1088	int i, error = 0, allerror = 0;
1089
1090	allerror = ext2_sbupdate(mp, waitfor);
1091
1092	/* Update gd csums */
1093	if (EXT2_HAS_RO_COMPAT_FEATURE(fs, EXT2F_ROCOMPAT_GDT_CSUM))
1094		ext2_gd_csum_set(fs);
1095
1096	for (i = 0; i < fs->e2fs_gdbcount; i++) {
1097		bp = getblk(mp->um_devvp, fsbtodb(fs,
1098		    fs->e2fs->e2fs_first_dblock +
1099		    1 /* superblock */ + i), fs->e2fs_bsize, 0, 0, 0);
1100		e2fs_cgsave(&fs->e2fs_gd[
1101		    i * fs->e2fs_bsize / sizeof(struct ext2_gd)],
1102		    (struct ext2_gd *)bp->b_data, fs->e2fs_bsize);
1103		if (waitfor == MNT_WAIT)
1104			error = bwrite(bp);
1105		else
1106			bawrite(bp);
1107	}
1108
1109	if (!allerror && error)
1110		allerror = error;
1111	return (allerror);
1112}
1113
1114/*
1115 * Return the root of a filesystem.
1116 */
1117static int
1118ext2_root(struct mount *mp, int flags, struct vnode **vpp)
1119{
1120	struct vnode *nvp;
1121	int error;
1122
1123	error = VFS_VGET(mp, EXT2_ROOTINO, LK_EXCLUSIVE, &nvp);
1124	if (error)
1125		return (error);
1126	*vpp = nvp;
1127	return (0);
1128}
1129