1/*-
2 *  modified for EXT2FS support in Lites 1.1
3 *
4 *  Aug 1995, Godmar Back (gback@cs.utah.edu)
5 *  University of Utah, Department of Computer Science
6 */
7/*-
8 * SPDX-License-Identifier: BSD-3-Clause
9 *
10 * Copyright (c) 1989, 1991, 1993, 1994
11 *	The Regents of the University of California.  All rights reserved.
12 *
13 * Redistribution and use in source and binary forms, with or without
14 * modification, are permitted provided that the following conditions
15 * are met:
16 * 1. Redistributions of source code must retain the above copyright
17 *    notice, this list of conditions and the following disclaimer.
18 * 2. Redistributions in binary form must reproduce the above copyright
19 *    notice, this list of conditions and the following disclaimer in the
20 *    documentation and/or other materials provided with the distribution.
21 * 3. Neither the name of the University nor the names of its contributors
22 *    may be used to endorse or promote products derived from this software
23 *    without specific prior written permission.
24 *
25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35 * SUCH DAMAGE.
36 */
37
38#include <sys/param.h>
39#include <sys/systm.h>
40#include <sys/namei.h>
41#include <sys/priv.h>
42#include <sys/proc.h>
43#include <sys/kernel.h>
44#include <sys/vnode.h>
45#include <sys/mount.h>
46#include <sys/bio.h>
47#include <sys/buf.h>
48#include <sys/conf.h>
49#include <sys/endian.h>
50#include <sys/fcntl.h>
51#include <sys/malloc.h>
52#include <sys/sdt.h>
53#include <sys/stat.h>
54#include <sys/mutex.h>
55
56#include <geom/geom.h>
57#include <geom/geom_vfs.h>
58
59#include <fs/ext2fs/fs.h>
60#include <fs/ext2fs/ext2_mount.h>
61#include <fs/ext2fs/inode.h>
62
63#include <fs/ext2fs/ext2fs.h>
64#include <fs/ext2fs/ext2_dinode.h>
65#include <fs/ext2fs/ext2_extern.h>
66#include <fs/ext2fs/ext2_extents.h>
67
68SDT_PROVIDER_DECLARE(ext2fs);
69/*
70 * ext2fs trace probe:
71 * arg0: verbosity. Higher numbers give more verbose messages
72 * arg1: Textual message
73 */
74SDT_PROBE_DEFINE2(ext2fs, , vfsops, trace, "int", "char*");
75SDT_PROBE_DEFINE2(ext2fs, , vfsops, ext2_cg_validate_error, "char*", "int");
76SDT_PROBE_DEFINE1(ext2fs, , vfsops, ext2_compute_sb_data_error, "char*");
77
78static int	ext2_flushfiles(struct mount *mp, int flags, struct thread *td);
79static int	ext2_mountfs(struct vnode *, struct mount *);
80static int	ext2_reload(struct mount *mp, struct thread *td);
81static int	ext2_sbupdate(struct ext2mount *, int);
82static int	ext2_cgupdate(struct ext2mount *, int);
83static vfs_unmount_t		ext2_unmount;
84static vfs_root_t		ext2_root;
85static vfs_statfs_t		ext2_statfs;
86static vfs_sync_t		ext2_sync;
87static vfs_vget_t		ext2_vget;
88static vfs_fhtovp_t		ext2_fhtovp;
89static vfs_mount_t		ext2_mount;
90
91MALLOC_DEFINE(M_EXT2NODE, "ext2_node", "EXT2 vnode private part");
92static MALLOC_DEFINE(M_EXT2MNT, "ext2_mount", "EXT2 mount structure");
93
94static struct vfsops ext2fs_vfsops = {
95	.vfs_fhtovp =		ext2_fhtovp,
96	.vfs_mount =		ext2_mount,
97	.vfs_root =		ext2_root,	/* root inode via vget */
98	.vfs_statfs =		ext2_statfs,
99	.vfs_sync =		ext2_sync,
100	.vfs_unmount =		ext2_unmount,
101	.vfs_vget =		ext2_vget,
102};
103
104VFS_SET(ext2fs_vfsops, ext2fs, 0);
105
106static int	ext2_check_sb_compat(struct ext2fs *es, struct cdev *dev,
107		    int ronly);
108static int	ext2_compute_sb_data(struct vnode * devvp,
109		    struct ext2fs * es, struct m_ext2fs * fs);
110
111static const char *ext2_opts[] = { "acls", "async", "noatime", "noclusterr",
112    "noclusterw", "noexec", "export", "force", "from", "multilabel",
113    "suiddir", "nosymfollow", "sync", "union", NULL };
114
115/*
116 * VFS Operations.
117 *
118 * mount system call
119 */
120static int
121ext2_mount(struct mount *mp)
122{
123	struct vfsoptlist *opts;
124	struct vnode *devvp;
125	struct thread *td;
126	struct ext2mount *ump = NULL;
127	struct m_ext2fs *fs;
128	struct nameidata nd, *ndp = &nd;
129	accmode_t accmode;
130	char *path, *fspec;
131	int error, flags, len;
132
133	td = curthread;
134	opts = mp->mnt_optnew;
135
136	if (vfs_filteropt(opts, ext2_opts))
137		return (EINVAL);
138
139	vfs_getopt(opts, "fspath", (void **)&path, NULL);
140	/* Double-check the length of path.. */
141	if (strlen(path) >= MAXMNTLEN)
142		return (ENAMETOOLONG);
143
144	fspec = NULL;
145	error = vfs_getopt(opts, "from", (void **)&fspec, &len);
146	if (!error && fspec[len - 1] != '\0')
147		return (EINVAL);
148
149	/*
150	 * If updating, check whether changing from read-only to
151	 * read/write; if there is no device name, that's all we do.
152	 */
153	if (mp->mnt_flag & MNT_UPDATE) {
154		ump = VFSTOEXT2(mp);
155		fs = ump->um_e2fs;
156		error = 0;
157		if (fs->e2fs_ronly == 0 &&
158		    vfs_flagopt(opts, "ro", NULL, 0)) {
159			error = VFS_SYNC(mp, MNT_WAIT);
160			if (error)
161				return (error);
162			flags = WRITECLOSE;
163			if (mp->mnt_flag & MNT_FORCE)
164				flags |= FORCECLOSE;
165			error = ext2_flushfiles(mp, flags, td);
166			if (error == 0 && fs->e2fs_wasvalid &&
167			    ext2_cgupdate(ump, MNT_WAIT) == 0) {
168				fs->e2fs->e2fs_state =
169				    htole16((le16toh(fs->e2fs->e2fs_state) |
170				    E2FS_ISCLEAN));
171				ext2_sbupdate(ump, MNT_WAIT);
172			}
173			fs->e2fs_ronly = 1;
174			vfs_flagopt(opts, "ro", &mp->mnt_flag, MNT_RDONLY);
175			g_topology_lock();
176			g_access(ump->um_cp, 0, -1, 0);
177			g_topology_unlock();
178		}
179		if (!error && (mp->mnt_flag & MNT_RELOAD))
180			error = ext2_reload(mp, td);
181		if (error)
182			return (error);
183		devvp = ump->um_devvp;
184		if (fs->e2fs_ronly && !vfs_flagopt(opts, "ro", NULL, 0)) {
185			if (ext2_check_sb_compat(fs->e2fs, devvp->v_rdev, 0))
186				return (EPERM);
187
188			/*
189			 * If upgrade to read-write by non-root, then verify
190			 * that user has necessary permissions on the device.
191			 */
192			vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
193			error = VOP_ACCESS(devvp, VREAD | VWRITE,
194			    td->td_ucred, td);
195			if (error)
196				error = priv_check(td, PRIV_VFS_MOUNT_PERM);
197			if (error) {
198				VOP_UNLOCK(devvp);
199				return (error);
200			}
201			VOP_UNLOCK(devvp);
202			g_topology_lock();
203			error = g_access(ump->um_cp, 0, 1, 0);
204			g_topology_unlock();
205			if (error)
206				return (error);
207
208			if ((le16toh(fs->e2fs->e2fs_state) & E2FS_ISCLEAN) == 0 ||
209			    (le16toh(fs->e2fs->e2fs_state) & E2FS_ERRORS)) {
210				if (mp->mnt_flag & MNT_FORCE) {
211					printf(
212"WARNING: %s was not properly dismounted\n", fs->e2fs_fsmnt);
213				} else {
214					printf(
215"WARNING: R/W mount of %s denied.  Filesystem is not clean - run fsck\n",
216					    fs->e2fs_fsmnt);
217					return (EPERM);
218				}
219			}
220			fs->e2fs->e2fs_state =
221			    htole16(le16toh(fs->e2fs->e2fs_state) & ~E2FS_ISCLEAN);
222			(void)ext2_cgupdate(ump, MNT_WAIT);
223			fs->e2fs_ronly = 0;
224			MNT_ILOCK(mp);
225			mp->mnt_flag &= ~MNT_RDONLY;
226			MNT_IUNLOCK(mp);
227		}
228		if (vfs_flagopt(opts, "export", NULL, 0)) {
229			/* Process export requests in vfs_mount.c. */
230			return (error);
231		}
232	}
233
234	/*
235	 * Not an update, or updating the name: look up the name
236	 * and verify that it refers to a sensible disk device.
237	 */
238	if (fspec == NULL)
239		return (EINVAL);
240	NDINIT(ndp, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, fspec);
241	if ((error = namei(ndp)) != 0)
242		return (error);
243	NDFREE_PNBUF(ndp);
244	devvp = ndp->ni_vp;
245
246	if (!vn_isdisk_error(devvp, &error)) {
247		vput(devvp);
248		return (error);
249	}
250
251	/*
252	 * If mount by non-root, then verify that user has necessary
253	 * permissions on the device.
254	 *
255	 * XXXRW: VOP_ACCESS() enough?
256	 */
257	accmode = VREAD;
258	if ((mp->mnt_flag & MNT_RDONLY) == 0)
259		accmode |= VWRITE;
260	error = VOP_ACCESS(devvp, accmode, td->td_ucred, td);
261	if (error)
262		error = priv_check(td, PRIV_VFS_MOUNT_PERM);
263	if (error) {
264		vput(devvp);
265		return (error);
266	}
267
268	if ((mp->mnt_flag & MNT_UPDATE) == 0) {
269		error = ext2_mountfs(devvp, mp);
270	} else {
271		if (devvp != ump->um_devvp) {
272			vput(devvp);
273			return (EINVAL);	/* needs translation */
274		} else
275			vput(devvp);
276	}
277	if (error) {
278		vrele(devvp);
279		return (error);
280	}
281	ump = VFSTOEXT2(mp);
282	fs = ump->um_e2fs;
283
284	/*
285	 * Note that this strncpy() is ok because of a check at the start
286	 * of ext2_mount().
287	 */
288	strncpy(fs->e2fs_fsmnt, path, MAXMNTLEN);
289	fs->e2fs_fsmnt[MAXMNTLEN - 1] = '\0';
290	vfs_mountedfrom(mp, fspec);
291	return (0);
292}
293
294static int
295ext2_check_sb_compat(struct ext2fs *es, struct cdev *dev, int ronly)
296{
297	uint32_t i, mask;
298
299	if (le16toh(es->e2fs_magic) != E2FS_MAGIC) {
300		printf("ext2fs: %s: wrong magic number %#x (expected %#x)\n",
301		    devtoname(dev), le16toh(es->e2fs_magic), E2FS_MAGIC);
302		return (1);
303	}
304	if (le32toh(es->e2fs_rev) > E2FS_REV0) {
305		mask = le32toh(es->e2fs_features_incompat) & ~(EXT2F_INCOMPAT_SUPP);
306		if (mask) {
307			printf("WARNING: mount of %s denied due to "
308			    "unsupported optional features:\n", devtoname(dev));
309			for (i = 0;
310			    i < sizeof(incompat)/sizeof(struct ext2_feature);
311			    i++)
312				if (mask & incompat[i].mask)
313					printf("%s ", incompat[i].name);
314			printf("\n");
315			return (1);
316		}
317		mask = le32toh(es->e2fs_features_rocompat) & ~EXT2F_ROCOMPAT_SUPP;
318		if (!ronly && mask) {
319			printf("WARNING: R/W mount of %s denied due to "
320			    "unsupported optional features:\n", devtoname(dev));
321			for (i = 0;
322			    i < sizeof(ro_compat)/sizeof(struct ext2_feature);
323			    i++)
324				if (mask & ro_compat[i].mask)
325					printf("%s ", ro_compat[i].name);
326			printf("\n");
327			return (1);
328		}
329	}
330	return (0);
331}
332
333static e4fs_daddr_t
334ext2_cg_location(struct m_ext2fs *fs, int number)
335{
336	int cg, descpb, logical_sb, has_super = 0;
337
338	/*
339	 * Adjust logical superblock block number.
340	 * Godmar thinks: if the blocksize is greater than 1024, then
341	 * the superblock is logically part of block zero.
342	 */
343	logical_sb = fs->e2fs_bsize > SBLOCKSIZE ? 0 : 1;
344
345	if (!EXT2_HAS_INCOMPAT_FEATURE(fs, EXT2F_INCOMPAT_META_BG) ||
346	    number < le32toh(fs->e2fs->e3fs_first_meta_bg))
347		return (logical_sb + number + 1);
348
349	if (EXT2_HAS_INCOMPAT_FEATURE(fs, EXT2F_INCOMPAT_64BIT))
350		descpb = fs->e2fs_bsize / sizeof(struct ext2_gd);
351	else
352		descpb = fs->e2fs_bsize / E2FS_REV0_GD_SIZE;
353
354	cg = descpb * number;
355
356	if (ext2_cg_has_sb(fs, cg))
357		has_super = 1;
358
359	return (has_super + cg * (e4fs_daddr_t)EXT2_BLOCKS_PER_GROUP(fs) +
360	    le32toh(fs->e2fs->e2fs_first_dblock));
361}
362
363static int
364ext2_cg_validate(struct m_ext2fs *fs)
365{
366	uint64_t b_bitmap;
367	uint64_t i_bitmap;
368	uint64_t i_tables;
369	uint64_t first_block, last_block, last_cg_block;
370	struct ext2_gd *gd;
371	unsigned int i, cg_count;
372
373	first_block = le32toh(fs->e2fs->e2fs_first_dblock);
374	last_cg_block = ext2_cg_number_gdb(fs, 0);
375	cg_count = fs->e2fs_gcount;
376
377	for (i = 0; i < fs->e2fs_gcount; i++) {
378		gd = &fs->e2fs_gd[i];
379
380		if (EXT2_HAS_INCOMPAT_FEATURE(fs, EXT2F_INCOMPAT_FLEX_BG) ||
381		    i == fs->e2fs_gcount - 1) {
382			last_block = fs->e2fs_bcount - 1;
383		} else {
384			last_block = first_block +
385			    (EXT2_BLOCKS_PER_GROUP(fs) - 1);
386		}
387
388		if ((cg_count == fs->e2fs_gcount) &&
389		    !(le16toh(gd->ext4bgd_flags) & EXT2_BG_INODE_ZEROED))
390			cg_count = i;
391
392		b_bitmap = e2fs_gd_get_b_bitmap(gd);
393		if (b_bitmap == 0) {
394			SDT_PROBE2(ext2fs, , vfsops, ext2_cg_validate_error,
395			    "block bitmap is zero", i);
396			return (EINVAL);
397		}
398		if (b_bitmap <= last_cg_block) {
399			SDT_PROBE2(ext2fs, , vfsops, ext2_cg_validate_error,
400			    "block bitmap overlaps gds", i);
401			return (EINVAL);
402		}
403		if (b_bitmap < first_block || b_bitmap > last_block) {
404			SDT_PROBE2(ext2fs, , vfsops, ext2_cg_validate_error,
405			    "block bitmap not in group", i);
406			return (EINVAL);
407		}
408
409		i_bitmap = e2fs_gd_get_i_bitmap(gd);
410		if (i_bitmap == 0) {
411			SDT_PROBE2(ext2fs, , vfsops, ext2_cg_validate_error,
412			    "inode bitmap is zero", i);
413			return (EINVAL);
414		}
415		if (i_bitmap <= last_cg_block) {
416			SDT_PROBE2(ext2fs, , vfsops, ext2_cg_validate_error,
417			    "inode bitmap overlaps gds", i);
418			return (EINVAL);
419		}
420		if (i_bitmap < first_block || i_bitmap > last_block) {
421			SDT_PROBE2(ext2fs, , vfsops, ext2_cg_validate_error,
422			    "inode bitmap not in group blk", i);
423			return (EINVAL);
424		}
425
426		i_tables = e2fs_gd_get_i_tables(gd);
427		if (i_tables == 0) {
428			SDT_PROBE2(ext2fs, , vfsops, ext2_cg_validate_error,
429			    "inode table is zero", i);
430			return (EINVAL);
431		}
432		if (i_tables <= last_cg_block) {
433			SDT_PROBE2(ext2fs, , vfsops, ext2_cg_validate_error,
434			    "inode tables overlaps gds", i);
435			return (EINVAL);
436		}
437		if (i_tables < first_block ||
438		    i_tables + fs->e2fs_itpg - 1 > last_block) {
439			SDT_PROBE2(ext2fs, , vfsops, ext2_cg_validate_error,
440			    "inode tables not in group blk", i);
441			return (EINVAL);
442		}
443
444		if (!EXT2_HAS_INCOMPAT_FEATURE(fs, EXT2F_INCOMPAT_FLEX_BG))
445			first_block += EXT2_BLOCKS_PER_GROUP(fs);
446	}
447
448	return (0);
449}
450
451/*
452 * This computes the fields of the m_ext2fs structure from the
453 * data in the ext2fs structure read in.
454 */
455static int
456ext2_compute_sb_data(struct vnode *devvp, struct ext2fs *es,
457    struct m_ext2fs *fs)
458{
459	struct buf *bp;
460	uint32_t e2fs_descpb, e2fs_gdbcount_alloc;
461	int i, j;
462	int g_count = 0;
463	int error;
464
465	/* Check checksum features */
466	if (EXT2_HAS_RO_COMPAT_FEATURE(fs, EXT2F_ROCOMPAT_GDT_CSUM) &&
467	    EXT2_HAS_RO_COMPAT_FEATURE(fs, EXT2F_ROCOMPAT_METADATA_CKSUM)) {
468		SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error,
469		    "incorrect checksum features combination");
470		return (EINVAL);
471	}
472
473	/* Precompute checksum seed for all metadata */
474	ext2_sb_csum_set_seed(fs);
475
476	/* Verify sb csum if possible */
477	if (EXT2_HAS_RO_COMPAT_FEATURE(fs, EXT2F_ROCOMPAT_METADATA_CKSUM)) {
478		error = ext2_sb_csum_verify(fs);
479		if (error) {
480			return (error);
481		}
482	}
483
484	/* Check for block size = 1K|2K|4K */
485	if (le32toh(es->e2fs_log_bsize) > 2) {
486		SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error,
487		    "bad block size");
488		return (EINVAL);
489	}
490
491	fs->e2fs_bshift = EXT2_MIN_BLOCK_LOG_SIZE + le32toh(es->e2fs_log_bsize);
492	fs->e2fs_bsize = 1U << fs->e2fs_bshift;
493	fs->e2fs_fsbtodb = le32toh(es->e2fs_log_bsize) + 1;
494	fs->e2fs_qbmask = fs->e2fs_bsize - 1;
495
496	/* Check for fragment size */
497	if (le32toh(es->e2fs_log_fsize) >
498	    (EXT2_MAX_FRAG_LOG_SIZE - EXT2_MIN_BLOCK_LOG_SIZE)) {
499		SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error,
500		    "invalid log cluster size");
501		return (EINVAL);
502	}
503
504	fs->e2fs_fsize = EXT2_MIN_FRAG_SIZE << le32toh(es->e2fs_log_fsize);
505	if (fs->e2fs_fsize != fs->e2fs_bsize) {
506		SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error,
507		    "fragment size != block size");
508		return (EINVAL);
509	}
510
511	fs->e2fs_fpb = fs->e2fs_bsize / fs->e2fs_fsize;
512
513	/* Check reserved gdt blocks for future filesystem expansion */
514	if (le16toh(es->e2fs_reserved_ngdb) > (fs->e2fs_bsize / 4)) {
515		SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error,
516		    "number of reserved GDT blocks too large");
517		return (EINVAL);
518	}
519
520	if (le32toh(es->e2fs_rev) == E2FS_REV0) {
521		fs->e2fs_isize = E2FS_REV0_INODE_SIZE;
522	} else {
523		fs->e2fs_isize = le16toh(es->e2fs_inode_size);
524
525		/*
526		 * Check first ino.
527		 */
528		if (le32toh(es->e2fs_first_ino) < EXT2_FIRSTINO) {
529			SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error,
530			    "invalid first ino");
531			return (EINVAL);
532		}
533
534		/*
535		 * Simple sanity check for superblock inode size value.
536		 */
537		if (EXT2_INODE_SIZE(fs) < E2FS_REV0_INODE_SIZE ||
538		    EXT2_INODE_SIZE(fs) > fs->e2fs_bsize ||
539		    (fs->e2fs_isize & (fs->e2fs_isize - 1)) != 0) {
540			SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error,
541			    "invalid inode size");
542			return (EINVAL);
543		}
544	}
545
546	/* Check group descriptors */
547	if (EXT2_HAS_INCOMPAT_FEATURE(fs, EXT2F_INCOMPAT_64BIT) &&
548	    le16toh(es->e3fs_desc_size) != E2FS_64BIT_GD_SIZE) {
549		SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error,
550		    "unsupported 64bit descriptor size");
551		return (EINVAL);
552	}
553
554	fs->e2fs_bpg = le32toh(es->e2fs_bpg);
555	fs->e2fs_fpg = le32toh(es->e2fs_fpg);
556	if (fs->e2fs_bpg == 0 || fs->e2fs_fpg == 0) {
557		SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error,
558		    "zero blocks/fragments per group");
559		return (EINVAL);
560	} else if (fs->e2fs_bpg != fs->e2fs_fpg) {
561		SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error,
562		    "blocks per group not equal fragments per group");
563		return (EINVAL);
564	}
565
566	if (fs->e2fs_bpg != fs->e2fs_bsize * 8) {
567		SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error,
568		    "non-standard group size unsupported");
569		return (EINVAL);
570	}
571
572	fs->e2fs_ipb = fs->e2fs_bsize / EXT2_INODE_SIZE(fs);
573	if (fs->e2fs_ipb == 0 ||
574	    fs->e2fs_ipb > fs->e2fs_bsize / E2FS_REV0_INODE_SIZE) {
575		SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error,
576		    "bad inodes per block size");
577		return (EINVAL);
578	}
579
580	fs->e2fs_ipg = le32toh(es->e2fs_ipg);
581	if (fs->e2fs_ipg < fs->e2fs_ipb || fs->e2fs_ipg >  fs->e2fs_bsize * 8) {
582		SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error,
583		    "invalid inodes per group");
584		return (EINVAL);
585	}
586
587	fs->e2fs_itpg = fs->e2fs_ipg / fs->e2fs_ipb;
588
589	fs->e2fs_bcount = le32toh(es->e2fs_bcount);
590	fs->e2fs_rbcount = le32toh(es->e2fs_rbcount);
591	fs->e2fs_fbcount = le32toh(es->e2fs_fbcount);
592	if (EXT2_HAS_INCOMPAT_FEATURE(fs, EXT2F_INCOMPAT_64BIT)) {
593		fs->e2fs_bcount |= (uint64_t)(le32toh(es->e4fs_bcount_hi)) << 32;
594		fs->e2fs_rbcount |= (uint64_t)(le32toh(es->e4fs_rbcount_hi)) << 32;
595		fs->e2fs_fbcount |= (uint64_t)(le32toh(es->e4fs_fbcount_hi)) << 32;
596	}
597	if (fs->e2fs_rbcount > fs->e2fs_bcount ||
598	    fs->e2fs_fbcount > fs->e2fs_bcount) {
599		SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error,
600		    "invalid block count");
601		return (EINVAL);
602	}
603
604	fs->e2fs_ficount = le32toh(es->e2fs_ficount);
605	if (fs->e2fs_ficount > le32toh(es->e2fs_icount)) {
606		SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error,
607		    "invalid number of free inodes");
608		return (EINVAL);
609	}
610
611	if (le32toh(es->e2fs_first_dblock) != (fs->e2fs_bsize > 1024 ? 0 : 1) ||
612	    le32toh(es->e2fs_first_dblock) >= fs->e2fs_bcount) {
613		SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error,
614		    "first data block out of range");
615		return (EINVAL);
616	}
617
618	fs->e2fs_gcount = howmany(fs->e2fs_bcount -
619	    le32toh(es->e2fs_first_dblock), EXT2_BLOCKS_PER_GROUP(fs));
620	if (fs->e2fs_gcount > ((uint64_t)1 << 32) - EXT2_DESCS_PER_BLOCK(fs)) {
621		SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error,
622		    "groups count too large");
623		return (EINVAL);
624	}
625
626	/* Check for extra isize in big inodes. */
627	if (EXT2_HAS_RO_COMPAT_FEATURE(fs, EXT2F_ROCOMPAT_EXTRA_ISIZE) &&
628	    EXT2_INODE_SIZE(fs) < sizeof(struct ext2fs_dinode)) {
629		SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error,
630		    "no space for extra inode timestamps");
631		return (EINVAL);
632	}
633
634	/* s_resuid / s_resgid ? */
635
636	if (EXT2_HAS_INCOMPAT_FEATURE(fs, EXT2F_INCOMPAT_64BIT)) {
637		e2fs_descpb = fs->e2fs_bsize / E2FS_64BIT_GD_SIZE;
638		e2fs_gdbcount_alloc = howmany(fs->e2fs_gcount, e2fs_descpb);
639	} else {
640		e2fs_descpb = fs->e2fs_bsize / E2FS_REV0_GD_SIZE;
641		e2fs_gdbcount_alloc = howmany(fs->e2fs_gcount,
642		    fs->e2fs_bsize / sizeof(struct ext2_gd));
643	}
644	fs->e2fs_gdbcount = howmany(fs->e2fs_gcount, e2fs_descpb);
645	fs->e2fs_gd = malloc(e2fs_gdbcount_alloc * fs->e2fs_bsize,
646	    M_EXT2MNT, M_WAITOK | M_ZERO);
647	fs->e2fs_contigdirs = malloc(fs->e2fs_gcount *
648	    sizeof(*fs->e2fs_contigdirs), M_EXT2MNT, M_WAITOK | M_ZERO);
649
650	for (i = 0; i < fs->e2fs_gdbcount; i++) {
651		error = bread(devvp,
652		    fsbtodb(fs, ext2_cg_location(fs, i)),
653		    fs->e2fs_bsize, NOCRED, &bp);
654		if (error) {
655			/*
656			 * fs->e2fs_gd and fs->e2fs_contigdirs
657			 * will be freed later by the caller,
658			 * because this function could be called from
659			 * MNT_UPDATE path.
660			 */
661			return (error);
662		}
663		if (EXT2_HAS_INCOMPAT_FEATURE(fs, EXT2F_INCOMPAT_64BIT)) {
664			memcpy(&fs->e2fs_gd[
665			    i * fs->e2fs_bsize / sizeof(struct ext2_gd)],
666			    bp->b_data, fs->e2fs_bsize);
667		} else {
668			for (j = 0; j < e2fs_descpb &&
669			    g_count < fs->e2fs_gcount; j++, g_count++)
670				memcpy(&fs->e2fs_gd[g_count],
671				    bp->b_data + j * E2FS_REV0_GD_SIZE,
672				    E2FS_REV0_GD_SIZE);
673		}
674		brelse(bp);
675		bp = NULL;
676	}
677
678	/* Validate cgs consistency */
679	error = ext2_cg_validate(fs);
680	if (error)
681		return (error);
682
683	/* Verfy cgs csum */
684	if (EXT2_HAS_RO_COMPAT_FEATURE(fs, EXT2F_ROCOMPAT_GDT_CSUM) ||
685	    EXT2_HAS_RO_COMPAT_FEATURE(fs, EXT2F_ROCOMPAT_METADATA_CKSUM)) {
686		error = ext2_gd_csum_verify(fs, devvp->v_rdev);
687		if (error)
688			return (error);
689	}
690	/* Initialization for the ext2 Orlov allocator variant. */
691	fs->e2fs_total_dir = 0;
692	for (i = 0; i < fs->e2fs_gcount; i++)
693		fs->e2fs_total_dir += e2fs_gd_get_ndirs(&fs->e2fs_gd[i]);
694
695	if (le32toh(es->e2fs_rev) == E2FS_REV0 ||
696	    !EXT2_HAS_RO_COMPAT_FEATURE(fs, EXT2F_ROCOMPAT_LARGEFILE))
697		fs->e2fs_maxfilesize = 0x7fffffff;
698	else {
699		fs->e2fs_maxfilesize = 0xffffffffffff;
700		if (EXT2_HAS_RO_COMPAT_FEATURE(fs, EXT2F_ROCOMPAT_HUGE_FILE))
701			fs->e2fs_maxfilesize = 0x7fffffffffffffff;
702	}
703	if (le32toh(es->e4fs_flags) & E2FS_UNSIGNED_HASH) {
704		fs->e2fs_uhash = 3;
705	} else if ((le32toh(es->e4fs_flags) & E2FS_SIGNED_HASH) == 0) {
706#ifdef __CHAR_UNSIGNED__
707		es->e4fs_flags = htole32(le32toh(es->e4fs_flags) | E2FS_UNSIGNED_HASH);
708		fs->e2fs_uhash = 3;
709#else
710		es->e4fs_flags = htole32(le32toh(es->e4fs_flags) | E2FS_SIGNED_HASH);
711#endif
712	}
713	if (EXT2_HAS_RO_COMPAT_FEATURE(fs, EXT2F_ROCOMPAT_METADATA_CKSUM))
714		error = ext2_sb_csum_verify(fs);
715
716	return (error);
717}
718
719/*
720 * Reload all incore data for a filesystem (used after running fsck on
721 * the root filesystem and finding things to fix). The filesystem must
722 * be mounted read-only.
723 *
724 * Things to do to update the mount:
725 *	1) invalidate all cached meta-data.
726 *	2) re-read superblock from disk.
727 *	3) invalidate all cluster summary information.
728 *	4) invalidate all inactive vnodes.
729 *	5) invalidate all cached file data.
730 *	6) re-read inode data for all active vnodes.
731 * XXX we are missing some steps, in particular # 3, this has to be reviewed.
732 */
733static int
734ext2_reload(struct mount *mp, struct thread *td)
735{
736	struct vnode *vp, *mvp, *devvp;
737	struct inode *ip;
738	struct buf *bp;
739	struct ext2fs *es;
740	struct m_ext2fs *fs;
741	struct csum *sump;
742	int error, i;
743	int32_t *lp;
744
745	if ((mp->mnt_flag & MNT_RDONLY) == 0)
746		return (EINVAL);
747	/*
748	 * Step 1: invalidate all cached meta-data.
749	 */
750	devvp = VFSTOEXT2(mp)->um_devvp;
751	vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
752	if (vinvalbuf(devvp, 0, 0, 0) != 0)
753		panic("ext2_reload: dirty1");
754	VOP_UNLOCK(devvp);
755
756	/*
757	 * Step 2: re-read superblock from disk.
758	 * constants have been adjusted for ext2
759	 */
760	if ((error = bread(devvp, SBLOCK, SBLOCKBLKSIZE, NOCRED, &bp)) != 0)
761		return (error);
762	es = (struct ext2fs *)((char *)bp->b_data + SBLOCKOFFSET);
763	if (ext2_check_sb_compat(es, devvp->v_rdev, 0) != 0) {
764		brelse(bp);
765		return (EIO);		/* XXX needs translation */
766	}
767	fs = VFSTOEXT2(mp)->um_e2fs;
768	bcopy(bp->b_data, fs->e2fs, sizeof(struct ext2fs));
769
770	if ((error = ext2_compute_sb_data(devvp, es, fs)) != 0) {
771		brelse(bp);
772		return (error);
773	}
774
775	brelse(bp);
776
777	/*
778	 * Step 3: invalidate all cluster summary information.
779	 */
780	if (fs->e2fs_contigsumsize > 0) {
781		lp = fs->e2fs_maxcluster;
782		sump = fs->e2fs_clustersum;
783		for (i = 0; i < fs->e2fs_gcount; i++, sump++) {
784			*lp++ = fs->e2fs_contigsumsize;
785			sump->cs_init = 0;
786			bzero(sump->cs_sum, fs->e2fs_contigsumsize + 1);
787		}
788	}
789
790loop:
791	MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
792		/*
793		 * Step 4: invalidate all cached file data.
794		 */
795		if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK)) {
796			MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
797			goto loop;
798		}
799		if (vinvalbuf(vp, 0, 0, 0))
800			panic("ext2_reload: dirty2");
801
802		/*
803		 * Step 5: re-read inode data for all active vnodes.
804		 */
805		ip = VTOI(vp);
806		error = bread(devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
807		    (int)fs->e2fs_bsize, NOCRED, &bp);
808		if (error) {
809			vput(vp);
810			MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
811			return (error);
812		}
813
814		error = ext2_ei2i((struct ext2fs_dinode *)((char *)bp->b_data +
815		    EXT2_INODE_SIZE(fs) * ino_to_fsbo(fs, ip->i_number)), ip);
816
817		brelse(bp);
818		vput(vp);
819
820		if (error) {
821			MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
822			return (error);
823		}
824	}
825	return (0);
826}
827
828/*
829 * Common code for mount and mountroot.
830 */
831static int
832ext2_mountfs(struct vnode *devvp, struct mount *mp)
833{
834	struct ext2mount *ump;
835	struct buf *bp;
836	struct m_ext2fs *fs;
837	struct ext2fs *es;
838	struct cdev *dev = devvp->v_rdev;
839	struct g_consumer *cp;
840	struct bufobj *bo;
841	struct csum *sump;
842	int error;
843	int ronly;
844	int i;
845	u_long size;
846	int32_t *lp;
847	int32_t e2fs_maxcontig;
848
849	bp = NULL;
850	ump = NULL;
851
852	ronly = vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0);
853	/* XXX: use VOP_ACESS to check FS perms */
854	g_topology_lock();
855	error = g_vfs_open(devvp, &cp, "ext2fs", ronly ? 0 : 1);
856	g_topology_unlock();
857	VOP_UNLOCK(devvp);
858	if (error)
859		return (error);
860
861	if (PAGE_SIZE != SBLOCKBLKSIZE) {
862		printf("WARNING: Unsupported page size %d\n", PAGE_SIZE);
863		error = EINVAL;
864		goto out;
865	}
866	if (cp->provider->sectorsize > PAGE_SIZE) {
867		printf("WARNING: Device sectorsize(%d) is more than %d\n",
868		    cp->provider->sectorsize, PAGE_SIZE);
869		error = EINVAL;
870		goto out;
871	}
872
873	bo = &devvp->v_bufobj;
874	bo->bo_private = cp;
875	bo->bo_ops = g_vfs_bufops;
876	if (devvp->v_rdev->si_iosize_max != 0)
877		mp->mnt_iosize_max = devvp->v_rdev->si_iosize_max;
878	if (mp->mnt_iosize_max > maxphys)
879		mp->mnt_iosize_max = maxphys;
880	if ((error = bread(devvp, SBLOCK, SBLOCKBLKSIZE, NOCRED, &bp)) != 0)
881		goto out;
882	es = (struct ext2fs *)((char *)bp->b_data + SBLOCKOFFSET);
883	if (ext2_check_sb_compat(es, dev, ronly) != 0) {
884		error = EINVAL;		/* XXX needs translation */
885		goto out;
886	}
887	if ((le16toh(es->e2fs_state) & E2FS_ISCLEAN) == 0 ||
888	    (le16toh(es->e2fs_state) & E2FS_ERRORS)) {
889		if (ronly || (mp->mnt_flag & MNT_FORCE)) {
890			printf(
891"WARNING: Filesystem was not properly dismounted\n");
892		} else {
893			printf(
894"WARNING: R/W mount denied.  Filesystem is not clean - run fsck\n");
895			error = EPERM;
896			goto out;
897		}
898	}
899	ump = malloc(sizeof(*ump), M_EXT2MNT, M_WAITOK | M_ZERO);
900
901	/*
902	 * I don't know whether this is the right strategy. Note that
903	 * we dynamically allocate both an m_ext2fs and an ext2fs
904	 * while Linux keeps the super block in a locked buffer.
905	 */
906	ump->um_e2fs = malloc(sizeof(struct m_ext2fs),
907	    M_EXT2MNT, M_WAITOK | M_ZERO);
908	ump->um_e2fs->e2fs = malloc(sizeof(struct ext2fs),
909	    M_EXT2MNT, M_WAITOK);
910	mtx_init(EXT2_MTX(ump), "EXT2FS", "EXT2FS Lock", MTX_DEF);
911	bcopy(es, ump->um_e2fs->e2fs, (u_int)sizeof(struct ext2fs));
912	if ((error = ext2_compute_sb_data(devvp, ump->um_e2fs->e2fs, ump->um_e2fs)))
913		goto out;
914
915	/*
916	 * Calculate the maximum contiguous blocks and size of cluster summary
917	 * array.  In FFS this is done by newfs; however, the superblock
918	 * in ext2fs doesn't have these variables, so we can calculate
919	 * them here.
920	 */
921	e2fs_maxcontig = MAX(1, maxphys / ump->um_e2fs->e2fs_bsize);
922	ump->um_e2fs->e2fs_contigsumsize = MIN(e2fs_maxcontig, EXT2_MAXCONTIG);
923	ump->um_e2fs->e2fs_maxsymlinklen = EXT2_MAXSYMLINKLEN;
924	if (ump->um_e2fs->e2fs_contigsumsize > 0) {
925		size = ump->um_e2fs->e2fs_gcount * sizeof(int32_t);
926		ump->um_e2fs->e2fs_maxcluster = malloc(size, M_EXT2MNT, M_WAITOK);
927		size = ump->um_e2fs->e2fs_gcount * sizeof(struct csum);
928		ump->um_e2fs->e2fs_clustersum = malloc(size, M_EXT2MNT, M_WAITOK);
929		lp = ump->um_e2fs->e2fs_maxcluster;
930		sump = ump->um_e2fs->e2fs_clustersum;
931		for (i = 0; i < ump->um_e2fs->e2fs_gcount; i++, sump++) {
932			*lp++ = ump->um_e2fs->e2fs_contigsumsize;
933			sump->cs_init = 0;
934			sump->cs_sum = malloc((ump->um_e2fs->e2fs_contigsumsize + 1) *
935			    sizeof(int32_t), M_EXT2MNT, M_WAITOK | M_ZERO);
936		}
937	}
938
939	brelse(bp);
940	bp = NULL;
941	fs = ump->um_e2fs;
942	fs->e2fs_ronly = ronly;	/* ronly is set according to mnt_flags */
943
944	/*
945	 * If the fs is not mounted read-only, make sure the super block is
946	 * always written back on a sync().
947	 */
948	fs->e2fs_wasvalid = le16toh(fs->e2fs->e2fs_state) & E2FS_ISCLEAN ? 1 : 0;
949	if (ronly == 0) {
950		fs->e2fs_fmod = 1;	/* mark it modified and set fs invalid */
951		fs->e2fs->e2fs_state =
952		    htole16(le16toh(fs->e2fs->e2fs_state) & ~E2FS_ISCLEAN);
953	}
954	mp->mnt_data = ump;
955	mp->mnt_stat.f_fsid.val[0] = dev2udev(dev);
956	mp->mnt_stat.f_fsid.val[1] = mp->mnt_vfc->vfc_typenum;
957	MNT_ILOCK(mp);
958	mp->mnt_flag |= MNT_LOCAL;
959	MNT_IUNLOCK(mp);
960	ump->um_mountp = mp;
961	ump->um_dev = dev;
962	ump->um_devvp = devvp;
963	ump->um_bo = &devvp->v_bufobj;
964	ump->um_cp = cp;
965
966	/*
967	 * Setting those two parameters allowed us to use
968	 * ufs_bmap w/o changse!
969	 */
970	ump->um_nindir = EXT2_ADDR_PER_BLOCK(fs);
971	ump->um_bptrtodb = le32toh(fs->e2fs->e2fs_log_bsize) + 1;
972	ump->um_seqinc = EXT2_FRAGS_PER_BLOCK(fs);
973	if (ronly == 0)
974		ext2_sbupdate(ump, MNT_WAIT);
975	/*
976	 * Initialize filesystem stat information in mount struct.
977	 */
978	MNT_ILOCK(mp);
979	mp->mnt_kern_flag |= MNTK_LOOKUP_SHARED | MNTK_EXTENDED_SHARED |
980	    MNTK_USES_BCACHE;
981	MNT_IUNLOCK(mp);
982	return (0);
983out:
984	if (bp)
985		brelse(bp);
986	if (cp != NULL) {
987		g_topology_lock();
988		g_vfs_close(cp);
989		g_topology_unlock();
990	}
991	if (ump) {
992		mtx_destroy(EXT2_MTX(ump));
993		free(ump->um_e2fs->e2fs_gd, M_EXT2MNT);
994		free(ump->um_e2fs->e2fs_contigdirs, M_EXT2MNT);
995		free(ump->um_e2fs->e2fs, M_EXT2MNT);
996		free(ump->um_e2fs, M_EXT2MNT);
997		free(ump, M_EXT2MNT);
998		mp->mnt_data = NULL;
999	}
1000	return (error);
1001}
1002
1003/*
1004 * Unmount system call.
1005 */
1006static int
1007ext2_unmount(struct mount *mp, int mntflags)
1008{
1009	struct ext2mount *ump;
1010	struct m_ext2fs *fs;
1011	struct csum *sump;
1012	int error, flags, i, ronly;
1013
1014	flags = 0;
1015	if (mntflags & MNT_FORCE) {
1016		if (mp->mnt_flag & MNT_ROOTFS)
1017			return (EINVAL);
1018		flags |= FORCECLOSE;
1019	}
1020	if ((error = ext2_flushfiles(mp, flags, curthread)) != 0)
1021		return (error);
1022	ump = VFSTOEXT2(mp);
1023	fs = ump->um_e2fs;
1024	ronly = fs->e2fs_ronly;
1025	if (ronly == 0 && ext2_cgupdate(ump, MNT_WAIT) == 0) {
1026		if (fs->e2fs_wasvalid)
1027			fs->e2fs->e2fs_state =
1028			    htole16(le16toh(fs->e2fs->e2fs_state) | E2FS_ISCLEAN);
1029		ext2_sbupdate(ump, MNT_WAIT);
1030	}
1031
1032	g_topology_lock();
1033	g_vfs_close(ump->um_cp);
1034	g_topology_unlock();
1035	vrele(ump->um_devvp);
1036	sump = fs->e2fs_clustersum;
1037	for (i = 0; i < fs->e2fs_gcount; i++, sump++)
1038		free(sump->cs_sum, M_EXT2MNT);
1039	free(fs->e2fs_clustersum, M_EXT2MNT);
1040	free(fs->e2fs_maxcluster, M_EXT2MNT);
1041	free(fs->e2fs_gd, M_EXT2MNT);
1042	free(fs->e2fs_contigdirs, M_EXT2MNT);
1043	free(fs->e2fs, M_EXT2MNT);
1044	free(fs, M_EXT2MNT);
1045	free(ump, M_EXT2MNT);
1046	mp->mnt_data = NULL;
1047	return (error);
1048}
1049
1050/*
1051 * Flush out all the files in a filesystem.
1052 */
1053static int
1054ext2_flushfiles(struct mount *mp, int flags, struct thread *td)
1055{
1056	int error;
1057
1058	error = vflush(mp, 0, flags, td);
1059	return (error);
1060}
1061
1062/*
1063 * Get filesystem statistics.
1064 */
1065int
1066ext2_statfs(struct mount *mp, struct statfs *sbp)
1067{
1068	struct ext2mount *ump;
1069	struct m_ext2fs *fs;
1070	uint32_t overhead, overhead_per_group, ngdb;
1071	int i, ngroups;
1072
1073	ump = VFSTOEXT2(mp);
1074	fs = ump->um_e2fs;
1075	if (le16toh(fs->e2fs->e2fs_magic) != E2FS_MAGIC)
1076		panic("ext2_statfs");
1077
1078	/*
1079	 * Compute the overhead (FS structures)
1080	 */
1081	overhead_per_group =
1082	    1 /* block bitmap */ +
1083	    1 /* inode bitmap */ +
1084	    fs->e2fs_itpg;
1085	overhead = le32toh(fs->e2fs->e2fs_first_dblock) +
1086	    fs->e2fs_gcount * overhead_per_group;
1087	if (le32toh(fs->e2fs->e2fs_rev) > E2FS_REV0 &&
1088	    le32toh(fs->e2fs->e2fs_features_rocompat) & EXT2F_ROCOMPAT_SPARSESUPER) {
1089		for (i = 0, ngroups = 0; i < fs->e2fs_gcount; i++) {
1090			if (ext2_cg_has_sb(fs, i))
1091				ngroups++;
1092		}
1093	} else {
1094		ngroups = fs->e2fs_gcount;
1095	}
1096	ngdb = fs->e2fs_gdbcount;
1097	if (le32toh(fs->e2fs->e2fs_rev) > E2FS_REV0 &&
1098	    le32toh(fs->e2fs->e2fs_features_compat) & EXT2F_COMPAT_RESIZE)
1099		ngdb += le16toh(fs->e2fs->e2fs_reserved_ngdb);
1100	overhead += ngroups * (1 /* superblock */ + ngdb);
1101
1102	sbp->f_bsize = EXT2_FRAG_SIZE(fs);
1103	sbp->f_iosize = EXT2_BLOCK_SIZE(fs);
1104	sbp->f_blocks = fs->e2fs_bcount - overhead;
1105	sbp->f_bfree = fs->e2fs_fbcount;
1106	sbp->f_bavail = sbp->f_bfree - fs->e2fs_rbcount;
1107	sbp->f_files = le32toh(fs->e2fs->e2fs_icount);
1108	sbp->f_ffree = fs->e2fs_ficount;
1109	return (0);
1110}
1111
1112/*
1113 * Go through the disk queues to initiate sandbagged IO;
1114 * go through the inodes to write those that have been modified;
1115 * initiate the writing of the super block if it has been modified.
1116 *
1117 * Note: we are always called with the filesystem marked `MPBUSY'.
1118 */
1119static int
1120ext2_sync(struct mount *mp, int waitfor)
1121{
1122	struct vnode *mvp, *vp;
1123	struct thread *td;
1124	struct inode *ip;
1125	struct ext2mount *ump = VFSTOEXT2(mp);
1126	struct m_ext2fs *fs;
1127	int error, allerror = 0;
1128
1129	td = curthread;
1130	fs = ump->um_e2fs;
1131	if (fs->e2fs_fmod != 0 && fs->e2fs_ronly != 0) {		/* XXX */
1132		panic("ext2_sync: rofs mod fs=%s", fs->e2fs_fsmnt);
1133	}
1134
1135	/*
1136	 * Write back each (modified) inode.
1137	 */
1138loop:
1139	MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
1140		if (vp->v_type == VNON) {
1141			VI_UNLOCK(vp);
1142			continue;
1143		}
1144		ip = VTOI(vp);
1145		if ((ip->i_flag &
1146		    (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE)) == 0 &&
1147		    (vp->v_bufobj.bo_dirty.bv_cnt == 0 ||
1148		    waitfor == MNT_LAZY)) {
1149			VI_UNLOCK(vp);
1150			continue;
1151		}
1152		error = vget(vp, LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK);
1153		if (error) {
1154			if (error == ENOENT) {
1155				MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
1156				goto loop;
1157			}
1158			continue;
1159		}
1160		if ((error = VOP_FSYNC(vp, waitfor, td)) != 0)
1161			allerror = error;
1162		vput(vp);
1163	}
1164
1165	/*
1166	 * Force stale filesystem control information to be flushed.
1167	 */
1168	if (waitfor != MNT_LAZY) {
1169		vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY);
1170		if ((error = VOP_FSYNC(ump->um_devvp, waitfor, td)) != 0)
1171			allerror = error;
1172		VOP_UNLOCK(ump->um_devvp);
1173	}
1174
1175	/*
1176	 * Write back modified superblock.
1177	 */
1178	if (fs->e2fs_fmod != 0) {
1179		fs->e2fs_fmod = 0;
1180		fs->e2fs->e2fs_wtime = htole32(time_second);
1181		if ((error = ext2_cgupdate(ump, waitfor)) != 0)
1182			allerror = error;
1183	}
1184	return (allerror);
1185}
1186
1187/*
1188 * Look up an EXT2FS dinode number to find its incore vnode, otherwise read it
1189 * in from disk.  If it is in core, wait for the lock bit to clear, then
1190 * return the inode locked.  Detection and handling of mount points must be
1191 * done by the calling routine.
1192 */
1193static int
1194ext2_vget(struct mount *mp, ino_t ino, int flags, struct vnode **vpp)
1195{
1196	struct m_ext2fs *fs;
1197	struct inode *ip;
1198	struct ext2mount *ump;
1199	struct buf *bp;
1200	struct vnode *vp;
1201	struct thread *td;
1202	unsigned int i, used_blocks;
1203	int error;
1204
1205	td = curthread;
1206	error = vfs_hash_get(mp, ino, flags, td, vpp, NULL, NULL);
1207	if (error || *vpp != NULL)
1208		return (error);
1209
1210	ump = VFSTOEXT2(mp);
1211	ip = malloc(sizeof(struct inode), M_EXT2NODE, M_WAITOK | M_ZERO);
1212
1213	/* Allocate a new vnode/inode. */
1214	if ((error = getnewvnode("ext2fs", mp, &ext2_vnodeops, &vp)) != 0) {
1215		*vpp = NULL;
1216		free(ip, M_EXT2NODE);
1217		return (error);
1218	}
1219	vp->v_data = ip;
1220	ip->i_vnode = vp;
1221	ip->i_e2fs = fs = ump->um_e2fs;
1222	ip->i_ump = ump;
1223	ip->i_number = ino;
1224	cluster_init_vn(&ip->i_clusterw);
1225
1226	lockmgr(vp->v_vnlock, LK_EXCLUSIVE, NULL);
1227	error = insmntque(vp, mp);
1228	if (error != 0) {
1229		free(ip, M_EXT2NODE);
1230		*vpp = NULL;
1231		return (error);
1232	}
1233	error = vfs_hash_insert(vp, ino, flags, td, vpp, NULL, NULL);
1234	if (error || *vpp != NULL)
1235		return (error);
1236
1237	/* Read in the disk contents for the inode, copy into the inode. */
1238	if ((error = bread(ump->um_devvp, fsbtodb(fs, ino_to_fsba(fs, ino)),
1239	    (int)fs->e2fs_bsize, NOCRED, &bp)) != 0) {
1240		/*
1241		 * The inode does not contain anything useful, so it would
1242		 * be misleading to leave it on its hash chain. With mode
1243		 * still zero, it will be unlinked and returned to the free
1244		 * list by vput().
1245		 */
1246		brelse(bp);
1247		vput(vp);
1248		*vpp = NULL;
1249		return (error);
1250	}
1251	/* convert ext2 inode to dinode */
1252	error = ext2_ei2i((struct ext2fs_dinode *)((char *)bp->b_data +
1253	    EXT2_INODE_SIZE(fs) * ino_to_fsbo(fs, ino)), ip);
1254	if (error) {
1255		brelse(bp);
1256		vput(vp);
1257		*vpp = NULL;
1258		return (error);
1259	}
1260	ip->i_block_group = ino_to_cg(fs, ino);
1261	ip->i_next_alloc_block = 0;
1262	ip->i_next_alloc_goal = 0;
1263
1264	/*
1265	 * Now we want to make sure that block pointers for unused
1266	 * blocks are zeroed out - ext2_balloc depends on this
1267	 * although for regular files and directories only
1268	 *
1269	 * If IN_E4EXTENTS is enabled, unused blocks are not zeroed
1270	 * out because we could corrupt the extent tree.
1271	 */
1272	if (!(ip->i_flag & IN_E4EXTENTS) &&
1273	    (S_ISDIR(ip->i_mode) || S_ISREG(ip->i_mode))) {
1274		used_blocks = howmany(ip->i_size, fs->e2fs_bsize);
1275		for (i = used_blocks; i < EXT2_NDIR_BLOCKS; i++)
1276			ip->i_db[i] = 0;
1277	}
1278
1279	bqrelse(bp);
1280
1281#ifdef EXT2FS_PRINT_EXTENTS
1282	ext2_print_inode(ip);
1283	error = ext4_ext_walk(ip);
1284	if (error) {
1285		vput(vp);
1286		*vpp = NULL;
1287		return (error);
1288	}
1289#endif
1290
1291	/*
1292	 * Initialize the vnode from the inode, check for aliases.
1293	 * Note that the underlying vnode may have changed.
1294	 */
1295	if ((error = ext2_vinit(mp, &ext2_fifoops, &vp)) != 0) {
1296		vput(vp);
1297		*vpp = NULL;
1298		return (error);
1299	}
1300
1301	/*
1302	 * Finish inode initialization.
1303	 */
1304
1305	vn_set_state(vp, VSTATE_CONSTRUCTED);
1306	*vpp = vp;
1307	return (0);
1308}
1309
1310/*
1311 * File handle to vnode
1312 *
1313 * Have to be really careful about stale file handles:
1314 * - check that the inode number is valid
1315 * - call ext2_vget() to get the locked inode
1316 * - check for an unallocated inode (i_mode == 0)
1317 * - check that the given client host has export rights and return
1318 *   those rights via. exflagsp and credanonp
1319 */
1320static int
1321ext2_fhtovp(struct mount *mp, struct fid *fhp, int flags, struct vnode **vpp)
1322{
1323	struct inode *ip;
1324	struct ufid *ufhp;
1325	struct vnode *nvp;
1326	struct m_ext2fs *fs;
1327	int error;
1328
1329	ufhp = (struct ufid *)fhp;
1330	fs = VFSTOEXT2(mp)->um_e2fs;
1331	if (ufhp->ufid_ino < EXT2_ROOTINO ||
1332	    ufhp->ufid_ino > fs->e2fs_gcount * fs->e2fs_ipg)
1333		return (ESTALE);
1334
1335	error = VFS_VGET(mp, ufhp->ufid_ino, LK_EXCLUSIVE, &nvp);
1336	if (error) {
1337		*vpp = NULLVP;
1338		return (error);
1339	}
1340	ip = VTOI(nvp);
1341	if (ip->i_mode == 0 ||
1342	    ip->i_gen != ufhp->ufid_gen || ip->i_nlink <= 0) {
1343		vput(nvp);
1344		*vpp = NULLVP;
1345		return (ESTALE);
1346	}
1347	*vpp = nvp;
1348	vnode_create_vobject(*vpp, ip->i_size, curthread);
1349	return (0);
1350}
1351
1352/*
1353 * Write a superblock and associated information back to disk.
1354 */
1355static int
1356ext2_sbupdate(struct ext2mount *mp, int waitfor)
1357{
1358	struct m_ext2fs *fs = mp->um_e2fs;
1359	struct ext2fs *es = fs->e2fs;
1360	struct buf *bp;
1361	int error = 0;
1362
1363	es->e2fs_bcount = htole32(fs->e2fs_bcount & 0xffffffff);
1364	es->e2fs_rbcount = htole32(fs->e2fs_rbcount & 0xffffffff);
1365	es->e2fs_fbcount = htole32(fs->e2fs_fbcount & 0xffffffff);
1366	if (EXT2_HAS_INCOMPAT_FEATURE(fs, EXT2F_INCOMPAT_64BIT)) {
1367		es->e4fs_bcount_hi = htole32(fs->e2fs_bcount >> 32);
1368		es->e4fs_rbcount_hi = htole32(fs->e2fs_rbcount >> 32);
1369		es->e4fs_fbcount_hi = htole32(fs->e2fs_fbcount >> 32);
1370	}
1371
1372	es->e2fs_ficount = htole32(fs->e2fs_ficount);
1373
1374	if (EXT2_HAS_RO_COMPAT_FEATURE(fs, EXT2F_ROCOMPAT_METADATA_CKSUM))
1375		ext2_sb_csum_set(fs);
1376
1377	error = bread(mp->um_devvp, SBLOCK, SBLOCKBLKSIZE, NOCRED, &bp);
1378	if (error != 0)
1379		return (error);
1380
1381	memcpy((char *)bp->b_data + SBLOCKOFFSET, (caddr_t)es,
1382	    (u_int)sizeof(struct ext2fs));
1383	if (waitfor == MNT_WAIT)
1384		error = bwrite(bp);
1385	else
1386		bawrite(bp);
1387
1388	/*
1389	 * The buffers for group descriptors, inode bitmaps and block bitmaps
1390	 * are not busy at this point and are (hopefully) written by the
1391	 * usual sync mechanism. No need to write them here.
1392	 */
1393	return (error);
1394}
1395int
1396ext2_cgupdate(struct ext2mount *mp, int waitfor)
1397{
1398	struct m_ext2fs *fs = mp->um_e2fs;
1399	struct buf *bp;
1400	int i, j, g_count = 0, error = 0, allerror = 0;
1401
1402	allerror = ext2_sbupdate(mp, waitfor);
1403
1404	/* Update gd csums */
1405	if (EXT2_HAS_RO_COMPAT_FEATURE(fs, EXT2F_ROCOMPAT_GDT_CSUM) ||
1406	    EXT2_HAS_RO_COMPAT_FEATURE(fs, EXT2F_ROCOMPAT_METADATA_CKSUM))
1407		ext2_gd_csum_set(fs);
1408
1409	for (i = 0; i < fs->e2fs_gdbcount; i++) {
1410		bp = getblk(mp->um_devvp, fsbtodb(fs,
1411		    ext2_cg_location(fs, i)),
1412		    fs->e2fs_bsize, 0, 0, 0);
1413		if (EXT2_HAS_INCOMPAT_FEATURE(fs, EXT2F_INCOMPAT_64BIT)) {
1414			memcpy(bp->b_data, &fs->e2fs_gd[
1415			    i * fs->e2fs_bsize / sizeof(struct ext2_gd)],
1416			    fs->e2fs_bsize);
1417		} else {
1418			for (j = 0; j < fs->e2fs_bsize / E2FS_REV0_GD_SIZE &&
1419			    g_count < fs->e2fs_gcount; j++, g_count++)
1420				memcpy(bp->b_data + j * E2FS_REV0_GD_SIZE,
1421				    &fs->e2fs_gd[g_count], E2FS_REV0_GD_SIZE);
1422		}
1423		if (waitfor == MNT_WAIT)
1424			error = bwrite(bp);
1425		else
1426			bawrite(bp);
1427	}
1428
1429	if (!allerror && error)
1430		allerror = error;
1431	return (allerror);
1432}
1433
1434/*
1435 * Return the root of a filesystem.
1436 */
1437static int
1438ext2_root(struct mount *mp, int flags, struct vnode **vpp)
1439{
1440	struct vnode *nvp;
1441	int error;
1442
1443	error = VFS_VGET(mp, EXT2_ROOTINO, LK_EXCLUSIVE, &nvp);
1444	if (error)
1445		return (error);
1446	*vpp = nvp;
1447	return (0);
1448}
1449