ffs_vfsops.c revision 158095
1/*-
2 * Copyright (c) 1989, 1991, 1993, 1994
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 4. Neither the name of the University nor the names of its contributors
14 *    may be used to endorse or promote products derived from this software
15 *    without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 *
29 *	@(#)ffs_vfsops.c	8.31 (Berkeley) 5/20/95
30 */
31
32#include <sys/cdefs.h>
33__FBSDID("$FreeBSD: head/sys/ufs/ffs/ffs_vfsops.c 158095 2006-04-28 01:05:31Z jeff $");
34
35#include "opt_mac.h"
36#include "opt_quota.h"
37#include "opt_ufs.h"
38#include "opt_ffs.h"
39
40#include <sys/param.h>
41#include <sys/systm.h>
42#include <sys/namei.h>
43#include <sys/proc.h>
44#include <sys/kernel.h>
45#include <sys/mac.h>
46#include <sys/vnode.h>
47#include <sys/mount.h>
48#include <sys/bio.h>
49#include <sys/buf.h>
50#include <sys/conf.h>
51#include <sys/fcntl.h>
52#include <sys/malloc.h>
53#include <sys/mutex.h>
54
55#include <ufs/ufs/extattr.h>
56#include <ufs/ufs/quota.h>
57#include <ufs/ufs/ufsmount.h>
58#include <ufs/ufs/inode.h>
59#include <ufs/ufs/ufs_extern.h>
60
61#include <ufs/ffs/fs.h>
62#include <ufs/ffs/ffs_extern.h>
63
64#include <vm/vm.h>
65#include <vm/uma.h>
66#include <vm/vm_page.h>
67
68#include <geom/geom.h>
69#include <geom/geom_vfs.h>
70
71static uma_zone_t uma_inode, uma_ufs1, uma_ufs2;
72
73static int	ffs_sbupdate(struct ufsmount *, int, int);
74static int	ffs_reload(struct mount *, struct thread *);
75static int	ffs_mountfs(struct vnode *, struct mount *, struct thread *);
76static void	ffs_oldfscompat_read(struct fs *, struct ufsmount *,
77		    ufs2_daddr_t);
78static void	ffs_oldfscompat_write(struct fs *, struct ufsmount *);
79static void	ffs_ifree(struct ufsmount *ump, struct inode *ip);
80static vfs_init_t ffs_init;
81static vfs_uninit_t ffs_uninit;
82static vfs_extattrctl_t ffs_extattrctl;
83static vfs_cmount_t ffs_cmount;
84static vfs_unmount_t ffs_unmount;
85static vfs_mount_t ffs_mount;
86static vfs_statfs_t ffs_statfs;
87static vfs_fhtovp_t ffs_fhtovp;
88static vfs_vptofh_t ffs_vptofh;
89static vfs_sync_t ffs_sync;
90
91static struct vfsops ufs_vfsops = {
92	.vfs_extattrctl =	ffs_extattrctl,
93	.vfs_fhtovp =		ffs_fhtovp,
94	.vfs_init =		ffs_init,
95	.vfs_mount =		ffs_mount,
96	.vfs_cmount =		ffs_cmount,
97	.vfs_quotactl =		ufs_quotactl,
98	.vfs_root =		ufs_root,
99	.vfs_statfs =		ffs_statfs,
100	.vfs_sync =		ffs_sync,
101	.vfs_uninit =		ffs_uninit,
102	.vfs_unmount =		ffs_unmount,
103	.vfs_vget =		ffs_vget,
104	.vfs_vptofh =		ffs_vptofh,
105};
106
107VFS_SET(ufs_vfsops, ufs, 0);
108
109static b_strategy_t ffs_geom_strategy;
110static b_write_t ffs_bufwrite;
111
112static struct buf_ops ffs_ops = {
113	.bop_name =	"FFS",
114	.bop_write =	ffs_bufwrite,
115	.bop_strategy =	ffs_geom_strategy,
116	.bop_sync =	bufsync,
117};
118
119static const char *ffs_opts[] = { "acls", "async", "atime", "clusterr",
120    "clusterw", "exec", "errmsg", "export", "force", "from", "multilabel",
121    "snapshot", "suid", "suiddir", "symfollow", "sync",
122    "update", "union", NULL };
123
124static int
125ffs_mount(struct mount *mp, struct thread *td)
126{
127	struct vnode *devvp;
128	struct ufsmount *ump = 0;
129	struct fs *fs;
130	int error, flags;
131	mode_t accessmode;
132	struct nameidata ndp;
133	struct export_args export;
134	char *fspec;
135
136	if (vfs_filteropt(mp->mnt_optnew, ffs_opts))
137		return (EINVAL);
138	if (uma_inode == NULL) {
139		uma_inode = uma_zcreate("FFS inode",
140		    sizeof(struct inode), NULL, NULL, NULL, NULL,
141		    UMA_ALIGN_PTR, 0);
142		uma_ufs1 = uma_zcreate("FFS1 dinode",
143		    sizeof(struct ufs1_dinode), NULL, NULL, NULL, NULL,
144		    UMA_ALIGN_PTR, 0);
145		uma_ufs2 = uma_zcreate("FFS2 dinode",
146		    sizeof(struct ufs2_dinode), NULL, NULL, NULL, NULL,
147		    UMA_ALIGN_PTR, 0);
148	}
149
150	fspec = vfs_getopts(mp->mnt_optnew, "from", &error);
151	if (error)
152		return (error);
153
154	if (vfs_getopt(mp->mnt_optnew, "acls", NULL, NULL) == 0)
155		mp->mnt_flag |= MNT_ACLS;
156
157	if (vfs_getopt(mp->mnt_optnew, "async", NULL, NULL) == 0)
158		mp->mnt_flag |= MNT_ASYNC;
159
160	if (vfs_getopt(mp->mnt_optnew, "force", NULL, NULL) == 0)
161		mp->mnt_flag |= MNT_FORCE;
162
163	if (vfs_getopt(mp->mnt_optnew, "multilabel", NULL, NULL) == 0)
164		mp->mnt_flag |= MNT_MULTILABEL;
165
166	if (vfs_getopt(mp->mnt_optnew, "noasync", NULL, NULL) == 0)
167		mp->mnt_flag &= ~MNT_ASYNC;
168
169	if (vfs_getopt(mp->mnt_optnew, "noatime", NULL, NULL) == 0)
170		mp->mnt_flag |= MNT_NOATIME;
171
172	if (vfs_getopt(mp->mnt_optnew, "noclusterr", NULL, NULL) == 0)
173		mp->mnt_flag |= MNT_NOCLUSTERR;
174
175	if (vfs_getopt(mp->mnt_optnew, "noclusterw", NULL, NULL) == 0)
176		mp->mnt_flag |= MNT_NOCLUSTERW;
177
178	if (vfs_getopt(mp->mnt_optnew, "snapshot", NULL, NULL) == 0)
179		mp->mnt_flag |= MNT_SNAPSHOT;
180
181	if (vfs_getopt(mp->mnt_optnew, "update", NULL, NULL) == 0)
182		mp->mnt_flag |= MNT_UPDATE;
183
184	export.ex_root = -2; /* DEFAULT_ROOTID */
185
186	if (mp->mnt_flag & MNT_RDONLY)
187		export.ex_flags = MNT_EXRDONLY;
188	else
189		export.ex_flags = 0;
190
191	/*
192	 * If updating, check whether changing from read-only to
193	 * read/write; if there is no device name, that's all we do.
194	 */
195	if (mp->mnt_flag & MNT_UPDATE) {
196		ump = VFSTOUFS(mp);
197		fs = ump->um_fs;
198		devvp = ump->um_devvp;
199		if (fs->fs_ronly == 0 &&
200		    vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0)) {
201			if ((error = vn_start_write(NULL, &mp, V_WAIT)) != 0)
202				return (error);
203			/*
204			 * Flush any dirty data.
205			 */
206			if ((error = ffs_sync(mp, MNT_WAIT, td)) != 0) {
207				vn_finished_write(mp);
208				return (error);
209			}
210			/*
211			 * Check for and optionally get rid of files open
212			 * for writing.
213			 */
214			flags = WRITECLOSE;
215			if (mp->mnt_flag & MNT_FORCE)
216				flags |= FORCECLOSE;
217			if (mp->mnt_flag & MNT_SOFTDEP) {
218				error = softdep_flushfiles(mp, flags, td);
219			} else {
220				error = ffs_flushfiles(mp, flags, td);
221			}
222			if (error) {
223				vn_finished_write(mp);
224				return (error);
225			}
226			if (fs->fs_pendingblocks != 0 ||
227			    fs->fs_pendinginodes != 0) {
228				printf("%s: %s: blocks %jd files %d\n",
229				    fs->fs_fsmnt, "update error",
230				    (intmax_t)fs->fs_pendingblocks,
231				    fs->fs_pendinginodes);
232				fs->fs_pendingblocks = 0;
233				fs->fs_pendinginodes = 0;
234			}
235			if ((fs->fs_flags & (FS_UNCLEAN | FS_NEEDSFSCK)) == 0)
236				fs->fs_clean = 1;
237			if ((error = ffs_sbupdate(ump, MNT_WAIT, 0)) != 0) {
238				fs->fs_ronly = 0;
239				fs->fs_clean = 0;
240				vn_finished_write(mp);
241				return (error);
242			}
243			vn_finished_write(mp);
244			DROP_GIANT();
245			g_topology_lock();
246			g_access(ump->um_cp, 0, -1, 0);
247			g_topology_unlock();
248			PICKUP_GIANT();
249			fs->fs_ronly = 1;
250			mp->mnt_flag |= MNT_RDONLY;
251		}
252		if ((mp->mnt_flag & MNT_RELOAD) &&
253		    (error = ffs_reload(mp, td)) != 0)
254			return (error);
255		if (fs->fs_ronly &&
256		    !vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0)) {
257			/*
258			 * If upgrade to read-write by non-root, then verify
259			 * that user has necessary permissions on the device.
260			 */
261			if (suser(td)) {
262				vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, td);
263				if ((error = VOP_ACCESS(devvp, VREAD | VWRITE,
264				    td->td_ucred, td)) != 0) {
265					VOP_UNLOCK(devvp, 0, td);
266					return (error);
267				}
268				VOP_UNLOCK(devvp, 0, td);
269			}
270			fs->fs_flags &= ~FS_UNCLEAN;
271			if (fs->fs_clean == 0) {
272				fs->fs_flags |= FS_UNCLEAN;
273				if ((mp->mnt_flag & MNT_FORCE) ||
274				    ((fs->fs_flags & FS_NEEDSFSCK) == 0 &&
275				     (fs->fs_flags & FS_DOSOFTDEP))) {
276					printf("WARNING: %s was not %s\n",
277					   fs->fs_fsmnt, "properly dismounted");
278				} else {
279					printf(
280"WARNING: R/W mount of %s denied.  Filesystem is not clean - run fsck\n",
281					    fs->fs_fsmnt);
282					return (EPERM);
283				}
284			}
285			DROP_GIANT();
286			g_topology_lock();
287			/*
288			 * If we're the root device, we may not have an E count
289			 * yet, get it now.
290			 */
291			if (ump->um_cp->ace == 0)
292				error = g_access(ump->um_cp, 0, 1, 1);
293			else
294				error = g_access(ump->um_cp, 0, 1, 0);
295			g_topology_unlock();
296			PICKUP_GIANT();
297			if (error)
298				return (error);
299			if ((error = vn_start_write(NULL, &mp, V_WAIT)) != 0)
300				return (error);
301			fs->fs_ronly = 0;
302			mp->mnt_flag &= ~MNT_RDONLY;
303			fs->fs_clean = 0;
304			if ((error = ffs_sbupdate(ump, MNT_WAIT, 0)) != 0) {
305				vn_finished_write(mp);
306				return (error);
307			}
308			/* check to see if we need to start softdep */
309			if ((fs->fs_flags & FS_DOSOFTDEP) &&
310			    (error = softdep_mount(devvp, mp, fs, td->td_ucred))){
311				vn_finished_write(mp);
312				return (error);
313			}
314			if (fs->fs_snapinum[0] != 0)
315				ffs_snapshot_mount(mp);
316			vn_finished_write(mp);
317		}
318		/*
319		 * Soft updates is incompatible with "async",
320		 * so if we are doing softupdates stop the user
321		 * from setting the async flag in an update.
322		 * Softdep_mount() clears it in an initial mount
323		 * or ro->rw remount.
324		 */
325		if (mp->mnt_flag & MNT_SOFTDEP)
326			mp->mnt_flag &= ~MNT_ASYNC;
327		/*
328		 * Keep MNT_ACLS flag if it is stored in superblock.
329		 */
330		if ((fs->fs_flags & FS_ACLS) != 0)
331			mp->mnt_flag |= MNT_ACLS;
332		/*
333		 * If not updating name, process export requests.
334		 */
335		error = 0;
336		if (vfs_getopt(mp->mnt_optnew, "export", NULL, NULL) == 0) {
337			error = vfs_copyopt(mp->mnt_optnew, "export",
338			    &export, sizeof export);
339		}
340
341		if (error == 0 && export.ex_flags != 0)
342			return (vfs_export(mp, &export));
343		/*
344		 * If this is a snapshot request, take the snapshot.
345		 */
346		if (mp->mnt_flag & MNT_SNAPSHOT)
347			return (ffs_snapshot(mp, fspec));
348	}
349
350	/*
351	 * Not an update, or updating the name: look up the name
352	 * and verify that it refers to a sensible disk device.
353	 */
354	NDINIT(&ndp, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, fspec, td);
355	if ((error = namei(&ndp)) != 0)
356		return (error);
357	NDFREE(&ndp, NDF_ONLY_PNBUF);
358	devvp = ndp.ni_vp;
359	if (!vn_isdisk(devvp, &error)) {
360		vput(devvp);
361		return (error);
362	}
363
364	/*
365	 * If mount by non-root, then verify that user has necessary
366	 * permissions on the device.
367	 */
368	if (suser(td)) {
369		accessmode = VREAD;
370		if ((mp->mnt_flag & MNT_RDONLY) == 0)
371			accessmode |= VWRITE;
372		if ((error = VOP_ACCESS(devvp, accessmode, td->td_ucred, td))!= 0){
373			vput(devvp);
374			return (error);
375		}
376	}
377
378	if (mp->mnt_flag & MNT_UPDATE) {
379		/*
380		 * Update only
381		 *
382		 * If it's not the same vnode, or at least the same device
383		 * then it's not correct.
384		 */
385
386		if (devvp->v_rdev != ump->um_devvp->v_rdev)
387			error = EINVAL;	/* needs translation */
388		vput(devvp);
389		if (error)
390			return (error);
391	} else {
392		/*
393		 * New mount
394		 *
395		 * We need the name for the mount point (also used for
396		 * "last mounted on") copied in. If an error occurs,
397		 * the mount point is discarded by the upper level code.
398		 * Note that vfs_mount() populates f_mntonname for us.
399		 */
400		if ((error = ffs_mountfs(devvp, mp, td)) != 0) {
401			vrele(devvp);
402			return (error);
403		}
404	}
405	vfs_mountedfrom(mp, fspec);
406	return (0);
407}
408
409/*
410 * Compatibility with old mount system call.
411 */
412
413static int
414ffs_cmount(struct mntarg *ma, void *data, int flags, struct thread *td)
415{
416	struct ufs_args args;
417	int error;
418
419	if (data == NULL)
420		return (EINVAL);
421	error = copyin(data, &args, sizeof args);
422	if (error)
423		return (error);
424
425	ma = mount_argsu(ma, "from", args.fspec, MAXPATHLEN);
426	ma = mount_arg(ma, "export", &args.export, sizeof args.export);
427	error = kernel_mount(ma, flags);
428
429	return (error);
430}
431
432/*
433 * Reload all incore data for a filesystem (used after running fsck on
434 * the root filesystem and finding things to fix). The filesystem must
435 * be mounted read-only.
436 *
437 * Things to do to update the mount:
438 *	1) invalidate all cached meta-data.
439 *	2) re-read superblock from disk.
440 *	3) re-read summary information from disk.
441 *	4) invalidate all inactive vnodes.
442 *	5) invalidate all cached file data.
443 *	6) re-read inode data for all active vnodes.
444 */
445static int
446ffs_reload(struct mount *mp, struct thread *td)
447{
448	struct vnode *vp, *mvp, *devvp;
449	struct inode *ip;
450	void *space;
451	struct buf *bp;
452	struct fs *fs, *newfs;
453	struct ufsmount *ump;
454	ufs2_daddr_t sblockloc;
455	int i, blks, size, error;
456	int32_t *lp;
457
458	if ((mp->mnt_flag & MNT_RDONLY) == 0)
459		return (EINVAL);
460	ump = VFSTOUFS(mp);
461	/*
462	 * Step 1: invalidate all cached meta-data.
463	 */
464	devvp = VFSTOUFS(mp)->um_devvp;
465	vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, td);
466	if (vinvalbuf(devvp, 0, td, 0, 0) != 0)
467		panic("ffs_reload: dirty1");
468	VOP_UNLOCK(devvp, 0, td);
469
470	/*
471	 * Step 2: re-read superblock from disk.
472	 */
473	fs = VFSTOUFS(mp)->um_fs;
474	if ((error = bread(devvp, btodb(fs->fs_sblockloc), fs->fs_sbsize,
475	    NOCRED, &bp)) != 0)
476		return (error);
477	newfs = (struct fs *)bp->b_data;
478	if ((newfs->fs_magic != FS_UFS1_MAGIC &&
479	     newfs->fs_magic != FS_UFS2_MAGIC) ||
480	    newfs->fs_bsize > MAXBSIZE ||
481	    newfs->fs_bsize < sizeof(struct fs)) {
482			brelse(bp);
483			return (EIO);		/* XXX needs translation */
484	}
485	/*
486	 * Copy pointer fields back into superblock before copying in	XXX
487	 * new superblock. These should really be in the ufsmount.	XXX
488	 * Note that important parameters (eg fs_ncg) are unchanged.
489	 */
490	newfs->fs_csp = fs->fs_csp;
491	newfs->fs_maxcluster = fs->fs_maxcluster;
492	newfs->fs_contigdirs = fs->fs_contigdirs;
493	newfs->fs_active = fs->fs_active;
494	/* The file system is still read-only. */
495	newfs->fs_ronly = 1;
496	sblockloc = fs->fs_sblockloc;
497	bcopy(newfs, fs, (u_int)fs->fs_sbsize);
498	brelse(bp);
499	mp->mnt_maxsymlinklen = fs->fs_maxsymlinklen;
500	ffs_oldfscompat_read(fs, VFSTOUFS(mp), sblockloc);
501	UFS_LOCK(ump);
502	if (fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) {
503		printf("%s: reload pending error: blocks %jd files %d\n",
504		    fs->fs_fsmnt, (intmax_t)fs->fs_pendingblocks,
505		    fs->fs_pendinginodes);
506		fs->fs_pendingblocks = 0;
507		fs->fs_pendinginodes = 0;
508	}
509	UFS_UNLOCK(ump);
510
511	/*
512	 * Step 3: re-read summary information from disk.
513	 */
514	blks = howmany(fs->fs_cssize, fs->fs_fsize);
515	space = fs->fs_csp;
516	for (i = 0; i < blks; i += fs->fs_frag) {
517		size = fs->fs_bsize;
518		if (i + fs->fs_frag > blks)
519			size = (blks - i) * fs->fs_fsize;
520		error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + i), size,
521		    NOCRED, &bp);
522		if (error)
523			return (error);
524		bcopy(bp->b_data, space, (u_int)size);
525		space = (char *)space + size;
526		brelse(bp);
527	}
528	/*
529	 * We no longer know anything about clusters per cylinder group.
530	 */
531	if (fs->fs_contigsumsize > 0) {
532		lp = fs->fs_maxcluster;
533		for (i = 0; i < fs->fs_ncg; i++)
534			*lp++ = fs->fs_contigsumsize;
535	}
536
537loop:
538	MNT_ILOCK(mp);
539	MNT_VNODE_FOREACH(vp, mp, mvp) {
540		VI_LOCK(vp);
541		if (vp->v_iflag & VI_DOOMED) {
542			VI_UNLOCK(vp);
543			continue;
544		}
545		MNT_IUNLOCK(mp);
546		/*
547		 * Step 4: invalidate all cached file data.
548		 */
549		if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, td)) {
550			MNT_VNODE_FOREACH_ABORT(mp, mvp);
551			goto loop;
552		}
553		if (vinvalbuf(vp, 0, td, 0, 0))
554			panic("ffs_reload: dirty2");
555		/*
556		 * Step 5: re-read inode data for all active vnodes.
557		 */
558		ip = VTOI(vp);
559		error =
560		    bread(devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
561		    (int)fs->fs_bsize, NOCRED, &bp);
562		if (error) {
563			VOP_UNLOCK(vp, 0, td);
564			vrele(vp);
565			MNT_VNODE_FOREACH_ABORT(mp, mvp);
566			return (error);
567		}
568		ffs_load_inode(bp, ip, fs, ip->i_number);
569		ip->i_effnlink = ip->i_nlink;
570		brelse(bp);
571		VOP_UNLOCK(vp, 0, td);
572		vrele(vp);
573		MNT_ILOCK(mp);
574	}
575	MNT_IUNLOCK(mp);
576	return (0);
577}
578
579/*
580 * Possible superblock locations ordered from most to least likely.
581 */
582static int sblock_try[] = SBLOCKSEARCH;
583
584/*
585 * Common code for mount and mountroot
586 */
587static int
588ffs_mountfs(devvp, mp, td)
589	struct vnode *devvp;
590	struct mount *mp;
591	struct thread *td;
592{
593	struct ufsmount *ump;
594	struct buf *bp;
595	struct fs *fs;
596	struct cdev *dev;
597	void *space;
598	ufs2_daddr_t sblockloc;
599	int error, i, blks, size, ronly;
600	int32_t *lp;
601	struct ucred *cred;
602	struct g_consumer *cp;
603	struct mount *nmp;
604
605	dev = devvp->v_rdev;
606	cred = td ? td->td_ucred : NOCRED;
607
608	ronly = (mp->mnt_flag & MNT_RDONLY) != 0;
609	DROP_GIANT();
610	g_topology_lock();
611	error = g_vfs_open(devvp, &cp, "ffs", ronly ? 0 : 1);
612
613	/*
614	 * If we are a root mount, drop the E flag so fsck can do its magic.
615	 * We will pick it up again when we remount R/W.
616	 */
617	if (error == 0 && ronly && (mp->mnt_flag & MNT_ROOTFS))
618		error = g_access(cp, 0, 0, -1);
619	g_topology_unlock();
620	PICKUP_GIANT();
621	VOP_UNLOCK(devvp, 0, td);
622	if (error)
623		return (error);
624	if (devvp->v_rdev->si_iosize_max != 0)
625		mp->mnt_iosize_max = devvp->v_rdev->si_iosize_max;
626	if (mp->mnt_iosize_max > MAXPHYS)
627		mp->mnt_iosize_max = MAXPHYS;
628
629	devvp->v_bufobj.bo_private = cp;
630	devvp->v_bufobj.bo_ops = &ffs_ops;
631
632	bp = NULL;
633	ump = NULL;
634	fs = NULL;
635	sblockloc = 0;
636	/*
637	 * Try reading the superblock in each of its possible locations.
638	 */
639	for (i = 0; sblock_try[i] != -1; i++) {
640		if ((error = bread(devvp, sblock_try[i] / DEV_BSIZE, SBLOCKSIZE,
641		    cred, &bp)) != 0)
642			goto out;
643		fs = (struct fs *)bp->b_data;
644		sblockloc = sblock_try[i];
645		if ((fs->fs_magic == FS_UFS1_MAGIC ||
646		     (fs->fs_magic == FS_UFS2_MAGIC &&
647		      (fs->fs_sblockloc == sblockloc ||
648		       (fs->fs_old_flags & FS_FLAGS_UPDATED) == 0))) &&
649		    fs->fs_bsize <= MAXBSIZE &&
650		    fs->fs_bsize >= sizeof(struct fs))
651			break;
652		brelse(bp);
653		bp = NULL;
654	}
655	if (sblock_try[i] == -1) {
656		error = EINVAL;		/* XXX needs translation */
657		goto out;
658	}
659	fs->fs_fmod = 0;
660	fs->fs_flags &= ~FS_INDEXDIRS;	/* no support for directory indicies */
661	fs->fs_flags &= ~FS_UNCLEAN;
662	if (fs->fs_clean == 0) {
663		fs->fs_flags |= FS_UNCLEAN;
664		if (ronly || (mp->mnt_flag & MNT_FORCE) ||
665		    ((fs->fs_flags & FS_NEEDSFSCK) == 0 &&
666		     (fs->fs_flags & FS_DOSOFTDEP))) {
667			printf(
668"WARNING: %s was not properly dismounted\n",
669			    fs->fs_fsmnt);
670		} else {
671			printf(
672"WARNING: R/W mount of %s denied.  Filesystem is not clean - run fsck\n",
673			    fs->fs_fsmnt);
674			error = EPERM;
675			goto out;
676		}
677		if ((fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) &&
678		    (mp->mnt_flag & MNT_FORCE)) {
679			printf("%s: lost blocks %jd files %d\n", fs->fs_fsmnt,
680			    (intmax_t)fs->fs_pendingblocks,
681			    fs->fs_pendinginodes);
682			fs->fs_pendingblocks = 0;
683			fs->fs_pendinginodes = 0;
684		}
685	}
686	if (fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) {
687		printf("%s: mount pending error: blocks %jd files %d\n",
688		    fs->fs_fsmnt, (intmax_t)fs->fs_pendingblocks,
689		    fs->fs_pendinginodes);
690		fs->fs_pendingblocks = 0;
691		fs->fs_pendinginodes = 0;
692	}
693	ump = malloc(sizeof *ump, M_UFSMNT, M_WAITOK | M_ZERO);
694	ump->um_cp = cp;
695	ump->um_bo = &devvp->v_bufobj;
696	ump->um_fs = malloc((u_long)fs->fs_sbsize, M_UFSMNT, M_WAITOK);
697	if (fs->fs_magic == FS_UFS1_MAGIC) {
698		ump->um_fstype = UFS1;
699		ump->um_balloc = ffs_balloc_ufs1;
700	} else {
701		ump->um_fstype = UFS2;
702		ump->um_balloc = ffs_balloc_ufs2;
703	}
704	ump->um_blkatoff = ffs_blkatoff;
705	ump->um_truncate = ffs_truncate;
706	ump->um_update = ffs_update;
707	ump->um_valloc = ffs_valloc;
708	ump->um_vfree = ffs_vfree;
709	ump->um_ifree = ffs_ifree;
710	mtx_init(UFS_MTX(ump), "FFS", "FFS Lock", MTX_DEF);
711	bcopy(bp->b_data, ump->um_fs, (u_int)fs->fs_sbsize);
712	if (fs->fs_sbsize < SBLOCKSIZE)
713		bp->b_flags |= B_INVAL | B_NOCACHE;
714	brelse(bp);
715	bp = NULL;
716	fs = ump->um_fs;
717	ffs_oldfscompat_read(fs, ump, sblockloc);
718	fs->fs_ronly = ronly;
719	size = fs->fs_cssize;
720	blks = howmany(size, fs->fs_fsize);
721	if (fs->fs_contigsumsize > 0)
722		size += fs->fs_ncg * sizeof(int32_t);
723	size += fs->fs_ncg * sizeof(u_int8_t);
724	space = malloc((u_long)size, M_UFSMNT, M_WAITOK);
725	fs->fs_csp = space;
726	for (i = 0; i < blks; i += fs->fs_frag) {
727		size = fs->fs_bsize;
728		if (i + fs->fs_frag > blks)
729			size = (blks - i) * fs->fs_fsize;
730		if ((error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + i), size,
731		    cred, &bp)) != 0) {
732			free(fs->fs_csp, M_UFSMNT);
733			goto out;
734		}
735		bcopy(bp->b_data, space, (u_int)size);
736		space = (char *)space + size;
737		brelse(bp);
738		bp = NULL;
739	}
740	if (fs->fs_contigsumsize > 0) {
741		fs->fs_maxcluster = lp = space;
742		for (i = 0; i < fs->fs_ncg; i++)
743			*lp++ = fs->fs_contigsumsize;
744		space = lp;
745	}
746	size = fs->fs_ncg * sizeof(u_int8_t);
747	fs->fs_contigdirs = (u_int8_t *)space;
748	bzero(fs->fs_contigdirs, size);
749	fs->fs_active = NULL;
750	mp->mnt_data = (qaddr_t)ump;
751	mp->mnt_stat.f_fsid.val[0] = fs->fs_id[0];
752	mp->mnt_stat.f_fsid.val[1] = fs->fs_id[1];
753	nmp = NULL;
754	if (fs->fs_id[0] == 0 || fs->fs_id[1] == 0 ||
755	    (nmp = vfs_getvfs(&mp->mnt_stat.f_fsid))) {
756		if (nmp)
757			vfs_rel(nmp);
758		vfs_getnewfsid(mp);
759	}
760	mp->mnt_maxsymlinklen = fs->fs_maxsymlinklen;
761	mp->mnt_flag |= MNT_LOCAL;
762	if ((fs->fs_flags & FS_MULTILABEL) != 0)
763#ifdef MAC
764		mp->mnt_flag |= MNT_MULTILABEL;
765#else
766		printf(
767"WARNING: %s: multilabel flag on fs but no MAC support\n",
768		    fs->fs_fsmnt);
769#endif
770	if ((fs->fs_flags & FS_ACLS) != 0)
771#ifdef UFS_ACL
772		mp->mnt_flag |= MNT_ACLS;
773#else
774		printf(
775"WARNING: %s: ACLs flag on fs but no ACLs support\n",
776		    fs->fs_fsmnt);
777#endif
778	ump->um_mountp = mp;
779	ump->um_dev = dev;
780	ump->um_devvp = devvp;
781	ump->um_nindir = fs->fs_nindir;
782	ump->um_bptrtodb = fs->fs_fsbtodb;
783	ump->um_seqinc = fs->fs_frag;
784	for (i = 0; i < MAXQUOTAS; i++)
785		ump->um_quotas[i] = NULLVP;
786#ifdef UFS_EXTATTR
787	ufs_extattr_uepm_init(&ump->um_extattr);
788#endif
789	/*
790	 * Set FS local "last mounted on" information (NULL pad)
791	 */
792	bzero(fs->fs_fsmnt, MAXMNTLEN);
793	strlcpy(fs->fs_fsmnt, mp->mnt_stat.f_mntonname, MAXMNTLEN);
794
795	if( mp->mnt_flag & MNT_ROOTFS) {
796		/*
797		 * Root mount; update timestamp in mount structure.
798		 * this will be used by the common root mount code
799		 * to update the system clock.
800		 */
801		mp->mnt_time = fs->fs_time;
802	}
803
804	if (ronly == 0) {
805		if ((fs->fs_flags & FS_DOSOFTDEP) &&
806		    (error = softdep_mount(devvp, mp, fs, cred)) != 0) {
807			free(fs->fs_csp, M_UFSMNT);
808			goto out;
809		}
810		if (fs->fs_snapinum[0] != 0)
811			ffs_snapshot_mount(mp);
812		fs->fs_fmod = 1;
813		fs->fs_clean = 0;
814		(void) ffs_sbupdate(ump, MNT_WAIT, 0);
815	}
816	/*
817	 * Initialize filesystem stat information in mount struct.
818	 */
819#ifdef UFS_EXTATTR
820#ifdef UFS_EXTATTR_AUTOSTART
821	/*
822	 *
823	 * Auto-starting does the following:
824	 *	- check for /.attribute in the fs, and extattr_start if so
825	 *	- for each file in .attribute, enable that file with
826	 * 	  an attribute of the same name.
827	 * Not clear how to report errors -- probably eat them.
828	 * This would all happen while the filesystem was busy/not
829	 * available, so would effectively be "atomic".
830	 */
831	(void) ufs_extattr_autostart(mp, td);
832#endif /* !UFS_EXTATTR_AUTOSTART */
833#endif /* !UFS_EXTATTR */
834#ifdef QUOTA
835	/*
836	 * Our bufobj must require giant for snapshots when quotas are
837	 * enabled.
838	 */
839	devvp->v_bufobj.bo_flags |= BO_NEEDSGIANT;
840#else
841	mp->mnt_kern_flag |= MNTK_MPSAFE;
842#endif
843	return (0);
844out:
845	if (bp)
846		brelse(bp);
847	if (cp != NULL) {
848		DROP_GIANT();
849		g_topology_lock();
850		g_vfs_close(cp, td);
851		g_topology_unlock();
852		PICKUP_GIANT();
853	}
854	if (ump) {
855		mtx_destroy(UFS_MTX(ump));
856		free(ump->um_fs, M_UFSMNT);
857		free(ump, M_UFSMNT);
858		mp->mnt_data = (qaddr_t)0;
859	}
860	return (error);
861}
862
863#include <sys/sysctl.h>
864static int bigcgs = 0;
865SYSCTL_INT(_debug, OID_AUTO, bigcgs, CTLFLAG_RW, &bigcgs, 0, "");
866
867/*
868 * Sanity checks for loading old filesystem superblocks.
869 * See ffs_oldfscompat_write below for unwound actions.
870 *
871 * XXX - Parts get retired eventually.
872 * Unfortunately new bits get added.
873 */
874static void
875ffs_oldfscompat_read(fs, ump, sblockloc)
876	struct fs *fs;
877	struct ufsmount *ump;
878	ufs2_daddr_t sblockloc;
879{
880	off_t maxfilesize;
881
882	/*
883	 * If not yet done, update fs_flags location and value of fs_sblockloc.
884	 */
885	if ((fs->fs_old_flags & FS_FLAGS_UPDATED) == 0) {
886		fs->fs_flags = fs->fs_old_flags;
887		fs->fs_old_flags |= FS_FLAGS_UPDATED;
888		fs->fs_sblockloc = sblockloc;
889	}
890	/*
891	 * If not yet done, update UFS1 superblock with new wider fields.
892	 */
893	if (fs->fs_magic == FS_UFS1_MAGIC && fs->fs_maxbsize != fs->fs_bsize) {
894		fs->fs_maxbsize = fs->fs_bsize;
895		fs->fs_time = fs->fs_old_time;
896		fs->fs_size = fs->fs_old_size;
897		fs->fs_dsize = fs->fs_old_dsize;
898		fs->fs_csaddr = fs->fs_old_csaddr;
899		fs->fs_cstotal.cs_ndir = fs->fs_old_cstotal.cs_ndir;
900		fs->fs_cstotal.cs_nbfree = fs->fs_old_cstotal.cs_nbfree;
901		fs->fs_cstotal.cs_nifree = fs->fs_old_cstotal.cs_nifree;
902		fs->fs_cstotal.cs_nffree = fs->fs_old_cstotal.cs_nffree;
903	}
904	if (fs->fs_magic == FS_UFS1_MAGIC &&
905	    fs->fs_old_inodefmt < FS_44INODEFMT) {
906		fs->fs_maxfilesize = ((uint64_t)1 << 31) - 1;
907		fs->fs_qbmask = ~fs->fs_bmask;
908		fs->fs_qfmask = ~fs->fs_fmask;
909	}
910	if (fs->fs_magic == FS_UFS1_MAGIC) {
911		ump->um_savedmaxfilesize = fs->fs_maxfilesize;
912		maxfilesize = (uint64_t)0x80000000 * fs->fs_bsize - 1;
913		if (fs->fs_maxfilesize > maxfilesize)
914			fs->fs_maxfilesize = maxfilesize;
915	}
916	/* Compatibility for old filesystems */
917	if (fs->fs_avgfilesize <= 0)
918		fs->fs_avgfilesize = AVFILESIZ;
919	if (fs->fs_avgfpdir <= 0)
920		fs->fs_avgfpdir = AFPDIR;
921	if (bigcgs) {
922		fs->fs_save_cgsize = fs->fs_cgsize;
923		fs->fs_cgsize = fs->fs_bsize;
924	}
925}
926
927/*
928 * Unwinding superblock updates for old filesystems.
929 * See ffs_oldfscompat_read above for details.
930 *
931 * XXX - Parts get retired eventually.
932 * Unfortunately new bits get added.
933 */
934static void
935ffs_oldfscompat_write(fs, ump)
936	struct fs *fs;
937	struct ufsmount *ump;
938{
939
940	/*
941	 * Copy back UFS2 updated fields that UFS1 inspects.
942	 */
943	if (fs->fs_magic == FS_UFS1_MAGIC) {
944		fs->fs_old_time = fs->fs_time;
945		fs->fs_old_cstotal.cs_ndir = fs->fs_cstotal.cs_ndir;
946		fs->fs_old_cstotal.cs_nbfree = fs->fs_cstotal.cs_nbfree;
947		fs->fs_old_cstotal.cs_nifree = fs->fs_cstotal.cs_nifree;
948		fs->fs_old_cstotal.cs_nffree = fs->fs_cstotal.cs_nffree;
949		fs->fs_maxfilesize = ump->um_savedmaxfilesize;
950	}
951	if (bigcgs) {
952		fs->fs_cgsize = fs->fs_save_cgsize;
953		fs->fs_save_cgsize = 0;
954	}
955}
956
957/*
958 * unmount system call
959 */
960static int
961ffs_unmount(mp, mntflags, td)
962	struct mount *mp;
963	int mntflags;
964	struct thread *td;
965{
966	struct ufsmount *ump = VFSTOUFS(mp);
967	struct fs *fs;
968	int error, flags;
969
970	flags = 0;
971	if (mntflags & MNT_FORCE) {
972		flags |= FORCECLOSE;
973	}
974#ifdef UFS_EXTATTR
975	if ((error = ufs_extattr_stop(mp, td))) {
976		if (error != EOPNOTSUPP)
977			printf("ffs_unmount: ufs_extattr_stop returned %d\n",
978			    error);
979	} else {
980		ufs_extattr_uepm_destroy(&ump->um_extattr);
981	}
982#endif
983	if (mp->mnt_flag & MNT_SOFTDEP) {
984		if ((error = softdep_flushfiles(mp, flags, td)) != 0)
985			return (error);
986	} else {
987		if ((error = ffs_flushfiles(mp, flags, td)) != 0)
988			return (error);
989	}
990	fs = ump->um_fs;
991	UFS_LOCK(ump);
992	if (fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) {
993		printf("%s: unmount pending error: blocks %jd files %d\n",
994		    fs->fs_fsmnt, (intmax_t)fs->fs_pendingblocks,
995		    fs->fs_pendinginodes);
996		fs->fs_pendingblocks = 0;
997		fs->fs_pendinginodes = 0;
998	}
999	UFS_UNLOCK(ump);
1000	if (fs->fs_ronly == 0) {
1001		fs->fs_clean = fs->fs_flags & (FS_UNCLEAN|FS_NEEDSFSCK) ? 0 : 1;
1002		error = ffs_sbupdate(ump, MNT_WAIT, 0);
1003		if (error) {
1004			fs->fs_clean = 0;
1005			return (error);
1006		}
1007	}
1008	DROP_GIANT();
1009	g_topology_lock();
1010	g_vfs_close(ump->um_cp, td);
1011	g_topology_unlock();
1012	PICKUP_GIANT();
1013	vrele(ump->um_devvp);
1014	mtx_destroy(UFS_MTX(ump));
1015	free(fs->fs_csp, M_UFSMNT);
1016	free(fs, M_UFSMNT);
1017	free(ump, M_UFSMNT);
1018	mp->mnt_data = (qaddr_t)0;
1019	mp->mnt_flag &= ~MNT_LOCAL;
1020	return (error);
1021}
1022
1023/*
1024 * Flush out all the files in a filesystem.
1025 */
1026int
1027ffs_flushfiles(mp, flags, td)
1028	struct mount *mp;
1029	int flags;
1030	struct thread *td;
1031{
1032	struct ufsmount *ump;
1033	int error;
1034
1035	ump = VFSTOUFS(mp);
1036#ifdef QUOTA
1037	if (mp->mnt_flag & MNT_QUOTA) {
1038		int i;
1039		error = vflush(mp, 0, SKIPSYSTEM|flags, td);
1040		if (error)
1041			return (error);
1042		for (i = 0; i < MAXQUOTAS; i++) {
1043			if (ump->um_quotas[i] == NULLVP)
1044				continue;
1045			quotaoff(td, mp, i);
1046		}
1047		/*
1048		 * Here we fall through to vflush again to ensure
1049		 * that we have gotten rid of all the system vnodes.
1050		 */
1051	}
1052#endif
1053	ASSERT_VOP_LOCKED(ump->um_devvp, "ffs_flushfiles");
1054	if (ump->um_devvp->v_vflag & VV_COPYONWRITE) {
1055		if ((error = vflush(mp, 0, SKIPSYSTEM | flags, td)) != 0)
1056			return (error);
1057		ffs_snapshot_unmount(mp);
1058		flags |= FORCECLOSE;
1059		/*
1060		 * Here we fall through to vflush again to ensure
1061		 * that we have gotten rid of all the system vnodes.
1062		 */
1063	}
1064        /*
1065	 * Flush all the files.
1066	 */
1067	if ((error = vflush(mp, 0, flags, td)) != 0)
1068		return (error);
1069	/*
1070	 * Flush filesystem metadata.
1071	 */
1072	vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY, td);
1073	error = VOP_FSYNC(ump->um_devvp, MNT_WAIT, td);
1074	VOP_UNLOCK(ump->um_devvp, 0, td);
1075	return (error);
1076}
1077
1078/*
1079 * Get filesystem statistics.
1080 */
1081static int
1082ffs_statfs(mp, sbp, td)
1083	struct mount *mp;
1084	struct statfs *sbp;
1085	struct thread *td;
1086{
1087	struct ufsmount *ump;
1088	struct fs *fs;
1089
1090	ump = VFSTOUFS(mp);
1091	fs = ump->um_fs;
1092	if (fs->fs_magic != FS_UFS1_MAGIC && fs->fs_magic != FS_UFS2_MAGIC)
1093		panic("ffs_statfs");
1094	sbp->f_version = STATFS_VERSION;
1095	sbp->f_bsize = fs->fs_fsize;
1096	sbp->f_iosize = fs->fs_bsize;
1097	sbp->f_blocks = fs->fs_dsize;
1098	UFS_LOCK(ump);
1099	sbp->f_bfree = fs->fs_cstotal.cs_nbfree * fs->fs_frag +
1100	    fs->fs_cstotal.cs_nffree + dbtofsb(fs, fs->fs_pendingblocks);
1101	sbp->f_bavail = freespace(fs, fs->fs_minfree) +
1102	    dbtofsb(fs, fs->fs_pendingblocks);
1103	sbp->f_files =  fs->fs_ncg * fs->fs_ipg - ROOTINO;
1104	sbp->f_ffree = fs->fs_cstotal.cs_nifree + fs->fs_pendinginodes;
1105	UFS_UNLOCK(ump);
1106	sbp->f_namemax = NAME_MAX;
1107	return (0);
1108}
1109
1110/*
1111 * Go through the disk queues to initiate sandbagged IO;
1112 * go through the inodes to write those that have been modified;
1113 * initiate the writing of the super block if it has been modified.
1114 *
1115 * Note: we are always called with the filesystem marked `MPBUSY'.
1116 */
1117static int
1118ffs_sync(mp, waitfor, td)
1119	struct mount *mp;
1120	int waitfor;
1121	struct thread *td;
1122{
1123	struct vnode *mvp, *vp, *devvp;
1124	struct inode *ip;
1125	struct ufsmount *ump = VFSTOUFS(mp);
1126	struct fs *fs;
1127	int error, count, wait, lockreq, allerror = 0;
1128	int suspend;
1129	int suspended;
1130	int secondary_writes;
1131	int secondary_accwrites;
1132	int softdep_deps;
1133	int softdep_accdeps;
1134	struct bufobj *bo;
1135
1136	fs = ump->um_fs;
1137	if (fs->fs_fmod != 0 && fs->fs_ronly != 0) {		/* XXX */
1138		printf("fs = %s\n", fs->fs_fsmnt);
1139		panic("ffs_sync: rofs mod");
1140	}
1141	/*
1142	 * Write back each (modified) inode.
1143	 */
1144	wait = 0;
1145	suspend = 0;
1146	suspended = 0;
1147	lockreq = LK_EXCLUSIVE | LK_NOWAIT;
1148	if (waitfor == MNT_SUSPEND) {
1149		suspend = 1;
1150		waitfor = MNT_WAIT;
1151	}
1152	if (waitfor == MNT_WAIT) {
1153		wait = 1;
1154		lockreq = LK_EXCLUSIVE;
1155	}
1156	lockreq |= LK_INTERLOCK | LK_SLEEPFAIL;
1157	MNT_ILOCK(mp);
1158loop:
1159	/* Grab snapshot of secondary write counts */
1160	secondary_writes = mp->mnt_secondary_writes;
1161	secondary_accwrites = mp->mnt_secondary_accwrites;
1162
1163	/* Grab snapshot of softdep dependency counts */
1164	MNT_IUNLOCK(mp);
1165	softdep_get_depcounts(mp, &softdep_deps, &softdep_accdeps);
1166	MNT_ILOCK(mp);
1167
1168	MNT_VNODE_FOREACH(vp, mp, mvp) {
1169		/*
1170		 * Depend on the mntvnode_slock to keep things stable enough
1171		 * for a quick test.  Since there might be hundreds of
1172		 * thousands of vnodes, we cannot afford even a subroutine
1173		 * call unless there's a good chance that we have work to do.
1174		 */
1175		VI_LOCK(vp);
1176		if (vp->v_iflag & VI_DOOMED) {
1177			VI_UNLOCK(vp);
1178			continue;
1179		}
1180		ip = VTOI(vp);
1181		if (vp->v_type == VNON || ((ip->i_flag &
1182		    (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE)) == 0 &&
1183		    vp->v_bufobj.bo_dirty.bv_cnt == 0)) {
1184			VI_UNLOCK(vp);
1185			continue;
1186		}
1187		MNT_IUNLOCK(mp);
1188		if ((error = vget(vp, lockreq, td)) != 0) {
1189			MNT_ILOCK(mp);
1190			if (error == ENOENT || error == ENOLCK) {
1191				MNT_VNODE_FOREACH_ABORT_ILOCKED(mp, mvp);
1192				goto loop;
1193			}
1194			continue;
1195		}
1196		if ((error = ffs_syncvnode(vp, waitfor)) != 0)
1197			allerror = error;
1198		vput(vp);
1199		MNT_ILOCK(mp);
1200	}
1201	MNT_IUNLOCK(mp);
1202	/*
1203	 * Force stale filesystem control information to be flushed.
1204	 */
1205	if (waitfor == MNT_WAIT) {
1206		if ((error = softdep_flushworklist(ump->um_mountp, &count, td)))
1207			allerror = error;
1208		/* Flushed work items may create new vnodes to clean */
1209		if (allerror == 0 && count) {
1210			MNT_ILOCK(mp);
1211			goto loop;
1212		}
1213	}
1214#ifdef QUOTA
1215	qsync(mp);
1216#endif
1217	devvp = ump->um_devvp;
1218	VI_LOCK(devvp);
1219	bo = &devvp->v_bufobj;
1220	if (waitfor != MNT_LAZY &&
1221	    (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0)) {
1222		vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY | LK_INTERLOCK, td);
1223		if ((error = VOP_FSYNC(devvp, waitfor, td)) != 0)
1224			allerror = error;
1225		VOP_UNLOCK(devvp, 0, td);
1226		if (allerror == 0 && waitfor == MNT_WAIT) {
1227			MNT_ILOCK(mp);
1228			goto loop;
1229		}
1230	} else if (suspend != 0) {
1231		if (softdep_check_suspend(mp,
1232					  devvp,
1233					  softdep_deps,
1234					  softdep_accdeps,
1235					  secondary_writes,
1236					  secondary_accwrites) != 0)
1237			goto loop;	/* More work needed */
1238		mtx_assert(MNT_MTX(mp), MA_OWNED);
1239		mp->mnt_kern_flag |= MNTK_SUSPEND2 | MNTK_SUSPENDED;
1240		MNT_IUNLOCK(mp);
1241		suspended = 1;
1242	} else
1243		VI_UNLOCK(devvp);
1244	/*
1245	 * Write back modified superblock.
1246	 */
1247	if (fs->fs_fmod != 0 &&
1248	    (error = ffs_sbupdate(ump, waitfor, suspended)) != 0)
1249		allerror = error;
1250	return (allerror);
1251}
1252
1253int
1254ffs_vget(mp, ino, flags, vpp)
1255	struct mount *mp;
1256	ino_t ino;
1257	int flags;
1258	struct vnode **vpp;
1259{
1260	struct fs *fs;
1261	struct inode *ip;
1262	struct ufsmount *ump;
1263	struct buf *bp;
1264	struct vnode *vp;
1265	struct cdev *dev;
1266	int error;
1267
1268	error = vfs_hash_get(mp, ino, flags, curthread, vpp, NULL, NULL);
1269	if (error || *vpp != NULL)
1270		return (error);
1271
1272	/*
1273	 * We must promote to an exclusive lock for vnode creation.  This
1274	 * can happen if lookup is passed LOCKSHARED.
1275 	 */
1276	if ((flags & LK_TYPE_MASK) == LK_SHARED) {
1277		flags &= ~LK_TYPE_MASK;
1278		flags |= LK_EXCLUSIVE;
1279	}
1280
1281	/*
1282	 * We do not lock vnode creation as it is believed to be too
1283	 * expensive for such rare case as simultaneous creation of vnode
1284	 * for same ino by different processes. We just allow them to race
1285	 * and check later to decide who wins. Let the race begin!
1286	 */
1287
1288	ump = VFSTOUFS(mp);
1289	dev = ump->um_dev;
1290	fs = ump->um_fs;
1291
1292	/*
1293	 * If this MALLOC() is performed after the getnewvnode()
1294	 * it might block, leaving a vnode with a NULL v_data to be
1295	 * found by ffs_sync() if a sync happens to fire right then,
1296	 * which will cause a panic because ffs_sync() blindly
1297	 * dereferences vp->v_data (as well it should).
1298	 */
1299	ip = uma_zalloc(uma_inode, M_WAITOK | M_ZERO);
1300
1301	/* Allocate a new vnode/inode. */
1302	if (fs->fs_magic == FS_UFS1_MAGIC)
1303		error = getnewvnode("ufs", mp, &ffs_vnodeops1, &vp);
1304	else
1305		error = getnewvnode("ufs", mp, &ffs_vnodeops2, &vp);
1306	if (error) {
1307		*vpp = NULL;
1308		uma_zfree(uma_inode, ip);
1309		return (error);
1310	}
1311	/*
1312	 * FFS supports recursive and shared locking.
1313	 */
1314	vp->v_vnlock->lk_flags |= LK_CANRECURSE;
1315	vp->v_vnlock->lk_flags &= ~LK_NOSHARE;
1316	vp->v_data = ip;
1317	vp->v_bufobj.bo_bsize = fs->fs_bsize;
1318	ip->i_vnode = vp;
1319	ip->i_ump = ump;
1320	ip->i_fs = fs;
1321	ip->i_dev = dev;
1322	ip->i_number = ino;
1323#ifdef QUOTA
1324	{
1325		int i;
1326		for (i = 0; i < MAXQUOTAS; i++)
1327			ip->i_dquot[i] = NODQUOT;
1328	}
1329#endif
1330
1331	error = vfs_hash_insert(vp, ino, flags, curthread, vpp, NULL, NULL);
1332	if (error || *vpp != NULL)
1333		return (error);
1334
1335	/* Read in the disk contents for the inode, copy into the inode. */
1336	error = bread(ump->um_devvp, fsbtodb(fs, ino_to_fsba(fs, ino)),
1337	    (int)fs->fs_bsize, NOCRED, &bp);
1338	if (error) {
1339		/*
1340		 * The inode does not contain anything useful, so it would
1341		 * be misleading to leave it on its hash chain. With mode
1342		 * still zero, it will be unlinked and returned to the free
1343		 * list by vput().
1344		 */
1345		brelse(bp);
1346		vput(vp);
1347		*vpp = NULL;
1348		return (error);
1349	}
1350	if (ip->i_ump->um_fstype == UFS1)
1351		ip->i_din1 = uma_zalloc(uma_ufs1, M_WAITOK);
1352	else
1353		ip->i_din2 = uma_zalloc(uma_ufs2, M_WAITOK);
1354	ffs_load_inode(bp, ip, fs, ino);
1355	if (DOINGSOFTDEP(vp))
1356		softdep_load_inodeblock(ip);
1357	else
1358		ip->i_effnlink = ip->i_nlink;
1359	bqrelse(bp);
1360
1361	/*
1362	 * Initialize the vnode from the inode, check for aliases.
1363	 * Note that the underlying vnode may have changed.
1364	 */
1365	if (ip->i_ump->um_fstype == UFS1)
1366		error = ufs_vinit(mp, &ffs_fifoops1, &vp);
1367	else
1368		error = ufs_vinit(mp, &ffs_fifoops2, &vp);
1369	if (error) {
1370		vput(vp);
1371		*vpp = NULL;
1372		return (error);
1373	}
1374
1375	/*
1376	 * Finish inode initialization.
1377	 */
1378
1379	/*
1380	 * Set up a generation number for this inode if it does not
1381	 * already have one. This should only happen on old filesystems.
1382	 */
1383	if (ip->i_gen == 0) {
1384		ip->i_gen = arc4random() / 2 + 1;
1385		if ((vp->v_mount->mnt_flag & MNT_RDONLY) == 0) {
1386			ip->i_flag |= IN_MODIFIED;
1387			DIP_SET(ip, i_gen, ip->i_gen);
1388		}
1389	}
1390	/*
1391	 * Ensure that uid and gid are correct. This is a temporary
1392	 * fix until fsck has been changed to do the update.
1393	 */
1394	if (fs->fs_magic == FS_UFS1_MAGIC &&		/* XXX */
1395	    fs->fs_old_inodefmt < FS_44INODEFMT) {	/* XXX */
1396		ip->i_uid = ip->i_din1->di_ouid;	/* XXX */
1397		ip->i_gid = ip->i_din1->di_ogid;	/* XXX */
1398	}						/* XXX */
1399
1400#ifdef MAC
1401	if ((mp->mnt_flag & MNT_MULTILABEL) && ip->i_mode) {
1402		/*
1403		 * If this vnode is already allocated, and we're running
1404		 * multi-label, attempt to perform a label association
1405		 * from the extended attributes on the inode.
1406		 */
1407		error = mac_associate_vnode_extattr(mp, vp);
1408		if (error) {
1409			/* ufs_inactive will release ip->i_devvp ref. */
1410			vput(vp);
1411			*vpp = NULL;
1412			return (error);
1413		}
1414	}
1415#endif
1416
1417	*vpp = vp;
1418	return (0);
1419}
1420
1421/*
1422 * File handle to vnode
1423 *
1424 * Have to be really careful about stale file handles:
1425 * - check that the inode number is valid
1426 * - call ffs_vget() to get the locked inode
1427 * - check for an unallocated inode (i_mode == 0)
1428 * - check that the given client host has export rights and return
1429 *   those rights via. exflagsp and credanonp
1430 */
1431static int
1432ffs_fhtovp(mp, fhp, vpp)
1433	struct mount *mp;
1434	struct fid *fhp;
1435	struct vnode **vpp;
1436{
1437	struct ufid *ufhp;
1438	struct fs *fs;
1439
1440	ufhp = (struct ufid *)fhp;
1441	fs = VFSTOUFS(mp)->um_fs;
1442	if (ufhp->ufid_ino < ROOTINO ||
1443	    ufhp->ufid_ino >= fs->fs_ncg * fs->fs_ipg)
1444		return (ESTALE);
1445	return (ufs_fhtovp(mp, ufhp, vpp));
1446}
1447
1448/*
1449 * Vnode pointer to File handle
1450 */
1451/* ARGSUSED */
1452static int
1453ffs_vptofh(vp, fhp)
1454	struct vnode *vp;
1455	struct fid *fhp;
1456{
1457	struct inode *ip;
1458	struct ufid *ufhp;
1459
1460	ip = VTOI(vp);
1461	ufhp = (struct ufid *)fhp;
1462	ufhp->ufid_len = sizeof(struct ufid);
1463	ufhp->ufid_ino = ip->i_number;
1464	ufhp->ufid_gen = ip->i_gen;
1465	return (0);
1466}
1467
1468/*
1469 * Initialize the filesystem.
1470 */
1471static int
1472ffs_init(vfsp)
1473	struct vfsconf *vfsp;
1474{
1475
1476	softdep_initialize();
1477	return (ufs_init(vfsp));
1478}
1479
1480/*
1481 * Undo the work of ffs_init().
1482 */
1483static int
1484ffs_uninit(vfsp)
1485	struct vfsconf *vfsp;
1486{
1487	int ret;
1488
1489	ret = ufs_uninit(vfsp);
1490	softdep_uninitialize();
1491	return (ret);
1492}
1493
1494/*
1495 * Write a superblock and associated information back to disk.
1496 */
1497static int
1498ffs_sbupdate(mp, waitfor, suspended)
1499	struct ufsmount *mp;
1500	int waitfor;
1501	int suspended;
1502{
1503	struct fs *fs = mp->um_fs;
1504	struct buf *sbbp;
1505	struct buf *bp;
1506	int blks;
1507	void *space;
1508	int i, size, error, allerror = 0;
1509
1510	if (fs->fs_ronly == 1 &&
1511	    (mp->um_mountp->mnt_flag & (MNT_RDONLY | MNT_UPDATE)) !=
1512	    (MNT_RDONLY | MNT_UPDATE))
1513		panic("ffs_sbupdate: write read-only filesystem");
1514	/*
1515	 * We use the superblock's buf to serialize calls to ffs_sbupdate().
1516	 */
1517	sbbp = getblk(mp->um_devvp, btodb(fs->fs_sblockloc), (int)fs->fs_sbsize,
1518	    0, 0, 0);
1519	/*
1520	 * First write back the summary information.
1521	 */
1522	blks = howmany(fs->fs_cssize, fs->fs_fsize);
1523	space = fs->fs_csp;
1524	for (i = 0; i < blks; i += fs->fs_frag) {
1525		size = fs->fs_bsize;
1526		if (i + fs->fs_frag > blks)
1527			size = (blks - i) * fs->fs_fsize;
1528		bp = getblk(mp->um_devvp, fsbtodb(fs, fs->fs_csaddr + i),
1529		    size, 0, 0, 0);
1530		bcopy(space, bp->b_data, (u_int)size);
1531		space = (char *)space + size;
1532		if (suspended)
1533			bp->b_flags |= B_VALIDSUSPWRT;
1534		if (waitfor != MNT_WAIT)
1535			bawrite(bp);
1536		else if ((error = bwrite(bp)) != 0)
1537			allerror = error;
1538	}
1539	/*
1540	 * Now write back the superblock itself. If any errors occurred
1541	 * up to this point, then fail so that the superblock avoids
1542	 * being written out as clean.
1543	 */
1544	if (allerror) {
1545		brelse(sbbp);
1546		return (allerror);
1547	}
1548	bp = sbbp;
1549	if (fs->fs_magic == FS_UFS1_MAGIC && fs->fs_sblockloc != SBLOCK_UFS1 &&
1550	    (fs->fs_flags & FS_FLAGS_UPDATED) == 0) {
1551		printf("%s: correcting fs_sblockloc from %jd to %d\n",
1552		    fs->fs_fsmnt, fs->fs_sblockloc, SBLOCK_UFS1);
1553		fs->fs_sblockloc = SBLOCK_UFS1;
1554	}
1555	if (fs->fs_magic == FS_UFS2_MAGIC && fs->fs_sblockloc != SBLOCK_UFS2 &&
1556	    (fs->fs_flags & FS_FLAGS_UPDATED) == 0) {
1557		printf("%s: correcting fs_sblockloc from %jd to %d\n",
1558		    fs->fs_fsmnt, fs->fs_sblockloc, SBLOCK_UFS2);
1559		fs->fs_sblockloc = SBLOCK_UFS2;
1560	}
1561	fs->fs_fmod = 0;
1562	fs->fs_time = time_second;
1563	bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize);
1564	ffs_oldfscompat_write((struct fs *)bp->b_data, mp);
1565	if (suspended)
1566		bp->b_flags |= B_VALIDSUSPWRT;
1567	if (waitfor != MNT_WAIT)
1568		bawrite(bp);
1569	else if ((error = bwrite(bp)) != 0)
1570		allerror = error;
1571	return (allerror);
1572}
1573
1574static int
1575ffs_extattrctl(struct mount *mp, int cmd, struct vnode *filename_vp,
1576	int attrnamespace, const char *attrname, struct thread *td)
1577{
1578
1579#ifdef UFS_EXTATTR
1580	return (ufs_extattrctl(mp, cmd, filename_vp, attrnamespace,
1581	    attrname, td));
1582#else
1583	return (vfs_stdextattrctl(mp, cmd, filename_vp, attrnamespace,
1584	    attrname, td));
1585#endif
1586}
1587
1588static void
1589ffs_ifree(struct ufsmount *ump, struct inode *ip)
1590{
1591
1592	if (ump->um_fstype == UFS1 && ip->i_din1 != NULL)
1593		uma_zfree(uma_ufs1, ip->i_din1);
1594	else if (ip->i_din2 != NULL)
1595		uma_zfree(uma_ufs2, ip->i_din2);
1596	uma_zfree(uma_inode, ip);
1597}
1598
1599static int dobkgrdwrite = 1;
1600SYSCTL_INT(_debug, OID_AUTO, dobkgrdwrite, CTLFLAG_RW, &dobkgrdwrite, 0,
1601    "Do background writes (honoring the BV_BKGRDWRITE flag)?");
1602
1603/*
1604 * Complete a background write started from bwrite.
1605 */
1606static void
1607ffs_backgroundwritedone(struct buf *bp)
1608{
1609	struct bufobj *bufobj;
1610	struct buf *origbp;
1611
1612	/*
1613	 * Find the original buffer that we are writing.
1614	 */
1615	bufobj = bp->b_bufobj;
1616	BO_LOCK(bufobj);
1617	if ((origbp = gbincore(bp->b_bufobj, bp->b_lblkno)) == NULL)
1618		panic("backgroundwritedone: lost buffer");
1619	/* Grab an extra reference to be dropped by the bufdone() below. */
1620	bufobj_wrefl(bufobj);
1621	BO_UNLOCK(bufobj);
1622	/*
1623	 * Process dependencies then return any unfinished ones.
1624	 */
1625	if (LIST_FIRST(&bp->b_dep) != NULL)
1626		buf_complete(bp);
1627#ifdef SOFTUPDATES
1628	if (LIST_FIRST(&bp->b_dep) != NULL)
1629		softdep_move_dependencies(bp, origbp);
1630#endif
1631	/*
1632	 * This buffer is marked B_NOCACHE so when it is released
1633	 * by biodone it will be tossed.
1634	 */
1635	bp->b_flags |= B_NOCACHE;
1636	bp->b_flags &= ~B_CACHE;
1637	bufdone(bp);
1638	BO_LOCK(bufobj);
1639	/*
1640	 * Clear the BV_BKGRDINPROG flag in the original buffer
1641	 * and awaken it if it is waiting for the write to complete.
1642	 * If BV_BKGRDINPROG is not set in the original buffer it must
1643	 * have been released and re-instantiated - which is not legal.
1644	 */
1645	KASSERT((origbp->b_vflags & BV_BKGRDINPROG),
1646	    ("backgroundwritedone: lost buffer2"));
1647	origbp->b_vflags &= ~BV_BKGRDINPROG;
1648	if (origbp->b_vflags & BV_BKGRDWAIT) {
1649		origbp->b_vflags &= ~BV_BKGRDWAIT;
1650		wakeup(&origbp->b_xflags);
1651	}
1652	BO_UNLOCK(bufobj);
1653}
1654
1655
1656/*
1657 * Write, release buffer on completion.  (Done by iodone
1658 * if async).  Do not bother writing anything if the buffer
1659 * is invalid.
1660 *
1661 * Note that we set B_CACHE here, indicating that buffer is
1662 * fully valid and thus cacheable.  This is true even of NFS
1663 * now so we set it generally.  This could be set either here
1664 * or in biodone() since the I/O is synchronous.  We put it
1665 * here.
1666 */
1667static int
1668ffs_bufwrite(struct buf *bp)
1669{
1670	int oldflags, s;
1671	struct buf *newbp;
1672
1673	CTR3(KTR_BUF, "bufwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
1674	if (bp->b_flags & B_INVAL) {
1675		brelse(bp);
1676		return (0);
1677	}
1678
1679	oldflags = bp->b_flags;
1680
1681	if (BUF_REFCNT(bp) == 0)
1682		panic("bufwrite: buffer is not busy???");
1683	s = splbio();
1684	/*
1685	 * If a background write is already in progress, delay
1686	 * writing this block if it is asynchronous. Otherwise
1687	 * wait for the background write to complete.
1688	 */
1689	BO_LOCK(bp->b_bufobj);
1690	if (bp->b_vflags & BV_BKGRDINPROG) {
1691		if (bp->b_flags & B_ASYNC) {
1692			BO_UNLOCK(bp->b_bufobj);
1693			splx(s);
1694			bdwrite(bp);
1695			return (0);
1696		}
1697		bp->b_vflags |= BV_BKGRDWAIT;
1698		msleep(&bp->b_xflags, BO_MTX(bp->b_bufobj), PRIBIO, "bwrbg", 0);
1699		if (bp->b_vflags & BV_BKGRDINPROG)
1700			panic("bufwrite: still writing");
1701	}
1702	BO_UNLOCK(bp->b_bufobj);
1703
1704	/* Mark the buffer clean */
1705	bundirty(bp);
1706
1707	/*
1708	 * If this buffer is marked for background writing and we
1709	 * do not have to wait for it, make a copy and write the
1710	 * copy so as to leave this buffer ready for further use.
1711	 *
1712	 * This optimization eats a lot of memory.  If we have a page
1713	 * or buffer shortfall we can't do it.
1714	 */
1715	if (dobkgrdwrite && (bp->b_xflags & BX_BKGRDWRITE) &&
1716	    (bp->b_flags & B_ASYNC) &&
1717	    !vm_page_count_severe() &&
1718	    !buf_dirty_count_severe()) {
1719		KASSERT(bp->b_iodone == NULL,
1720		    ("bufwrite: needs chained iodone (%p)", bp->b_iodone));
1721
1722		/* get a new block */
1723		newbp = geteblk(bp->b_bufsize);
1724
1725		/*
1726		 * set it to be identical to the old block.  We have to
1727		 * set b_lblkno and BKGRDMARKER before calling bgetvp()
1728		 * to avoid confusing the splay tree and gbincore().
1729		 */
1730		memcpy(newbp->b_data, bp->b_data, bp->b_bufsize);
1731		newbp->b_lblkno = bp->b_lblkno;
1732		newbp->b_xflags |= BX_BKGRDMARKER;
1733		BO_LOCK(bp->b_bufobj);
1734		bp->b_vflags |= BV_BKGRDINPROG;
1735		bgetvp(bp->b_vp, newbp);
1736		BO_UNLOCK(bp->b_bufobj);
1737		newbp->b_bufobj = &bp->b_vp->v_bufobj;
1738		newbp->b_blkno = bp->b_blkno;
1739		newbp->b_offset = bp->b_offset;
1740		newbp->b_iodone = ffs_backgroundwritedone;
1741		newbp->b_flags |= B_ASYNC;
1742		newbp->b_flags &= ~B_INVAL;
1743
1744#ifdef SOFTUPDATES
1745		/* move over the dependencies */
1746		if (LIST_FIRST(&bp->b_dep) != NULL)
1747			softdep_move_dependencies(bp, newbp);
1748#endif
1749
1750		/*
1751		 * Initiate write on the copy, release the original to
1752		 * the B_LOCKED queue so that it cannot go away until
1753		 * the background write completes. If not locked it could go
1754		 * away and then be reconstituted while it was being written.
1755		 * If the reconstituted buffer were written, we could end up
1756		 * with two background copies being written at the same time.
1757		 */
1758		bqrelse(bp);
1759		bp = newbp;
1760	}
1761
1762	/* Let the normal bufwrite do the rest for us */
1763	return (bufwrite(bp));
1764}
1765
1766
1767static void
1768ffs_geom_strategy(struct bufobj *bo, struct buf *bp)
1769{
1770	struct vnode *vp;
1771	int error;
1772	struct buf *tbp;
1773
1774	vp = bo->__bo_vnode;
1775	if (bp->b_iocmd == BIO_WRITE) {
1776		if ((bp->b_flags & B_VALIDSUSPWRT) == 0 &&
1777		    bp->b_vp != NULL && bp->b_vp->v_mount != NULL &&
1778		    (bp->b_vp->v_mount->mnt_kern_flag & MNTK_SUSPENDED) != 0)
1779			panic("ffs_geom_strategy: bad I/O");
1780		bp->b_flags &= ~B_VALIDSUSPWRT;
1781		if ((vp->v_vflag & VV_COPYONWRITE) &&
1782		    vp->v_rdev->si_snapdata != NULL) {
1783			if ((bp->b_flags & B_CLUSTER) != 0) {
1784				TAILQ_FOREACH(tbp, &bp->b_cluster.cluster_head,
1785					      b_cluster.cluster_entry) {
1786					error = ffs_copyonwrite(vp, tbp);
1787					if (error != 0 &&
1788					    error != EOPNOTSUPP) {
1789						bp->b_error = error;
1790						bp->b_ioflags |= BIO_ERROR;
1791						bufdone(bp);
1792						return;
1793					}
1794				}
1795			} else {
1796				error = ffs_copyonwrite(vp, bp);
1797				if (error != 0 && error != EOPNOTSUPP) {
1798					bp->b_error = error;
1799					bp->b_ioflags |= BIO_ERROR;
1800					bufdone(bp);
1801					return;
1802				}
1803			}
1804		}
1805#ifdef SOFTUPDATES
1806		if ((bp->b_flags & B_CLUSTER) != 0) {
1807			TAILQ_FOREACH(tbp, &bp->b_cluster.cluster_head,
1808				      b_cluster.cluster_entry) {
1809				if (LIST_FIRST(&tbp->b_dep) != NULL)
1810					buf_start(tbp);
1811			}
1812		} else {
1813			if (LIST_FIRST(&bp->b_dep) != NULL)
1814				buf_start(bp);
1815		}
1816
1817#endif
1818	}
1819	g_vfs_strategy(bo, bp);
1820}
1821