ffs_vfsops.c revision 156896
1/*-
2 * Copyright (c) 1989, 1991, 1993, 1994
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 4. Neither the name of the University nor the names of its contributors
14 *    may be used to endorse or promote products derived from this software
15 *    without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 *
29 *	@(#)ffs_vfsops.c	8.31 (Berkeley) 5/20/95
30 */
31
32#include <sys/cdefs.h>
33__FBSDID("$FreeBSD: head/sys/ufs/ffs/ffs_vfsops.c 156896 2006-03-19 21:09:19Z tegge $");
34
35#include "opt_mac.h"
36#include "opt_quota.h"
37#include "opt_ufs.h"
38#include "opt_ffs.h"
39
40#include <sys/param.h>
41#include <sys/systm.h>
42#include <sys/namei.h>
43#include <sys/proc.h>
44#include <sys/kernel.h>
45#include <sys/mac.h>
46#include <sys/vnode.h>
47#include <sys/mount.h>
48#include <sys/bio.h>
49#include <sys/buf.h>
50#include <sys/conf.h>
51#include <sys/fcntl.h>
52#include <sys/malloc.h>
53#include <sys/mutex.h>
54
55#include <ufs/ufs/extattr.h>
56#include <ufs/ufs/quota.h>
57#include <ufs/ufs/ufsmount.h>
58#include <ufs/ufs/inode.h>
59#include <ufs/ufs/ufs_extern.h>
60
61#include <ufs/ffs/fs.h>
62#include <ufs/ffs/ffs_extern.h>
63
64#include <vm/vm.h>
65#include <vm/uma.h>
66#include <vm/vm_page.h>
67
68#include <geom/geom.h>
69#include <geom/geom_vfs.h>
70
71static uma_zone_t uma_inode, uma_ufs1, uma_ufs2;
72
73static int	ffs_sbupdate(struct ufsmount *, int, int);
74static int	ffs_reload(struct mount *, struct thread *);
75static int	ffs_mountfs(struct vnode *, struct mount *, struct thread *);
76static void	ffs_oldfscompat_read(struct fs *, struct ufsmount *,
77		    ufs2_daddr_t);
78static void	ffs_oldfscompat_write(struct fs *, struct ufsmount *);
79static void	ffs_ifree(struct ufsmount *ump, struct inode *ip);
80static vfs_init_t ffs_init;
81static vfs_uninit_t ffs_uninit;
82static vfs_extattrctl_t ffs_extattrctl;
83static vfs_cmount_t ffs_cmount;
84static vfs_unmount_t ffs_unmount;
85static vfs_mount_t ffs_mount;
86static vfs_statfs_t ffs_statfs;
87static vfs_fhtovp_t ffs_fhtovp;
88static vfs_vptofh_t ffs_vptofh;
89static vfs_sync_t ffs_sync;
90
91static struct vfsops ufs_vfsops = {
92	.vfs_extattrctl =	ffs_extattrctl,
93	.vfs_fhtovp =		ffs_fhtovp,
94	.vfs_init =		ffs_init,
95	.vfs_mount =		ffs_mount,
96	.vfs_cmount =		ffs_cmount,
97	.vfs_quotactl =		ufs_quotactl,
98	.vfs_root =		ufs_root,
99	.vfs_statfs =		ffs_statfs,
100	.vfs_sync =		ffs_sync,
101	.vfs_uninit =		ffs_uninit,
102	.vfs_unmount =		ffs_unmount,
103	.vfs_vget =		ffs_vget,
104	.vfs_vptofh =		ffs_vptofh,
105};
106
107VFS_SET(ufs_vfsops, ufs, 0);
108
109static b_strategy_t ffs_geom_strategy;
110static b_write_t ffs_bufwrite;
111
112static struct buf_ops ffs_ops = {
113	.bop_name =	"FFS",
114	.bop_write =	ffs_bufwrite,
115	.bop_strategy =	ffs_geom_strategy,
116	.bop_sync =	bufsync,
117};
118
119static const char *ffs_opts[] = { "acls", "async", "atime", "clusterr",
120    "clusterw", "exec", "errmsg", "export", "force", "from", "multilabel",
121    "snapshot", "suid", "suiddir", "symfollow", "sync",
122    "update", "union", NULL };
123
124static int
125ffs_mount(struct mount *mp, struct thread *td)
126{
127	struct vnode *devvp;
128	struct ufsmount *ump = 0;
129	struct fs *fs;
130	int error, flags;
131	mode_t accessmode;
132	struct nameidata ndp;
133	struct export_args export;
134	char *fspec;
135
136	if (vfs_filteropt(mp->mnt_optnew, ffs_opts))
137		return (EINVAL);
138	if (uma_inode == NULL) {
139		uma_inode = uma_zcreate("FFS inode",
140		    sizeof(struct inode), NULL, NULL, NULL, NULL,
141		    UMA_ALIGN_PTR, 0);
142		uma_ufs1 = uma_zcreate("FFS1 dinode",
143		    sizeof(struct ufs1_dinode), NULL, NULL, NULL, NULL,
144		    UMA_ALIGN_PTR, 0);
145		uma_ufs2 = uma_zcreate("FFS2 dinode",
146		    sizeof(struct ufs2_dinode), NULL, NULL, NULL, NULL,
147		    UMA_ALIGN_PTR, 0);
148	}
149
150	fspec = vfs_getopts(mp->mnt_optnew, "from", &error);
151	if (error)
152		return (error);
153
154	if (vfs_getopt(mp->mnt_optnew, "acls", NULL, NULL) == 0)
155		mp->mnt_flag |= MNT_ACLS;
156
157	if (vfs_getopt(mp->mnt_optnew, "async", NULL, NULL) == 0)
158		mp->mnt_flag |= MNT_ASYNC;
159
160	if (vfs_getopt(mp->mnt_optnew, "force", NULL, NULL) == 0)
161		mp->mnt_flag |= MNT_FORCE;
162
163	if (vfs_getopt(mp->mnt_optnew, "multilabel", NULL, NULL) == 0)
164		mp->mnt_flag |= MNT_MULTILABEL;
165
166	if (vfs_getopt(mp->mnt_optnew, "noasync", NULL, NULL) == 0)
167		mp->mnt_flag &= ~MNT_ASYNC;
168
169	if (vfs_getopt(mp->mnt_optnew, "noatime", NULL, NULL) == 0)
170		mp->mnt_flag |= MNT_NOATIME;
171
172	if (vfs_getopt(mp->mnt_optnew, "noclusterr", NULL, NULL) == 0)
173		mp->mnt_flag |= MNT_NOCLUSTERR;
174
175	if (vfs_getopt(mp->mnt_optnew, "noclusterw", NULL, NULL) == 0)
176		mp->mnt_flag |= MNT_NOCLUSTERW;
177
178	if (vfs_getopt(mp->mnt_optnew, "snapshot", NULL, NULL) == 0)
179		mp->mnt_flag |= MNT_SNAPSHOT;
180
181	if (vfs_getopt(mp->mnt_optnew, "update", NULL, NULL) == 0)
182		mp->mnt_flag |= MNT_UPDATE;
183
184	export.ex_root = -2; /* DEFAULT_ROOTID */
185
186	if (mp->mnt_flag & MNT_RDONLY)
187		export.ex_flags = MNT_EXRDONLY;
188	else
189		export.ex_flags = 0;
190
191	/*
192	 * If updating, check whether changing from read-only to
193	 * read/write; if there is no device name, that's all we do.
194	 */
195	if (mp->mnt_flag & MNT_UPDATE) {
196		ump = VFSTOUFS(mp);
197		fs = ump->um_fs;
198		devvp = ump->um_devvp;
199		if (fs->fs_ronly == 0 &&
200		    vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0)) {
201			if ((error = vn_start_write(NULL, &mp, V_WAIT)) != 0)
202				return (error);
203			/*
204			 * Flush any dirty data.
205			 */
206			if ((error = ffs_sync(mp, MNT_WAIT, td)) != 0) {
207				vn_finished_write(mp);
208				return (error);
209			}
210			/*
211			 * Check for and optionally get rid of files open
212			 * for writing.
213			 */
214			flags = WRITECLOSE;
215			if (mp->mnt_flag & MNT_FORCE)
216				flags |= FORCECLOSE;
217			if (mp->mnt_flag & MNT_SOFTDEP) {
218				error = softdep_flushfiles(mp, flags, td);
219			} else {
220				error = ffs_flushfiles(mp, flags, td);
221			}
222			if (error) {
223				vn_finished_write(mp);
224				return (error);
225			}
226			if (fs->fs_pendingblocks != 0 ||
227			    fs->fs_pendinginodes != 0) {
228				printf("%s: %s: blocks %jd files %d\n",
229				    fs->fs_fsmnt, "update error",
230				    (intmax_t)fs->fs_pendingblocks,
231				    fs->fs_pendinginodes);
232				fs->fs_pendingblocks = 0;
233				fs->fs_pendinginodes = 0;
234			}
235			if ((fs->fs_flags & (FS_UNCLEAN | FS_NEEDSFSCK)) == 0)
236				fs->fs_clean = 1;
237			if ((error = ffs_sbupdate(ump, MNT_WAIT, 0)) != 0) {
238				fs->fs_ronly = 0;
239				fs->fs_clean = 0;
240				vn_finished_write(mp);
241				return (error);
242			}
243			vn_finished_write(mp);
244			DROP_GIANT();
245			g_topology_lock();
246			g_access(ump->um_cp, 0, -1, 0);
247			g_topology_unlock();
248			PICKUP_GIANT();
249			fs->fs_ronly = 1;
250			mp->mnt_flag |= MNT_RDONLY;
251		}
252		if ((mp->mnt_flag & MNT_RELOAD) &&
253		    (error = ffs_reload(mp, td)) != 0)
254			return (error);
255		if (fs->fs_ronly &&
256		    !vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0)) {
257			/*
258			 * If upgrade to read-write by non-root, then verify
259			 * that user has necessary permissions on the device.
260			 */
261			if (suser(td)) {
262				vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, td);
263				if ((error = VOP_ACCESS(devvp, VREAD | VWRITE,
264				    td->td_ucred, td)) != 0) {
265					VOP_UNLOCK(devvp, 0, td);
266					return (error);
267				}
268				VOP_UNLOCK(devvp, 0, td);
269			}
270			fs->fs_flags &= ~FS_UNCLEAN;
271			if (fs->fs_clean == 0) {
272				fs->fs_flags |= FS_UNCLEAN;
273				if ((mp->mnt_flag & MNT_FORCE) ||
274				    ((fs->fs_flags & FS_NEEDSFSCK) == 0 &&
275				     (fs->fs_flags & FS_DOSOFTDEP))) {
276					printf("WARNING: %s was not %s\n",
277					   fs->fs_fsmnt, "properly dismounted");
278				} else {
279					printf(
280"WARNING: R/W mount of %s denied.  Filesystem is not clean - run fsck\n",
281					    fs->fs_fsmnt);
282					return (EPERM);
283				}
284			}
285			DROP_GIANT();
286			g_topology_lock();
287			/*
288			 * If we're the root device, we may not have an E count
289			 * yet, get it now.
290			 */
291			if (ump->um_cp->ace == 0)
292				error = g_access(ump->um_cp, 0, 1, 1);
293			else
294				error = g_access(ump->um_cp, 0, 1, 0);
295			g_topology_unlock();
296			PICKUP_GIANT();
297			if (error)
298				return (error);
299			if ((error = vn_start_write(NULL, &mp, V_WAIT)) != 0)
300				return (error);
301			fs->fs_ronly = 0;
302			mp->mnt_flag &= ~MNT_RDONLY;
303			fs->fs_clean = 0;
304			if ((error = ffs_sbupdate(ump, MNT_WAIT, 0)) != 0) {
305				vn_finished_write(mp);
306				return (error);
307			}
308			/* check to see if we need to start softdep */
309			if ((fs->fs_flags & FS_DOSOFTDEP) &&
310			    (error = softdep_mount(devvp, mp, fs, td->td_ucred))){
311				vn_finished_write(mp);
312				return (error);
313			}
314			if (fs->fs_snapinum[0] != 0)
315				ffs_snapshot_mount(mp);
316			vn_finished_write(mp);
317		}
318		/*
319		 * Soft updates is incompatible with "async",
320		 * so if we are doing softupdates stop the user
321		 * from setting the async flag in an update.
322		 * Softdep_mount() clears it in an initial mount
323		 * or ro->rw remount.
324		 */
325		if (mp->mnt_flag & MNT_SOFTDEP)
326			mp->mnt_flag &= ~MNT_ASYNC;
327		/*
328		 * Keep MNT_ACLS flag if it is stored in superblock.
329		 */
330		if ((fs->fs_flags & FS_ACLS) != 0)
331			mp->mnt_flag |= MNT_ACLS;
332		/*
333		 * If not updating name, process export requests.
334		 */
335		error = 0;
336		if (vfs_getopt(mp->mnt_optnew, "export", NULL, NULL) == 0) {
337			error = vfs_copyopt(mp->mnt_optnew, "export",
338			    &export, sizeof export);
339		}
340
341		if (error == 0 && export.ex_flags != 0)
342			return (vfs_export(mp, &export));
343		/*
344		 * If this is a snapshot request, take the snapshot.
345		 */
346		if (mp->mnt_flag & MNT_SNAPSHOT)
347			return (ffs_snapshot(mp, fspec));
348	}
349
350	/*
351	 * Not an update, or updating the name: look up the name
352	 * and verify that it refers to a sensible disk device.
353	 */
354	NDINIT(&ndp, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, fspec, td);
355	if ((error = namei(&ndp)) != 0)
356		return (error);
357	NDFREE(&ndp, NDF_ONLY_PNBUF);
358	devvp = ndp.ni_vp;
359	if (!vn_isdisk(devvp, &error)) {
360		vput(devvp);
361		return (error);
362	}
363
364	/*
365	 * If mount by non-root, then verify that user has necessary
366	 * permissions on the device.
367	 */
368	if (suser(td)) {
369		accessmode = VREAD;
370		if ((mp->mnt_flag & MNT_RDONLY) == 0)
371			accessmode |= VWRITE;
372		if ((error = VOP_ACCESS(devvp, accessmode, td->td_ucred, td))!= 0){
373			vput(devvp);
374			return (error);
375		}
376	}
377
378	if (mp->mnt_flag & MNT_UPDATE) {
379		/*
380		 * Update only
381		 *
382		 * If it's not the same vnode, or at least the same device
383		 * then it's not correct.
384		 */
385
386		if (devvp->v_rdev != ump->um_devvp->v_rdev)
387			error = EINVAL;	/* needs translation */
388		vput(devvp);
389		if (error)
390			return (error);
391	} else {
392		/*
393		 * New mount
394		 *
395		 * We need the name for the mount point (also used for
396		 * "last mounted on") copied in. If an error occurs,
397		 * the mount point is discarded by the upper level code.
398		 * Note that vfs_mount() populates f_mntonname for us.
399		 */
400		if ((error = ffs_mountfs(devvp, mp, td)) != 0) {
401			vrele(devvp);
402			return (error);
403		}
404	}
405	vfs_mountedfrom(mp, fspec);
406	return (0);
407}
408
409/*
410 * Compatibility with old mount system call.
411 */
412
413static int
414ffs_cmount(struct mntarg *ma, void *data, int flags, struct thread *td)
415{
416	struct ufs_args args;
417	int error;
418
419	if (data == NULL)
420		return (EINVAL);
421	error = copyin(data, &args, sizeof args);
422	if (error)
423		return (error);
424
425	ma = mount_argsu(ma, "from", args.fspec, MAXPATHLEN);
426	ma = mount_arg(ma, "export", &args.export, sizeof args.export);
427	error = kernel_mount(ma, flags);
428
429	return (error);
430}
431
432/*
433 * Reload all incore data for a filesystem (used after running fsck on
434 * the root filesystem and finding things to fix). The filesystem must
435 * be mounted read-only.
436 *
437 * Things to do to update the mount:
438 *	1) invalidate all cached meta-data.
439 *	2) re-read superblock from disk.
440 *	3) re-read summary information from disk.
441 *	4) invalidate all inactive vnodes.
442 *	5) invalidate all cached file data.
443 *	6) re-read inode data for all active vnodes.
444 */
445static int
446ffs_reload(struct mount *mp, struct thread *td)
447{
448	struct vnode *vp, *mvp, *devvp;
449	struct inode *ip;
450	void *space;
451	struct buf *bp;
452	struct fs *fs, *newfs;
453	struct ufsmount *ump;
454	ufs2_daddr_t sblockloc;
455	int i, blks, size, error;
456	int32_t *lp;
457
458	if ((mp->mnt_flag & MNT_RDONLY) == 0)
459		return (EINVAL);
460	ump = VFSTOUFS(mp);
461	/*
462	 * Step 1: invalidate all cached meta-data.
463	 */
464	devvp = VFSTOUFS(mp)->um_devvp;
465	vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, td);
466	if (vinvalbuf(devvp, 0, td, 0, 0) != 0)
467		panic("ffs_reload: dirty1");
468	VOP_UNLOCK(devvp, 0, td);
469
470	/*
471	 * Step 2: re-read superblock from disk.
472	 */
473	fs = VFSTOUFS(mp)->um_fs;
474	if ((error = bread(devvp, btodb(fs->fs_sblockloc), fs->fs_sbsize,
475	    NOCRED, &bp)) != 0)
476		return (error);
477	newfs = (struct fs *)bp->b_data;
478	if ((newfs->fs_magic != FS_UFS1_MAGIC &&
479	     newfs->fs_magic != FS_UFS2_MAGIC) ||
480	    newfs->fs_bsize > MAXBSIZE ||
481	    newfs->fs_bsize < sizeof(struct fs)) {
482			brelse(bp);
483			return (EIO);		/* XXX needs translation */
484	}
485	/*
486	 * Copy pointer fields back into superblock before copying in	XXX
487	 * new superblock. These should really be in the ufsmount.	XXX
488	 * Note that important parameters (eg fs_ncg) are unchanged.
489	 */
490	newfs->fs_csp = fs->fs_csp;
491	newfs->fs_maxcluster = fs->fs_maxcluster;
492	newfs->fs_contigdirs = fs->fs_contigdirs;
493	newfs->fs_active = fs->fs_active;
494	/* The file system is still read-only. */
495	newfs->fs_ronly = 1;
496	sblockloc = fs->fs_sblockloc;
497	bcopy(newfs, fs, (u_int)fs->fs_sbsize);
498	brelse(bp);
499	mp->mnt_maxsymlinklen = fs->fs_maxsymlinklen;
500	ffs_oldfscompat_read(fs, VFSTOUFS(mp), sblockloc);
501	UFS_LOCK(ump);
502	if (fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) {
503		printf("%s: reload pending error: blocks %jd files %d\n",
504		    fs->fs_fsmnt, (intmax_t)fs->fs_pendingblocks,
505		    fs->fs_pendinginodes);
506		fs->fs_pendingblocks = 0;
507		fs->fs_pendinginodes = 0;
508	}
509	UFS_UNLOCK(ump);
510
511	/*
512	 * Step 3: re-read summary information from disk.
513	 */
514	blks = howmany(fs->fs_cssize, fs->fs_fsize);
515	space = fs->fs_csp;
516	for (i = 0; i < blks; i += fs->fs_frag) {
517		size = fs->fs_bsize;
518		if (i + fs->fs_frag > blks)
519			size = (blks - i) * fs->fs_fsize;
520		error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + i), size,
521		    NOCRED, &bp);
522		if (error)
523			return (error);
524		bcopy(bp->b_data, space, (u_int)size);
525		space = (char *)space + size;
526		brelse(bp);
527	}
528	/*
529	 * We no longer know anything about clusters per cylinder group.
530	 */
531	if (fs->fs_contigsumsize > 0) {
532		lp = fs->fs_maxcluster;
533		for (i = 0; i < fs->fs_ncg; i++)
534			*lp++ = fs->fs_contigsumsize;
535	}
536
537loop:
538	MNT_ILOCK(mp);
539	MNT_VNODE_FOREACH(vp, mp, mvp) {
540		VI_LOCK(vp);
541		if (vp->v_iflag & VI_DOOMED) {
542			VI_UNLOCK(vp);
543			continue;
544		}
545		MNT_IUNLOCK(mp);
546		/*
547		 * Step 4: invalidate all cached file data.
548		 */
549		if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, td)) {
550			MNT_VNODE_FOREACH_ABORT(mp, mvp);
551			goto loop;
552		}
553		if (vinvalbuf(vp, 0, td, 0, 0))
554			panic("ffs_reload: dirty2");
555		/*
556		 * Step 5: re-read inode data for all active vnodes.
557		 */
558		ip = VTOI(vp);
559		error =
560		    bread(devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
561		    (int)fs->fs_bsize, NOCRED, &bp);
562		if (error) {
563			VOP_UNLOCK(vp, 0, td);
564			vrele(vp);
565			MNT_VNODE_FOREACH_ABORT(mp, mvp);
566			return (error);
567		}
568		ffs_load_inode(bp, ip, fs, ip->i_number);
569		ip->i_effnlink = ip->i_nlink;
570		brelse(bp);
571		VOP_UNLOCK(vp, 0, td);
572		vrele(vp);
573		MNT_ILOCK(mp);
574	}
575	MNT_IUNLOCK(mp);
576	return (0);
577}
578
579/*
580 * Possible superblock locations ordered from most to least likely.
581 */
582static int sblock_try[] = SBLOCKSEARCH;
583
584/*
585 * Common code for mount and mountroot
586 */
587static int
588ffs_mountfs(devvp, mp, td)
589	struct vnode *devvp;
590	struct mount *mp;
591	struct thread *td;
592{
593	struct ufsmount *ump;
594	struct buf *bp;
595	struct fs *fs;
596	struct cdev *dev;
597	void *space;
598	ufs2_daddr_t sblockloc;
599	int error, i, blks, size, ronly;
600	int32_t *lp;
601	struct ucred *cred;
602	struct g_consumer *cp;
603
604	dev = devvp->v_rdev;
605	cred = td ? td->td_ucred : NOCRED;
606
607	ronly = (mp->mnt_flag & MNT_RDONLY) != 0;
608	DROP_GIANT();
609	g_topology_lock();
610	error = g_vfs_open(devvp, &cp, "ffs", ronly ? 0 : 1);
611
612	/*
613	 * If we are a root mount, drop the E flag so fsck can do its magic.
614	 * We will pick it up again when we remount R/W.
615	 */
616	if (error == 0 && ronly && (mp->mnt_flag & MNT_ROOTFS))
617		error = g_access(cp, 0, 0, -1);
618	g_topology_unlock();
619	PICKUP_GIANT();
620	VOP_UNLOCK(devvp, 0, td);
621	if (error)
622		return (error);
623	if (devvp->v_rdev->si_iosize_max != 0)
624		mp->mnt_iosize_max = devvp->v_rdev->si_iosize_max;
625	if (mp->mnt_iosize_max > MAXPHYS)
626		mp->mnt_iosize_max = MAXPHYS;
627
628	devvp->v_bufobj.bo_private = cp;
629	devvp->v_bufobj.bo_ops = &ffs_ops;
630
631	bp = NULL;
632	ump = NULL;
633	fs = NULL;
634	sblockloc = 0;
635	/*
636	 * Try reading the superblock in each of its possible locations.
637	 */
638	for (i = 0; sblock_try[i] != -1; i++) {
639		if ((error = bread(devvp, sblock_try[i] / DEV_BSIZE, SBLOCKSIZE,
640		    cred, &bp)) != 0)
641			goto out;
642		fs = (struct fs *)bp->b_data;
643		sblockloc = sblock_try[i];
644		if ((fs->fs_magic == FS_UFS1_MAGIC ||
645		     (fs->fs_magic == FS_UFS2_MAGIC &&
646		      (fs->fs_sblockloc == sblockloc ||
647		       (fs->fs_old_flags & FS_FLAGS_UPDATED) == 0))) &&
648		    fs->fs_bsize <= MAXBSIZE &&
649		    fs->fs_bsize >= sizeof(struct fs))
650			break;
651		brelse(bp);
652		bp = NULL;
653	}
654	if (sblock_try[i] == -1) {
655		error = EINVAL;		/* XXX needs translation */
656		goto out;
657	}
658	fs->fs_fmod = 0;
659	fs->fs_flags &= ~FS_INDEXDIRS;	/* no support for directory indicies */
660	fs->fs_flags &= ~FS_UNCLEAN;
661	if (fs->fs_clean == 0) {
662		fs->fs_flags |= FS_UNCLEAN;
663		if (ronly || (mp->mnt_flag & MNT_FORCE) ||
664		    ((fs->fs_flags & FS_NEEDSFSCK) == 0 &&
665		     (fs->fs_flags & FS_DOSOFTDEP))) {
666			printf(
667"WARNING: %s was not properly dismounted\n",
668			    fs->fs_fsmnt);
669		} else {
670			printf(
671"WARNING: R/W mount of %s denied.  Filesystem is not clean - run fsck\n",
672			    fs->fs_fsmnt);
673			error = EPERM;
674			goto out;
675		}
676		if ((fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) &&
677		    (mp->mnt_flag & MNT_FORCE)) {
678			printf("%s: lost blocks %jd files %d\n", fs->fs_fsmnt,
679			    (intmax_t)fs->fs_pendingblocks,
680			    fs->fs_pendinginodes);
681			fs->fs_pendingblocks = 0;
682			fs->fs_pendinginodes = 0;
683		}
684	}
685	if (fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) {
686		printf("%s: mount pending error: blocks %jd files %d\n",
687		    fs->fs_fsmnt, (intmax_t)fs->fs_pendingblocks,
688		    fs->fs_pendinginodes);
689		fs->fs_pendingblocks = 0;
690		fs->fs_pendinginodes = 0;
691	}
692	ump = malloc(sizeof *ump, M_UFSMNT, M_WAITOK | M_ZERO);
693	ump->um_cp = cp;
694	ump->um_bo = &devvp->v_bufobj;
695	ump->um_fs = malloc((u_long)fs->fs_sbsize, M_UFSMNT, M_WAITOK);
696	if (fs->fs_magic == FS_UFS1_MAGIC) {
697		ump->um_fstype = UFS1;
698		ump->um_balloc = ffs_balloc_ufs1;
699	} else {
700		ump->um_fstype = UFS2;
701		ump->um_balloc = ffs_balloc_ufs2;
702	}
703	ump->um_blkatoff = ffs_blkatoff;
704	ump->um_truncate = ffs_truncate;
705	ump->um_update = ffs_update;
706	ump->um_valloc = ffs_valloc;
707	ump->um_vfree = ffs_vfree;
708	ump->um_ifree = ffs_ifree;
709	mtx_init(UFS_MTX(ump), "FFS", "FFS Lock", MTX_DEF);
710	bcopy(bp->b_data, ump->um_fs, (u_int)fs->fs_sbsize);
711	if (fs->fs_sbsize < SBLOCKSIZE)
712		bp->b_flags |= B_INVAL | B_NOCACHE;
713	brelse(bp);
714	bp = NULL;
715	fs = ump->um_fs;
716	ffs_oldfscompat_read(fs, ump, sblockloc);
717	fs->fs_ronly = ronly;
718	size = fs->fs_cssize;
719	blks = howmany(size, fs->fs_fsize);
720	if (fs->fs_contigsumsize > 0)
721		size += fs->fs_ncg * sizeof(int32_t);
722	size += fs->fs_ncg * sizeof(u_int8_t);
723	space = malloc((u_long)size, M_UFSMNT, M_WAITOK);
724	fs->fs_csp = space;
725	for (i = 0; i < blks; i += fs->fs_frag) {
726		size = fs->fs_bsize;
727		if (i + fs->fs_frag > blks)
728			size = (blks - i) * fs->fs_fsize;
729		if ((error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + i), size,
730		    cred, &bp)) != 0) {
731			free(fs->fs_csp, M_UFSMNT);
732			goto out;
733		}
734		bcopy(bp->b_data, space, (u_int)size);
735		space = (char *)space + size;
736		brelse(bp);
737		bp = NULL;
738	}
739	if (fs->fs_contigsumsize > 0) {
740		fs->fs_maxcluster = lp = space;
741		for (i = 0; i < fs->fs_ncg; i++)
742			*lp++ = fs->fs_contigsumsize;
743		space = lp;
744	}
745	size = fs->fs_ncg * sizeof(u_int8_t);
746	fs->fs_contigdirs = (u_int8_t *)space;
747	bzero(fs->fs_contigdirs, size);
748	fs->fs_active = NULL;
749	mp->mnt_data = (qaddr_t)ump;
750	mp->mnt_stat.f_fsid.val[0] = fs->fs_id[0];
751	mp->mnt_stat.f_fsid.val[1] = fs->fs_id[1];
752	if (fs->fs_id[0] == 0 || fs->fs_id[1] == 0 ||
753	    vfs_getvfs(&mp->mnt_stat.f_fsid))
754		vfs_getnewfsid(mp);
755	mp->mnt_maxsymlinklen = fs->fs_maxsymlinklen;
756	mp->mnt_flag |= MNT_LOCAL;
757	if ((fs->fs_flags & FS_MULTILABEL) != 0)
758#ifdef MAC
759		mp->mnt_flag |= MNT_MULTILABEL;
760#else
761		printf(
762"WARNING: %s: multilabel flag on fs but no MAC support\n",
763		    fs->fs_fsmnt);
764#endif
765	if ((fs->fs_flags & FS_ACLS) != 0)
766#ifdef UFS_ACL
767		mp->mnt_flag |= MNT_ACLS;
768#else
769		printf(
770"WARNING: %s: ACLs flag on fs but no ACLs support\n",
771		    fs->fs_fsmnt);
772#endif
773	ump->um_mountp = mp;
774	ump->um_dev = dev;
775	ump->um_devvp = devvp;
776	ump->um_nindir = fs->fs_nindir;
777	ump->um_bptrtodb = fs->fs_fsbtodb;
778	ump->um_seqinc = fs->fs_frag;
779	for (i = 0; i < MAXQUOTAS; i++)
780		ump->um_quotas[i] = NULLVP;
781#ifdef UFS_EXTATTR
782	ufs_extattr_uepm_init(&ump->um_extattr);
783#endif
784	/*
785	 * Set FS local "last mounted on" information (NULL pad)
786	 */
787	bzero(fs->fs_fsmnt, MAXMNTLEN);
788	strlcpy(fs->fs_fsmnt, mp->mnt_stat.f_mntonname, MAXMNTLEN);
789
790	if( mp->mnt_flag & MNT_ROOTFS) {
791		/*
792		 * Root mount; update timestamp in mount structure.
793		 * this will be used by the common root mount code
794		 * to update the system clock.
795		 */
796		mp->mnt_time = fs->fs_time;
797	}
798
799	if (ronly == 0) {
800		if ((fs->fs_flags & FS_DOSOFTDEP) &&
801		    (error = softdep_mount(devvp, mp, fs, cred)) != 0) {
802			free(fs->fs_csp, M_UFSMNT);
803			goto out;
804		}
805		if (fs->fs_snapinum[0] != 0)
806			ffs_snapshot_mount(mp);
807		fs->fs_fmod = 1;
808		fs->fs_clean = 0;
809		(void) ffs_sbupdate(ump, MNT_WAIT, 0);
810	}
811	/*
812	 * Initialize filesystem stat information in mount struct.
813	 */
814#ifdef UFS_EXTATTR
815#ifdef UFS_EXTATTR_AUTOSTART
816	/*
817	 *
818	 * Auto-starting does the following:
819	 *	- check for /.attribute in the fs, and extattr_start if so
820	 *	- for each file in .attribute, enable that file with
821	 * 	  an attribute of the same name.
822	 * Not clear how to report errors -- probably eat them.
823	 * This would all happen while the filesystem was busy/not
824	 * available, so would effectively be "atomic".
825	 */
826	(void) ufs_extattr_autostart(mp, td);
827#endif /* !UFS_EXTATTR_AUTOSTART */
828#endif /* !UFS_EXTATTR */
829#ifndef QUOTA
830	mp->mnt_kern_flag |= MNTK_MPSAFE;
831#endif
832	return (0);
833out:
834	if (bp)
835		brelse(bp);
836	if (cp != NULL) {
837		DROP_GIANT();
838		g_topology_lock();
839		g_vfs_close(cp, td);
840		g_topology_unlock();
841		PICKUP_GIANT();
842	}
843	if (ump) {
844		mtx_destroy(UFS_MTX(ump));
845		free(ump->um_fs, M_UFSMNT);
846		free(ump, M_UFSMNT);
847		mp->mnt_data = (qaddr_t)0;
848	}
849	return (error);
850}
851
852#include <sys/sysctl.h>
853static int bigcgs = 0;
854SYSCTL_INT(_debug, OID_AUTO, bigcgs, CTLFLAG_RW, &bigcgs, 0, "");
855
856/*
857 * Sanity checks for loading old filesystem superblocks.
858 * See ffs_oldfscompat_write below for unwound actions.
859 *
860 * XXX - Parts get retired eventually.
861 * Unfortunately new bits get added.
862 */
863static void
864ffs_oldfscompat_read(fs, ump, sblockloc)
865	struct fs *fs;
866	struct ufsmount *ump;
867	ufs2_daddr_t sblockloc;
868{
869	off_t maxfilesize;
870
871	/*
872	 * If not yet done, update fs_flags location and value of fs_sblockloc.
873	 */
874	if ((fs->fs_old_flags & FS_FLAGS_UPDATED) == 0) {
875		fs->fs_flags = fs->fs_old_flags;
876		fs->fs_old_flags |= FS_FLAGS_UPDATED;
877		fs->fs_sblockloc = sblockloc;
878	}
879	/*
880	 * If not yet done, update UFS1 superblock with new wider fields.
881	 */
882	if (fs->fs_magic == FS_UFS1_MAGIC && fs->fs_maxbsize != fs->fs_bsize) {
883		fs->fs_maxbsize = fs->fs_bsize;
884		fs->fs_time = fs->fs_old_time;
885		fs->fs_size = fs->fs_old_size;
886		fs->fs_dsize = fs->fs_old_dsize;
887		fs->fs_csaddr = fs->fs_old_csaddr;
888		fs->fs_cstotal.cs_ndir = fs->fs_old_cstotal.cs_ndir;
889		fs->fs_cstotal.cs_nbfree = fs->fs_old_cstotal.cs_nbfree;
890		fs->fs_cstotal.cs_nifree = fs->fs_old_cstotal.cs_nifree;
891		fs->fs_cstotal.cs_nffree = fs->fs_old_cstotal.cs_nffree;
892	}
893	if (fs->fs_magic == FS_UFS1_MAGIC &&
894	    fs->fs_old_inodefmt < FS_44INODEFMT) {
895		fs->fs_maxfilesize = ((uint64_t)1 << 31) - 1;
896		fs->fs_qbmask = ~fs->fs_bmask;
897		fs->fs_qfmask = ~fs->fs_fmask;
898	}
899	if (fs->fs_magic == FS_UFS1_MAGIC) {
900		ump->um_savedmaxfilesize = fs->fs_maxfilesize;
901		maxfilesize = (uint64_t)0x80000000 * fs->fs_bsize - 1;
902		if (fs->fs_maxfilesize > maxfilesize)
903			fs->fs_maxfilesize = maxfilesize;
904	}
905	/* Compatibility for old filesystems */
906	if (fs->fs_avgfilesize <= 0)
907		fs->fs_avgfilesize = AVFILESIZ;
908	if (fs->fs_avgfpdir <= 0)
909		fs->fs_avgfpdir = AFPDIR;
910	if (bigcgs) {
911		fs->fs_save_cgsize = fs->fs_cgsize;
912		fs->fs_cgsize = fs->fs_bsize;
913	}
914}
915
916/*
917 * Unwinding superblock updates for old filesystems.
918 * See ffs_oldfscompat_read above for details.
919 *
920 * XXX - Parts get retired eventually.
921 * Unfortunately new bits get added.
922 */
923static void
924ffs_oldfscompat_write(fs, ump)
925	struct fs *fs;
926	struct ufsmount *ump;
927{
928
929	/*
930	 * Copy back UFS2 updated fields that UFS1 inspects.
931	 */
932	if (fs->fs_magic == FS_UFS1_MAGIC) {
933		fs->fs_old_time = fs->fs_time;
934		fs->fs_old_cstotal.cs_ndir = fs->fs_cstotal.cs_ndir;
935		fs->fs_old_cstotal.cs_nbfree = fs->fs_cstotal.cs_nbfree;
936		fs->fs_old_cstotal.cs_nifree = fs->fs_cstotal.cs_nifree;
937		fs->fs_old_cstotal.cs_nffree = fs->fs_cstotal.cs_nffree;
938		fs->fs_maxfilesize = ump->um_savedmaxfilesize;
939	}
940	if (bigcgs) {
941		fs->fs_cgsize = fs->fs_save_cgsize;
942		fs->fs_save_cgsize = 0;
943	}
944}
945
946/*
947 * unmount system call
948 */
949static int
950ffs_unmount(mp, mntflags, td)
951	struct mount *mp;
952	int mntflags;
953	struct thread *td;
954{
955	struct ufsmount *ump = VFSTOUFS(mp);
956	struct fs *fs;
957	int error, flags;
958
959	flags = 0;
960	if (mntflags & MNT_FORCE) {
961		flags |= FORCECLOSE;
962	}
963#ifdef UFS_EXTATTR
964	if ((error = ufs_extattr_stop(mp, td))) {
965		if (error != EOPNOTSUPP)
966			printf("ffs_unmount: ufs_extattr_stop returned %d\n",
967			    error);
968	} else {
969		ufs_extattr_uepm_destroy(&ump->um_extattr);
970	}
971#endif
972	if (mp->mnt_flag & MNT_SOFTDEP) {
973		if ((error = softdep_flushfiles(mp, flags, td)) != 0)
974			return (error);
975	} else {
976		if ((error = ffs_flushfiles(mp, flags, td)) != 0)
977			return (error);
978	}
979	fs = ump->um_fs;
980	UFS_LOCK(ump);
981	if (fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) {
982		printf("%s: unmount pending error: blocks %jd files %d\n",
983		    fs->fs_fsmnt, (intmax_t)fs->fs_pendingblocks,
984		    fs->fs_pendinginodes);
985		fs->fs_pendingblocks = 0;
986		fs->fs_pendinginodes = 0;
987	}
988	UFS_UNLOCK(ump);
989	if (fs->fs_ronly == 0) {
990		fs->fs_clean = fs->fs_flags & (FS_UNCLEAN|FS_NEEDSFSCK) ? 0 : 1;
991		error = ffs_sbupdate(ump, MNT_WAIT, 0);
992		if (error) {
993			fs->fs_clean = 0;
994			return (error);
995		}
996	}
997	DROP_GIANT();
998	g_topology_lock();
999	g_vfs_close(ump->um_cp, td);
1000	g_topology_unlock();
1001	PICKUP_GIANT();
1002	vrele(ump->um_devvp);
1003	mtx_destroy(UFS_MTX(ump));
1004	free(fs->fs_csp, M_UFSMNT);
1005	free(fs, M_UFSMNT);
1006	free(ump, M_UFSMNT);
1007	mp->mnt_data = (qaddr_t)0;
1008	mp->mnt_flag &= ~MNT_LOCAL;
1009	return (error);
1010}
1011
1012/*
1013 * Flush out all the files in a filesystem.
1014 */
1015int
1016ffs_flushfiles(mp, flags, td)
1017	struct mount *mp;
1018	int flags;
1019	struct thread *td;
1020{
1021	struct ufsmount *ump;
1022	int error;
1023
1024	ump = VFSTOUFS(mp);
1025#ifdef QUOTA
1026	if (mp->mnt_flag & MNT_QUOTA) {
1027		int i;
1028		error = vflush(mp, 0, SKIPSYSTEM|flags, td);
1029		if (error)
1030			return (error);
1031		for (i = 0; i < MAXQUOTAS; i++) {
1032			if (ump->um_quotas[i] == NULLVP)
1033				continue;
1034			quotaoff(td, mp, i);
1035		}
1036		/*
1037		 * Here we fall through to vflush again to ensure
1038		 * that we have gotten rid of all the system vnodes.
1039		 */
1040	}
1041#endif
1042	ASSERT_VOP_LOCKED(ump->um_devvp, "ffs_flushfiles");
1043	if (ump->um_devvp->v_vflag & VV_COPYONWRITE) {
1044		if ((error = vflush(mp, 0, SKIPSYSTEM | flags, td)) != 0)
1045			return (error);
1046		ffs_snapshot_unmount(mp);
1047		flags |= FORCECLOSE;
1048		/*
1049		 * Here we fall through to vflush again to ensure
1050		 * that we have gotten rid of all the system vnodes.
1051		 */
1052	}
1053        /*
1054	 * Flush all the files.
1055	 */
1056	if ((error = vflush(mp, 0, flags, td)) != 0)
1057		return (error);
1058	/*
1059	 * Flush filesystem metadata.
1060	 */
1061	vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY, td);
1062	error = VOP_FSYNC(ump->um_devvp, MNT_WAIT, td);
1063	VOP_UNLOCK(ump->um_devvp, 0, td);
1064	return (error);
1065}
1066
1067/*
1068 * Get filesystem statistics.
1069 */
1070static int
1071ffs_statfs(mp, sbp, td)
1072	struct mount *mp;
1073	struct statfs *sbp;
1074	struct thread *td;
1075{
1076	struct ufsmount *ump;
1077	struct fs *fs;
1078
1079	ump = VFSTOUFS(mp);
1080	fs = ump->um_fs;
1081	if (fs->fs_magic != FS_UFS1_MAGIC && fs->fs_magic != FS_UFS2_MAGIC)
1082		panic("ffs_statfs");
1083	sbp->f_version = STATFS_VERSION;
1084	sbp->f_bsize = fs->fs_fsize;
1085	sbp->f_iosize = fs->fs_bsize;
1086	sbp->f_blocks = fs->fs_dsize;
1087	UFS_LOCK(ump);
1088	sbp->f_bfree = fs->fs_cstotal.cs_nbfree * fs->fs_frag +
1089	    fs->fs_cstotal.cs_nffree + dbtofsb(fs, fs->fs_pendingblocks);
1090	sbp->f_bavail = freespace(fs, fs->fs_minfree) +
1091	    dbtofsb(fs, fs->fs_pendingblocks);
1092	sbp->f_files =  fs->fs_ncg * fs->fs_ipg - ROOTINO;
1093	sbp->f_ffree = fs->fs_cstotal.cs_nifree + fs->fs_pendinginodes;
1094	UFS_UNLOCK(ump);
1095	sbp->f_namemax = NAME_MAX;
1096	return (0);
1097}
1098
1099/*
1100 * Go through the disk queues to initiate sandbagged IO;
1101 * go through the inodes to write those that have been modified;
1102 * initiate the writing of the super block if it has been modified.
1103 *
1104 * Note: we are always called with the filesystem marked `MPBUSY'.
1105 */
1106static int
1107ffs_sync(mp, waitfor, td)
1108	struct mount *mp;
1109	int waitfor;
1110	struct thread *td;
1111{
1112	struct vnode *mvp, *vp, *devvp;
1113	struct inode *ip;
1114	struct ufsmount *ump = VFSTOUFS(mp);
1115	struct fs *fs;
1116	int error, count, wait, lockreq, allerror = 0;
1117	int suspend;
1118	int suspended;
1119	int secondary_writes;
1120	int secondary_accwrites;
1121	int softdep_deps;
1122	int softdep_accdeps;
1123	struct bufobj *bo;
1124
1125	fs = ump->um_fs;
1126	if (fs->fs_fmod != 0 && fs->fs_ronly != 0) {		/* XXX */
1127		printf("fs = %s\n", fs->fs_fsmnt);
1128		panic("ffs_sync: rofs mod");
1129	}
1130	/*
1131	 * Write back each (modified) inode.
1132	 */
1133	wait = 0;
1134	suspend = 0;
1135	suspended = 0;
1136	lockreq = LK_EXCLUSIVE | LK_NOWAIT;
1137	if (waitfor == MNT_SUSPEND) {
1138		suspend = 1;
1139		waitfor = MNT_WAIT;
1140	}
1141	if (waitfor == MNT_WAIT) {
1142		wait = 1;
1143		lockreq = LK_EXCLUSIVE;
1144	}
1145	lockreq |= LK_INTERLOCK | LK_SLEEPFAIL;
1146	MNT_ILOCK(mp);
1147loop:
1148	/* Grab snapshot of secondary write counts */
1149	secondary_writes = mp->mnt_secondary_writes;
1150	secondary_accwrites = mp->mnt_secondary_accwrites;
1151
1152	/* Grab snapshot of softdep dependency counts */
1153	MNT_IUNLOCK(mp);
1154	softdep_get_depcounts(mp, &softdep_deps, &softdep_accdeps);
1155	MNT_ILOCK(mp);
1156
1157	MNT_VNODE_FOREACH(vp, mp, mvp) {
1158		/*
1159		 * Depend on the mntvnode_slock to keep things stable enough
1160		 * for a quick test.  Since there might be hundreds of
1161		 * thousands of vnodes, we cannot afford even a subroutine
1162		 * call unless there's a good chance that we have work to do.
1163		 */
1164		VI_LOCK(vp);
1165		if (vp->v_iflag & VI_DOOMED) {
1166			VI_UNLOCK(vp);
1167			continue;
1168		}
1169		ip = VTOI(vp);
1170		if (vp->v_type == VNON || ((ip->i_flag &
1171		    (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE)) == 0 &&
1172		    vp->v_bufobj.bo_dirty.bv_cnt == 0)) {
1173			VI_UNLOCK(vp);
1174			continue;
1175		}
1176		MNT_IUNLOCK(mp);
1177		if ((error = vget(vp, lockreq, td)) != 0) {
1178			MNT_ILOCK(mp);
1179			if (error == ENOENT || error == ENOLCK) {
1180				MNT_VNODE_FOREACH_ABORT_ILOCKED(mp, mvp);
1181				goto loop;
1182			}
1183			continue;
1184		}
1185		if ((error = ffs_syncvnode(vp, waitfor)) != 0)
1186			allerror = error;
1187		vput(vp);
1188		MNT_ILOCK(mp);
1189	}
1190	MNT_IUNLOCK(mp);
1191	/*
1192	 * Force stale filesystem control information to be flushed.
1193	 */
1194	if (waitfor == MNT_WAIT) {
1195		if ((error = softdep_flushworklist(ump->um_mountp, &count, td)))
1196			allerror = error;
1197		/* Flushed work items may create new vnodes to clean */
1198		if (allerror == 0 && count) {
1199			MNT_ILOCK(mp);
1200			goto loop;
1201		}
1202	}
1203#ifdef QUOTA
1204	qsync(mp);
1205#endif
1206	devvp = ump->um_devvp;
1207	VI_LOCK(devvp);
1208	bo = &devvp->v_bufobj;
1209	if (waitfor != MNT_LAZY &&
1210	    (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0)) {
1211		vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY | LK_INTERLOCK, td);
1212		if ((error = VOP_FSYNC(devvp, waitfor, td)) != 0)
1213			allerror = error;
1214		VOP_UNLOCK(devvp, 0, td);
1215		if (allerror == 0 && waitfor == MNT_WAIT) {
1216			MNT_ILOCK(mp);
1217			goto loop;
1218		}
1219	} else if (suspend != 0) {
1220		if (softdep_check_suspend(mp,
1221					  devvp,
1222					  softdep_deps,
1223					  softdep_accdeps,
1224					  secondary_writes,
1225					  secondary_accwrites) != 0)
1226			goto loop;	/* More work needed */
1227		mtx_assert(MNT_MTX(mp), MA_OWNED);
1228		mp->mnt_kern_flag |= MNTK_SUSPEND2 | MNTK_SUSPENDED;
1229		MNT_IUNLOCK(mp);
1230		suspended = 1;
1231	} else
1232		VI_UNLOCK(devvp);
1233	/*
1234	 * Write back modified superblock.
1235	 */
1236	if (fs->fs_fmod != 0 &&
1237	    (error = ffs_sbupdate(ump, waitfor, suspended)) != 0)
1238		allerror = error;
1239	return (allerror);
1240}
1241
1242int
1243ffs_vget(mp, ino, flags, vpp)
1244	struct mount *mp;
1245	ino_t ino;
1246	int flags;
1247	struct vnode **vpp;
1248{
1249	struct fs *fs;
1250	struct inode *ip;
1251	struct ufsmount *ump;
1252	struct buf *bp;
1253	struct vnode *vp;
1254	struct cdev *dev;
1255	int error;
1256
1257	error = vfs_hash_get(mp, ino, flags, curthread, vpp, NULL, NULL);
1258	if (error || *vpp != NULL)
1259		return (error);
1260
1261	/*
1262	 * We must promote to an exclusive lock for vnode creation.  This
1263	 * can happen if lookup is passed LOCKSHARED.
1264 	 */
1265	if ((flags & LK_TYPE_MASK) == LK_SHARED) {
1266		flags &= ~LK_TYPE_MASK;
1267		flags |= LK_EXCLUSIVE;
1268	}
1269
1270	/*
1271	 * We do not lock vnode creation as it is believed to be too
1272	 * expensive for such rare case as simultaneous creation of vnode
1273	 * for same ino by different processes. We just allow them to race
1274	 * and check later to decide who wins. Let the race begin!
1275	 */
1276
1277	ump = VFSTOUFS(mp);
1278	dev = ump->um_dev;
1279	fs = ump->um_fs;
1280
1281	/*
1282	 * If this MALLOC() is performed after the getnewvnode()
1283	 * it might block, leaving a vnode with a NULL v_data to be
1284	 * found by ffs_sync() if a sync happens to fire right then,
1285	 * which will cause a panic because ffs_sync() blindly
1286	 * dereferences vp->v_data (as well it should).
1287	 */
1288	ip = uma_zalloc(uma_inode, M_WAITOK | M_ZERO);
1289
1290	/* Allocate a new vnode/inode. */
1291	if (fs->fs_magic == FS_UFS1_MAGIC)
1292		error = getnewvnode("ufs", mp, &ffs_vnodeops1, &vp);
1293	else
1294		error = getnewvnode("ufs", mp, &ffs_vnodeops2, &vp);
1295	if (error) {
1296		*vpp = NULL;
1297		uma_zfree(uma_inode, ip);
1298		return (error);
1299	}
1300	/*
1301	 * FFS supports recursive and shared locking.
1302	 */
1303	vp->v_vnlock->lk_flags |= LK_CANRECURSE;
1304	vp->v_vnlock->lk_flags &= ~LK_NOSHARE;
1305	vp->v_data = ip;
1306	vp->v_bufobj.bo_bsize = fs->fs_bsize;
1307	ip->i_vnode = vp;
1308	ip->i_ump = ump;
1309	ip->i_fs = fs;
1310	ip->i_dev = dev;
1311	ip->i_number = ino;
1312#ifdef QUOTA
1313	{
1314		int i;
1315		for (i = 0; i < MAXQUOTAS; i++)
1316			ip->i_dquot[i] = NODQUOT;
1317	}
1318#endif
1319
1320	error = vfs_hash_insert(vp, ino, flags, curthread, vpp, NULL, NULL);
1321	if (error || *vpp != NULL)
1322		return (error);
1323
1324	/* Read in the disk contents for the inode, copy into the inode. */
1325	error = bread(ump->um_devvp, fsbtodb(fs, ino_to_fsba(fs, ino)),
1326	    (int)fs->fs_bsize, NOCRED, &bp);
1327	if (error) {
1328		/*
1329		 * The inode does not contain anything useful, so it would
1330		 * be misleading to leave it on its hash chain. With mode
1331		 * still zero, it will be unlinked and returned to the free
1332		 * list by vput().
1333		 */
1334		brelse(bp);
1335		vput(vp);
1336		*vpp = NULL;
1337		return (error);
1338	}
1339	if (ip->i_ump->um_fstype == UFS1)
1340		ip->i_din1 = uma_zalloc(uma_ufs1, M_WAITOK);
1341	else
1342		ip->i_din2 = uma_zalloc(uma_ufs2, M_WAITOK);
1343	ffs_load_inode(bp, ip, fs, ino);
1344	if (DOINGSOFTDEP(vp))
1345		softdep_load_inodeblock(ip);
1346	else
1347		ip->i_effnlink = ip->i_nlink;
1348	bqrelse(bp);
1349
1350	/*
1351	 * Initialize the vnode from the inode, check for aliases.
1352	 * Note that the underlying vnode may have changed.
1353	 */
1354	if (ip->i_ump->um_fstype == UFS1)
1355		error = ufs_vinit(mp, &ffs_fifoops1, &vp);
1356	else
1357		error = ufs_vinit(mp, &ffs_fifoops2, &vp);
1358	if (error) {
1359		vput(vp);
1360		*vpp = NULL;
1361		return (error);
1362	}
1363
1364	/*
1365	 * Finish inode initialization.
1366	 */
1367
1368	/*
1369	 * Set up a generation number for this inode if it does not
1370	 * already have one. This should only happen on old filesystems.
1371	 */
1372	if (ip->i_gen == 0) {
1373		ip->i_gen = arc4random() / 2 + 1;
1374		if ((vp->v_mount->mnt_flag & MNT_RDONLY) == 0) {
1375			ip->i_flag |= IN_MODIFIED;
1376			DIP_SET(ip, i_gen, ip->i_gen);
1377		}
1378	}
1379	/*
1380	 * Ensure that uid and gid are correct. This is a temporary
1381	 * fix until fsck has been changed to do the update.
1382	 */
1383	if (fs->fs_magic == FS_UFS1_MAGIC &&		/* XXX */
1384	    fs->fs_old_inodefmt < FS_44INODEFMT) {	/* XXX */
1385		ip->i_uid = ip->i_din1->di_ouid;	/* XXX */
1386		ip->i_gid = ip->i_din1->di_ogid;	/* XXX */
1387	}						/* XXX */
1388
1389#ifdef MAC
1390	if ((mp->mnt_flag & MNT_MULTILABEL) && ip->i_mode) {
1391		/*
1392		 * If this vnode is already allocated, and we're running
1393		 * multi-label, attempt to perform a label association
1394		 * from the extended attributes on the inode.
1395		 */
1396		error = mac_associate_vnode_extattr(mp, vp);
1397		if (error) {
1398			/* ufs_inactive will release ip->i_devvp ref. */
1399			vput(vp);
1400			*vpp = NULL;
1401			return (error);
1402		}
1403	}
1404#endif
1405
1406	*vpp = vp;
1407	return (0);
1408}
1409
1410/*
1411 * File handle to vnode
1412 *
1413 * Have to be really careful about stale file handles:
1414 * - check that the inode number is valid
1415 * - call ffs_vget() to get the locked inode
1416 * - check for an unallocated inode (i_mode == 0)
1417 * - check that the given client host has export rights and return
1418 *   those rights via. exflagsp and credanonp
1419 */
1420static int
1421ffs_fhtovp(mp, fhp, vpp)
1422	struct mount *mp;
1423	struct fid *fhp;
1424	struct vnode **vpp;
1425{
1426	struct ufid *ufhp;
1427	struct fs *fs;
1428
1429	ufhp = (struct ufid *)fhp;
1430	fs = VFSTOUFS(mp)->um_fs;
1431	if (ufhp->ufid_ino < ROOTINO ||
1432	    ufhp->ufid_ino >= fs->fs_ncg * fs->fs_ipg)
1433		return (ESTALE);
1434	return (ufs_fhtovp(mp, ufhp, vpp));
1435}
1436
1437/*
1438 * Vnode pointer to File handle
1439 */
1440/* ARGSUSED */
1441static int
1442ffs_vptofh(vp, fhp)
1443	struct vnode *vp;
1444	struct fid *fhp;
1445{
1446	struct inode *ip;
1447	struct ufid *ufhp;
1448
1449	ip = VTOI(vp);
1450	ufhp = (struct ufid *)fhp;
1451	ufhp->ufid_len = sizeof(struct ufid);
1452	ufhp->ufid_ino = ip->i_number;
1453	ufhp->ufid_gen = ip->i_gen;
1454	return (0);
1455}
1456
1457/*
1458 * Initialize the filesystem.
1459 */
1460static int
1461ffs_init(vfsp)
1462	struct vfsconf *vfsp;
1463{
1464
1465	softdep_initialize();
1466	return (ufs_init(vfsp));
1467}
1468
1469/*
1470 * Undo the work of ffs_init().
1471 */
1472static int
1473ffs_uninit(vfsp)
1474	struct vfsconf *vfsp;
1475{
1476	int ret;
1477
1478	ret = ufs_uninit(vfsp);
1479	softdep_uninitialize();
1480	return (ret);
1481}
1482
1483/*
1484 * Write a superblock and associated information back to disk.
1485 */
1486static int
1487ffs_sbupdate(mp, waitfor, suspended)
1488	struct ufsmount *mp;
1489	int waitfor;
1490	int suspended;
1491{
1492	struct fs *fs = mp->um_fs;
1493	struct buf *sbbp;
1494	struct buf *bp;
1495	int blks;
1496	void *space;
1497	int i, size, error, allerror = 0;
1498
1499	if (fs->fs_ronly == 1 &&
1500	    (mp->um_mountp->mnt_flag & (MNT_RDONLY | MNT_UPDATE)) !=
1501	    (MNT_RDONLY | MNT_UPDATE))
1502		panic("ffs_sbupdate: write read-only filesystem");
1503	/*
1504	 * We use the superblock's buf to serialize calls to ffs_sbupdate().
1505	 */
1506	sbbp = getblk(mp->um_devvp, btodb(fs->fs_sblockloc), (int)fs->fs_sbsize,
1507	    0, 0, 0);
1508	/*
1509	 * First write back the summary information.
1510	 */
1511	blks = howmany(fs->fs_cssize, fs->fs_fsize);
1512	space = fs->fs_csp;
1513	for (i = 0; i < blks; i += fs->fs_frag) {
1514		size = fs->fs_bsize;
1515		if (i + fs->fs_frag > blks)
1516			size = (blks - i) * fs->fs_fsize;
1517		bp = getblk(mp->um_devvp, fsbtodb(fs, fs->fs_csaddr + i),
1518		    size, 0, 0, 0);
1519		bcopy(space, bp->b_data, (u_int)size);
1520		space = (char *)space + size;
1521		if (suspended)
1522			bp->b_flags |= B_VALIDSUSPWRT;
1523		if (waitfor != MNT_WAIT)
1524			bawrite(bp);
1525		else if ((error = bwrite(bp)) != 0)
1526			allerror = error;
1527	}
1528	/*
1529	 * Now write back the superblock itself. If any errors occurred
1530	 * up to this point, then fail so that the superblock avoids
1531	 * being written out as clean.
1532	 */
1533	if (allerror) {
1534		brelse(sbbp);
1535		return (allerror);
1536	}
1537	bp = sbbp;
1538	if (fs->fs_magic == FS_UFS1_MAGIC && fs->fs_sblockloc != SBLOCK_UFS1 &&
1539	    (fs->fs_flags & FS_FLAGS_UPDATED) == 0) {
1540		printf("%s: correcting fs_sblockloc from %jd to %d\n",
1541		    fs->fs_fsmnt, fs->fs_sblockloc, SBLOCK_UFS1);
1542		fs->fs_sblockloc = SBLOCK_UFS1;
1543	}
1544	if (fs->fs_magic == FS_UFS2_MAGIC && fs->fs_sblockloc != SBLOCK_UFS2 &&
1545	    (fs->fs_flags & FS_FLAGS_UPDATED) == 0) {
1546		printf("%s: correcting fs_sblockloc from %jd to %d\n",
1547		    fs->fs_fsmnt, fs->fs_sblockloc, SBLOCK_UFS2);
1548		fs->fs_sblockloc = SBLOCK_UFS2;
1549	}
1550	fs->fs_fmod = 0;
1551	fs->fs_time = time_second;
1552	bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize);
1553	ffs_oldfscompat_write((struct fs *)bp->b_data, mp);
1554	if (suspended)
1555		bp->b_flags |= B_VALIDSUSPWRT;
1556	if (waitfor != MNT_WAIT)
1557		bawrite(bp);
1558	else if ((error = bwrite(bp)) != 0)
1559		allerror = error;
1560	return (allerror);
1561}
1562
1563static int
1564ffs_extattrctl(struct mount *mp, int cmd, struct vnode *filename_vp,
1565	int attrnamespace, const char *attrname, struct thread *td)
1566{
1567
1568#ifdef UFS_EXTATTR
1569	return (ufs_extattrctl(mp, cmd, filename_vp, attrnamespace,
1570	    attrname, td));
1571#else
1572	return (vfs_stdextattrctl(mp, cmd, filename_vp, attrnamespace,
1573	    attrname, td));
1574#endif
1575}
1576
1577static void
1578ffs_ifree(struct ufsmount *ump, struct inode *ip)
1579{
1580
1581	if (ump->um_fstype == UFS1 && ip->i_din1 != NULL)
1582		uma_zfree(uma_ufs1, ip->i_din1);
1583	else if (ip->i_din2 != NULL)
1584		uma_zfree(uma_ufs2, ip->i_din2);
1585	uma_zfree(uma_inode, ip);
1586}
1587
1588static int dobkgrdwrite = 1;
1589SYSCTL_INT(_debug, OID_AUTO, dobkgrdwrite, CTLFLAG_RW, &dobkgrdwrite, 0,
1590    "Do background writes (honoring the BV_BKGRDWRITE flag)?");
1591
1592/*
1593 * Complete a background write started from bwrite.
1594 */
1595static void
1596ffs_backgroundwritedone(struct buf *bp)
1597{
1598	struct bufobj *bufobj;
1599	struct buf *origbp;
1600
1601	/*
1602	 * Find the original buffer that we are writing.
1603	 */
1604	bufobj = bp->b_bufobj;
1605	BO_LOCK(bufobj);
1606	if ((origbp = gbincore(bp->b_bufobj, bp->b_lblkno)) == NULL)
1607		panic("backgroundwritedone: lost buffer");
1608	/* Grab an extra reference to be dropped by the bufdone() below. */
1609	bufobj_wrefl(bufobj);
1610	BO_UNLOCK(bufobj);
1611	/*
1612	 * Process dependencies then return any unfinished ones.
1613	 */
1614	if (LIST_FIRST(&bp->b_dep) != NULL)
1615		buf_complete(bp);
1616#ifdef SOFTUPDATES
1617	if (LIST_FIRST(&bp->b_dep) != NULL)
1618		softdep_move_dependencies(bp, origbp);
1619#endif
1620	/*
1621	 * This buffer is marked B_NOCACHE so when it is released
1622	 * by biodone it will be tossed.
1623	 */
1624	bp->b_flags |= B_NOCACHE;
1625	bp->b_flags &= ~B_CACHE;
1626	bufdone(bp);
1627	BO_LOCK(bufobj);
1628	/*
1629	 * Clear the BV_BKGRDINPROG flag in the original buffer
1630	 * and awaken it if it is waiting for the write to complete.
1631	 * If BV_BKGRDINPROG is not set in the original buffer it must
1632	 * have been released and re-instantiated - which is not legal.
1633	 */
1634	KASSERT((origbp->b_vflags & BV_BKGRDINPROG),
1635	    ("backgroundwritedone: lost buffer2"));
1636	origbp->b_vflags &= ~BV_BKGRDINPROG;
1637	if (origbp->b_vflags & BV_BKGRDWAIT) {
1638		origbp->b_vflags &= ~BV_BKGRDWAIT;
1639		wakeup(&origbp->b_xflags);
1640	}
1641	BO_UNLOCK(bufobj);
1642}
1643
1644
1645/*
1646 * Write, release buffer on completion.  (Done by iodone
1647 * if async).  Do not bother writing anything if the buffer
1648 * is invalid.
1649 *
1650 * Note that we set B_CACHE here, indicating that buffer is
1651 * fully valid and thus cacheable.  This is true even of NFS
1652 * now so we set it generally.  This could be set either here
1653 * or in biodone() since the I/O is synchronous.  We put it
1654 * here.
1655 */
1656static int
1657ffs_bufwrite(struct buf *bp)
1658{
1659	int oldflags, s;
1660	struct buf *newbp;
1661
1662	CTR3(KTR_BUF, "bufwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
1663	if (bp->b_flags & B_INVAL) {
1664		brelse(bp);
1665		return (0);
1666	}
1667
1668	oldflags = bp->b_flags;
1669
1670	if (BUF_REFCNT(bp) == 0)
1671		panic("bufwrite: buffer is not busy???");
1672	s = splbio();
1673	/*
1674	 * If a background write is already in progress, delay
1675	 * writing this block if it is asynchronous. Otherwise
1676	 * wait for the background write to complete.
1677	 */
1678	BO_LOCK(bp->b_bufobj);
1679	if (bp->b_vflags & BV_BKGRDINPROG) {
1680		if (bp->b_flags & B_ASYNC) {
1681			BO_UNLOCK(bp->b_bufobj);
1682			splx(s);
1683			bdwrite(bp);
1684			return (0);
1685		}
1686		bp->b_vflags |= BV_BKGRDWAIT;
1687		msleep(&bp->b_xflags, BO_MTX(bp->b_bufobj), PRIBIO, "bwrbg", 0);
1688		if (bp->b_vflags & BV_BKGRDINPROG)
1689			panic("bufwrite: still writing");
1690	}
1691	BO_UNLOCK(bp->b_bufobj);
1692
1693	/* Mark the buffer clean */
1694	bundirty(bp);
1695
1696	/*
1697	 * If this buffer is marked for background writing and we
1698	 * do not have to wait for it, make a copy and write the
1699	 * copy so as to leave this buffer ready for further use.
1700	 *
1701	 * This optimization eats a lot of memory.  If we have a page
1702	 * or buffer shortfall we can't do it.
1703	 */
1704	if (dobkgrdwrite && (bp->b_xflags & BX_BKGRDWRITE) &&
1705	    (bp->b_flags & B_ASYNC) &&
1706	    !vm_page_count_severe() &&
1707	    !buf_dirty_count_severe()) {
1708		KASSERT(bp->b_iodone == NULL,
1709		    ("bufwrite: needs chained iodone (%p)", bp->b_iodone));
1710
1711		/* get a new block */
1712		newbp = geteblk(bp->b_bufsize);
1713
1714		/*
1715		 * set it to be identical to the old block.  We have to
1716		 * set b_lblkno and BKGRDMARKER before calling bgetvp()
1717		 * to avoid confusing the splay tree and gbincore().
1718		 */
1719		memcpy(newbp->b_data, bp->b_data, bp->b_bufsize);
1720		newbp->b_lblkno = bp->b_lblkno;
1721		newbp->b_xflags |= BX_BKGRDMARKER;
1722		BO_LOCK(bp->b_bufobj);
1723		bp->b_vflags |= BV_BKGRDINPROG;
1724		bgetvp(bp->b_vp, newbp);
1725		BO_UNLOCK(bp->b_bufobj);
1726		newbp->b_bufobj = &bp->b_vp->v_bufobj;
1727		newbp->b_blkno = bp->b_blkno;
1728		newbp->b_offset = bp->b_offset;
1729		newbp->b_iodone = ffs_backgroundwritedone;
1730		newbp->b_flags |= B_ASYNC;
1731		newbp->b_flags &= ~B_INVAL;
1732
1733#ifdef SOFTUPDATES
1734		/* move over the dependencies */
1735		if (LIST_FIRST(&bp->b_dep) != NULL)
1736			softdep_move_dependencies(bp, newbp);
1737#endif
1738
1739		/*
1740		 * Initiate write on the copy, release the original to
1741		 * the B_LOCKED queue so that it cannot go away until
1742		 * the background write completes. If not locked it could go
1743		 * away and then be reconstituted while it was being written.
1744		 * If the reconstituted buffer were written, we could end up
1745		 * with two background copies being written at the same time.
1746		 */
1747		bqrelse(bp);
1748		bp = newbp;
1749	}
1750
1751	/* Let the normal bufwrite do the rest for us */
1752	return (bufwrite(bp));
1753}
1754
1755
1756static void
1757ffs_geom_strategy(struct bufobj *bo, struct buf *bp)
1758{
1759	struct vnode *vp;
1760	int error;
1761
1762	vp = bo->__bo_vnode;
1763	if (bp->b_iocmd == BIO_WRITE) {
1764#ifdef SOFTUPDATES
1765		if (LIST_FIRST(&bp->b_dep) != NULL)
1766			buf_start(bp);
1767#endif
1768		if ((bp->b_flags & B_VALIDSUSPWRT) == 0 &&
1769		    bp->b_vp != NULL && bp->b_vp->v_mount != NULL &&
1770		    (bp->b_vp->v_mount->mnt_kern_flag & MNTK_SUSPENDED) != 0)
1771			panic("ffs_geom_strategy: bad I/O");
1772		bp->b_flags &= ~B_VALIDSUSPWRT;
1773		if ((vp->v_vflag & VV_COPYONWRITE) &&
1774		    vp->v_rdev->si_snapdata != NULL &&
1775		    (error = (ffs_copyonwrite)(vp, bp)) != 0 &&
1776		    error != EOPNOTSUPP) {
1777			bp->b_error = error;
1778			bp->b_ioflags |= BIO_ERROR;
1779			bufdone(bp);
1780			return;
1781		}
1782	}
1783	g_vfs_strategy(bo, bp);
1784}
1785