ffs_vfsops.c revision 200796
1/*-
2 * Copyright (c) 1989, 1991, 1993, 1994
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 4. Neither the name of the University nor the names of its contributors
14 *    may be used to endorse or promote products derived from this software
15 *    without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 *
29 *	@(#)ffs_vfsops.c	8.31 (Berkeley) 5/20/95
30 */
31
32#include <sys/cdefs.h>
33__FBSDID("$FreeBSD: head/sys/ufs/ffs/ffs_vfsops.c 200796 2009-12-21 19:39:10Z trasz $");
34
35#include "opt_quota.h"
36#include "opt_ufs.h"
37#include "opt_ffs.h"
38#include "opt_ddb.h"
39
40#include <sys/param.h>
41#include <sys/systm.h>
42#include <sys/namei.h>
43#include <sys/priv.h>
44#include <sys/proc.h>
45#include <sys/kernel.h>
46#include <sys/vnode.h>
47#include <sys/mount.h>
48#include <sys/bio.h>
49#include <sys/buf.h>
50#include <sys/conf.h>
51#include <sys/fcntl.h>
52#include <sys/malloc.h>
53#include <sys/mutex.h>
54
55#include <security/mac/mac_framework.h>
56
57#include <ufs/ufs/extattr.h>
58#include <ufs/ufs/gjournal.h>
59#include <ufs/ufs/quota.h>
60#include <ufs/ufs/ufsmount.h>
61#include <ufs/ufs/inode.h>
62#include <ufs/ufs/ufs_extern.h>
63
64#include <ufs/ffs/fs.h>
65#include <ufs/ffs/ffs_extern.h>
66
67#include <vm/vm.h>
68#include <vm/uma.h>
69#include <vm/vm_page.h>
70
71#include <geom/geom.h>
72#include <geom/geom_vfs.h>
73
74#include <ddb/ddb.h>
75
76static uma_zone_t uma_inode, uma_ufs1, uma_ufs2;
77
78static int	ffs_reload(struct mount *, struct thread *);
79static int	ffs_mountfs(struct vnode *, struct mount *, struct thread *);
80static void	ffs_oldfscompat_read(struct fs *, struct ufsmount *,
81		    ufs2_daddr_t);
82static void	ffs_oldfscompat_write(struct fs *, struct ufsmount *);
83static void	ffs_ifree(struct ufsmount *ump, struct inode *ip);
84static vfs_init_t ffs_init;
85static vfs_uninit_t ffs_uninit;
86static vfs_extattrctl_t ffs_extattrctl;
87static vfs_cmount_t ffs_cmount;
88static vfs_unmount_t ffs_unmount;
89static vfs_mount_t ffs_mount;
90static vfs_statfs_t ffs_statfs;
91static vfs_fhtovp_t ffs_fhtovp;
92static vfs_sync_t ffs_sync;
93
94static struct vfsops ufs_vfsops = {
95	.vfs_extattrctl =	ffs_extattrctl,
96	.vfs_fhtovp =		ffs_fhtovp,
97	.vfs_init =		ffs_init,
98	.vfs_mount =		ffs_mount,
99	.vfs_cmount =		ffs_cmount,
100	.vfs_quotactl =		ufs_quotactl,
101	.vfs_root =		ufs_root,
102	.vfs_statfs =		ffs_statfs,
103	.vfs_sync =		ffs_sync,
104	.vfs_uninit =		ffs_uninit,
105	.vfs_unmount =		ffs_unmount,
106	.vfs_vget =		ffs_vget,
107	.vfs_susp_clean =	process_deferred_inactive,
108};
109
110VFS_SET(ufs_vfsops, ufs, 0);
111MODULE_VERSION(ufs, 1);
112
113static b_strategy_t ffs_geom_strategy;
114static b_write_t ffs_bufwrite;
115
116static struct buf_ops ffs_ops = {
117	.bop_name =	"FFS",
118	.bop_write =	ffs_bufwrite,
119	.bop_strategy =	ffs_geom_strategy,
120	.bop_sync =	bufsync,
121#ifdef NO_FFS_SNAPSHOT
122	.bop_bdflush =	bufbdflush,
123#else
124	.bop_bdflush =	ffs_bdflush,
125#endif
126};
127
128static const char *ffs_opts[] = { "acls", "async", "noatime", "noclusterr",
129    "noclusterw", "noexec", "export", "force", "from", "multilabel",
130    "snapshot", "nosuid", "suiddir", "nosymfollow", "sync",
131    "union", "nfsv4acls", NULL };
132
133static int
134ffs_mount(struct mount *mp)
135{
136	struct vnode *devvp;
137	struct thread *td;
138	struct ufsmount *ump = 0;
139	struct fs *fs;
140	int error, flags;
141	u_int mntorflags, mntandnotflags;
142	accmode_t accmode;
143	struct nameidata ndp;
144	char *fspec;
145
146	td = curthread;
147	if (vfs_filteropt(mp->mnt_optnew, ffs_opts))
148		return (EINVAL);
149	if (uma_inode == NULL) {
150		uma_inode = uma_zcreate("FFS inode",
151		    sizeof(struct inode), NULL, NULL, NULL, NULL,
152		    UMA_ALIGN_PTR, 0);
153		uma_ufs1 = uma_zcreate("FFS1 dinode",
154		    sizeof(struct ufs1_dinode), NULL, NULL, NULL, NULL,
155		    UMA_ALIGN_PTR, 0);
156		uma_ufs2 = uma_zcreate("FFS2 dinode",
157		    sizeof(struct ufs2_dinode), NULL, NULL, NULL, NULL,
158		    UMA_ALIGN_PTR, 0);
159	}
160
161	fspec = vfs_getopts(mp->mnt_optnew, "from", &error);
162	if (error)
163		return (error);
164
165	mntorflags = 0;
166	mntandnotflags = 0;
167	if (vfs_getopt(mp->mnt_optnew, "acls", NULL, NULL) == 0)
168		mntorflags |= MNT_ACLS;
169
170	if (vfs_getopt(mp->mnt_optnew, "snapshot", NULL, NULL) == 0) {
171		mntorflags |= MNT_SNAPSHOT;
172		/*
173		 * Once we have set the MNT_SNAPSHOT flag, do not
174		 * persist "snapshot" in the options list.
175		 */
176		vfs_deleteopt(mp->mnt_optnew, "snapshot");
177		vfs_deleteopt(mp->mnt_opt, "snapshot");
178	}
179
180	if (vfs_getopt(mp->mnt_optnew, "nfsv4acls", NULL, NULL) == 0) {
181		if (mntorflags & MNT_ACLS) {
182			printf("WARNING: \"acls\" and \"nfsv4acls\" "
183			    "options are mutually exclusive\n");
184			return (EINVAL);
185		}
186		mntorflags |= MNT_NFS4ACLS;
187	}
188
189	MNT_ILOCK(mp);
190	mp->mnt_flag = (mp->mnt_flag | mntorflags) & ~mntandnotflags;
191	MNT_IUNLOCK(mp);
192	/*
193	 * If updating, check whether changing from read-only to
194	 * read/write; if there is no device name, that's all we do.
195	 */
196	if (mp->mnt_flag & MNT_UPDATE) {
197		ump = VFSTOUFS(mp);
198		fs = ump->um_fs;
199		devvp = ump->um_devvp;
200		if (fs->fs_ronly == 0 &&
201		    vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0)) {
202			/*
203			 * Flush any dirty data and suspend filesystem.
204			 */
205			if ((error = vn_start_write(NULL, &mp, V_WAIT)) != 0)
206				return (error);
207			for (;;) {
208				vn_finished_write(mp);
209				if ((error = vfs_write_suspend(mp)) != 0)
210					return (error);
211				MNT_ILOCK(mp);
212				if (mp->mnt_kern_flag & MNTK_SUSPENDED) {
213					/*
214					 * Allow the secondary writes
215					 * to proceed.
216					 */
217					mp->mnt_kern_flag &= ~(MNTK_SUSPENDED |
218					    MNTK_SUSPEND2);
219					wakeup(&mp->mnt_flag);
220					MNT_IUNLOCK(mp);
221					/*
222					 * Allow the curthread to
223					 * ignore the suspension to
224					 * synchronize on-disk state.
225					 */
226					td->td_pflags |= TDP_IGNSUSP;
227					break;
228				}
229				MNT_IUNLOCK(mp);
230				vn_start_write(NULL, &mp, V_WAIT);
231			}
232			/*
233			 * Check for and optionally get rid of files open
234			 * for writing.
235			 */
236			flags = WRITECLOSE;
237			if (mp->mnt_flag & MNT_FORCE)
238				flags |= FORCECLOSE;
239			if (mp->mnt_flag & MNT_SOFTDEP) {
240				error = softdep_flushfiles(mp, flags, td);
241			} else {
242				error = ffs_flushfiles(mp, flags, td);
243			}
244			if (error) {
245				vfs_write_resume(mp);
246				return (error);
247			}
248			if (fs->fs_pendingblocks != 0 ||
249			    fs->fs_pendinginodes != 0) {
250				printf("%s: %s: blocks %jd files %d\n",
251				    fs->fs_fsmnt, "update error",
252				    (intmax_t)fs->fs_pendingblocks,
253				    fs->fs_pendinginodes);
254				fs->fs_pendingblocks = 0;
255				fs->fs_pendinginodes = 0;
256			}
257			if ((fs->fs_flags & (FS_UNCLEAN | FS_NEEDSFSCK)) == 0)
258				fs->fs_clean = 1;
259			if ((error = ffs_sbupdate(ump, MNT_WAIT, 0)) != 0) {
260				fs->fs_ronly = 0;
261				fs->fs_clean = 0;
262				vfs_write_resume(mp);
263				return (error);
264			}
265			DROP_GIANT();
266			g_topology_lock();
267			g_access(ump->um_cp, 0, -1, 0);
268			g_topology_unlock();
269			PICKUP_GIANT();
270			fs->fs_ronly = 1;
271			MNT_ILOCK(mp);
272			mp->mnt_flag |= MNT_RDONLY;
273			MNT_IUNLOCK(mp);
274			/*
275			 * Allow the writers to note that filesystem
276			 * is ro now.
277			 */
278			vfs_write_resume(mp);
279		}
280		if ((mp->mnt_flag & MNT_RELOAD) &&
281		    (error = ffs_reload(mp, td)) != 0)
282			return (error);
283		if (fs->fs_ronly &&
284		    !vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0)) {
285			/*
286			 * If upgrade to read-write by non-root, then verify
287			 * that user has necessary permissions on the device.
288			 */
289			vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
290			error = VOP_ACCESS(devvp, VREAD | VWRITE,
291			    td->td_ucred, td);
292			if (error)
293				error = priv_check(td, PRIV_VFS_MOUNT_PERM);
294			if (error) {
295				VOP_UNLOCK(devvp, 0);
296				return (error);
297			}
298			VOP_UNLOCK(devvp, 0);
299			fs->fs_flags &= ~FS_UNCLEAN;
300			if (fs->fs_clean == 0) {
301				fs->fs_flags |= FS_UNCLEAN;
302				if ((mp->mnt_flag & MNT_FORCE) ||
303				    ((fs->fs_flags & FS_NEEDSFSCK) == 0 &&
304				     (fs->fs_flags & FS_DOSOFTDEP))) {
305					printf("WARNING: %s was not %s\n",
306					   fs->fs_fsmnt, "properly dismounted");
307				} else {
308					printf(
309"WARNING: R/W mount of %s denied.  Filesystem is not clean - run fsck\n",
310					    fs->fs_fsmnt);
311					return (EPERM);
312				}
313			}
314			DROP_GIANT();
315			g_topology_lock();
316			/*
317			 * If we're the root device, we may not have an E count
318			 * yet, get it now.
319			 */
320			if (ump->um_cp->ace == 0)
321				error = g_access(ump->um_cp, 0, 1, 1);
322			else
323				error = g_access(ump->um_cp, 0, 1, 0);
324			g_topology_unlock();
325			PICKUP_GIANT();
326			if (error)
327				return (error);
328			if ((error = vn_start_write(NULL, &mp, V_WAIT)) != 0)
329				return (error);
330			fs->fs_ronly = 0;
331			MNT_ILOCK(mp);
332			mp->mnt_flag &= ~MNT_RDONLY;
333			MNT_IUNLOCK(mp);
334			fs->fs_clean = 0;
335			if ((error = ffs_sbupdate(ump, MNT_WAIT, 0)) != 0) {
336				vn_finished_write(mp);
337				return (error);
338			}
339			/* check to see if we need to start softdep */
340			if ((fs->fs_flags & FS_DOSOFTDEP) &&
341			    (error = softdep_mount(devvp, mp, fs, td->td_ucred))){
342				vn_finished_write(mp);
343				return (error);
344			}
345			if (fs->fs_snapinum[0] != 0)
346				ffs_snapshot_mount(mp);
347			vn_finished_write(mp);
348		}
349		/*
350		 * Soft updates is incompatible with "async",
351		 * so if we are doing softupdates stop the user
352		 * from setting the async flag in an update.
353		 * Softdep_mount() clears it in an initial mount
354		 * or ro->rw remount.
355		 */
356		if (mp->mnt_flag & MNT_SOFTDEP) {
357			/* XXX: Reset too late ? */
358			MNT_ILOCK(mp);
359			mp->mnt_flag &= ~MNT_ASYNC;
360			MNT_IUNLOCK(mp);
361		}
362		/*
363		 * Keep MNT_ACLS flag if it is stored in superblock.
364		 */
365		if ((fs->fs_flags & FS_ACLS) != 0) {
366			/* XXX: Set too late ? */
367			MNT_ILOCK(mp);
368			mp->mnt_flag |= MNT_ACLS;
369			MNT_IUNLOCK(mp);
370		}
371
372		if ((fs->fs_flags & FS_NFS4ACLS) != 0) {
373			/* XXX: Set too late ? */
374			MNT_ILOCK(mp);
375			mp->mnt_flag |= MNT_NFS4ACLS;
376			MNT_IUNLOCK(mp);
377		}
378
379		/*
380		 * If this is a snapshot request, take the snapshot.
381		 */
382		if (mp->mnt_flag & MNT_SNAPSHOT)
383			return (ffs_snapshot(mp, fspec));
384	}
385
386	/*
387	 * Not an update, or updating the name: look up the name
388	 * and verify that it refers to a sensible disk device.
389	 */
390	NDINIT(&ndp, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, fspec, td);
391	if ((error = namei(&ndp)) != 0)
392		return (error);
393	NDFREE(&ndp, NDF_ONLY_PNBUF);
394	devvp = ndp.ni_vp;
395	if (!vn_isdisk(devvp, &error)) {
396		vput(devvp);
397		return (error);
398	}
399
400	/*
401	 * If mount by non-root, then verify that user has necessary
402	 * permissions on the device.
403	 */
404	accmode = VREAD;
405	if ((mp->mnt_flag & MNT_RDONLY) == 0)
406		accmode |= VWRITE;
407	error = VOP_ACCESS(devvp, accmode, td->td_ucred, td);
408	if (error)
409		error = priv_check(td, PRIV_VFS_MOUNT_PERM);
410	if (error) {
411		vput(devvp);
412		return (error);
413	}
414
415	if (mp->mnt_flag & MNT_UPDATE) {
416		/*
417		 * Update only
418		 *
419		 * If it's not the same vnode, or at least the same device
420		 * then it's not correct.
421		 */
422
423		if (devvp->v_rdev != ump->um_devvp->v_rdev)
424			error = EINVAL;	/* needs translation */
425		vput(devvp);
426		if (error)
427			return (error);
428	} else {
429		/*
430		 * New mount
431		 *
432		 * We need the name for the mount point (also used for
433		 * "last mounted on") copied in. If an error occurs,
434		 * the mount point is discarded by the upper level code.
435		 * Note that vfs_mount() populates f_mntonname for us.
436		 */
437		if ((error = ffs_mountfs(devvp, mp, td)) != 0) {
438			vrele(devvp);
439			return (error);
440		}
441	}
442	vfs_mountedfrom(mp, fspec);
443	return (0);
444}
445
446/*
447 * Compatibility with old mount system call.
448 */
449
450static int
451ffs_cmount(struct mntarg *ma, void *data, int flags)
452{
453	struct ufs_args args;
454	int error;
455
456	if (data == NULL)
457		return (EINVAL);
458	error = copyin(data, &args, sizeof args);
459	if (error)
460		return (error);
461
462	ma = mount_argsu(ma, "from", args.fspec, MAXPATHLEN);
463	ma = mount_arg(ma, "export", &args.export, sizeof args.export);
464	error = kernel_mount(ma, flags);
465
466	return (error);
467}
468
469/*
470 * Reload all incore data for a filesystem (used after running fsck on
471 * the root filesystem and finding things to fix). The filesystem must
472 * be mounted read-only.
473 *
474 * Things to do to update the mount:
475 *	1) invalidate all cached meta-data.
476 *	2) re-read superblock from disk.
477 *	3) re-read summary information from disk.
478 *	4) invalidate all inactive vnodes.
479 *	5) invalidate all cached file data.
480 *	6) re-read inode data for all active vnodes.
481 */
482static int
483ffs_reload(struct mount *mp, struct thread *td)
484{
485	struct vnode *vp, *mvp, *devvp;
486	struct inode *ip;
487	void *space;
488	struct buf *bp;
489	struct fs *fs, *newfs;
490	struct ufsmount *ump;
491	ufs2_daddr_t sblockloc;
492	int i, blks, size, error;
493	int32_t *lp;
494
495	if ((mp->mnt_flag & MNT_RDONLY) == 0)
496		return (EINVAL);
497	ump = VFSTOUFS(mp);
498	/*
499	 * Step 1: invalidate all cached meta-data.
500	 */
501	devvp = VFSTOUFS(mp)->um_devvp;
502	vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
503	if (vinvalbuf(devvp, 0, 0, 0) != 0)
504		panic("ffs_reload: dirty1");
505	VOP_UNLOCK(devvp, 0);
506
507	/*
508	 * Step 2: re-read superblock from disk.
509	 */
510	fs = VFSTOUFS(mp)->um_fs;
511	if ((error = bread(devvp, btodb(fs->fs_sblockloc), fs->fs_sbsize,
512	    NOCRED, &bp)) != 0)
513		return (error);
514	newfs = (struct fs *)bp->b_data;
515	if ((newfs->fs_magic != FS_UFS1_MAGIC &&
516	     newfs->fs_magic != FS_UFS2_MAGIC) ||
517	    newfs->fs_bsize > MAXBSIZE ||
518	    newfs->fs_bsize < sizeof(struct fs)) {
519			brelse(bp);
520			return (EIO);		/* XXX needs translation */
521	}
522	/*
523	 * Copy pointer fields back into superblock before copying in	XXX
524	 * new superblock. These should really be in the ufsmount.	XXX
525	 * Note that important parameters (eg fs_ncg) are unchanged.
526	 */
527	newfs->fs_csp = fs->fs_csp;
528	newfs->fs_maxcluster = fs->fs_maxcluster;
529	newfs->fs_contigdirs = fs->fs_contigdirs;
530	newfs->fs_active = fs->fs_active;
531	/* The file system is still read-only. */
532	newfs->fs_ronly = 1;
533	sblockloc = fs->fs_sblockloc;
534	bcopy(newfs, fs, (u_int)fs->fs_sbsize);
535	brelse(bp);
536	mp->mnt_maxsymlinklen = fs->fs_maxsymlinklen;
537	ffs_oldfscompat_read(fs, VFSTOUFS(mp), sblockloc);
538	UFS_LOCK(ump);
539	if (fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) {
540		printf("%s: reload pending error: blocks %jd files %d\n",
541		    fs->fs_fsmnt, (intmax_t)fs->fs_pendingblocks,
542		    fs->fs_pendinginodes);
543		fs->fs_pendingblocks = 0;
544		fs->fs_pendinginodes = 0;
545	}
546	UFS_UNLOCK(ump);
547
548	/*
549	 * Step 3: re-read summary information from disk.
550	 */
551	blks = howmany(fs->fs_cssize, fs->fs_fsize);
552	space = fs->fs_csp;
553	for (i = 0; i < blks; i += fs->fs_frag) {
554		size = fs->fs_bsize;
555		if (i + fs->fs_frag > blks)
556			size = (blks - i) * fs->fs_fsize;
557		error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + i), size,
558		    NOCRED, &bp);
559		if (error)
560			return (error);
561		bcopy(bp->b_data, space, (u_int)size);
562		space = (char *)space + size;
563		brelse(bp);
564	}
565	/*
566	 * We no longer know anything about clusters per cylinder group.
567	 */
568	if (fs->fs_contigsumsize > 0) {
569		lp = fs->fs_maxcluster;
570		for (i = 0; i < fs->fs_ncg; i++)
571			*lp++ = fs->fs_contigsumsize;
572	}
573
574loop:
575	MNT_ILOCK(mp);
576	MNT_VNODE_FOREACH(vp, mp, mvp) {
577		VI_LOCK(vp);
578		if (vp->v_iflag & VI_DOOMED) {
579			VI_UNLOCK(vp);
580			continue;
581		}
582		MNT_IUNLOCK(mp);
583		/*
584		 * Step 4: invalidate all cached file data.
585		 */
586		if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, td)) {
587			MNT_VNODE_FOREACH_ABORT(mp, mvp);
588			goto loop;
589		}
590		if (vinvalbuf(vp, 0, 0, 0))
591			panic("ffs_reload: dirty2");
592		/*
593		 * Step 5: re-read inode data for all active vnodes.
594		 */
595		ip = VTOI(vp);
596		error =
597		    bread(devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
598		    (int)fs->fs_bsize, NOCRED, &bp);
599		if (error) {
600			VOP_UNLOCK(vp, 0);
601			vrele(vp);
602			MNT_VNODE_FOREACH_ABORT(mp, mvp);
603			return (error);
604		}
605		ffs_load_inode(bp, ip, fs, ip->i_number);
606		ip->i_effnlink = ip->i_nlink;
607		brelse(bp);
608		VOP_UNLOCK(vp, 0);
609		vrele(vp);
610		MNT_ILOCK(mp);
611	}
612	MNT_IUNLOCK(mp);
613	return (0);
614}
615
616/*
617 * Possible superblock locations ordered from most to least likely.
618 */
619static int sblock_try[] = SBLOCKSEARCH;
620
621/*
622 * Common code for mount and mountroot
623 */
624static int
625ffs_mountfs(devvp, mp, td)
626	struct vnode *devvp;
627	struct mount *mp;
628	struct thread *td;
629{
630	struct ufsmount *ump;
631	struct buf *bp;
632	struct fs *fs;
633	struct cdev *dev;
634	void *space;
635	ufs2_daddr_t sblockloc;
636	int error, i, blks, size, ronly;
637	int32_t *lp;
638	struct ucred *cred;
639	struct g_consumer *cp;
640	struct mount *nmp;
641
642	bp = NULL;
643	ump = NULL;
644	cred = td ? td->td_ucred : NOCRED;
645	ronly = (mp->mnt_flag & MNT_RDONLY) != 0;
646
647	dev = devvp->v_rdev;
648	dev_ref(dev);
649	DROP_GIANT();
650	g_topology_lock();
651	error = g_vfs_open(devvp, &cp, "ffs", ronly ? 0 : 1);
652
653	/*
654	 * If we are a root mount, drop the E flag so fsck can do its magic.
655	 * We will pick it up again when we remount R/W.
656	 */
657	if (error == 0 && ronly && (mp->mnt_flag & MNT_ROOTFS))
658		error = g_access(cp, 0, 0, -1);
659	g_topology_unlock();
660	PICKUP_GIANT();
661	VOP_UNLOCK(devvp, 0);
662	if (error)
663		goto out;
664	if (devvp->v_rdev->si_iosize_max != 0)
665		mp->mnt_iosize_max = devvp->v_rdev->si_iosize_max;
666	if (mp->mnt_iosize_max > MAXPHYS)
667		mp->mnt_iosize_max = MAXPHYS;
668
669	devvp->v_bufobj.bo_private = cp;
670	devvp->v_bufobj.bo_ops = &ffs_ops;
671
672	fs = NULL;
673	sblockloc = 0;
674	/*
675	 * Try reading the superblock in each of its possible locations.
676	 */
677	for (i = 0; sblock_try[i] != -1; i++) {
678		if ((SBLOCKSIZE % cp->provider->sectorsize) != 0) {
679			error = EINVAL;
680			vfs_mount_error(mp,
681			    "Invalid sectorsize %d for superblock size %d",
682			    cp->provider->sectorsize, SBLOCKSIZE);
683			goto out;
684		}
685		if ((error = bread(devvp, btodb(sblock_try[i]), SBLOCKSIZE,
686		    cred, &bp)) != 0)
687			goto out;
688		fs = (struct fs *)bp->b_data;
689		sblockloc = sblock_try[i];
690		if ((fs->fs_magic == FS_UFS1_MAGIC ||
691		     (fs->fs_magic == FS_UFS2_MAGIC &&
692		      (fs->fs_sblockloc == sblockloc ||
693		       (fs->fs_old_flags & FS_FLAGS_UPDATED) == 0))) &&
694		    fs->fs_bsize <= MAXBSIZE &&
695		    fs->fs_bsize >= sizeof(struct fs))
696			break;
697		brelse(bp);
698		bp = NULL;
699	}
700	if (sblock_try[i] == -1) {
701		error = EINVAL;		/* XXX needs translation */
702		goto out;
703	}
704	fs->fs_fmod = 0;
705	fs->fs_flags &= ~FS_INDEXDIRS;	/* no support for directory indicies */
706	fs->fs_flags &= ~FS_UNCLEAN;
707	if (fs->fs_clean == 0) {
708		fs->fs_flags |= FS_UNCLEAN;
709		if (ronly || (mp->mnt_flag & MNT_FORCE) ||
710		    ((fs->fs_flags & FS_NEEDSFSCK) == 0 &&
711		     (fs->fs_flags & FS_DOSOFTDEP))) {
712			printf(
713"WARNING: %s was not properly dismounted\n",
714			    fs->fs_fsmnt);
715		} else {
716			printf(
717"WARNING: R/W mount of %s denied.  Filesystem is not clean - run fsck\n",
718			    fs->fs_fsmnt);
719			error = EPERM;
720			goto out;
721		}
722		if ((fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) &&
723		    (mp->mnt_flag & MNT_FORCE)) {
724			printf("%s: lost blocks %jd files %d\n", fs->fs_fsmnt,
725			    (intmax_t)fs->fs_pendingblocks,
726			    fs->fs_pendinginodes);
727			fs->fs_pendingblocks = 0;
728			fs->fs_pendinginodes = 0;
729		}
730	}
731	if (fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) {
732		printf("%s: mount pending error: blocks %jd files %d\n",
733		    fs->fs_fsmnt, (intmax_t)fs->fs_pendingblocks,
734		    fs->fs_pendinginodes);
735		fs->fs_pendingblocks = 0;
736		fs->fs_pendinginodes = 0;
737	}
738	if ((fs->fs_flags & FS_GJOURNAL) != 0) {
739#ifdef UFS_GJOURNAL
740		/*
741		 * Get journal provider name.
742		 */
743		size = 1024;
744		mp->mnt_gjprovider = malloc(size, M_UFSMNT, M_WAITOK);
745		if (g_io_getattr("GJOURNAL::provider", cp, &size,
746		    mp->mnt_gjprovider) == 0) {
747			mp->mnt_gjprovider = realloc(mp->mnt_gjprovider, size,
748			    M_UFSMNT, M_WAITOK);
749			MNT_ILOCK(mp);
750			mp->mnt_flag |= MNT_GJOURNAL;
751			MNT_IUNLOCK(mp);
752		} else {
753			printf(
754"WARNING: %s: GJOURNAL flag on fs but no gjournal provider below\n",
755			    mp->mnt_stat.f_mntonname);
756			free(mp->mnt_gjprovider, M_UFSMNT);
757			mp->mnt_gjprovider = NULL;
758		}
759#else
760		printf(
761"WARNING: %s: GJOURNAL flag on fs but no UFS_GJOURNAL support\n",
762		    mp->mnt_stat.f_mntonname);
763#endif
764	} else {
765		mp->mnt_gjprovider = NULL;
766	}
767	ump = malloc(sizeof *ump, M_UFSMNT, M_WAITOK | M_ZERO);
768	ump->um_cp = cp;
769	ump->um_bo = &devvp->v_bufobj;
770	ump->um_fs = malloc((u_long)fs->fs_sbsize, M_UFSMNT, M_WAITOK);
771	if (fs->fs_magic == FS_UFS1_MAGIC) {
772		ump->um_fstype = UFS1;
773		ump->um_balloc = ffs_balloc_ufs1;
774	} else {
775		ump->um_fstype = UFS2;
776		ump->um_balloc = ffs_balloc_ufs2;
777	}
778	ump->um_blkatoff = ffs_blkatoff;
779	ump->um_truncate = ffs_truncate;
780	ump->um_update = ffs_update;
781	ump->um_valloc = ffs_valloc;
782	ump->um_vfree = ffs_vfree;
783	ump->um_ifree = ffs_ifree;
784	ump->um_rdonly = ffs_rdonly;
785	mtx_init(UFS_MTX(ump), "FFS", "FFS Lock", MTX_DEF);
786	bcopy(bp->b_data, ump->um_fs, (u_int)fs->fs_sbsize);
787	if (fs->fs_sbsize < SBLOCKSIZE)
788		bp->b_flags |= B_INVAL | B_NOCACHE;
789	brelse(bp);
790	bp = NULL;
791	fs = ump->um_fs;
792	ffs_oldfscompat_read(fs, ump, sblockloc);
793	fs->fs_ronly = ronly;
794	size = fs->fs_cssize;
795	blks = howmany(size, fs->fs_fsize);
796	if (fs->fs_contigsumsize > 0)
797		size += fs->fs_ncg * sizeof(int32_t);
798	size += fs->fs_ncg * sizeof(u_int8_t);
799	space = malloc((u_long)size, M_UFSMNT, M_WAITOK);
800	fs->fs_csp = space;
801	for (i = 0; i < blks; i += fs->fs_frag) {
802		size = fs->fs_bsize;
803		if (i + fs->fs_frag > blks)
804			size = (blks - i) * fs->fs_fsize;
805		if ((error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + i), size,
806		    cred, &bp)) != 0) {
807			free(fs->fs_csp, M_UFSMNT);
808			goto out;
809		}
810		bcopy(bp->b_data, space, (u_int)size);
811		space = (char *)space + size;
812		brelse(bp);
813		bp = NULL;
814	}
815	if (fs->fs_contigsumsize > 0) {
816		fs->fs_maxcluster = lp = space;
817		for (i = 0; i < fs->fs_ncg; i++)
818			*lp++ = fs->fs_contigsumsize;
819		space = lp;
820	}
821	size = fs->fs_ncg * sizeof(u_int8_t);
822	fs->fs_contigdirs = (u_int8_t *)space;
823	bzero(fs->fs_contigdirs, size);
824	fs->fs_active = NULL;
825	mp->mnt_data = ump;
826	mp->mnt_stat.f_fsid.val[0] = fs->fs_id[0];
827	mp->mnt_stat.f_fsid.val[1] = fs->fs_id[1];
828	nmp = NULL;
829	if (fs->fs_id[0] == 0 || fs->fs_id[1] == 0 ||
830	    (nmp = vfs_getvfs(&mp->mnt_stat.f_fsid))) {
831		if (nmp)
832			vfs_rel(nmp);
833		vfs_getnewfsid(mp);
834	}
835	mp->mnt_maxsymlinklen = fs->fs_maxsymlinklen;
836	MNT_ILOCK(mp);
837	mp->mnt_flag |= MNT_LOCAL;
838	MNT_IUNLOCK(mp);
839	if ((fs->fs_flags & FS_MULTILABEL) != 0) {
840#ifdef MAC
841		MNT_ILOCK(mp);
842		mp->mnt_flag |= MNT_MULTILABEL;
843		MNT_IUNLOCK(mp);
844#else
845		printf(
846"WARNING: %s: multilabel flag on fs but no MAC support\n",
847		    mp->mnt_stat.f_mntonname);
848#endif
849	}
850	if ((fs->fs_flags & FS_ACLS) != 0) {
851#ifdef UFS_ACL
852		MNT_ILOCK(mp);
853
854		if (mp->mnt_flag & MNT_NFS4ACLS)
855			printf("WARNING: ACLs flag on fs conflicts with "
856			    "\"nfsv4acls\" mount option; option ignored\n");
857		mp->mnt_flag &= ~MNT_NFS4ACLS;
858		mp->mnt_flag |= MNT_ACLS;
859
860		MNT_IUNLOCK(mp);
861#else
862		printf(
863"WARNING: %s: ACLs flag on fs but no ACLs support\n",
864		    mp->mnt_stat.f_mntonname);
865#endif
866	}
867	if ((fs->fs_flags & FS_NFS4ACLS) != 0) {
868#ifdef UFS_ACL
869		MNT_ILOCK(mp);
870
871		if (mp->mnt_flag & MNT_ACLS)
872			printf("WARNING: NFSv4 ACLs flag on fs conflicts with "
873			    "\"acls\" mount option; option ignored\n");
874		mp->mnt_flag &= ~MNT_ACLS;
875		mp->mnt_flag |= MNT_NFS4ACLS;
876
877		MNT_IUNLOCK(mp);
878#else
879		printf(
880"WARNING: %s: NFSv4 ACLs flag on fs but no ACLs support\n",
881		    mp->mnt_stat.f_mntonname);
882#endif
883	}
884
885	ump->um_mountp = mp;
886	ump->um_dev = dev;
887	ump->um_devvp = devvp;
888	ump->um_nindir = fs->fs_nindir;
889	ump->um_bptrtodb = fs->fs_fsbtodb;
890	ump->um_seqinc = fs->fs_frag;
891	for (i = 0; i < MAXQUOTAS; i++)
892		ump->um_quotas[i] = NULLVP;
893#ifdef UFS_EXTATTR
894	ufs_extattr_uepm_init(&ump->um_extattr);
895#endif
896	/*
897	 * Set FS local "last mounted on" information (NULL pad)
898	 */
899	bzero(fs->fs_fsmnt, MAXMNTLEN);
900	strlcpy(fs->fs_fsmnt, mp->mnt_stat.f_mntonname, MAXMNTLEN);
901
902	if( mp->mnt_flag & MNT_ROOTFS) {
903		/*
904		 * Root mount; update timestamp in mount structure.
905		 * this will be used by the common root mount code
906		 * to update the system clock.
907		 */
908		mp->mnt_time = fs->fs_time;
909	}
910
911	if (ronly == 0) {
912		if ((fs->fs_flags & FS_DOSOFTDEP) &&
913		    (error = softdep_mount(devvp, mp, fs, cred)) != 0) {
914			free(fs->fs_csp, M_UFSMNT);
915			goto out;
916		}
917		if (fs->fs_snapinum[0] != 0)
918			ffs_snapshot_mount(mp);
919		fs->fs_fmod = 1;
920		fs->fs_clean = 0;
921		(void) ffs_sbupdate(ump, MNT_WAIT, 0);
922	}
923	/*
924	 * Initialize filesystem stat information in mount struct.
925	 */
926	MNT_ILOCK(mp);
927	mp->mnt_kern_flag |= MNTK_MPSAFE | MNTK_LOOKUP_SHARED |
928	    MNTK_EXTENDED_SHARED;
929	MNT_IUNLOCK(mp);
930#ifdef UFS_EXTATTR
931#ifdef UFS_EXTATTR_AUTOSTART
932	/*
933	 *
934	 * Auto-starting does the following:
935	 *	- check for /.attribute in the fs, and extattr_start if so
936	 *	- for each file in .attribute, enable that file with
937	 * 	  an attribute of the same name.
938	 * Not clear how to report errors -- probably eat them.
939	 * This would all happen while the filesystem was busy/not
940	 * available, so would effectively be "atomic".
941	 */
942	mp->mnt_stat.f_iosize = fs->fs_bsize;
943	(void) ufs_extattr_autostart(mp, td);
944#endif /* !UFS_EXTATTR_AUTOSTART */
945#endif /* !UFS_EXTATTR */
946	return (0);
947out:
948	if (bp)
949		brelse(bp);
950	if (cp != NULL) {
951		DROP_GIANT();
952		g_topology_lock();
953		g_vfs_close(cp);
954		g_topology_unlock();
955		PICKUP_GIANT();
956	}
957	if (ump) {
958		mtx_destroy(UFS_MTX(ump));
959		if (mp->mnt_gjprovider != NULL) {
960			free(mp->mnt_gjprovider, M_UFSMNT);
961			mp->mnt_gjprovider = NULL;
962		}
963		free(ump->um_fs, M_UFSMNT);
964		free(ump, M_UFSMNT);
965		mp->mnt_data = NULL;
966	}
967	dev_rel(dev);
968	return (error);
969}
970
971#include <sys/sysctl.h>
972static int bigcgs = 0;
973SYSCTL_INT(_debug, OID_AUTO, bigcgs, CTLFLAG_RW, &bigcgs, 0, "");
974
975/*
976 * Sanity checks for loading old filesystem superblocks.
977 * See ffs_oldfscompat_write below for unwound actions.
978 *
979 * XXX - Parts get retired eventually.
980 * Unfortunately new bits get added.
981 */
982static void
983ffs_oldfscompat_read(fs, ump, sblockloc)
984	struct fs *fs;
985	struct ufsmount *ump;
986	ufs2_daddr_t sblockloc;
987{
988	off_t maxfilesize;
989
990	/*
991	 * If not yet done, update fs_flags location and value of fs_sblockloc.
992	 */
993	if ((fs->fs_old_flags & FS_FLAGS_UPDATED) == 0) {
994		fs->fs_flags = fs->fs_old_flags;
995		fs->fs_old_flags |= FS_FLAGS_UPDATED;
996		fs->fs_sblockloc = sblockloc;
997	}
998	/*
999	 * If not yet done, update UFS1 superblock with new wider fields.
1000	 */
1001	if (fs->fs_magic == FS_UFS1_MAGIC && fs->fs_maxbsize != fs->fs_bsize) {
1002		fs->fs_maxbsize = fs->fs_bsize;
1003		fs->fs_time = fs->fs_old_time;
1004		fs->fs_size = fs->fs_old_size;
1005		fs->fs_dsize = fs->fs_old_dsize;
1006		fs->fs_csaddr = fs->fs_old_csaddr;
1007		fs->fs_cstotal.cs_ndir = fs->fs_old_cstotal.cs_ndir;
1008		fs->fs_cstotal.cs_nbfree = fs->fs_old_cstotal.cs_nbfree;
1009		fs->fs_cstotal.cs_nifree = fs->fs_old_cstotal.cs_nifree;
1010		fs->fs_cstotal.cs_nffree = fs->fs_old_cstotal.cs_nffree;
1011	}
1012	if (fs->fs_magic == FS_UFS1_MAGIC &&
1013	    fs->fs_old_inodefmt < FS_44INODEFMT) {
1014		fs->fs_maxfilesize = ((uint64_t)1 << 31) - 1;
1015		fs->fs_qbmask = ~fs->fs_bmask;
1016		fs->fs_qfmask = ~fs->fs_fmask;
1017	}
1018	if (fs->fs_magic == FS_UFS1_MAGIC) {
1019		ump->um_savedmaxfilesize = fs->fs_maxfilesize;
1020		maxfilesize = (uint64_t)0x80000000 * fs->fs_bsize - 1;
1021		if (fs->fs_maxfilesize > maxfilesize)
1022			fs->fs_maxfilesize = maxfilesize;
1023	}
1024	/* Compatibility for old filesystems */
1025	if (fs->fs_avgfilesize <= 0)
1026		fs->fs_avgfilesize = AVFILESIZ;
1027	if (fs->fs_avgfpdir <= 0)
1028		fs->fs_avgfpdir = AFPDIR;
1029	if (bigcgs) {
1030		fs->fs_save_cgsize = fs->fs_cgsize;
1031		fs->fs_cgsize = fs->fs_bsize;
1032	}
1033}
1034
1035/*
1036 * Unwinding superblock updates for old filesystems.
1037 * See ffs_oldfscompat_read above for details.
1038 *
1039 * XXX - Parts get retired eventually.
1040 * Unfortunately new bits get added.
1041 */
1042static void
1043ffs_oldfscompat_write(fs, ump)
1044	struct fs *fs;
1045	struct ufsmount *ump;
1046{
1047
1048	/*
1049	 * Copy back UFS2 updated fields that UFS1 inspects.
1050	 */
1051	if (fs->fs_magic == FS_UFS1_MAGIC) {
1052		fs->fs_old_time = fs->fs_time;
1053		fs->fs_old_cstotal.cs_ndir = fs->fs_cstotal.cs_ndir;
1054		fs->fs_old_cstotal.cs_nbfree = fs->fs_cstotal.cs_nbfree;
1055		fs->fs_old_cstotal.cs_nifree = fs->fs_cstotal.cs_nifree;
1056		fs->fs_old_cstotal.cs_nffree = fs->fs_cstotal.cs_nffree;
1057		fs->fs_maxfilesize = ump->um_savedmaxfilesize;
1058	}
1059	if (bigcgs) {
1060		fs->fs_cgsize = fs->fs_save_cgsize;
1061		fs->fs_save_cgsize = 0;
1062	}
1063}
1064
1065/*
1066 * unmount system call
1067 */
1068static int
1069ffs_unmount(mp, mntflags)
1070	struct mount *mp;
1071	int mntflags;
1072{
1073	struct thread *td;
1074	struct ufsmount *ump = VFSTOUFS(mp);
1075	struct fs *fs;
1076	int error, flags, susp;
1077#ifdef UFS_EXTATTR
1078	int e_restart;
1079#endif
1080
1081	flags = 0;
1082	td = curthread;
1083	fs = ump->um_fs;
1084	if (mntflags & MNT_FORCE) {
1085		flags |= FORCECLOSE;
1086		susp = fs->fs_ronly != 0;
1087	} else
1088		susp = 0;
1089#ifdef UFS_EXTATTR
1090	if ((error = ufs_extattr_stop(mp, td))) {
1091		if (error != EOPNOTSUPP)
1092			printf("ffs_unmount: ufs_extattr_stop returned %d\n",
1093			    error);
1094		e_restart = 0;
1095	} else {
1096		ufs_extattr_uepm_destroy(&ump->um_extattr);
1097		e_restart = 1;
1098	}
1099#endif
1100	if (susp) {
1101		/*
1102		 * dounmount already called vn_start_write().
1103		 */
1104		for (;;) {
1105			vn_finished_write(mp);
1106			if ((error = vfs_write_suspend(mp)) != 0)
1107				return (error);
1108			MNT_ILOCK(mp);
1109			if (mp->mnt_kern_flag & MNTK_SUSPENDED) {
1110				mp->mnt_kern_flag &= ~(MNTK_SUSPENDED |
1111				    MNTK_SUSPEND2);
1112				wakeup(&mp->mnt_flag);
1113				MNT_IUNLOCK(mp);
1114				td->td_pflags |= TDP_IGNSUSP;
1115				break;
1116			}
1117			MNT_IUNLOCK(mp);
1118			vn_start_write(NULL, &mp, V_WAIT);
1119		}
1120	}
1121	if (mp->mnt_flag & MNT_SOFTDEP)
1122		error = softdep_flushfiles(mp, flags, td);
1123	else
1124		error = ffs_flushfiles(mp, flags, td);
1125	if (error != 0 && error != ENXIO)
1126		goto fail;
1127
1128	UFS_LOCK(ump);
1129	if (fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) {
1130		printf("%s: unmount pending error: blocks %jd files %d\n",
1131		    fs->fs_fsmnt, (intmax_t)fs->fs_pendingblocks,
1132		    fs->fs_pendinginodes);
1133		fs->fs_pendingblocks = 0;
1134		fs->fs_pendinginodes = 0;
1135	}
1136	UFS_UNLOCK(ump);
1137	if (fs->fs_ronly == 0) {
1138		fs->fs_clean = fs->fs_flags & (FS_UNCLEAN|FS_NEEDSFSCK) ? 0 : 1;
1139		error = ffs_sbupdate(ump, MNT_WAIT, 0);
1140		if (error && error != ENXIO) {
1141			fs->fs_clean = 0;
1142			goto fail;
1143		}
1144	}
1145	if (susp) {
1146		vfs_write_resume(mp);
1147		vn_start_write(NULL, &mp, V_WAIT);
1148	}
1149	DROP_GIANT();
1150	g_topology_lock();
1151	g_vfs_close(ump->um_cp);
1152	g_topology_unlock();
1153	PICKUP_GIANT();
1154	vrele(ump->um_devvp);
1155	dev_rel(ump->um_dev);
1156	mtx_destroy(UFS_MTX(ump));
1157	if (mp->mnt_gjprovider != NULL) {
1158		free(mp->mnt_gjprovider, M_UFSMNT);
1159		mp->mnt_gjprovider = NULL;
1160	}
1161	free(fs->fs_csp, M_UFSMNT);
1162	free(fs, M_UFSMNT);
1163	free(ump, M_UFSMNT);
1164	mp->mnt_data = NULL;
1165	MNT_ILOCK(mp);
1166	mp->mnt_flag &= ~MNT_LOCAL;
1167	MNT_IUNLOCK(mp);
1168	return (error);
1169
1170fail:
1171	if (susp) {
1172		vfs_write_resume(mp);
1173		vn_start_write(NULL, &mp, V_WAIT);
1174	}
1175#ifdef UFS_EXTATTR
1176	if (e_restart) {
1177		ufs_extattr_uepm_init(&ump->um_extattr);
1178#ifdef UFS_EXTATTR_AUTOSTART
1179		(void) ufs_extattr_autostart(mp, td);
1180#endif
1181	}
1182#endif
1183
1184	return (error);
1185}
1186
1187/*
1188 * Flush out all the files in a filesystem.
1189 */
1190int
1191ffs_flushfiles(mp, flags, td)
1192	struct mount *mp;
1193	int flags;
1194	struct thread *td;
1195{
1196	struct ufsmount *ump;
1197	int error;
1198
1199	ump = VFSTOUFS(mp);
1200#ifdef QUOTA
1201	if (mp->mnt_flag & MNT_QUOTA) {
1202		int i;
1203		error = vflush(mp, 0, SKIPSYSTEM|flags, td);
1204		if (error)
1205			return (error);
1206		for (i = 0; i < MAXQUOTAS; i++) {
1207			quotaoff(td, mp, i);
1208		}
1209		/*
1210		 * Here we fall through to vflush again to ensure
1211		 * that we have gotten rid of all the system vnodes.
1212		 */
1213	}
1214#endif
1215	ASSERT_VOP_LOCKED(ump->um_devvp, "ffs_flushfiles");
1216	if (ump->um_devvp->v_vflag & VV_COPYONWRITE) {
1217		if ((error = vflush(mp, 0, SKIPSYSTEM | flags, td)) != 0)
1218			return (error);
1219		ffs_snapshot_unmount(mp);
1220		flags |= FORCECLOSE;
1221		/*
1222		 * Here we fall through to vflush again to ensure
1223		 * that we have gotten rid of all the system vnodes.
1224		 */
1225	}
1226        /*
1227	 * Flush all the files.
1228	 */
1229	if ((error = vflush(mp, 0, flags, td)) != 0)
1230		return (error);
1231	/*
1232	 * Flush filesystem metadata.
1233	 */
1234	vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY);
1235	error = VOP_FSYNC(ump->um_devvp, MNT_WAIT, td);
1236	VOP_UNLOCK(ump->um_devvp, 0);
1237	return (error);
1238}
1239
1240/*
1241 * Get filesystem statistics.
1242 */
1243static int
1244ffs_statfs(mp, sbp)
1245	struct mount *mp;
1246	struct statfs *sbp;
1247{
1248	struct ufsmount *ump;
1249	struct fs *fs;
1250
1251	ump = VFSTOUFS(mp);
1252	fs = ump->um_fs;
1253	if (fs->fs_magic != FS_UFS1_MAGIC && fs->fs_magic != FS_UFS2_MAGIC)
1254		panic("ffs_statfs");
1255	sbp->f_version = STATFS_VERSION;
1256	sbp->f_bsize = fs->fs_fsize;
1257	sbp->f_iosize = fs->fs_bsize;
1258	sbp->f_blocks = fs->fs_dsize;
1259	UFS_LOCK(ump);
1260	sbp->f_bfree = fs->fs_cstotal.cs_nbfree * fs->fs_frag +
1261	    fs->fs_cstotal.cs_nffree + dbtofsb(fs, fs->fs_pendingblocks);
1262	sbp->f_bavail = freespace(fs, fs->fs_minfree) +
1263	    dbtofsb(fs, fs->fs_pendingblocks);
1264	sbp->f_files =  fs->fs_ncg * fs->fs_ipg - ROOTINO;
1265	sbp->f_ffree = fs->fs_cstotal.cs_nifree + fs->fs_pendinginodes;
1266	UFS_UNLOCK(ump);
1267	sbp->f_namemax = NAME_MAX;
1268	return (0);
1269}
1270
1271/*
1272 * Go through the disk queues to initiate sandbagged IO;
1273 * go through the inodes to write those that have been modified;
1274 * initiate the writing of the super block if it has been modified.
1275 *
1276 * Note: we are always called with the filesystem marked `MPBUSY'.
1277 */
1278static int
1279ffs_sync(mp, waitfor)
1280	struct mount *mp;
1281	int waitfor;
1282{
1283	struct vnode *mvp, *vp, *devvp;
1284	struct thread *td;
1285	struct inode *ip;
1286	struct ufsmount *ump = VFSTOUFS(mp);
1287	struct fs *fs;
1288	int error, count, wait, lockreq, allerror = 0;
1289	int suspend;
1290	int suspended;
1291	int secondary_writes;
1292	int secondary_accwrites;
1293	int softdep_deps;
1294	int softdep_accdeps;
1295	struct bufobj *bo;
1296
1297	td = curthread;
1298	fs = ump->um_fs;
1299	if (fs->fs_fmod != 0 && fs->fs_ronly != 0) {		/* XXX */
1300		printf("fs = %s\n", fs->fs_fsmnt);
1301		panic("ffs_sync: rofs mod");
1302	}
1303	/*
1304	 * Write back each (modified) inode.
1305	 */
1306	wait = 0;
1307	suspend = 0;
1308	suspended = 0;
1309	lockreq = LK_EXCLUSIVE | LK_NOWAIT;
1310	if (waitfor == MNT_SUSPEND) {
1311		suspend = 1;
1312		waitfor = MNT_WAIT;
1313	}
1314	if (waitfor == MNT_WAIT) {
1315		wait = 1;
1316		lockreq = LK_EXCLUSIVE;
1317	}
1318	lockreq |= LK_INTERLOCK | LK_SLEEPFAIL;
1319	MNT_ILOCK(mp);
1320loop:
1321	/* Grab snapshot of secondary write counts */
1322	secondary_writes = mp->mnt_secondary_writes;
1323	secondary_accwrites = mp->mnt_secondary_accwrites;
1324
1325	/* Grab snapshot of softdep dependency counts */
1326	MNT_IUNLOCK(mp);
1327	softdep_get_depcounts(mp, &softdep_deps, &softdep_accdeps);
1328	MNT_ILOCK(mp);
1329
1330	MNT_VNODE_FOREACH(vp, mp, mvp) {
1331		/*
1332		 * Depend on the mntvnode_slock to keep things stable enough
1333		 * for a quick test.  Since there might be hundreds of
1334		 * thousands of vnodes, we cannot afford even a subroutine
1335		 * call unless there's a good chance that we have work to do.
1336		 */
1337		VI_LOCK(vp);
1338		if (vp->v_iflag & VI_DOOMED) {
1339			VI_UNLOCK(vp);
1340			continue;
1341		}
1342		ip = VTOI(vp);
1343		if (vp->v_type == VNON || ((ip->i_flag &
1344		    (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE)) == 0 &&
1345		    vp->v_bufobj.bo_dirty.bv_cnt == 0)) {
1346			VI_UNLOCK(vp);
1347			continue;
1348		}
1349		MNT_IUNLOCK(mp);
1350		if ((error = vget(vp, lockreq, td)) != 0) {
1351			MNT_ILOCK(mp);
1352			if (error == ENOENT || error == ENOLCK) {
1353				MNT_VNODE_FOREACH_ABORT_ILOCKED(mp, mvp);
1354				goto loop;
1355			}
1356			continue;
1357		}
1358		if ((error = ffs_syncvnode(vp, waitfor)) != 0)
1359			allerror = error;
1360		vput(vp);
1361		MNT_ILOCK(mp);
1362	}
1363	MNT_IUNLOCK(mp);
1364	/*
1365	 * Force stale filesystem control information to be flushed.
1366	 */
1367	if (waitfor == MNT_WAIT) {
1368		if ((error = softdep_flushworklist(ump->um_mountp, &count, td)))
1369			allerror = error;
1370		/* Flushed work items may create new vnodes to clean */
1371		if (allerror == 0 && count) {
1372			MNT_ILOCK(mp);
1373			goto loop;
1374		}
1375	}
1376#ifdef QUOTA
1377	qsync(mp);
1378#endif
1379	devvp = ump->um_devvp;
1380	bo = &devvp->v_bufobj;
1381	BO_LOCK(bo);
1382	if (waitfor != MNT_LAZY &&
1383	    (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0)) {
1384		BO_UNLOCK(bo);
1385		vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
1386		if ((error = VOP_FSYNC(devvp, waitfor, td)) != 0)
1387			allerror = error;
1388		VOP_UNLOCK(devvp, 0);
1389		if (allerror == 0 && waitfor == MNT_WAIT) {
1390			MNT_ILOCK(mp);
1391			goto loop;
1392		}
1393	} else if (suspend != 0) {
1394		if (softdep_check_suspend(mp,
1395					  devvp,
1396					  softdep_deps,
1397					  softdep_accdeps,
1398					  secondary_writes,
1399					  secondary_accwrites) != 0)
1400			goto loop;	/* More work needed */
1401		mtx_assert(MNT_MTX(mp), MA_OWNED);
1402		mp->mnt_kern_flag |= MNTK_SUSPEND2 | MNTK_SUSPENDED;
1403		MNT_IUNLOCK(mp);
1404		suspended = 1;
1405	} else
1406		BO_UNLOCK(bo);
1407	/*
1408	 * Write back modified superblock.
1409	 */
1410	if (fs->fs_fmod != 0 &&
1411	    (error = ffs_sbupdate(ump, waitfor, suspended)) != 0)
1412		allerror = error;
1413	return (allerror);
1414}
1415
1416int
1417ffs_vget(mp, ino, flags, vpp)
1418	struct mount *mp;
1419	ino_t ino;
1420	int flags;
1421	struct vnode **vpp;
1422{
1423	return (ffs_vgetf(mp, ino, flags, vpp, 0));
1424}
1425
1426int
1427ffs_vgetf(mp, ino, flags, vpp, ffs_flags)
1428	struct mount *mp;
1429	ino_t ino;
1430	int flags;
1431	struct vnode **vpp;
1432	int ffs_flags;
1433{
1434	struct fs *fs;
1435	struct inode *ip;
1436	struct ufsmount *ump;
1437	struct buf *bp;
1438	struct vnode *vp;
1439	struct cdev *dev;
1440	int error;
1441
1442	error = vfs_hash_get(mp, ino, flags, curthread, vpp, NULL, NULL);
1443	if (error || *vpp != NULL)
1444		return (error);
1445
1446	/*
1447	 * We must promote to an exclusive lock for vnode creation.  This
1448	 * can happen if lookup is passed LOCKSHARED.
1449 	 */
1450	if ((flags & LK_TYPE_MASK) == LK_SHARED) {
1451		flags &= ~LK_TYPE_MASK;
1452		flags |= LK_EXCLUSIVE;
1453	}
1454
1455	/*
1456	 * We do not lock vnode creation as it is believed to be too
1457	 * expensive for such rare case as simultaneous creation of vnode
1458	 * for same ino by different processes. We just allow them to race
1459	 * and check later to decide who wins. Let the race begin!
1460	 */
1461
1462	ump = VFSTOUFS(mp);
1463	dev = ump->um_dev;
1464	fs = ump->um_fs;
1465
1466	/*
1467	 * If this malloc() is performed after the getnewvnode()
1468	 * it might block, leaving a vnode with a NULL v_data to be
1469	 * found by ffs_sync() if a sync happens to fire right then,
1470	 * which will cause a panic because ffs_sync() blindly
1471	 * dereferences vp->v_data (as well it should).
1472	 */
1473	ip = uma_zalloc(uma_inode, M_WAITOK | M_ZERO);
1474
1475	/* Allocate a new vnode/inode. */
1476	if (fs->fs_magic == FS_UFS1_MAGIC)
1477		error = getnewvnode("ufs", mp, &ffs_vnodeops1, &vp);
1478	else
1479		error = getnewvnode("ufs", mp, &ffs_vnodeops2, &vp);
1480	if (error) {
1481		*vpp = NULL;
1482		uma_zfree(uma_inode, ip);
1483		return (error);
1484	}
1485	/*
1486	 * FFS supports recursive locking.
1487	 */
1488	VN_LOCK_AREC(vp);
1489	vp->v_data = ip;
1490	vp->v_bufobj.bo_bsize = fs->fs_bsize;
1491	ip->i_vnode = vp;
1492	ip->i_ump = ump;
1493	ip->i_fs = fs;
1494	ip->i_dev = dev;
1495	ip->i_number = ino;
1496	ip->i_ea_refs = 0;
1497#ifdef QUOTA
1498	{
1499		int i;
1500		for (i = 0; i < MAXQUOTAS; i++)
1501			ip->i_dquot[i] = NODQUOT;
1502	}
1503#endif
1504
1505	lockmgr(vp->v_vnlock, LK_EXCLUSIVE, NULL);
1506	if (ffs_flags & FFSV_FORCEINSMQ)
1507		vp->v_vflag |= VV_FORCEINSMQ;
1508	error = insmntque(vp, mp);
1509	if (error != 0) {
1510		uma_zfree(uma_inode, ip);
1511		*vpp = NULL;
1512		return (error);
1513	}
1514	vp->v_vflag &= ~VV_FORCEINSMQ;
1515	error = vfs_hash_insert(vp, ino, flags, curthread, vpp, NULL, NULL);
1516	if (error || *vpp != NULL)
1517		return (error);
1518
1519	/* Read in the disk contents for the inode, copy into the inode. */
1520	error = bread(ump->um_devvp, fsbtodb(fs, ino_to_fsba(fs, ino)),
1521	    (int)fs->fs_bsize, NOCRED, &bp);
1522	if (error) {
1523		/*
1524		 * The inode does not contain anything useful, so it would
1525		 * be misleading to leave it on its hash chain. With mode
1526		 * still zero, it will be unlinked and returned to the free
1527		 * list by vput().
1528		 */
1529		brelse(bp);
1530		vput(vp);
1531		*vpp = NULL;
1532		return (error);
1533	}
1534	if (ip->i_ump->um_fstype == UFS1)
1535		ip->i_din1 = uma_zalloc(uma_ufs1, M_WAITOK);
1536	else
1537		ip->i_din2 = uma_zalloc(uma_ufs2, M_WAITOK);
1538	ffs_load_inode(bp, ip, fs, ino);
1539	if (DOINGSOFTDEP(vp))
1540		softdep_load_inodeblock(ip);
1541	else
1542		ip->i_effnlink = ip->i_nlink;
1543	bqrelse(bp);
1544
1545	/*
1546	 * Initialize the vnode from the inode, check for aliases.
1547	 * Note that the underlying vnode may have changed.
1548	 */
1549	if (ip->i_ump->um_fstype == UFS1)
1550		error = ufs_vinit(mp, &ffs_fifoops1, &vp);
1551	else
1552		error = ufs_vinit(mp, &ffs_fifoops2, &vp);
1553	if (error) {
1554		vput(vp);
1555		*vpp = NULL;
1556		return (error);
1557	}
1558
1559	/*
1560	 * Finish inode initialization.
1561	 */
1562	if (vp->v_type != VFIFO) {
1563		/* FFS supports shared locking for all files except fifos. */
1564		VN_LOCK_ASHARE(vp);
1565	}
1566
1567	/*
1568	 * Set up a generation number for this inode if it does not
1569	 * already have one. This should only happen on old filesystems.
1570	 */
1571	if (ip->i_gen == 0) {
1572		ip->i_gen = arc4random() / 2 + 1;
1573		if ((vp->v_mount->mnt_flag & MNT_RDONLY) == 0) {
1574			ip->i_flag |= IN_MODIFIED;
1575			DIP_SET(ip, i_gen, ip->i_gen);
1576		}
1577	}
1578	/*
1579	 * Ensure that uid and gid are correct. This is a temporary
1580	 * fix until fsck has been changed to do the update.
1581	 */
1582	if (fs->fs_magic == FS_UFS1_MAGIC &&		/* XXX */
1583	    fs->fs_old_inodefmt < FS_44INODEFMT) {	/* XXX */
1584		ip->i_uid = ip->i_din1->di_ouid;	/* XXX */
1585		ip->i_gid = ip->i_din1->di_ogid;	/* XXX */
1586	}						/* XXX */
1587
1588#ifdef MAC
1589	if ((mp->mnt_flag & MNT_MULTILABEL) && ip->i_mode) {
1590		/*
1591		 * If this vnode is already allocated, and we're running
1592		 * multi-label, attempt to perform a label association
1593		 * from the extended attributes on the inode.
1594		 */
1595		error = mac_vnode_associate_extattr(mp, vp);
1596		if (error) {
1597			/* ufs_inactive will release ip->i_devvp ref. */
1598			vput(vp);
1599			*vpp = NULL;
1600			return (error);
1601		}
1602	}
1603#endif
1604
1605	*vpp = vp;
1606	return (0);
1607}
1608
1609/*
1610 * File handle to vnode
1611 *
1612 * Have to be really careful about stale file handles:
1613 * - check that the inode number is valid
1614 * - call ffs_vget() to get the locked inode
1615 * - check for an unallocated inode (i_mode == 0)
1616 * - check that the given client host has export rights and return
1617 *   those rights via. exflagsp and credanonp
1618 */
1619static int
1620ffs_fhtovp(mp, fhp, vpp)
1621	struct mount *mp;
1622	struct fid *fhp;
1623	struct vnode **vpp;
1624{
1625	struct ufid *ufhp;
1626	struct fs *fs;
1627
1628	ufhp = (struct ufid *)fhp;
1629	fs = VFSTOUFS(mp)->um_fs;
1630	if (ufhp->ufid_ino < ROOTINO ||
1631	    ufhp->ufid_ino >= fs->fs_ncg * fs->fs_ipg)
1632		return (ESTALE);
1633	return (ufs_fhtovp(mp, ufhp, vpp));
1634}
1635
1636/*
1637 * Initialize the filesystem.
1638 */
1639static int
1640ffs_init(vfsp)
1641	struct vfsconf *vfsp;
1642{
1643
1644	softdep_initialize();
1645	return (ufs_init(vfsp));
1646}
1647
1648/*
1649 * Undo the work of ffs_init().
1650 */
1651static int
1652ffs_uninit(vfsp)
1653	struct vfsconf *vfsp;
1654{
1655	int ret;
1656
1657	ret = ufs_uninit(vfsp);
1658	softdep_uninitialize();
1659	return (ret);
1660}
1661
1662/*
1663 * Write a superblock and associated information back to disk.
1664 */
1665int
1666ffs_sbupdate(mp, waitfor, suspended)
1667	struct ufsmount *mp;
1668	int waitfor;
1669	int suspended;
1670{
1671	struct fs *fs = mp->um_fs;
1672	struct buf *sbbp;
1673	struct buf *bp;
1674	int blks;
1675	void *space;
1676	int i, size, error, allerror = 0;
1677
1678	if (fs->fs_ronly == 1 &&
1679	    (mp->um_mountp->mnt_flag & (MNT_RDONLY | MNT_UPDATE)) !=
1680	    (MNT_RDONLY | MNT_UPDATE))
1681		panic("ffs_sbupdate: write read-only filesystem");
1682	/*
1683	 * We use the superblock's buf to serialize calls to ffs_sbupdate().
1684	 */
1685	sbbp = getblk(mp->um_devvp, btodb(fs->fs_sblockloc), (int)fs->fs_sbsize,
1686	    0, 0, 0);
1687	/*
1688	 * First write back the summary information.
1689	 */
1690	blks = howmany(fs->fs_cssize, fs->fs_fsize);
1691	space = fs->fs_csp;
1692	for (i = 0; i < blks; i += fs->fs_frag) {
1693		size = fs->fs_bsize;
1694		if (i + fs->fs_frag > blks)
1695			size = (blks - i) * fs->fs_fsize;
1696		bp = getblk(mp->um_devvp, fsbtodb(fs, fs->fs_csaddr + i),
1697		    size, 0, 0, 0);
1698		bcopy(space, bp->b_data, (u_int)size);
1699		space = (char *)space + size;
1700		if (suspended)
1701			bp->b_flags |= B_VALIDSUSPWRT;
1702		if (waitfor != MNT_WAIT)
1703			bawrite(bp);
1704		else if ((error = bwrite(bp)) != 0)
1705			allerror = error;
1706	}
1707	/*
1708	 * Now write back the superblock itself. If any errors occurred
1709	 * up to this point, then fail so that the superblock avoids
1710	 * being written out as clean.
1711	 */
1712	if (allerror) {
1713		brelse(sbbp);
1714		return (allerror);
1715	}
1716	bp = sbbp;
1717	if (fs->fs_magic == FS_UFS1_MAGIC && fs->fs_sblockloc != SBLOCK_UFS1 &&
1718	    (fs->fs_flags & FS_FLAGS_UPDATED) == 0) {
1719		printf("%s: correcting fs_sblockloc from %jd to %d\n",
1720		    fs->fs_fsmnt, fs->fs_sblockloc, SBLOCK_UFS1);
1721		fs->fs_sblockloc = SBLOCK_UFS1;
1722	}
1723	if (fs->fs_magic == FS_UFS2_MAGIC && fs->fs_sblockloc != SBLOCK_UFS2 &&
1724	    (fs->fs_flags & FS_FLAGS_UPDATED) == 0) {
1725		printf("%s: correcting fs_sblockloc from %jd to %d\n",
1726		    fs->fs_fsmnt, fs->fs_sblockloc, SBLOCK_UFS2);
1727		fs->fs_sblockloc = SBLOCK_UFS2;
1728	}
1729	fs->fs_fmod = 0;
1730	fs->fs_time = time_second;
1731	bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize);
1732	ffs_oldfscompat_write((struct fs *)bp->b_data, mp);
1733	if (suspended)
1734		bp->b_flags |= B_VALIDSUSPWRT;
1735	if (waitfor != MNT_WAIT)
1736		bawrite(bp);
1737	else if ((error = bwrite(bp)) != 0)
1738		allerror = error;
1739	return (allerror);
1740}
1741
1742static int
1743ffs_extattrctl(struct mount *mp, int cmd, struct vnode *filename_vp,
1744	int attrnamespace, const char *attrname)
1745{
1746
1747#ifdef UFS_EXTATTR
1748	return (ufs_extattrctl(mp, cmd, filename_vp, attrnamespace,
1749	    attrname));
1750#else
1751	return (vfs_stdextattrctl(mp, cmd, filename_vp, attrnamespace,
1752	    attrname));
1753#endif
1754}
1755
1756static void
1757ffs_ifree(struct ufsmount *ump, struct inode *ip)
1758{
1759
1760	if (ump->um_fstype == UFS1 && ip->i_din1 != NULL)
1761		uma_zfree(uma_ufs1, ip->i_din1);
1762	else if (ip->i_din2 != NULL)
1763		uma_zfree(uma_ufs2, ip->i_din2);
1764	uma_zfree(uma_inode, ip);
1765}
1766
1767static int dobkgrdwrite = 1;
1768SYSCTL_INT(_debug, OID_AUTO, dobkgrdwrite, CTLFLAG_RW, &dobkgrdwrite, 0,
1769    "Do background writes (honoring the BV_BKGRDWRITE flag)?");
1770
1771/*
1772 * Complete a background write started from bwrite.
1773 */
1774static void
1775ffs_backgroundwritedone(struct buf *bp)
1776{
1777	struct bufobj *bufobj;
1778	struct buf *origbp;
1779
1780	/*
1781	 * Find the original buffer that we are writing.
1782	 */
1783	bufobj = bp->b_bufobj;
1784	BO_LOCK(bufobj);
1785	if ((origbp = gbincore(bp->b_bufobj, bp->b_lblkno)) == NULL)
1786		panic("backgroundwritedone: lost buffer");
1787	/* Grab an extra reference to be dropped by the bufdone() below. */
1788	bufobj_wrefl(bufobj);
1789	BO_UNLOCK(bufobj);
1790	/*
1791	 * Process dependencies then return any unfinished ones.
1792	 */
1793	if (!LIST_EMPTY(&bp->b_dep))
1794		buf_complete(bp);
1795#ifdef SOFTUPDATES
1796	if (!LIST_EMPTY(&bp->b_dep))
1797		softdep_move_dependencies(bp, origbp);
1798#endif
1799	/*
1800	 * This buffer is marked B_NOCACHE so when it is released
1801	 * by biodone it will be tossed.
1802	 */
1803	bp->b_flags |= B_NOCACHE;
1804	bp->b_flags &= ~B_CACHE;
1805	bufdone(bp);
1806	BO_LOCK(bufobj);
1807	/*
1808	 * Clear the BV_BKGRDINPROG flag in the original buffer
1809	 * and awaken it if it is waiting for the write to complete.
1810	 * If BV_BKGRDINPROG is not set in the original buffer it must
1811	 * have been released and re-instantiated - which is not legal.
1812	 */
1813	KASSERT((origbp->b_vflags & BV_BKGRDINPROG),
1814	    ("backgroundwritedone: lost buffer2"));
1815	origbp->b_vflags &= ~BV_BKGRDINPROG;
1816	if (origbp->b_vflags & BV_BKGRDWAIT) {
1817		origbp->b_vflags &= ~BV_BKGRDWAIT;
1818		wakeup(&origbp->b_xflags);
1819	}
1820	BO_UNLOCK(bufobj);
1821}
1822
1823
1824/*
1825 * Write, release buffer on completion.  (Done by iodone
1826 * if async).  Do not bother writing anything if the buffer
1827 * is invalid.
1828 *
1829 * Note that we set B_CACHE here, indicating that buffer is
1830 * fully valid and thus cacheable.  This is true even of NFS
1831 * now so we set it generally.  This could be set either here
1832 * or in biodone() since the I/O is synchronous.  We put it
1833 * here.
1834 */
1835static int
1836ffs_bufwrite(struct buf *bp)
1837{
1838	int oldflags, s;
1839	struct buf *newbp;
1840
1841	CTR3(KTR_BUF, "bufwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
1842	if (bp->b_flags & B_INVAL) {
1843		brelse(bp);
1844		return (0);
1845	}
1846
1847	oldflags = bp->b_flags;
1848
1849	if (!BUF_ISLOCKED(bp))
1850		panic("bufwrite: buffer is not busy???");
1851	s = splbio();
1852	/*
1853	 * If a background write is already in progress, delay
1854	 * writing this block if it is asynchronous. Otherwise
1855	 * wait for the background write to complete.
1856	 */
1857	BO_LOCK(bp->b_bufobj);
1858	if (bp->b_vflags & BV_BKGRDINPROG) {
1859		if (bp->b_flags & B_ASYNC) {
1860			BO_UNLOCK(bp->b_bufobj);
1861			splx(s);
1862			bdwrite(bp);
1863			return (0);
1864		}
1865		bp->b_vflags |= BV_BKGRDWAIT;
1866		msleep(&bp->b_xflags, BO_MTX(bp->b_bufobj), PRIBIO, "bwrbg", 0);
1867		if (bp->b_vflags & BV_BKGRDINPROG)
1868			panic("bufwrite: still writing");
1869	}
1870	BO_UNLOCK(bp->b_bufobj);
1871
1872	/* Mark the buffer clean */
1873	bundirty(bp);
1874
1875	/*
1876	 * If this buffer is marked for background writing and we
1877	 * do not have to wait for it, make a copy and write the
1878	 * copy so as to leave this buffer ready for further use.
1879	 *
1880	 * This optimization eats a lot of memory.  If we have a page
1881	 * or buffer shortfall we can't do it.
1882	 */
1883	if (dobkgrdwrite && (bp->b_xflags & BX_BKGRDWRITE) &&
1884	    (bp->b_flags & B_ASYNC) &&
1885	    !vm_page_count_severe() &&
1886	    !buf_dirty_count_severe()) {
1887		KASSERT(bp->b_iodone == NULL,
1888		    ("bufwrite: needs chained iodone (%p)", bp->b_iodone));
1889
1890		/* get a new block */
1891		newbp = geteblk(bp->b_bufsize, GB_NOWAIT_BD);
1892		if (newbp == NULL)
1893			goto normal_write;
1894
1895		/*
1896		 * set it to be identical to the old block.  We have to
1897		 * set b_lblkno and BKGRDMARKER before calling bgetvp()
1898		 * to avoid confusing the splay tree and gbincore().
1899		 */
1900		memcpy(newbp->b_data, bp->b_data, bp->b_bufsize);
1901		newbp->b_lblkno = bp->b_lblkno;
1902		newbp->b_xflags |= BX_BKGRDMARKER;
1903		BO_LOCK(bp->b_bufobj);
1904		bp->b_vflags |= BV_BKGRDINPROG;
1905		bgetvp(bp->b_vp, newbp);
1906		BO_UNLOCK(bp->b_bufobj);
1907		newbp->b_bufobj = &bp->b_vp->v_bufobj;
1908		newbp->b_blkno = bp->b_blkno;
1909		newbp->b_offset = bp->b_offset;
1910		newbp->b_iodone = ffs_backgroundwritedone;
1911		newbp->b_flags |= B_ASYNC;
1912		newbp->b_flags &= ~B_INVAL;
1913
1914#ifdef SOFTUPDATES
1915		/* move over the dependencies */
1916		if (!LIST_EMPTY(&bp->b_dep))
1917			softdep_move_dependencies(bp, newbp);
1918#endif
1919
1920		/*
1921		 * Initiate write on the copy, release the original to
1922		 * the B_LOCKED queue so that it cannot go away until
1923		 * the background write completes. If not locked it could go
1924		 * away and then be reconstituted while it was being written.
1925		 * If the reconstituted buffer were written, we could end up
1926		 * with two background copies being written at the same time.
1927		 */
1928		bqrelse(bp);
1929		bp = newbp;
1930	}
1931
1932	/* Let the normal bufwrite do the rest for us */
1933normal_write:
1934	return (bufwrite(bp));
1935}
1936
1937
1938static void
1939ffs_geom_strategy(struct bufobj *bo, struct buf *bp)
1940{
1941	struct vnode *vp;
1942	int error;
1943	struct buf *tbp;
1944
1945	vp = bo->__bo_vnode;
1946	if (bp->b_iocmd == BIO_WRITE) {
1947		if ((bp->b_flags & B_VALIDSUSPWRT) == 0 &&
1948		    bp->b_vp != NULL && bp->b_vp->v_mount != NULL &&
1949		    (bp->b_vp->v_mount->mnt_kern_flag & MNTK_SUSPENDED) != 0)
1950			panic("ffs_geom_strategy: bad I/O");
1951		bp->b_flags &= ~B_VALIDSUSPWRT;
1952		if ((vp->v_vflag & VV_COPYONWRITE) &&
1953		    vp->v_rdev->si_snapdata != NULL) {
1954			if ((bp->b_flags & B_CLUSTER) != 0) {
1955				runningbufwakeup(bp);
1956				TAILQ_FOREACH(tbp, &bp->b_cluster.cluster_head,
1957					      b_cluster.cluster_entry) {
1958					error = ffs_copyonwrite(vp, tbp);
1959					if (error != 0 &&
1960					    error != EOPNOTSUPP) {
1961						bp->b_error = error;
1962						bp->b_ioflags |= BIO_ERROR;
1963						bufdone(bp);
1964						return;
1965					}
1966				}
1967				bp->b_runningbufspace = bp->b_bufsize;
1968				atomic_add_long(&runningbufspace,
1969					       bp->b_runningbufspace);
1970			} else {
1971				error = ffs_copyonwrite(vp, bp);
1972				if (error != 0 && error != EOPNOTSUPP) {
1973					bp->b_error = error;
1974					bp->b_ioflags |= BIO_ERROR;
1975					bufdone(bp);
1976					return;
1977				}
1978			}
1979		}
1980#ifdef SOFTUPDATES
1981		if ((bp->b_flags & B_CLUSTER) != 0) {
1982			TAILQ_FOREACH(tbp, &bp->b_cluster.cluster_head,
1983				      b_cluster.cluster_entry) {
1984				if (!LIST_EMPTY(&tbp->b_dep))
1985					buf_start(tbp);
1986			}
1987		} else {
1988			if (!LIST_EMPTY(&bp->b_dep))
1989				buf_start(bp);
1990		}
1991
1992#endif
1993	}
1994	g_vfs_strategy(bo, bp);
1995}
1996
1997#ifdef	DDB
1998
1999static void
2000db_print_ffs(struct ufsmount *ump)
2001{
2002	db_printf("mp %p %s devvp %p fs %p su_wl %d su_wl_in %d su_deps %d "
2003		  "su_req %d\n",
2004	    ump->um_mountp, ump->um_mountp->mnt_stat.f_mntonname,
2005	    ump->um_devvp, ump->um_fs, ump->softdep_on_worklist,
2006	    ump->softdep_on_worklist_inprogress, ump->softdep_deps,
2007	    ump->softdep_req);
2008}
2009
2010DB_SHOW_COMMAND(ffs, db_show_ffs)
2011{
2012	struct mount *mp;
2013	struct ufsmount *ump;
2014
2015	if (have_addr) {
2016		ump = VFSTOUFS((struct mount *)addr);
2017		db_print_ffs(ump);
2018		return;
2019	}
2020
2021	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
2022		if (!strcmp(mp->mnt_stat.f_fstypename, ufs_vfsconf.vfc_name))
2023			db_print_ffs(VFSTOUFS(mp));
2024	}
2025}
2026
2027#endif	/* DDB */
2028