ffs_vfsops.c revision 210172
1/*-
2 * Copyright (c) 1989, 1991, 1993, 1994
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 4. Neither the name of the University nor the names of its contributors
14 *    may be used to endorse or promote products derived from this software
15 *    without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 *
29 *	@(#)ffs_vfsops.c	8.31 (Berkeley) 5/20/95
30 */
31
32#include <sys/cdefs.h>
33__FBSDID("$FreeBSD: head/sys/ufs/ffs/ffs_vfsops.c 210172 2010-07-16 19:52:03Z jhb $");
34
35#include "opt_quota.h"
36#include "opt_ufs.h"
37#include "opt_ffs.h"
38#include "opt_ddb.h"
39
40#include <sys/param.h>
41#include <sys/systm.h>
42#include <sys/namei.h>
43#include <sys/priv.h>
44#include <sys/proc.h>
45#include <sys/kernel.h>
46#include <sys/vnode.h>
47#include <sys/mount.h>
48#include <sys/bio.h>
49#include <sys/buf.h>
50#include <sys/conf.h>
51#include <sys/fcntl.h>
52#include <sys/malloc.h>
53#include <sys/mutex.h>
54
55#include <security/mac/mac_framework.h>
56
57#include <ufs/ufs/extattr.h>
58#include <ufs/ufs/gjournal.h>
59#include <ufs/ufs/quota.h>
60#include <ufs/ufs/ufsmount.h>
61#include <ufs/ufs/inode.h>
62#include <ufs/ufs/ufs_extern.h>
63
64#include <ufs/ffs/fs.h>
65#include <ufs/ffs/ffs_extern.h>
66
67#include <vm/vm.h>
68#include <vm/uma.h>
69#include <vm/vm_page.h>
70
71#include <geom/geom.h>
72#include <geom/geom_vfs.h>
73
74#include <ddb/ddb.h>
75
76static uma_zone_t uma_inode, uma_ufs1, uma_ufs2;
77
78static int	ffs_reload(struct mount *, struct thread *);
79static int	ffs_mountfs(struct vnode *, struct mount *, struct thread *);
80static void	ffs_oldfscompat_read(struct fs *, struct ufsmount *,
81		    ufs2_daddr_t);
82static void	ffs_ifree(struct ufsmount *ump, struct inode *ip);
83static vfs_init_t ffs_init;
84static vfs_uninit_t ffs_uninit;
85static vfs_extattrctl_t ffs_extattrctl;
86static vfs_cmount_t ffs_cmount;
87static vfs_unmount_t ffs_unmount;
88static vfs_mount_t ffs_mount;
89static vfs_statfs_t ffs_statfs;
90static vfs_fhtovp_t ffs_fhtovp;
91static vfs_sync_t ffs_sync;
92
93static struct vfsops ufs_vfsops = {
94	.vfs_extattrctl =	ffs_extattrctl,
95	.vfs_fhtovp =		ffs_fhtovp,
96	.vfs_init =		ffs_init,
97	.vfs_mount =		ffs_mount,
98	.vfs_cmount =		ffs_cmount,
99	.vfs_quotactl =		ufs_quotactl,
100	.vfs_root =		ufs_root,
101	.vfs_statfs =		ffs_statfs,
102	.vfs_sync =		ffs_sync,
103	.vfs_uninit =		ffs_uninit,
104	.vfs_unmount =		ffs_unmount,
105	.vfs_vget =		ffs_vget,
106	.vfs_susp_clean =	process_deferred_inactive,
107};
108
109VFS_SET(ufs_vfsops, ufs, 0);
110MODULE_VERSION(ufs, 1);
111
112static b_strategy_t ffs_geom_strategy;
113static b_write_t ffs_bufwrite;
114
115static struct buf_ops ffs_ops = {
116	.bop_name =	"FFS",
117	.bop_write =	ffs_bufwrite,
118	.bop_strategy =	ffs_geom_strategy,
119	.bop_sync =	bufsync,
120#ifdef NO_FFS_SNAPSHOT
121	.bop_bdflush =	bufbdflush,
122#else
123	.bop_bdflush =	ffs_bdflush,
124#endif
125};
126
127/*
128 * Note that userquota and groupquota options are not currently used
129 * by UFS/FFS code and generally mount(8) does not pass those options
130 * from userland, but they can be passed by loader(8) via
131 * vfs.root.mountfrom.options.
132 */
133static const char *ffs_opts[] = { "acls", "async", "noatime", "noclusterr",
134    "noclusterw", "noexec", "export", "force", "from", "groupquota",
135    "multilabel", "nfsv4acls", "snapshot", "nosuid", "suiddir", "nosymfollow",
136    "sync", "union", "userquota", NULL };
137
138static int
139ffs_mount(struct mount *mp)
140{
141	struct vnode *devvp;
142	struct thread *td;
143	struct ufsmount *ump = 0;
144	struct fs *fs;
145	int error, flags;
146	u_int mntorflags;
147	accmode_t accmode;
148	struct nameidata ndp;
149	char *fspec;
150
151	td = curthread;
152	if (vfs_filteropt(mp->mnt_optnew, ffs_opts))
153		return (EINVAL);
154	if (uma_inode == NULL) {
155		uma_inode = uma_zcreate("FFS inode",
156		    sizeof(struct inode), NULL, NULL, NULL, NULL,
157		    UMA_ALIGN_PTR, 0);
158		uma_ufs1 = uma_zcreate("FFS1 dinode",
159		    sizeof(struct ufs1_dinode), NULL, NULL, NULL, NULL,
160		    UMA_ALIGN_PTR, 0);
161		uma_ufs2 = uma_zcreate("FFS2 dinode",
162		    sizeof(struct ufs2_dinode), NULL, NULL, NULL, NULL,
163		    UMA_ALIGN_PTR, 0);
164	}
165
166	vfs_deleteopt(mp->mnt_optnew, "groupquota");
167	vfs_deleteopt(mp->mnt_optnew, "userquota");
168
169	fspec = vfs_getopts(mp->mnt_optnew, "from", &error);
170	if (error)
171		return (error);
172
173	mntorflags = 0;
174	if (vfs_getopt(mp->mnt_optnew, "acls", NULL, NULL) == 0)
175		mntorflags |= MNT_ACLS;
176
177	if (vfs_getopt(mp->mnt_optnew, "snapshot", NULL, NULL) == 0) {
178		mntorflags |= MNT_SNAPSHOT;
179		/*
180		 * Once we have set the MNT_SNAPSHOT flag, do not
181		 * persist "snapshot" in the options list.
182		 */
183		vfs_deleteopt(mp->mnt_optnew, "snapshot");
184		vfs_deleteopt(mp->mnt_opt, "snapshot");
185	}
186
187	if (vfs_getopt(mp->mnt_optnew, "nfsv4acls", NULL, NULL) == 0) {
188		if (mntorflags & MNT_ACLS) {
189			printf("WARNING: \"acls\" and \"nfsv4acls\" "
190			    "options are mutually exclusive\n");
191			return (EINVAL);
192		}
193		mntorflags |= MNT_NFS4ACLS;
194	}
195
196	MNT_ILOCK(mp);
197	mp->mnt_flag |= mntorflags;
198	MNT_IUNLOCK(mp);
199	/*
200	 * If updating, check whether changing from read-only to
201	 * read/write; if there is no device name, that's all we do.
202	 */
203	if (mp->mnt_flag & MNT_UPDATE) {
204		ump = VFSTOUFS(mp);
205		fs = ump->um_fs;
206		devvp = ump->um_devvp;
207		if (fs->fs_ronly == 0 &&
208		    vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0)) {
209			/*
210			 * Flush any dirty data and suspend filesystem.
211			 */
212			if ((error = vn_start_write(NULL, &mp, V_WAIT)) != 0)
213				return (error);
214			for (;;) {
215				vn_finished_write(mp);
216				if ((error = vfs_write_suspend(mp)) != 0)
217					return (error);
218				MNT_ILOCK(mp);
219				if (mp->mnt_kern_flag & MNTK_SUSPENDED) {
220					/*
221					 * Allow the secondary writes
222					 * to proceed.
223					 */
224					mp->mnt_kern_flag &= ~(MNTK_SUSPENDED |
225					    MNTK_SUSPEND2);
226					wakeup(&mp->mnt_flag);
227					MNT_IUNLOCK(mp);
228					/*
229					 * Allow the curthread to
230					 * ignore the suspension to
231					 * synchronize on-disk state.
232					 */
233					td->td_pflags |= TDP_IGNSUSP;
234					break;
235				}
236				MNT_IUNLOCK(mp);
237				vn_start_write(NULL, &mp, V_WAIT);
238			}
239			/*
240			 * Check for and optionally get rid of files open
241			 * for writing.
242			 */
243			flags = WRITECLOSE;
244			if (mp->mnt_flag & MNT_FORCE)
245				flags |= FORCECLOSE;
246			if (mp->mnt_flag & MNT_SOFTDEP) {
247				error = softdep_flushfiles(mp, flags, td);
248			} else {
249				error = ffs_flushfiles(mp, flags, td);
250			}
251			if (error) {
252				vfs_write_resume(mp);
253				return (error);
254			}
255			if (fs->fs_pendingblocks != 0 ||
256			    fs->fs_pendinginodes != 0) {
257				printf("%s: %s: blocks %jd files %d\n",
258				    fs->fs_fsmnt, "update error",
259				    (intmax_t)fs->fs_pendingblocks,
260				    fs->fs_pendinginodes);
261				fs->fs_pendingblocks = 0;
262				fs->fs_pendinginodes = 0;
263			}
264			if ((fs->fs_flags & (FS_UNCLEAN | FS_NEEDSFSCK)) == 0)
265				fs->fs_clean = 1;
266			if ((error = ffs_sbupdate(ump, MNT_WAIT, 0)) != 0) {
267				fs->fs_ronly = 0;
268				fs->fs_clean = 0;
269				vfs_write_resume(mp);
270				return (error);
271			}
272			DROP_GIANT();
273			g_topology_lock();
274			g_access(ump->um_cp, 0, -1, 0);
275			g_topology_unlock();
276			PICKUP_GIANT();
277			fs->fs_ronly = 1;
278			MNT_ILOCK(mp);
279			mp->mnt_flag |= MNT_RDONLY;
280			MNT_IUNLOCK(mp);
281			/*
282			 * Allow the writers to note that filesystem
283			 * is ro now.
284			 */
285			vfs_write_resume(mp);
286		}
287		if ((mp->mnt_flag & MNT_RELOAD) &&
288		    (error = ffs_reload(mp, td)) != 0)
289			return (error);
290		if (fs->fs_ronly &&
291		    !vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0)) {
292			/*
293			 * If upgrade to read-write by non-root, then verify
294			 * that user has necessary permissions on the device.
295			 */
296			vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
297			error = VOP_ACCESS(devvp, VREAD | VWRITE,
298			    td->td_ucred, td);
299			if (error)
300				error = priv_check(td, PRIV_VFS_MOUNT_PERM);
301			if (error) {
302				VOP_UNLOCK(devvp, 0);
303				return (error);
304			}
305			VOP_UNLOCK(devvp, 0);
306			fs->fs_flags &= ~FS_UNCLEAN;
307			if (fs->fs_clean == 0) {
308				fs->fs_flags |= FS_UNCLEAN;
309				if ((mp->mnt_flag & MNT_FORCE) ||
310				    ((fs->fs_flags &
311				     (FS_SUJ | FS_NEEDSFSCK)) == 0 &&
312				     (fs->fs_flags & FS_DOSOFTDEP))) {
313					printf("WARNING: %s was not %s\n",
314					   fs->fs_fsmnt, "properly dismounted");
315				} else {
316					printf(
317"WARNING: R/W mount of %s denied.  Filesystem is not clean - run fsck\n",
318					    fs->fs_fsmnt);
319					if (fs->fs_flags & FS_SUJ)
320						printf(
321"WARNING: Forced mount will invalidated journal contents\n");
322					return (EPERM);
323				}
324			}
325			DROP_GIANT();
326			g_topology_lock();
327			/*
328			 * If we're the root device, we may not have an E count
329			 * yet, get it now.
330			 */
331			if (ump->um_cp->ace == 0)
332				error = g_access(ump->um_cp, 0, 1, 1);
333			else
334				error = g_access(ump->um_cp, 0, 1, 0);
335			g_topology_unlock();
336			PICKUP_GIANT();
337			if (error)
338				return (error);
339			if ((error = vn_start_write(NULL, &mp, V_WAIT)) != 0)
340				return (error);
341			fs->fs_ronly = 0;
342			MNT_ILOCK(mp);
343			mp->mnt_flag &= ~MNT_RDONLY;
344			MNT_IUNLOCK(mp);
345			fs->fs_mtime = time_second;
346			/* check to see if we need to start softdep */
347			if ((fs->fs_flags & FS_DOSOFTDEP) &&
348			    (error = softdep_mount(devvp, mp, fs, td->td_ucred))){
349				vn_finished_write(mp);
350				return (error);
351			}
352			fs->fs_clean = 0;
353			if ((error = ffs_sbupdate(ump, MNT_WAIT, 0)) != 0) {
354				vn_finished_write(mp);
355				return (error);
356			}
357			if (fs->fs_snapinum[0] != 0)
358				ffs_snapshot_mount(mp);
359			vn_finished_write(mp);
360		}
361		/*
362		 * Soft updates is incompatible with "async",
363		 * so if we are doing softupdates stop the user
364		 * from setting the async flag in an update.
365		 * Softdep_mount() clears it in an initial mount
366		 * or ro->rw remount.
367		 */
368		if (mp->mnt_flag & MNT_SOFTDEP) {
369			/* XXX: Reset too late ? */
370			MNT_ILOCK(mp);
371			mp->mnt_flag &= ~MNT_ASYNC;
372			MNT_IUNLOCK(mp);
373		}
374		/*
375		 * Keep MNT_ACLS flag if it is stored in superblock.
376		 */
377		if ((fs->fs_flags & FS_ACLS) != 0) {
378			/* XXX: Set too late ? */
379			MNT_ILOCK(mp);
380			mp->mnt_flag |= MNT_ACLS;
381			MNT_IUNLOCK(mp);
382		}
383
384		if ((fs->fs_flags & FS_NFS4ACLS) != 0) {
385			/* XXX: Set too late ? */
386			MNT_ILOCK(mp);
387			mp->mnt_flag |= MNT_NFS4ACLS;
388			MNT_IUNLOCK(mp);
389		}
390
391		/*
392		 * If this is a snapshot request, take the snapshot.
393		 */
394		if (mp->mnt_flag & MNT_SNAPSHOT)
395			return (ffs_snapshot(mp, fspec));
396	}
397
398	/*
399	 * Not an update, or updating the name: look up the name
400	 * and verify that it refers to a sensible disk device.
401	 */
402	NDINIT(&ndp, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, fspec, td);
403	if ((error = namei(&ndp)) != 0)
404		return (error);
405	NDFREE(&ndp, NDF_ONLY_PNBUF);
406	devvp = ndp.ni_vp;
407	if (!vn_isdisk(devvp, &error)) {
408		vput(devvp);
409		return (error);
410	}
411
412	/*
413	 * If mount by non-root, then verify that user has necessary
414	 * permissions on the device.
415	 */
416	accmode = VREAD;
417	if ((mp->mnt_flag & MNT_RDONLY) == 0)
418		accmode |= VWRITE;
419	error = VOP_ACCESS(devvp, accmode, td->td_ucred, td);
420	if (error)
421		error = priv_check(td, PRIV_VFS_MOUNT_PERM);
422	if (error) {
423		vput(devvp);
424		return (error);
425	}
426
427	if (mp->mnt_flag & MNT_UPDATE) {
428		/*
429		 * Update only
430		 *
431		 * If it's not the same vnode, or at least the same device
432		 * then it's not correct.
433		 */
434
435		if (devvp->v_rdev != ump->um_devvp->v_rdev)
436			error = EINVAL;	/* needs translation */
437		vput(devvp);
438		if (error)
439			return (error);
440	} else {
441		/*
442		 * New mount
443		 *
444		 * We need the name for the mount point (also used for
445		 * "last mounted on") copied in. If an error occurs,
446		 * the mount point is discarded by the upper level code.
447		 * Note that vfs_mount() populates f_mntonname for us.
448		 */
449		if ((error = ffs_mountfs(devvp, mp, td)) != 0) {
450			vrele(devvp);
451			return (error);
452		}
453	}
454	vfs_mountedfrom(mp, fspec);
455	return (0);
456}
457
458/*
459 * Compatibility with old mount system call.
460 */
461
462static int
463ffs_cmount(struct mntarg *ma, void *data, int flags)
464{
465	struct ufs_args args;
466	int error;
467
468	if (data == NULL)
469		return (EINVAL);
470	error = copyin(data, &args, sizeof args);
471	if (error)
472		return (error);
473
474	ma = mount_argsu(ma, "from", args.fspec, MAXPATHLEN);
475	ma = mount_arg(ma, "export", &args.export, sizeof args.export);
476	error = kernel_mount(ma, flags);
477
478	return (error);
479}
480
481/*
482 * Reload all incore data for a filesystem (used after running fsck on
483 * the root filesystem and finding things to fix). The filesystem must
484 * be mounted read-only.
485 *
486 * Things to do to update the mount:
487 *	1) invalidate all cached meta-data.
488 *	2) re-read superblock from disk.
489 *	3) re-read summary information from disk.
490 *	4) invalidate all inactive vnodes.
491 *	5) invalidate all cached file data.
492 *	6) re-read inode data for all active vnodes.
493 */
494static int
495ffs_reload(struct mount *mp, struct thread *td)
496{
497	struct vnode *vp, *mvp, *devvp;
498	struct inode *ip;
499	void *space;
500	struct buf *bp;
501	struct fs *fs, *newfs;
502	struct ufsmount *ump;
503	ufs2_daddr_t sblockloc;
504	int i, blks, size, error;
505	int32_t *lp;
506
507	if ((mp->mnt_flag & MNT_RDONLY) == 0)
508		return (EINVAL);
509	ump = VFSTOUFS(mp);
510	/*
511	 * Step 1: invalidate all cached meta-data.
512	 */
513	devvp = VFSTOUFS(mp)->um_devvp;
514	vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
515	if (vinvalbuf(devvp, 0, 0, 0) != 0)
516		panic("ffs_reload: dirty1");
517	VOP_UNLOCK(devvp, 0);
518
519	/*
520	 * Step 2: re-read superblock from disk.
521	 */
522	fs = VFSTOUFS(mp)->um_fs;
523	if ((error = bread(devvp, btodb(fs->fs_sblockloc), fs->fs_sbsize,
524	    NOCRED, &bp)) != 0)
525		return (error);
526	newfs = (struct fs *)bp->b_data;
527	if ((newfs->fs_magic != FS_UFS1_MAGIC &&
528	     newfs->fs_magic != FS_UFS2_MAGIC) ||
529	    newfs->fs_bsize > MAXBSIZE ||
530	    newfs->fs_bsize < sizeof(struct fs)) {
531			brelse(bp);
532			return (EIO);		/* XXX needs translation */
533	}
534	/*
535	 * Copy pointer fields back into superblock before copying in	XXX
536	 * new superblock. These should really be in the ufsmount.	XXX
537	 * Note that important parameters (eg fs_ncg) are unchanged.
538	 */
539	newfs->fs_csp = fs->fs_csp;
540	newfs->fs_maxcluster = fs->fs_maxcluster;
541	newfs->fs_contigdirs = fs->fs_contigdirs;
542	newfs->fs_active = fs->fs_active;
543	/* The file system is still read-only. */
544	newfs->fs_ronly = 1;
545	sblockloc = fs->fs_sblockloc;
546	bcopy(newfs, fs, (u_int)fs->fs_sbsize);
547	brelse(bp);
548	mp->mnt_maxsymlinklen = fs->fs_maxsymlinklen;
549	ffs_oldfscompat_read(fs, VFSTOUFS(mp), sblockloc);
550	UFS_LOCK(ump);
551	if (fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) {
552		printf("%s: reload pending error: blocks %jd files %d\n",
553		    fs->fs_fsmnt, (intmax_t)fs->fs_pendingblocks,
554		    fs->fs_pendinginodes);
555		fs->fs_pendingblocks = 0;
556		fs->fs_pendinginodes = 0;
557	}
558	UFS_UNLOCK(ump);
559
560	/*
561	 * Step 3: re-read summary information from disk.
562	 */
563	blks = howmany(fs->fs_cssize, fs->fs_fsize);
564	space = fs->fs_csp;
565	for (i = 0; i < blks; i += fs->fs_frag) {
566		size = fs->fs_bsize;
567		if (i + fs->fs_frag > blks)
568			size = (blks - i) * fs->fs_fsize;
569		error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + i), size,
570		    NOCRED, &bp);
571		if (error)
572			return (error);
573		bcopy(bp->b_data, space, (u_int)size);
574		space = (char *)space + size;
575		brelse(bp);
576	}
577	/*
578	 * We no longer know anything about clusters per cylinder group.
579	 */
580	if (fs->fs_contigsumsize > 0) {
581		lp = fs->fs_maxcluster;
582		for (i = 0; i < fs->fs_ncg; i++)
583			*lp++ = fs->fs_contigsumsize;
584	}
585
586loop:
587	MNT_ILOCK(mp);
588	MNT_VNODE_FOREACH(vp, mp, mvp) {
589		VI_LOCK(vp);
590		if (vp->v_iflag & VI_DOOMED) {
591			VI_UNLOCK(vp);
592			continue;
593		}
594		MNT_IUNLOCK(mp);
595		/*
596		 * Step 4: invalidate all cached file data.
597		 */
598		if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, td)) {
599			MNT_VNODE_FOREACH_ABORT(mp, mvp);
600			goto loop;
601		}
602		if (vinvalbuf(vp, 0, 0, 0))
603			panic("ffs_reload: dirty2");
604		/*
605		 * Step 5: re-read inode data for all active vnodes.
606		 */
607		ip = VTOI(vp);
608		error =
609		    bread(devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
610		    (int)fs->fs_bsize, NOCRED, &bp);
611		if (error) {
612			VOP_UNLOCK(vp, 0);
613			vrele(vp);
614			MNT_VNODE_FOREACH_ABORT(mp, mvp);
615			return (error);
616		}
617		ffs_load_inode(bp, ip, fs, ip->i_number);
618		ip->i_effnlink = ip->i_nlink;
619		brelse(bp);
620		VOP_UNLOCK(vp, 0);
621		vrele(vp);
622		MNT_ILOCK(mp);
623	}
624	MNT_IUNLOCK(mp);
625	return (0);
626}
627
628/*
629 * Possible superblock locations ordered from most to least likely.
630 */
631static int sblock_try[] = SBLOCKSEARCH;
632
633/*
634 * Common code for mount and mountroot
635 */
636static int
637ffs_mountfs(devvp, mp, td)
638	struct vnode *devvp;
639	struct mount *mp;
640	struct thread *td;
641{
642	struct ufsmount *ump;
643	struct buf *bp;
644	struct fs *fs;
645	struct cdev *dev;
646	void *space;
647	ufs2_daddr_t sblockloc;
648	int error, i, blks, size, ronly;
649	int32_t *lp;
650	struct ucred *cred;
651	struct g_consumer *cp;
652	struct mount *nmp;
653
654	bp = NULL;
655	ump = NULL;
656	cred = td ? td->td_ucred : NOCRED;
657	ronly = (mp->mnt_flag & MNT_RDONLY) != 0;
658
659	dev = devvp->v_rdev;
660	dev_ref(dev);
661	DROP_GIANT();
662	g_topology_lock();
663	error = g_vfs_open(devvp, &cp, "ffs", ronly ? 0 : 1);
664
665	/*
666	 * If we are a root mount, drop the E flag so fsck can do its magic.
667	 * We will pick it up again when we remount R/W.
668	 */
669	if (error == 0 && ronly && (mp->mnt_flag & MNT_ROOTFS))
670		error = g_access(cp, 0, 0, -1);
671	g_topology_unlock();
672	PICKUP_GIANT();
673	VOP_UNLOCK(devvp, 0);
674	if (error)
675		goto out;
676	if (devvp->v_rdev->si_iosize_max != 0)
677		mp->mnt_iosize_max = devvp->v_rdev->si_iosize_max;
678	if (mp->mnt_iosize_max > MAXPHYS)
679		mp->mnt_iosize_max = MAXPHYS;
680
681	devvp->v_bufobj.bo_ops = &ffs_ops;
682
683	fs = NULL;
684	sblockloc = 0;
685	/*
686	 * Try reading the superblock in each of its possible locations.
687	 */
688	for (i = 0; sblock_try[i] != -1; i++) {
689		if ((SBLOCKSIZE % cp->provider->sectorsize) != 0) {
690			error = EINVAL;
691			vfs_mount_error(mp,
692			    "Invalid sectorsize %d for superblock size %d",
693			    cp->provider->sectorsize, SBLOCKSIZE);
694			goto out;
695		}
696		if ((error = bread(devvp, btodb(sblock_try[i]), SBLOCKSIZE,
697		    cred, &bp)) != 0)
698			goto out;
699		fs = (struct fs *)bp->b_data;
700		sblockloc = sblock_try[i];
701		if ((fs->fs_magic == FS_UFS1_MAGIC ||
702		     (fs->fs_magic == FS_UFS2_MAGIC &&
703		      (fs->fs_sblockloc == sblockloc ||
704		       (fs->fs_old_flags & FS_FLAGS_UPDATED) == 0))) &&
705		    fs->fs_bsize <= MAXBSIZE &&
706		    fs->fs_bsize >= sizeof(struct fs))
707			break;
708		brelse(bp);
709		bp = NULL;
710	}
711	if (sblock_try[i] == -1) {
712		error = EINVAL;		/* XXX needs translation */
713		goto out;
714	}
715	fs->fs_fmod = 0;
716	fs->fs_flags &= ~FS_INDEXDIRS;	/* no support for directory indicies */
717	fs->fs_flags &= ~FS_UNCLEAN;
718	if (fs->fs_clean == 0) {
719		fs->fs_flags |= FS_UNCLEAN;
720		if (ronly || (mp->mnt_flag & MNT_FORCE) ||
721		    ((fs->fs_flags & (FS_SUJ | FS_NEEDSFSCK)) == 0 &&
722		     (fs->fs_flags & FS_DOSOFTDEP))) {
723			printf(
724"WARNING: %s was not properly dismounted\n",
725			    fs->fs_fsmnt);
726		} else {
727			printf(
728"WARNING: R/W mount of %s denied.  Filesystem is not clean - run fsck\n",
729			    fs->fs_fsmnt);
730			if (fs->fs_flags & FS_SUJ)
731				printf(
732"WARNING: Forced mount will invalidated journal contents\n");
733			error = EPERM;
734			goto out;
735		}
736		if ((fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) &&
737		    (mp->mnt_flag & MNT_FORCE)) {
738			printf("%s: lost blocks %jd files %d\n", fs->fs_fsmnt,
739			    (intmax_t)fs->fs_pendingblocks,
740			    fs->fs_pendinginodes);
741			fs->fs_pendingblocks = 0;
742			fs->fs_pendinginodes = 0;
743		}
744	}
745	if (fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) {
746		printf("%s: mount pending error: blocks %jd files %d\n",
747		    fs->fs_fsmnt, (intmax_t)fs->fs_pendingblocks,
748		    fs->fs_pendinginodes);
749		fs->fs_pendingblocks = 0;
750		fs->fs_pendinginodes = 0;
751	}
752	if ((fs->fs_flags & FS_GJOURNAL) != 0) {
753#ifdef UFS_GJOURNAL
754		/*
755		 * Get journal provider name.
756		 */
757		size = 1024;
758		mp->mnt_gjprovider = malloc(size, M_UFSMNT, M_WAITOK);
759		if (g_io_getattr("GJOURNAL::provider", cp, &size,
760		    mp->mnt_gjprovider) == 0) {
761			mp->mnt_gjprovider = realloc(mp->mnt_gjprovider, size,
762			    M_UFSMNT, M_WAITOK);
763			MNT_ILOCK(mp);
764			mp->mnt_flag |= MNT_GJOURNAL;
765			MNT_IUNLOCK(mp);
766		} else {
767			printf(
768"WARNING: %s: GJOURNAL flag on fs but no gjournal provider below\n",
769			    mp->mnt_stat.f_mntonname);
770			free(mp->mnt_gjprovider, M_UFSMNT);
771			mp->mnt_gjprovider = NULL;
772		}
773#else
774		printf(
775"WARNING: %s: GJOURNAL flag on fs but no UFS_GJOURNAL support\n",
776		    mp->mnt_stat.f_mntonname);
777#endif
778	} else {
779		mp->mnt_gjprovider = NULL;
780	}
781	ump = malloc(sizeof *ump, M_UFSMNT, M_WAITOK | M_ZERO);
782	ump->um_cp = cp;
783	ump->um_bo = &devvp->v_bufobj;
784	ump->um_fs = malloc((u_long)fs->fs_sbsize, M_UFSMNT, M_WAITOK);
785	if (fs->fs_magic == FS_UFS1_MAGIC) {
786		ump->um_fstype = UFS1;
787		ump->um_balloc = ffs_balloc_ufs1;
788	} else {
789		ump->um_fstype = UFS2;
790		ump->um_balloc = ffs_balloc_ufs2;
791	}
792	ump->um_blkatoff = ffs_blkatoff;
793	ump->um_truncate = ffs_truncate;
794	ump->um_update = ffs_update;
795	ump->um_valloc = ffs_valloc;
796	ump->um_vfree = ffs_vfree;
797	ump->um_ifree = ffs_ifree;
798	ump->um_rdonly = ffs_rdonly;
799	mtx_init(UFS_MTX(ump), "FFS", "FFS Lock", MTX_DEF);
800	bcopy(bp->b_data, ump->um_fs, (u_int)fs->fs_sbsize);
801	if (fs->fs_sbsize < SBLOCKSIZE)
802		bp->b_flags |= B_INVAL | B_NOCACHE;
803	brelse(bp);
804	bp = NULL;
805	fs = ump->um_fs;
806	ffs_oldfscompat_read(fs, ump, sblockloc);
807	fs->fs_ronly = ronly;
808	size = fs->fs_cssize;
809	blks = howmany(size, fs->fs_fsize);
810	if (fs->fs_contigsumsize > 0)
811		size += fs->fs_ncg * sizeof(int32_t);
812	size += fs->fs_ncg * sizeof(u_int8_t);
813	space = malloc((u_long)size, M_UFSMNT, M_WAITOK);
814	fs->fs_csp = space;
815	for (i = 0; i < blks; i += fs->fs_frag) {
816		size = fs->fs_bsize;
817		if (i + fs->fs_frag > blks)
818			size = (blks - i) * fs->fs_fsize;
819		if ((error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + i), size,
820		    cred, &bp)) != 0) {
821			free(fs->fs_csp, M_UFSMNT);
822			goto out;
823		}
824		bcopy(bp->b_data, space, (u_int)size);
825		space = (char *)space + size;
826		brelse(bp);
827		bp = NULL;
828	}
829	if (fs->fs_contigsumsize > 0) {
830		fs->fs_maxcluster = lp = space;
831		for (i = 0; i < fs->fs_ncg; i++)
832			*lp++ = fs->fs_contigsumsize;
833		space = lp;
834	}
835	size = fs->fs_ncg * sizeof(u_int8_t);
836	fs->fs_contigdirs = (u_int8_t *)space;
837	bzero(fs->fs_contigdirs, size);
838	fs->fs_active = NULL;
839	mp->mnt_data = ump;
840	mp->mnt_stat.f_fsid.val[0] = fs->fs_id[0];
841	mp->mnt_stat.f_fsid.val[1] = fs->fs_id[1];
842	nmp = NULL;
843	if (fs->fs_id[0] == 0 || fs->fs_id[1] == 0 ||
844	    (nmp = vfs_getvfs(&mp->mnt_stat.f_fsid))) {
845		if (nmp)
846			vfs_rel(nmp);
847		vfs_getnewfsid(mp);
848	}
849	mp->mnt_maxsymlinklen = fs->fs_maxsymlinklen;
850	MNT_ILOCK(mp);
851	mp->mnt_flag |= MNT_LOCAL;
852	MNT_IUNLOCK(mp);
853	if ((fs->fs_flags & FS_MULTILABEL) != 0) {
854#ifdef MAC
855		MNT_ILOCK(mp);
856		mp->mnt_flag |= MNT_MULTILABEL;
857		MNT_IUNLOCK(mp);
858#else
859		printf(
860"WARNING: %s: multilabel flag on fs but no MAC support\n",
861		    mp->mnt_stat.f_mntonname);
862#endif
863	}
864	if ((fs->fs_flags & FS_ACLS) != 0) {
865#ifdef UFS_ACL
866		MNT_ILOCK(mp);
867
868		if (mp->mnt_flag & MNT_NFS4ACLS)
869			printf("WARNING: ACLs flag on fs conflicts with "
870			    "\"nfsv4acls\" mount option; option ignored\n");
871		mp->mnt_flag &= ~MNT_NFS4ACLS;
872		mp->mnt_flag |= MNT_ACLS;
873
874		MNT_IUNLOCK(mp);
875#else
876		printf(
877"WARNING: %s: ACLs flag on fs but no ACLs support\n",
878		    mp->mnt_stat.f_mntonname);
879#endif
880	}
881	if ((fs->fs_flags & FS_NFS4ACLS) != 0) {
882#ifdef UFS_ACL
883		MNT_ILOCK(mp);
884
885		if (mp->mnt_flag & MNT_ACLS)
886			printf("WARNING: NFSv4 ACLs flag on fs conflicts with "
887			    "\"acls\" mount option; option ignored\n");
888		mp->mnt_flag &= ~MNT_ACLS;
889		mp->mnt_flag |= MNT_NFS4ACLS;
890
891		MNT_IUNLOCK(mp);
892#else
893		printf(
894"WARNING: %s: NFSv4 ACLs flag on fs but no ACLs support\n",
895		    mp->mnt_stat.f_mntonname);
896#endif
897	}
898
899	ump->um_mountp = mp;
900	ump->um_dev = dev;
901	ump->um_devvp = devvp;
902	ump->um_nindir = fs->fs_nindir;
903	ump->um_bptrtodb = fs->fs_fsbtodb;
904	ump->um_seqinc = fs->fs_frag;
905	for (i = 0; i < MAXQUOTAS; i++)
906		ump->um_quotas[i] = NULLVP;
907#ifdef UFS_EXTATTR
908	ufs_extattr_uepm_init(&ump->um_extattr);
909#endif
910	/*
911	 * Set FS local "last mounted on" information (NULL pad)
912	 */
913	bzero(fs->fs_fsmnt, MAXMNTLEN);
914	strlcpy(fs->fs_fsmnt, mp->mnt_stat.f_mntonname, MAXMNTLEN);
915	mp->mnt_stat.f_iosize = fs->fs_bsize;
916
917	if( mp->mnt_flag & MNT_ROOTFS) {
918		/*
919		 * Root mount; update timestamp in mount structure.
920		 * this will be used by the common root mount code
921		 * to update the system clock.
922		 */
923		mp->mnt_time = fs->fs_time;
924	}
925
926	if (ronly == 0) {
927		fs->fs_mtime = time_second;
928		if ((fs->fs_flags & FS_DOSOFTDEP) &&
929		    (error = softdep_mount(devvp, mp, fs, cred)) != 0) {
930			free(fs->fs_csp, M_UFSMNT);
931			goto out;
932		}
933		if (fs->fs_snapinum[0] != 0)
934			ffs_snapshot_mount(mp);
935		fs->fs_fmod = 1;
936		fs->fs_clean = 0;
937		(void) ffs_sbupdate(ump, MNT_WAIT, 0);
938	}
939	/*
940	 * Initialize filesystem stat information in mount struct.
941	 */
942	MNT_ILOCK(mp);
943	mp->mnt_kern_flag |= MNTK_MPSAFE | MNTK_LOOKUP_SHARED |
944	    MNTK_EXTENDED_SHARED;
945	MNT_IUNLOCK(mp);
946#ifdef UFS_EXTATTR
947#ifdef UFS_EXTATTR_AUTOSTART
948	/*
949	 *
950	 * Auto-starting does the following:
951	 *	- check for /.attribute in the fs, and extattr_start if so
952	 *	- for each file in .attribute, enable that file with
953	 * 	  an attribute of the same name.
954	 * Not clear how to report errors -- probably eat them.
955	 * This would all happen while the filesystem was busy/not
956	 * available, so would effectively be "atomic".
957	 */
958	(void) ufs_extattr_autostart(mp, td);
959#endif /* !UFS_EXTATTR_AUTOSTART */
960#endif /* !UFS_EXTATTR */
961	return (0);
962out:
963	if (bp)
964		brelse(bp);
965	if (cp != NULL) {
966		DROP_GIANT();
967		g_topology_lock();
968		g_vfs_close(cp);
969		g_topology_unlock();
970		PICKUP_GIANT();
971	}
972	if (ump) {
973		mtx_destroy(UFS_MTX(ump));
974		if (mp->mnt_gjprovider != NULL) {
975			free(mp->mnt_gjprovider, M_UFSMNT);
976			mp->mnt_gjprovider = NULL;
977		}
978		free(ump->um_fs, M_UFSMNT);
979		free(ump, M_UFSMNT);
980		mp->mnt_data = NULL;
981	}
982	dev_rel(dev);
983	return (error);
984}
985
986#include <sys/sysctl.h>
987static int bigcgs = 0;
988SYSCTL_INT(_debug, OID_AUTO, bigcgs, CTLFLAG_RW, &bigcgs, 0, "");
989
990/*
991 * Sanity checks for loading old filesystem superblocks.
992 * See ffs_oldfscompat_write below for unwound actions.
993 *
994 * XXX - Parts get retired eventually.
995 * Unfortunately new bits get added.
996 */
997static void
998ffs_oldfscompat_read(fs, ump, sblockloc)
999	struct fs *fs;
1000	struct ufsmount *ump;
1001	ufs2_daddr_t sblockloc;
1002{
1003	off_t maxfilesize;
1004
1005	/*
1006	 * If not yet done, update fs_flags location and value of fs_sblockloc.
1007	 */
1008	if ((fs->fs_old_flags & FS_FLAGS_UPDATED) == 0) {
1009		fs->fs_flags = fs->fs_old_flags;
1010		fs->fs_old_flags |= FS_FLAGS_UPDATED;
1011		fs->fs_sblockloc = sblockloc;
1012	}
1013	/*
1014	 * If not yet done, update UFS1 superblock with new wider fields.
1015	 */
1016	if (fs->fs_magic == FS_UFS1_MAGIC && fs->fs_maxbsize != fs->fs_bsize) {
1017		fs->fs_maxbsize = fs->fs_bsize;
1018		fs->fs_time = fs->fs_old_time;
1019		fs->fs_size = fs->fs_old_size;
1020		fs->fs_dsize = fs->fs_old_dsize;
1021		fs->fs_csaddr = fs->fs_old_csaddr;
1022		fs->fs_cstotal.cs_ndir = fs->fs_old_cstotal.cs_ndir;
1023		fs->fs_cstotal.cs_nbfree = fs->fs_old_cstotal.cs_nbfree;
1024		fs->fs_cstotal.cs_nifree = fs->fs_old_cstotal.cs_nifree;
1025		fs->fs_cstotal.cs_nffree = fs->fs_old_cstotal.cs_nffree;
1026	}
1027	if (fs->fs_magic == FS_UFS1_MAGIC &&
1028	    fs->fs_old_inodefmt < FS_44INODEFMT) {
1029		fs->fs_maxfilesize = ((uint64_t)1 << 31) - 1;
1030		fs->fs_qbmask = ~fs->fs_bmask;
1031		fs->fs_qfmask = ~fs->fs_fmask;
1032	}
1033	if (fs->fs_magic == FS_UFS1_MAGIC) {
1034		ump->um_savedmaxfilesize = fs->fs_maxfilesize;
1035		maxfilesize = (uint64_t)0x80000000 * fs->fs_bsize - 1;
1036		if (fs->fs_maxfilesize > maxfilesize)
1037			fs->fs_maxfilesize = maxfilesize;
1038	}
1039	/* Compatibility for old filesystems */
1040	if (fs->fs_avgfilesize <= 0)
1041		fs->fs_avgfilesize = AVFILESIZ;
1042	if (fs->fs_avgfpdir <= 0)
1043		fs->fs_avgfpdir = AFPDIR;
1044	if (bigcgs) {
1045		fs->fs_save_cgsize = fs->fs_cgsize;
1046		fs->fs_cgsize = fs->fs_bsize;
1047	}
1048}
1049
1050/*
1051 * Unwinding superblock updates for old filesystems.
1052 * See ffs_oldfscompat_read above for details.
1053 *
1054 * XXX - Parts get retired eventually.
1055 * Unfortunately new bits get added.
1056 */
1057void
1058ffs_oldfscompat_write(fs, ump)
1059	struct fs *fs;
1060	struct ufsmount *ump;
1061{
1062
1063	/*
1064	 * Copy back UFS2 updated fields that UFS1 inspects.
1065	 */
1066	if (fs->fs_magic == FS_UFS1_MAGIC) {
1067		fs->fs_old_time = fs->fs_time;
1068		fs->fs_old_cstotal.cs_ndir = fs->fs_cstotal.cs_ndir;
1069		fs->fs_old_cstotal.cs_nbfree = fs->fs_cstotal.cs_nbfree;
1070		fs->fs_old_cstotal.cs_nifree = fs->fs_cstotal.cs_nifree;
1071		fs->fs_old_cstotal.cs_nffree = fs->fs_cstotal.cs_nffree;
1072		fs->fs_maxfilesize = ump->um_savedmaxfilesize;
1073	}
1074	if (bigcgs) {
1075		fs->fs_cgsize = fs->fs_save_cgsize;
1076		fs->fs_save_cgsize = 0;
1077	}
1078}
1079
1080/*
1081 * unmount system call
1082 */
1083static int
1084ffs_unmount(mp, mntflags)
1085	struct mount *mp;
1086	int mntflags;
1087{
1088	struct thread *td;
1089	struct ufsmount *ump = VFSTOUFS(mp);
1090	struct fs *fs;
1091	int error, flags, susp;
1092#ifdef UFS_EXTATTR
1093	int e_restart;
1094#endif
1095
1096	flags = 0;
1097	td = curthread;
1098	fs = ump->um_fs;
1099	if (mntflags & MNT_FORCE) {
1100		flags |= FORCECLOSE;
1101		susp = fs->fs_ronly != 0;
1102	} else
1103		susp = 0;
1104#ifdef UFS_EXTATTR
1105	if ((error = ufs_extattr_stop(mp, td))) {
1106		if (error != EOPNOTSUPP)
1107			printf("ffs_unmount: ufs_extattr_stop returned %d\n",
1108			    error);
1109		e_restart = 0;
1110	} else {
1111		ufs_extattr_uepm_destroy(&ump->um_extattr);
1112		e_restart = 1;
1113	}
1114#endif
1115	if (susp) {
1116		/*
1117		 * dounmount already called vn_start_write().
1118		 */
1119		for (;;) {
1120			vn_finished_write(mp);
1121			if ((error = vfs_write_suspend(mp)) != 0)
1122				return (error);
1123			MNT_ILOCK(mp);
1124			if (mp->mnt_kern_flag & MNTK_SUSPENDED) {
1125				mp->mnt_kern_flag &= ~(MNTK_SUSPENDED |
1126				    MNTK_SUSPEND2);
1127				wakeup(&mp->mnt_flag);
1128				MNT_IUNLOCK(mp);
1129				td->td_pflags |= TDP_IGNSUSP;
1130				break;
1131			}
1132			MNT_IUNLOCK(mp);
1133			vn_start_write(NULL, &mp, V_WAIT);
1134		}
1135	}
1136	if (mp->mnt_flag & MNT_SOFTDEP)
1137		error = softdep_flushfiles(mp, flags, td);
1138	else
1139		error = ffs_flushfiles(mp, flags, td);
1140	if (error != 0 && error != ENXIO)
1141		goto fail;
1142
1143	UFS_LOCK(ump);
1144	if (fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) {
1145		printf("%s: unmount pending error: blocks %jd files %d\n",
1146		    fs->fs_fsmnt, (intmax_t)fs->fs_pendingblocks,
1147		    fs->fs_pendinginodes);
1148		fs->fs_pendingblocks = 0;
1149		fs->fs_pendinginodes = 0;
1150	}
1151	UFS_UNLOCK(ump);
1152	softdep_unmount(mp);
1153	if (fs->fs_ronly == 0) {
1154		fs->fs_clean = fs->fs_flags & (FS_UNCLEAN|FS_NEEDSFSCK) ? 0 : 1;
1155		error = ffs_sbupdate(ump, MNT_WAIT, 0);
1156		if (error && error != ENXIO) {
1157			fs->fs_clean = 0;
1158			goto fail;
1159		}
1160	}
1161	if (susp) {
1162		vfs_write_resume(mp);
1163		vn_start_write(NULL, &mp, V_WAIT);
1164	}
1165	DROP_GIANT();
1166	g_topology_lock();
1167	g_vfs_close(ump->um_cp);
1168	g_topology_unlock();
1169	PICKUP_GIANT();
1170	vrele(ump->um_devvp);
1171	dev_rel(ump->um_dev);
1172	mtx_destroy(UFS_MTX(ump));
1173	if (mp->mnt_gjprovider != NULL) {
1174		free(mp->mnt_gjprovider, M_UFSMNT);
1175		mp->mnt_gjprovider = NULL;
1176	}
1177	free(fs->fs_csp, M_UFSMNT);
1178	free(fs, M_UFSMNT);
1179	free(ump, M_UFSMNT);
1180	mp->mnt_data = NULL;
1181	MNT_ILOCK(mp);
1182	mp->mnt_flag &= ~MNT_LOCAL;
1183	MNT_IUNLOCK(mp);
1184	return (error);
1185
1186fail:
1187	if (susp) {
1188		vfs_write_resume(mp);
1189		vn_start_write(NULL, &mp, V_WAIT);
1190	}
1191#ifdef UFS_EXTATTR
1192	if (e_restart) {
1193		ufs_extattr_uepm_init(&ump->um_extattr);
1194#ifdef UFS_EXTATTR_AUTOSTART
1195		(void) ufs_extattr_autostart(mp, td);
1196#endif
1197	}
1198#endif
1199
1200	return (error);
1201}
1202
1203/*
1204 * Flush out all the files in a filesystem.
1205 */
1206int
1207ffs_flushfiles(mp, flags, td)
1208	struct mount *mp;
1209	int flags;
1210	struct thread *td;
1211{
1212	struct ufsmount *ump;
1213	int error;
1214
1215	ump = VFSTOUFS(mp);
1216#ifdef QUOTA
1217	if (mp->mnt_flag & MNT_QUOTA) {
1218		int i;
1219		error = vflush(mp, 0, SKIPSYSTEM|flags, td);
1220		if (error)
1221			return (error);
1222		for (i = 0; i < MAXQUOTAS; i++) {
1223			quotaoff(td, mp, i);
1224		}
1225		/*
1226		 * Here we fall through to vflush again to ensure
1227		 * that we have gotten rid of all the system vnodes.
1228		 */
1229	}
1230#endif
1231	ASSERT_VOP_LOCKED(ump->um_devvp, "ffs_flushfiles");
1232	if (ump->um_devvp->v_vflag & VV_COPYONWRITE) {
1233		if ((error = vflush(mp, 0, SKIPSYSTEM | flags, td)) != 0)
1234			return (error);
1235		ffs_snapshot_unmount(mp);
1236		flags |= FORCECLOSE;
1237		/*
1238		 * Here we fall through to vflush again to ensure
1239		 * that we have gotten rid of all the system vnodes.
1240		 */
1241	}
1242        /*
1243	 * Flush all the files.
1244	 */
1245	if ((error = vflush(mp, 0, flags, td)) != 0)
1246		return (error);
1247	/*
1248	 * Flush filesystem metadata.
1249	 */
1250	vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY);
1251	error = VOP_FSYNC(ump->um_devvp, MNT_WAIT, td);
1252	VOP_UNLOCK(ump->um_devvp, 0);
1253	return (error);
1254}
1255
1256/*
1257 * Get filesystem statistics.
1258 */
1259static int
1260ffs_statfs(mp, sbp)
1261	struct mount *mp;
1262	struct statfs *sbp;
1263{
1264	struct ufsmount *ump;
1265	struct fs *fs;
1266
1267	ump = VFSTOUFS(mp);
1268	fs = ump->um_fs;
1269	if (fs->fs_magic != FS_UFS1_MAGIC && fs->fs_magic != FS_UFS2_MAGIC)
1270		panic("ffs_statfs");
1271	sbp->f_version = STATFS_VERSION;
1272	sbp->f_bsize = fs->fs_fsize;
1273	sbp->f_iosize = fs->fs_bsize;
1274	sbp->f_blocks = fs->fs_dsize;
1275	UFS_LOCK(ump);
1276	sbp->f_bfree = fs->fs_cstotal.cs_nbfree * fs->fs_frag +
1277	    fs->fs_cstotal.cs_nffree + dbtofsb(fs, fs->fs_pendingblocks);
1278	sbp->f_bavail = freespace(fs, fs->fs_minfree) +
1279	    dbtofsb(fs, fs->fs_pendingblocks);
1280	sbp->f_files =  fs->fs_ncg * fs->fs_ipg - ROOTINO;
1281	sbp->f_ffree = fs->fs_cstotal.cs_nifree + fs->fs_pendinginodes;
1282	UFS_UNLOCK(ump);
1283	sbp->f_namemax = NAME_MAX;
1284	return (0);
1285}
1286
1287/*
1288 * Go through the disk queues to initiate sandbagged IO;
1289 * go through the inodes to write those that have been modified;
1290 * initiate the writing of the super block if it has been modified.
1291 *
1292 * Note: we are always called with the filesystem marked `MPBUSY'.
1293 */
1294static int
1295ffs_sync(mp, waitfor)
1296	struct mount *mp;
1297	int waitfor;
1298{
1299	struct vnode *mvp, *vp, *devvp;
1300	struct thread *td;
1301	struct inode *ip;
1302	struct ufsmount *ump = VFSTOUFS(mp);
1303	struct fs *fs;
1304	int error, count, wait, lockreq, allerror = 0;
1305	int suspend;
1306	int suspended;
1307	int secondary_writes;
1308	int secondary_accwrites;
1309	int softdep_deps;
1310	int softdep_accdeps;
1311	struct bufobj *bo;
1312
1313	td = curthread;
1314	fs = ump->um_fs;
1315	if (fs->fs_fmod != 0 && fs->fs_ronly != 0) {		/* XXX */
1316		printf("fs = %s\n", fs->fs_fsmnt);
1317		panic("ffs_sync: rofs mod");
1318	}
1319	/*
1320	 * Write back each (modified) inode.
1321	 */
1322	wait = 0;
1323	suspend = 0;
1324	suspended = 0;
1325	lockreq = LK_EXCLUSIVE | LK_NOWAIT;
1326	if (waitfor == MNT_SUSPEND) {
1327		suspend = 1;
1328		waitfor = MNT_WAIT;
1329	}
1330	if (waitfor == MNT_WAIT) {
1331		wait = 1;
1332		lockreq = LK_EXCLUSIVE;
1333	}
1334	lockreq |= LK_INTERLOCK | LK_SLEEPFAIL;
1335	MNT_ILOCK(mp);
1336loop:
1337	/* Grab snapshot of secondary write counts */
1338	secondary_writes = mp->mnt_secondary_writes;
1339	secondary_accwrites = mp->mnt_secondary_accwrites;
1340
1341	/* Grab snapshot of softdep dependency counts */
1342	MNT_IUNLOCK(mp);
1343	softdep_get_depcounts(mp, &softdep_deps, &softdep_accdeps);
1344	MNT_ILOCK(mp);
1345
1346	MNT_VNODE_FOREACH(vp, mp, mvp) {
1347		/*
1348		 * Depend on the mntvnode_slock to keep things stable enough
1349		 * for a quick test.  Since there might be hundreds of
1350		 * thousands of vnodes, we cannot afford even a subroutine
1351		 * call unless there's a good chance that we have work to do.
1352		 */
1353		VI_LOCK(vp);
1354		if (vp->v_iflag & VI_DOOMED) {
1355			VI_UNLOCK(vp);
1356			continue;
1357		}
1358		ip = VTOI(vp);
1359		if (vp->v_type == VNON || ((ip->i_flag &
1360		    (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE)) == 0 &&
1361		    vp->v_bufobj.bo_dirty.bv_cnt == 0)) {
1362			VI_UNLOCK(vp);
1363			continue;
1364		}
1365		MNT_IUNLOCK(mp);
1366		if ((error = vget(vp, lockreq, td)) != 0) {
1367			MNT_ILOCK(mp);
1368			if (error == ENOENT || error == ENOLCK) {
1369				MNT_VNODE_FOREACH_ABORT_ILOCKED(mp, mvp);
1370				goto loop;
1371			}
1372			continue;
1373		}
1374		if ((error = ffs_syncvnode(vp, waitfor)) != 0)
1375			allerror = error;
1376		vput(vp);
1377		MNT_ILOCK(mp);
1378	}
1379	MNT_IUNLOCK(mp);
1380	/*
1381	 * Force stale filesystem control information to be flushed.
1382	 */
1383	if (waitfor == MNT_WAIT) {
1384		if ((error = softdep_flushworklist(ump->um_mountp, &count, td)))
1385			allerror = error;
1386		/* Flushed work items may create new vnodes to clean */
1387		if (allerror == 0 && count) {
1388			MNT_ILOCK(mp);
1389			goto loop;
1390		}
1391	}
1392#ifdef QUOTA
1393	qsync(mp);
1394#endif
1395	devvp = ump->um_devvp;
1396	bo = &devvp->v_bufobj;
1397	BO_LOCK(bo);
1398	if (waitfor != MNT_LAZY &&
1399	    (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0)) {
1400		BO_UNLOCK(bo);
1401		vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
1402		if ((error = VOP_FSYNC(devvp, waitfor, td)) != 0)
1403			allerror = error;
1404		VOP_UNLOCK(devvp, 0);
1405		if (allerror == 0 && waitfor == MNT_WAIT) {
1406			MNT_ILOCK(mp);
1407			goto loop;
1408		}
1409	} else if (suspend != 0) {
1410		if (softdep_check_suspend(mp,
1411					  devvp,
1412					  softdep_deps,
1413					  softdep_accdeps,
1414					  secondary_writes,
1415					  secondary_accwrites) != 0)
1416			goto loop;	/* More work needed */
1417		mtx_assert(MNT_MTX(mp), MA_OWNED);
1418		mp->mnt_kern_flag |= MNTK_SUSPEND2 | MNTK_SUSPENDED;
1419		MNT_IUNLOCK(mp);
1420		suspended = 1;
1421	} else
1422		BO_UNLOCK(bo);
1423	/*
1424	 * Write back modified superblock.
1425	 */
1426	if (fs->fs_fmod != 0 &&
1427	    (error = ffs_sbupdate(ump, waitfor, suspended)) != 0)
1428		allerror = error;
1429	return (allerror);
1430}
1431
1432int
1433ffs_vget(mp, ino, flags, vpp)
1434	struct mount *mp;
1435	ino_t ino;
1436	int flags;
1437	struct vnode **vpp;
1438{
1439	return (ffs_vgetf(mp, ino, flags, vpp, 0));
1440}
1441
1442int
1443ffs_vgetf(mp, ino, flags, vpp, ffs_flags)
1444	struct mount *mp;
1445	ino_t ino;
1446	int flags;
1447	struct vnode **vpp;
1448	int ffs_flags;
1449{
1450	struct fs *fs;
1451	struct inode *ip;
1452	struct ufsmount *ump;
1453	struct buf *bp;
1454	struct vnode *vp;
1455	struct cdev *dev;
1456	int error;
1457
1458	error = vfs_hash_get(mp, ino, flags, curthread, vpp, NULL, NULL);
1459	if (error || *vpp != NULL)
1460		return (error);
1461
1462	/*
1463	 * We must promote to an exclusive lock for vnode creation.  This
1464	 * can happen if lookup is passed LOCKSHARED.
1465 	 */
1466	if ((flags & LK_TYPE_MASK) == LK_SHARED) {
1467		flags &= ~LK_TYPE_MASK;
1468		flags |= LK_EXCLUSIVE;
1469	}
1470
1471	/*
1472	 * We do not lock vnode creation as it is believed to be too
1473	 * expensive for such rare case as simultaneous creation of vnode
1474	 * for same ino by different processes. We just allow them to race
1475	 * and check later to decide who wins. Let the race begin!
1476	 */
1477
1478	ump = VFSTOUFS(mp);
1479	dev = ump->um_dev;
1480	fs = ump->um_fs;
1481
1482	/*
1483	 * If this malloc() is performed after the getnewvnode()
1484	 * it might block, leaving a vnode with a NULL v_data to be
1485	 * found by ffs_sync() if a sync happens to fire right then,
1486	 * which will cause a panic because ffs_sync() blindly
1487	 * dereferences vp->v_data (as well it should).
1488	 */
1489	ip = uma_zalloc(uma_inode, M_WAITOK | M_ZERO);
1490
1491	/* Allocate a new vnode/inode. */
1492	if (fs->fs_magic == FS_UFS1_MAGIC)
1493		error = getnewvnode("ufs", mp, &ffs_vnodeops1, &vp);
1494	else
1495		error = getnewvnode("ufs", mp, &ffs_vnodeops2, &vp);
1496	if (error) {
1497		*vpp = NULL;
1498		uma_zfree(uma_inode, ip);
1499		return (error);
1500	}
1501	/*
1502	 * FFS supports recursive locking.
1503	 */
1504	VN_LOCK_AREC(vp);
1505	vp->v_data = ip;
1506	vp->v_bufobj.bo_bsize = fs->fs_bsize;
1507	ip->i_vnode = vp;
1508	ip->i_ump = ump;
1509	ip->i_fs = fs;
1510	ip->i_dev = dev;
1511	ip->i_number = ino;
1512	ip->i_ea_refs = 0;
1513#ifdef QUOTA
1514	{
1515		int i;
1516		for (i = 0; i < MAXQUOTAS; i++)
1517			ip->i_dquot[i] = NODQUOT;
1518	}
1519#endif
1520
1521	lockmgr(vp->v_vnlock, LK_EXCLUSIVE, NULL);
1522	if (ffs_flags & FFSV_FORCEINSMQ)
1523		vp->v_vflag |= VV_FORCEINSMQ;
1524	error = insmntque(vp, mp);
1525	if (error != 0) {
1526		uma_zfree(uma_inode, ip);
1527		*vpp = NULL;
1528		return (error);
1529	}
1530	vp->v_vflag &= ~VV_FORCEINSMQ;
1531	error = vfs_hash_insert(vp, ino, flags, curthread, vpp, NULL, NULL);
1532	if (error || *vpp != NULL)
1533		return (error);
1534
1535	/* Read in the disk contents for the inode, copy into the inode. */
1536	error = bread(ump->um_devvp, fsbtodb(fs, ino_to_fsba(fs, ino)),
1537	    (int)fs->fs_bsize, NOCRED, &bp);
1538	if (error) {
1539		/*
1540		 * The inode does not contain anything useful, so it would
1541		 * be misleading to leave it on its hash chain. With mode
1542		 * still zero, it will be unlinked and returned to the free
1543		 * list by vput().
1544		 */
1545		brelse(bp);
1546		vput(vp);
1547		*vpp = NULL;
1548		return (error);
1549	}
1550	if (ip->i_ump->um_fstype == UFS1)
1551		ip->i_din1 = uma_zalloc(uma_ufs1, M_WAITOK);
1552	else
1553		ip->i_din2 = uma_zalloc(uma_ufs2, M_WAITOK);
1554	ffs_load_inode(bp, ip, fs, ino);
1555	if (DOINGSOFTDEP(vp))
1556		softdep_load_inodeblock(ip);
1557	else
1558		ip->i_effnlink = ip->i_nlink;
1559	bqrelse(bp);
1560
1561	/*
1562	 * Initialize the vnode from the inode, check for aliases.
1563	 * Note that the underlying vnode may have changed.
1564	 */
1565	if (ip->i_ump->um_fstype == UFS1)
1566		error = ufs_vinit(mp, &ffs_fifoops1, &vp);
1567	else
1568		error = ufs_vinit(mp, &ffs_fifoops2, &vp);
1569	if (error) {
1570		vput(vp);
1571		*vpp = NULL;
1572		return (error);
1573	}
1574
1575	/*
1576	 * Finish inode initialization.
1577	 */
1578	if (vp->v_type != VFIFO) {
1579		/* FFS supports shared locking for all files except fifos. */
1580		VN_LOCK_ASHARE(vp);
1581	}
1582
1583	/*
1584	 * Set up a generation number for this inode if it does not
1585	 * already have one. This should only happen on old filesystems.
1586	 */
1587	if (ip->i_gen == 0) {
1588		ip->i_gen = arc4random() / 2 + 1;
1589		if ((vp->v_mount->mnt_flag & MNT_RDONLY) == 0) {
1590			ip->i_flag |= IN_MODIFIED;
1591			DIP_SET(ip, i_gen, ip->i_gen);
1592		}
1593	}
1594#ifdef MAC
1595	if ((mp->mnt_flag & MNT_MULTILABEL) && ip->i_mode) {
1596		/*
1597		 * If this vnode is already allocated, and we're running
1598		 * multi-label, attempt to perform a label association
1599		 * from the extended attributes on the inode.
1600		 */
1601		error = mac_vnode_associate_extattr(mp, vp);
1602		if (error) {
1603			/* ufs_inactive will release ip->i_devvp ref. */
1604			vput(vp);
1605			*vpp = NULL;
1606			return (error);
1607		}
1608	}
1609#endif
1610
1611	*vpp = vp;
1612	return (0);
1613}
1614
1615/*
1616 * File handle to vnode
1617 *
1618 * Have to be really careful about stale file handles:
1619 * - check that the inode number is valid
1620 * - call ffs_vget() to get the locked inode
1621 * - check for an unallocated inode (i_mode == 0)
1622 * - check that the given client host has export rights and return
1623 *   those rights via. exflagsp and credanonp
1624 */
1625static int
1626ffs_fhtovp(mp, fhp, vpp)
1627	struct mount *mp;
1628	struct fid *fhp;
1629	struct vnode **vpp;
1630{
1631	struct ufid *ufhp;
1632	struct fs *fs;
1633
1634	ufhp = (struct ufid *)fhp;
1635	fs = VFSTOUFS(mp)->um_fs;
1636	if (ufhp->ufid_ino < ROOTINO ||
1637	    ufhp->ufid_ino >= fs->fs_ncg * fs->fs_ipg)
1638		return (ESTALE);
1639	return (ufs_fhtovp(mp, ufhp, vpp));
1640}
1641
1642/*
1643 * Initialize the filesystem.
1644 */
1645static int
1646ffs_init(vfsp)
1647	struct vfsconf *vfsp;
1648{
1649
1650	softdep_initialize();
1651	return (ufs_init(vfsp));
1652}
1653
1654/*
1655 * Undo the work of ffs_init().
1656 */
1657static int
1658ffs_uninit(vfsp)
1659	struct vfsconf *vfsp;
1660{
1661	int ret;
1662
1663	ret = ufs_uninit(vfsp);
1664	softdep_uninitialize();
1665	return (ret);
1666}
1667
1668/*
1669 * Write a superblock and associated information back to disk.
1670 */
1671int
1672ffs_sbupdate(mp, waitfor, suspended)
1673	struct ufsmount *mp;
1674	int waitfor;
1675	int suspended;
1676{
1677	struct fs *fs = mp->um_fs;
1678	struct buf *sbbp;
1679	struct buf *bp;
1680	int blks;
1681	void *space;
1682	int i, size, error, allerror = 0;
1683
1684	if (fs->fs_ronly == 1 &&
1685	    (mp->um_mountp->mnt_flag & (MNT_RDONLY | MNT_UPDATE)) !=
1686	    (MNT_RDONLY | MNT_UPDATE))
1687		panic("ffs_sbupdate: write read-only filesystem");
1688	/*
1689	 * We use the superblock's buf to serialize calls to ffs_sbupdate().
1690	 */
1691	sbbp = getblk(mp->um_devvp, btodb(fs->fs_sblockloc), (int)fs->fs_sbsize,
1692	    0, 0, 0);
1693	/*
1694	 * First write back the summary information.
1695	 */
1696	blks = howmany(fs->fs_cssize, fs->fs_fsize);
1697	space = fs->fs_csp;
1698	for (i = 0; i < blks; i += fs->fs_frag) {
1699		size = fs->fs_bsize;
1700		if (i + fs->fs_frag > blks)
1701			size = (blks - i) * fs->fs_fsize;
1702		bp = getblk(mp->um_devvp, fsbtodb(fs, fs->fs_csaddr + i),
1703		    size, 0, 0, 0);
1704		bcopy(space, bp->b_data, (u_int)size);
1705		space = (char *)space + size;
1706		if (suspended)
1707			bp->b_flags |= B_VALIDSUSPWRT;
1708		if (waitfor != MNT_WAIT)
1709			bawrite(bp);
1710		else if ((error = bwrite(bp)) != 0)
1711			allerror = error;
1712	}
1713	/*
1714	 * Now write back the superblock itself. If any errors occurred
1715	 * up to this point, then fail so that the superblock avoids
1716	 * being written out as clean.
1717	 */
1718	if (allerror) {
1719		brelse(sbbp);
1720		return (allerror);
1721	}
1722	bp = sbbp;
1723	if (fs->fs_magic == FS_UFS1_MAGIC && fs->fs_sblockloc != SBLOCK_UFS1 &&
1724	    (fs->fs_flags & FS_FLAGS_UPDATED) == 0) {
1725		printf("%s: correcting fs_sblockloc from %jd to %d\n",
1726		    fs->fs_fsmnt, fs->fs_sblockloc, SBLOCK_UFS1);
1727		fs->fs_sblockloc = SBLOCK_UFS1;
1728	}
1729	if (fs->fs_magic == FS_UFS2_MAGIC && fs->fs_sblockloc != SBLOCK_UFS2 &&
1730	    (fs->fs_flags & FS_FLAGS_UPDATED) == 0) {
1731		printf("%s: correcting fs_sblockloc from %jd to %d\n",
1732		    fs->fs_fsmnt, fs->fs_sblockloc, SBLOCK_UFS2);
1733		fs->fs_sblockloc = SBLOCK_UFS2;
1734	}
1735	fs->fs_fmod = 0;
1736	fs->fs_time = time_second;
1737	if (fs->fs_flags & FS_DOSOFTDEP)
1738		softdep_setup_sbupdate(mp, (struct fs *)bp->b_data, bp);
1739	bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize);
1740	ffs_oldfscompat_write((struct fs *)bp->b_data, mp);
1741	if (suspended)
1742		bp->b_flags |= B_VALIDSUSPWRT;
1743	if (waitfor != MNT_WAIT)
1744		bawrite(bp);
1745	else if ((error = bwrite(bp)) != 0)
1746		allerror = error;
1747	return (allerror);
1748}
1749
1750static int
1751ffs_extattrctl(struct mount *mp, int cmd, struct vnode *filename_vp,
1752	int attrnamespace, const char *attrname)
1753{
1754
1755#ifdef UFS_EXTATTR
1756	return (ufs_extattrctl(mp, cmd, filename_vp, attrnamespace,
1757	    attrname));
1758#else
1759	return (vfs_stdextattrctl(mp, cmd, filename_vp, attrnamespace,
1760	    attrname));
1761#endif
1762}
1763
1764static void
1765ffs_ifree(struct ufsmount *ump, struct inode *ip)
1766{
1767
1768	if (ump->um_fstype == UFS1 && ip->i_din1 != NULL)
1769		uma_zfree(uma_ufs1, ip->i_din1);
1770	else if (ip->i_din2 != NULL)
1771		uma_zfree(uma_ufs2, ip->i_din2);
1772	uma_zfree(uma_inode, ip);
1773}
1774
1775static int dobkgrdwrite = 1;
1776SYSCTL_INT(_debug, OID_AUTO, dobkgrdwrite, CTLFLAG_RW, &dobkgrdwrite, 0,
1777    "Do background writes (honoring the BV_BKGRDWRITE flag)?");
1778
1779/*
1780 * Complete a background write started from bwrite.
1781 */
1782static void
1783ffs_backgroundwritedone(struct buf *bp)
1784{
1785	struct bufobj *bufobj;
1786	struct buf *origbp;
1787
1788	/*
1789	 * Find the original buffer that we are writing.
1790	 */
1791	bufobj = bp->b_bufobj;
1792	BO_LOCK(bufobj);
1793	if ((origbp = gbincore(bp->b_bufobj, bp->b_lblkno)) == NULL)
1794		panic("backgroundwritedone: lost buffer");
1795	/* Grab an extra reference to be dropped by the bufdone() below. */
1796	bufobj_wrefl(bufobj);
1797	BO_UNLOCK(bufobj);
1798	/*
1799	 * Process dependencies then return any unfinished ones.
1800	 */
1801	if (!LIST_EMPTY(&bp->b_dep))
1802		buf_complete(bp);
1803#ifdef SOFTUPDATES
1804	if (!LIST_EMPTY(&bp->b_dep))
1805		softdep_move_dependencies(bp, origbp);
1806#endif
1807	/*
1808	 * This buffer is marked B_NOCACHE so when it is released
1809	 * by biodone it will be tossed.
1810	 */
1811	bp->b_flags |= B_NOCACHE;
1812	bp->b_flags &= ~B_CACHE;
1813	bufdone(bp);
1814	BO_LOCK(bufobj);
1815	/*
1816	 * Clear the BV_BKGRDINPROG flag in the original buffer
1817	 * and awaken it if it is waiting for the write to complete.
1818	 * If BV_BKGRDINPROG is not set in the original buffer it must
1819	 * have been released and re-instantiated - which is not legal.
1820	 */
1821	KASSERT((origbp->b_vflags & BV_BKGRDINPROG),
1822	    ("backgroundwritedone: lost buffer2"));
1823	origbp->b_vflags &= ~BV_BKGRDINPROG;
1824	if (origbp->b_vflags & BV_BKGRDWAIT) {
1825		origbp->b_vflags &= ~BV_BKGRDWAIT;
1826		wakeup(&origbp->b_xflags);
1827	}
1828	BO_UNLOCK(bufobj);
1829}
1830
1831
1832/*
1833 * Write, release buffer on completion.  (Done by iodone
1834 * if async).  Do not bother writing anything if the buffer
1835 * is invalid.
1836 *
1837 * Note that we set B_CACHE here, indicating that buffer is
1838 * fully valid and thus cacheable.  This is true even of NFS
1839 * now so we set it generally.  This could be set either here
1840 * or in biodone() since the I/O is synchronous.  We put it
1841 * here.
1842 */
1843static int
1844ffs_bufwrite(struct buf *bp)
1845{
1846	int oldflags, s;
1847	struct buf *newbp;
1848
1849	CTR3(KTR_BUF, "bufwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
1850	if (bp->b_flags & B_INVAL) {
1851		brelse(bp);
1852		return (0);
1853	}
1854
1855	oldflags = bp->b_flags;
1856
1857	if (!BUF_ISLOCKED(bp))
1858		panic("bufwrite: buffer is not busy???");
1859	s = splbio();
1860	/*
1861	 * If a background write is already in progress, delay
1862	 * writing this block if it is asynchronous. Otherwise
1863	 * wait for the background write to complete.
1864	 */
1865	BO_LOCK(bp->b_bufobj);
1866	if (bp->b_vflags & BV_BKGRDINPROG) {
1867		if (bp->b_flags & B_ASYNC) {
1868			BO_UNLOCK(bp->b_bufobj);
1869			splx(s);
1870			bdwrite(bp);
1871			return (0);
1872		}
1873		bp->b_vflags |= BV_BKGRDWAIT;
1874		msleep(&bp->b_xflags, BO_MTX(bp->b_bufobj), PRIBIO, "bwrbg", 0);
1875		if (bp->b_vflags & BV_BKGRDINPROG)
1876			panic("bufwrite: still writing");
1877	}
1878	BO_UNLOCK(bp->b_bufobj);
1879
1880	/*
1881	 * If this buffer is marked for background writing and we
1882	 * do not have to wait for it, make a copy and write the
1883	 * copy so as to leave this buffer ready for further use.
1884	 *
1885	 * This optimization eats a lot of memory.  If we have a page
1886	 * or buffer shortfall we can't do it.
1887	 */
1888	if (dobkgrdwrite && (bp->b_xflags & BX_BKGRDWRITE) &&
1889	    (bp->b_flags & B_ASYNC) &&
1890	    !vm_page_count_severe() &&
1891	    !buf_dirty_count_severe()) {
1892		KASSERT(bp->b_iodone == NULL,
1893		    ("bufwrite: needs chained iodone (%p)", bp->b_iodone));
1894
1895		/* get a new block */
1896		newbp = geteblk(bp->b_bufsize, GB_NOWAIT_BD);
1897		if (newbp == NULL)
1898			goto normal_write;
1899
1900		/*
1901		 * set it to be identical to the old block.  We have to
1902		 * set b_lblkno and BKGRDMARKER before calling bgetvp()
1903		 * to avoid confusing the splay tree and gbincore().
1904		 */
1905		memcpy(newbp->b_data, bp->b_data, bp->b_bufsize);
1906		newbp->b_lblkno = bp->b_lblkno;
1907		newbp->b_xflags |= BX_BKGRDMARKER;
1908		BO_LOCK(bp->b_bufobj);
1909		bp->b_vflags |= BV_BKGRDINPROG;
1910		bgetvp(bp->b_vp, newbp);
1911		BO_UNLOCK(bp->b_bufobj);
1912		newbp->b_bufobj = &bp->b_vp->v_bufobj;
1913		newbp->b_blkno = bp->b_blkno;
1914		newbp->b_offset = bp->b_offset;
1915		newbp->b_iodone = ffs_backgroundwritedone;
1916		newbp->b_flags |= B_ASYNC;
1917		newbp->b_flags &= ~B_INVAL;
1918
1919#ifdef SOFTUPDATES
1920		/*
1921		 * Move over the dependencies.  If there are rollbacks,
1922		 * leave the parent buffer dirtied as it will need to
1923		 * be written again.
1924		 */
1925		if (LIST_EMPTY(&bp->b_dep) ||
1926		    softdep_move_dependencies(bp, newbp) == 0)
1927			bundirty(bp);
1928#else
1929		bundirty(bp);
1930#endif
1931
1932		/*
1933		 * Initiate write on the copy, release the original to
1934		 * the B_LOCKED queue so that it cannot go away until
1935		 * the background write completes. If not locked it could go
1936		 * away and then be reconstituted while it was being written.
1937		 * If the reconstituted buffer were written, we could end up
1938		 * with two background copies being written at the same time.
1939		 */
1940		bqrelse(bp);
1941		bp = newbp;
1942	} else
1943		/* Mark the buffer clean */
1944		bundirty(bp);
1945
1946
1947	/* Let the normal bufwrite do the rest for us */
1948normal_write:
1949	return (bufwrite(bp));
1950}
1951
1952
1953static void
1954ffs_geom_strategy(struct bufobj *bo, struct buf *bp)
1955{
1956	struct vnode *vp;
1957	int error;
1958	struct buf *tbp;
1959	int nocopy;
1960
1961	vp = bo->__bo_vnode;
1962	if (bp->b_iocmd == BIO_WRITE) {
1963		if ((bp->b_flags & B_VALIDSUSPWRT) == 0 &&
1964		    bp->b_vp != NULL && bp->b_vp->v_mount != NULL &&
1965		    (bp->b_vp->v_mount->mnt_kern_flag & MNTK_SUSPENDED) != 0)
1966			panic("ffs_geom_strategy: bad I/O");
1967		nocopy = bp->b_flags & B_NOCOPY;
1968		bp->b_flags &= ~(B_VALIDSUSPWRT | B_NOCOPY);
1969		if ((vp->v_vflag & VV_COPYONWRITE) && nocopy == 0 &&
1970		    vp->v_rdev->si_snapdata != NULL) {
1971			if ((bp->b_flags & B_CLUSTER) != 0) {
1972				runningbufwakeup(bp);
1973				TAILQ_FOREACH(tbp, &bp->b_cluster.cluster_head,
1974					      b_cluster.cluster_entry) {
1975					error = ffs_copyonwrite(vp, tbp);
1976					if (error != 0 &&
1977					    error != EOPNOTSUPP) {
1978						bp->b_error = error;
1979						bp->b_ioflags |= BIO_ERROR;
1980						bufdone(bp);
1981						return;
1982					}
1983				}
1984				bp->b_runningbufspace = bp->b_bufsize;
1985				atomic_add_long(&runningbufspace,
1986					       bp->b_runningbufspace);
1987			} else {
1988				error = ffs_copyonwrite(vp, bp);
1989				if (error != 0 && error != EOPNOTSUPP) {
1990					bp->b_error = error;
1991					bp->b_ioflags |= BIO_ERROR;
1992					bufdone(bp);
1993					return;
1994				}
1995			}
1996		}
1997#ifdef SOFTUPDATES
1998		if ((bp->b_flags & B_CLUSTER) != 0) {
1999			TAILQ_FOREACH(tbp, &bp->b_cluster.cluster_head,
2000				      b_cluster.cluster_entry) {
2001				if (!LIST_EMPTY(&tbp->b_dep))
2002					buf_start(tbp);
2003			}
2004		} else {
2005			if (!LIST_EMPTY(&bp->b_dep))
2006				buf_start(bp);
2007		}
2008
2009#endif
2010	}
2011	g_vfs_strategy(bo, bp);
2012}
2013
2014#ifdef	DDB
2015
2016static void
2017db_print_ffs(struct ufsmount *ump)
2018{
2019	db_printf("mp %p %s devvp %p fs %p su_wl %d su_wl_in %d su_deps %d "
2020		  "su_req %d\n",
2021	    ump->um_mountp, ump->um_mountp->mnt_stat.f_mntonname,
2022	    ump->um_devvp, ump->um_fs, ump->softdep_on_worklist,
2023	    ump->softdep_on_worklist_inprogress, ump->softdep_deps,
2024	    ump->softdep_req);
2025}
2026
2027DB_SHOW_COMMAND(ffs, db_show_ffs)
2028{
2029	struct mount *mp;
2030	struct ufsmount *ump;
2031
2032	if (have_addr) {
2033		ump = VFSTOUFS((struct mount *)addr);
2034		db_print_ffs(ump);
2035		return;
2036	}
2037
2038	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
2039		if (!strcmp(mp->mnt_stat.f_fstypename, ufs_vfsconf.vfc_name))
2040			db_print_ffs(VFSTOUFS(mp));
2041	}
2042}
2043
2044#endif	/* DDB */
2045