ffs_snapshot.c revision 107915
162976Smckusick/*
262976Smckusick * Copyright 2000 Marshall Kirk McKusick. All Rights Reserved.
362976Smckusick *
462976Smckusick * Further information about snapshots can be obtained from:
562976Smckusick *
662976Smckusick *	Marshall Kirk McKusick		http://www.mckusick.com/softdep/
762976Smckusick *	1614 Oxford Street		mckusick@mckusick.com
862976Smckusick *	Berkeley, CA 94709-1608		+1-510-843-9542
962976Smckusick *	USA
1062976Smckusick *
1162976Smckusick * Redistribution and use in source and binary forms, with or without
1262976Smckusick * modification, are permitted provided that the following conditions
1362976Smckusick * are met:
1462976Smckusick *
1562976Smckusick * 1. Redistributions of source code must retain the above copyright
1662976Smckusick *    notice, this list of conditions and the following disclaimer.
1762976Smckusick * 2. Redistributions in binary form must reproduce the above copyright
1862976Smckusick *    notice, this list of conditions and the following disclaimer in the
1962976Smckusick *    documentation and/or other materials provided with the distribution.
2062976Smckusick *
2162976Smckusick * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY
2262976Smckusick * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
2362976Smckusick * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
2462976Smckusick * DISCLAIMED.  IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR
2562976Smckusick * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
2662976Smckusick * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
2762976Smckusick * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
2862976Smckusick * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
2962976Smckusick * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
3062976Smckusick * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
3162976Smckusick * SUCH DAMAGE.
3262976Smckusick *
3363788Smckusick *	@(#)ffs_snapshot.c	8.11 (McKusick) 7/23/00
3462976Smckusick * $FreeBSD: head/sys/ufs/ffs/ffs_snapshot.c 107915 2002-12-15 19:25:59Z mckusick $
3562976Smckusick */
3662976Smckusick
3762976Smckusick#include <sys/param.h>
3898542Smckusick#include <sys/stdint.h>
39105191Smckusick#include <sys/kernel.h>
4062976Smckusick#include <sys/systm.h>
4173942Smckusick#include <sys/conf.h>
4262976Smckusick#include <sys/bio.h>
4362976Smckusick#include <sys/buf.h>
4462976Smckusick#include <sys/proc.h>
4562976Smckusick#include <sys/namei.h>
4662976Smckusick#include <sys/stat.h>
4762976Smckusick#include <sys/malloc.h>
4862976Smckusick#include <sys/mount.h>
4962976Smckusick#include <sys/resource.h>
5062976Smckusick#include <sys/resourcevar.h>
5162976Smckusick#include <sys/vnode.h>
5262976Smckusick
5362976Smckusick#include <ufs/ufs/extattr.h>
5462976Smckusick#include <ufs/ufs/quota.h>
5562976Smckusick#include <ufs/ufs/ufsmount.h>
5662976Smckusick#include <ufs/ufs/inode.h>
5762976Smckusick#include <ufs/ufs/ufs_extern.h>
5862976Smckusick
5962976Smckusick#include <ufs/ffs/fs.h>
6062976Smckusick#include <ufs/ffs/ffs_extern.h>
6162976Smckusick
6291420Sjhb#define KERNCRED thread0.td_ucred
6365998Sdes#define DEBUG 1
6462976Smckusick
6592728Salfredstatic int cgaccount(int, struct vnode *, struct buf *, int);
6698542Smckusickstatic int expunge_ufs1(struct vnode *, struct inode *, struct fs *,
6798542Smckusick    int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *,
6898542Smckusick    ufs_lbn_t, int), int);
6998542Smckusickstatic int indiracct_ufs1(struct vnode *, struct vnode *, int,
7098542Smckusick    ufs1_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *,
7198542Smckusick    int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *,
7298542Smckusick    ufs_lbn_t, int), int);
7398542Smckusickstatic int fullacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
7498542Smckusick    struct fs *, ufs_lbn_t, int);
7598542Smckusickstatic int snapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
7698542Smckusick    struct fs *, ufs_lbn_t, int);
7798542Smckusickstatic int mapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
7898542Smckusick    struct fs *, ufs_lbn_t, int);
7998542Smckusickstatic int expunge_ufs2(struct vnode *, struct inode *, struct fs *,
8098542Smckusick    int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *,
8198542Smckusick    ufs_lbn_t, int), int);
8298542Smckusickstatic int indiracct_ufs2(struct vnode *, struct vnode *, int,
8398542Smckusick    ufs2_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *,
8498542Smckusick    int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *,
8598542Smckusick    ufs_lbn_t, int), int);
8698542Smckusickstatic int fullacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
8798542Smckusick    struct fs *, ufs_lbn_t, int);
8898542Smckusickstatic int snapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
8998542Smckusick    struct fs *, ufs_lbn_t, int);
9098542Smckusickstatic int mapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
9198542Smckusick    struct fs *, ufs_lbn_t, int);
9292728Salfredstatic int ffs_copyonwrite(struct vnode *, struct buf *);
9398542Smckusickstatic int readblock(struct buf *, ufs2_daddr_t);
9462976Smckusick
9576580Smckusick/*
9676580Smckusick * To ensure the consistency of snapshots across crashes, we must
9776580Smckusick * synchronously write out copied blocks before allowing the
9876580Smckusick * originals to be modified. Because of the rather severe speed
9976580Smckusick * penalty that this imposes, the following flag allows this
10076580Smckusick * crash persistence to be disabled.
10176580Smckusick */
10276580Smckusickint dopersistence = 0;
10376580Smckusick
10462976Smckusick#ifdef DEBUG
10562976Smckusick#include <sys/sysctl.h>
10676580SmckusickSYSCTL_INT(_debug, OID_AUTO, dopersistence, CTLFLAG_RW, &dopersistence, 0, "");
10762976Smckusickint snapdebug = 0;
10862976SmckusickSYSCTL_INT(_debug, OID_AUTO, snapdebug, CTLFLAG_RW, &snapdebug, 0, "");
10987827Smckusickint collectsnapstats = 0;
11087827SmckusickSYSCTL_INT(_debug, OID_AUTO, collectsnapstats, CTLFLAG_RW, &collectsnapstats,
11187827Smckusick	0, "");
11262976Smckusick#endif /* DEBUG */
11362976Smckusick
11462976Smckusick/*
11562976Smckusick * Create a snapshot file and initialize it for the filesystem.
11662976Smckusick */
11762976Smckusickint
11862976Smckusickffs_snapshot(mp, snapfile)
11962976Smckusick	struct mount *mp;
12062976Smckusick	char *snapfile;
12162976Smckusick{
12298542Smckusick	ufs2_daddr_t numblks, blkno;
12398542Smckusick	int error, cg, snaploc;
12490098Smckusick	int i, size, len, loc;
12576269Smckusick	int flag = mp->mnt_flag;
12687827Smckusick	struct timespec starttime = {0, 0}, endtime;
12787827Smckusick	char saved_nice = 0;
128107848Smckusick	long redo = 0, snaplistsize;
12976269Smckusick	int32_t *lp;
13071073Siedowse	void *space;
131107848Smckusick	daddr_t *snapblklist;
13276269Smckusick	struct fs *copy_fs = NULL, *fs = VFSTOUFS(mp)->um_fs;
13373942Smckusick	struct snaphead *snaphead;
13483366Sjulian	struct thread *td = curthread;
13573942Smckusick	struct inode *ip, *xp;
13676269Smckusick	struct buf *bp, *nbp, *ibp, *sbp = NULL;
13762976Smckusick	struct nameidata nd;
13862976Smckusick	struct mount *wrtmp;
13962976Smckusick	struct vattr vat;
140107414Smckusick	struct vnode *vp, *xvp, *nvp, *devvp;
141104698Smckusick	struct uio auio;
142104698Smckusick	struct iovec aiov;
14362976Smckusick
14462976Smckusick	/*
14562976Smckusick	 * Need to serialize access to snapshot code per filesystem.
14662976Smckusick	 */
14762976Smckusick	/*
14862976Smckusick	 * Assign a snapshot slot in the superblock.
14962976Smckusick	 */
15062976Smckusick	for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++)
15162976Smckusick		if (fs->fs_snapinum[snaploc] == 0)
15262976Smckusick			break;
15362976Smckusick	if (snaploc == FSMAXSNAP)
15462976Smckusick		return (ENOSPC);
15562976Smckusick	/*
15662976Smckusick	 * Create the snapshot file.
15762976Smckusick	 */
15862976Smckusickrestart:
15983366Sjulian	NDINIT(&nd, CREATE, LOCKPARENT | LOCKLEAF, UIO_USERSPACE, snapfile, td);
16062976Smckusick	if ((error = namei(&nd)) != 0)
16162976Smckusick		return (error);
16262976Smckusick	if (nd.ni_vp != NULL) {
16362976Smckusick		vput(nd.ni_vp);
16462976Smckusick		error = EEXIST;
16562976Smckusick	}
16662976Smckusick	if (nd.ni_dvp->v_mount != mp)
16762976Smckusick		error = EXDEV;
16862976Smckusick	if (error) {
16962976Smckusick		NDFREE(&nd, NDF_ONLY_PNBUF);
17062976Smckusick		if (nd.ni_dvp == nd.ni_vp)
17162976Smckusick			vrele(nd.ni_dvp);
17262976Smckusick		else
17362976Smckusick			vput(nd.ni_dvp);
17462976Smckusick		return (error);
17562976Smckusick	}
17662976Smckusick	VATTR_NULL(&vat);
17762976Smckusick	vat.va_type = VREG;
17862976Smckusick	vat.va_mode = S_IRUSR;
17962976Smckusick	vat.va_vaflags |= VA_EXCLUSIVE;
18062976Smckusick	if (VOP_GETWRITEMOUNT(nd.ni_dvp, &wrtmp))
18162976Smckusick		wrtmp = NULL;
18262976Smckusick	if (wrtmp != mp)
18362976Smckusick		panic("ffs_snapshot: mount mismatch");
18462985Smckusick	if (vn_start_write(NULL, &wrtmp, V_NOWAIT) != 0) {
18562976Smckusick		NDFREE(&nd, NDF_ONLY_PNBUF);
18662976Smckusick		vput(nd.ni_dvp);
18762985Smckusick		if ((error = vn_start_write(NULL, &wrtmp,
18862985Smckusick		    V_XSLEEP | PCATCH)) != 0)
18962976Smckusick			return (error);
19062976Smckusick		goto restart;
19162976Smckusick	}
19283366Sjulian	VOP_LEASE(nd.ni_dvp, td, KERNCRED, LEASE_WRITE);
19362976Smckusick	error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vat);
19462976Smckusick	vput(nd.ni_dvp);
19562976Smckusick	if (error) {
19662976Smckusick		NDFREE(&nd, NDF_ONLY_PNBUF);
19762976Smckusick		vn_finished_write(wrtmp);
19862976Smckusick		return (error);
19962976Smckusick	}
20062976Smckusick	vp = nd.ni_vp;
20162976Smckusick	ip = VTOI(vp);
202107414Smckusick	devvp = ip->i_devvp;
20362976Smckusick	/*
20462976Smckusick	 * Allocate and copy the last block contents so as to be able
20562976Smckusick	 * to set size to that of the filesystem.
20662976Smckusick	 */
20762976Smckusick	numblks = howmany(fs->fs_size, fs->fs_frag);
20876132Sphk	error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(numblks - 1)),
20998658Sdillon	    fs->fs_bsize, KERNCRED, BA_CLRBUF, &bp);
21062976Smckusick	if (error)
21162976Smckusick		goto out;
21262976Smckusick	ip->i_size = lblktosize(fs, (off_t)numblks);
21398542Smckusick	DIP(ip, i_size) = ip->i_size;
21462976Smckusick	ip->i_flag |= IN_CHANGE | IN_UPDATE;
21562976Smckusick	if ((error = readblock(bp, numblks - 1)) != 0)
21662976Smckusick		goto out;
21762976Smckusick	bawrite(bp);
21862976Smckusick	/*
21962976Smckusick	 * Preallocate critical data structures so that we can copy
22062976Smckusick	 * them in without further allocation after we suspend all
22162976Smckusick	 * operations on the filesystem. We would like to just release
22262976Smckusick	 * the allocated buffers without writing them since they will
22362976Smckusick	 * be filled in below once we are ready to go, but this upsets
22462976Smckusick	 * the soft update code, so we go ahead and write the new buffers.
22562976Smckusick	 *
22675993Smckusick	 * Allocate all indirect blocks and mark all of them as not
22775993Smckusick	 * needing to be copied.
22862976Smckusick	 */
22962976Smckusick	for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) {
23076132Sphk		error = UFS_BALLOC(vp, lblktosize(fs, (off_t)blkno),
23198658Sdillon		    fs->fs_bsize, td->td_ucred, BA_METAONLY, &ibp);
23262976Smckusick		if (error)
23362976Smckusick			goto out;
234107406Smckusick		bawrite(ibp);
23562976Smckusick	}
23662976Smckusick	/*
23762976Smckusick	 * Allocate copies for the superblock and its summary information.
23862976Smckusick	 */
239107294Smckusick	error = UFS_BALLOC(vp, fs->fs_sblockloc, fs->fs_sbsize, KERNCRED,
240107294Smckusick	    0, &nbp);
24176269Smckusick	if (error)
24262976Smckusick		goto out;
24362976Smckusick	bawrite(nbp);
24462976Smckusick	blkno = fragstoblks(fs, fs->fs_csaddr);
24562976Smckusick	len = howmany(fs->fs_cssize, fs->fs_bsize);
24662976Smckusick	for (loc = 0; loc < len; loc++) {
24776132Sphk		error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(blkno + loc)),
24862976Smckusick		    fs->fs_bsize, KERNCRED, 0, &nbp);
24962976Smckusick		if (error)
25062976Smckusick			goto out;
25162976Smckusick		bawrite(nbp);
25262976Smckusick	}
25362976Smckusick	/*
25487827Smckusick	 * Allocate all cylinder group blocks.
25587827Smckusick	 */
25687827Smckusick	for (cg = 0; cg < fs->fs_ncg; cg++) {
25787827Smckusick		error = UFS_BALLOC(vp, (off_t)(cgtod(fs, cg)) << fs->fs_fshift,
25887827Smckusick		    fs->fs_bsize, KERNCRED, 0, &nbp);
25987827Smckusick		if (error)
26087827Smckusick			goto out;
261107406Smckusick		bawrite(nbp);
26287827Smckusick	}
26387827Smckusick	/*
26487827Smckusick	 * Copy all the cylinder group maps. Although the
26587827Smckusick	 * filesystem is still active, we hope that only a few
26687827Smckusick	 * cylinder groups will change between now and when we
26787827Smckusick	 * suspend operations. Thus, we will be able to quickly
26887827Smckusick	 * touch up the few cylinder groups that changed during
26987827Smckusick	 * the suspension period.
27087827Smckusick	 */
27189450Smckusick	len = howmany(fs->fs_ncg, NBBY);
27288138Smckusick	MALLOC(fs->fs_active, int *, len, M_DEVBUF, M_WAITOK);
27387827Smckusick	bzero(fs->fs_active, len);
27487827Smckusick	for (cg = 0; cg < fs->fs_ncg; cg++) {
275107558Smckusick		error = UFS_BALLOC(vp, (off_t)(cgtod(fs, cg)) << fs->fs_fshift,
276107558Smckusick		    fs->fs_bsize, KERNCRED, 0, &nbp);
277107558Smckusick		if (error)
27887827Smckusick			goto out;
27987827Smckusick		error = cgaccount(cg, vp, nbp, 1);
28087827Smckusick		bawrite(nbp);
28187827Smckusick		if (error)
28287827Smckusick			goto out;
28387827Smckusick	}
28487827Smckusick	/*
28562976Smckusick	 * Change inode to snapshot type file.
28662976Smckusick	 */
28763897Smckusick	ip->i_flags |= SF_SNAPSHOT;
28898542Smckusick	DIP(ip, i_flags) = ip->i_flags;
28962976Smckusick	ip->i_flag |= IN_CHANGE | IN_UPDATE;
29062976Smckusick	/*
29162976Smckusick	 * Ensure that the snapshot is completely on disk.
292107406Smckusick	 * Since we have marked it as a snapshot it is safe to
293107406Smckusick	 * unlock it as no process will be allowed to write to it.
29462976Smckusick	 */
29583366Sjulian	if ((error = VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td)) != 0)
29662976Smckusick		goto out;
297107406Smckusick	VOP_UNLOCK(vp, 0, td);
29862976Smckusick	/*
29962976Smckusick	 * All allocations are done, so we can now snapshot the system.
30062976Smckusick	 *
30187827Smckusick	 * Recind nice scheduling while running with the filesystem suspended.
30287827Smckusick	 */
30387827Smckusick	if (td->td_ksegrp->kg_nice > 0) {
30487827Smckusick		saved_nice = td->td_ksegrp->kg_nice;
30587827Smckusick		td->td_ksegrp->kg_nice = 0;
30687827Smckusick	}
30787827Smckusick	/*
30862976Smckusick	 * Suspend operation on filesystem.
30962976Smckusick	 */
31062976Smckusick	for (;;) {
31162976Smckusick		vn_finished_write(wrtmp);
312105902Smckusick		if ((error = vfs_write_suspend(vp->v_mount)) != 0) {
313105902Smckusick			vn_start_write(NULL, &wrtmp, V_WAIT);
314107406Smckusick			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
315105902Smckusick			goto out;
316105902Smckusick		}
31762976Smckusick		if (mp->mnt_kern_flag & MNTK_SUSPENDED)
31862976Smckusick			break;
31962985Smckusick		vn_start_write(NULL, &wrtmp, V_WAIT);
32062976Smckusick	}
321107406Smckusick	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
32290098Smckusick	if (collectsnapstats)
32390098Smckusick		nanotime(&starttime);
32462976Smckusick	/*
32587827Smckusick	 * First, copy all the cylinder group maps that have changed.
32662976Smckusick	 */
32762976Smckusick	for (cg = 0; cg < fs->fs_ncg; cg++) {
32888138Smckusick		if ((ACTIVECGNUM(fs, cg) & ACTIVECGOFF(cg)) != 0)
32987827Smckusick			continue;
33087827Smckusick		redo++;
331107558Smckusick		error = UFS_BALLOC(vp, (off_t)(cgtod(fs, cg)) << fs->fs_fshift,
332107558Smckusick		    fs->fs_bsize, KERNCRED, 0, &nbp);
333107558Smckusick		if (error)
33462976Smckusick			goto out1;
33587827Smckusick		error = cgaccount(cg, vp, nbp, 2);
33689450Smckusick		bawrite(nbp);
33787827Smckusick		if (error)
33862976Smckusick			goto out1;
33962976Smckusick	}
34062976Smckusick	/*
34176269Smckusick	 * Grab a copy of the superblock and its summary information.
34276269Smckusick	 * We delay writing it until the suspension is released below.
34376269Smckusick	 */
344107294Smckusick	error = bread(vp, lblkno(fs, fs->fs_sblockloc), fs->fs_bsize,
34598542Smckusick	    KERNCRED, &sbp);
34690098Smckusick	if (error) {
34790098Smckusick		brelse(sbp);
34890098Smckusick		sbp = NULL;
34976269Smckusick		goto out1;
35090098Smckusick	}
351107294Smckusick	loc = blkoff(fs, fs->fs_sblockloc);
35298542Smckusick	copy_fs = (struct fs *)(sbp->b_data + loc);
35376269Smckusick	bcopy(fs, copy_fs, fs->fs_sbsize);
35476269Smckusick	if ((fs->fs_flags & (FS_UNCLEAN | FS_NEEDSFSCK)) == 0)
35576269Smckusick		copy_fs->fs_clean = 1;
35698542Smckusick	if (fs->fs_sbsize < SBLOCKSIZE)
35798542Smckusick		bzero(&sbp->b_data[loc + fs->fs_sbsize],
35898542Smckusick		    SBLOCKSIZE - fs->fs_sbsize);
35976269Smckusick	size = blkroundup(fs, fs->fs_cssize);
36076269Smckusick	if (fs->fs_contigsumsize > 0)
36176269Smckusick		size += fs->fs_ncg * sizeof(int32_t);
36276269Smckusick	space = malloc((u_long)size, M_UFSMNT, M_WAITOK);
36376269Smckusick	copy_fs->fs_csp = space;
36476269Smckusick	bcopy(fs->fs_csp, copy_fs->fs_csp, fs->fs_cssize);
36576269Smckusick	(char *)space += fs->fs_cssize;
36676269Smckusick	loc = howmany(fs->fs_cssize, fs->fs_fsize);
36776356Smckusick	i = fs->fs_frag - loc % fs->fs_frag;
36876356Smckusick	len = (i == fs->fs_frag) ? 0 : i * fs->fs_fsize;
36976356Smckusick	if (len > 0) {
370107414Smckusick		if ((error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + loc),
37176269Smckusick		    len, KERNCRED, &bp)) != 0) {
37290098Smckusick			brelse(bp);
37376269Smckusick			free(copy_fs->fs_csp, M_UFSMNT);
37490098Smckusick			bawrite(sbp);
37590098Smckusick			sbp = NULL;
37676269Smckusick			goto out1;
37762976Smckusick		}
37876269Smckusick		bcopy(bp->b_data, space, (u_int)len);
37976269Smckusick		(char *)space += len;
38076269Smckusick		bp->b_flags |= B_INVAL | B_NOCACHE;
38176269Smckusick		brelse(bp);
38262976Smckusick	}
38376269Smckusick	if (fs->fs_contigsumsize > 0) {
38476269Smckusick		copy_fs->fs_maxcluster = lp = space;
38576269Smckusick		for (i = 0; i < fs->fs_ncg; i++)
38676269Smckusick			*lp++ = fs->fs_contigsumsize;
38776269Smckusick	}
38862976Smckusick	/*
38990098Smckusick	 * We must check for active files that have been unlinked
39090098Smckusick	 * (e.g., with a zero link count). We have to expunge all
39190098Smckusick	 * trace of these files from the snapshot so that they are
39290098Smckusick	 * not reclaimed prematurely by fsck or unnecessarily dumped.
39390098Smckusick	 * We turn off the MNTK_SUSPENDED flag to avoid a panic from
39490098Smckusick	 * spec_strategy about writing on a suspended filesystem.
395104698Smckusick	 * Note that we skip unlinked snapshot files as they will
396104698Smckusick	 * be handled separately below.
39790098Smckusick	 */
39890098Smckusick	mp->mnt_kern_flag &= ~MNTK_SUSPENDED;
39990098Smckusick	mtx_lock(&mntvnode_mtx);
40090098Smckusickloop:
40190098Smckusick	for (xvp = TAILQ_FIRST(&mp->mnt_nvnodelist); xvp; xvp = nvp) {
40290098Smckusick		/*
40390098Smckusick		 * Make sure this vnode wasn't reclaimed in getnewvnode().
40490098Smckusick		 * Start over if it has (it won't be on the list anymore).
40590098Smckusick		 */
40690098Smckusick		if (xvp->v_mount != mp)
40790098Smckusick			goto loop;
40890098Smckusick		nvp = TAILQ_NEXT(xvp, v_nmntvnodes);
40990098Smckusick		mtx_unlock(&mntvnode_mtx);
410103945Sjeff		mp_fixme("Unlocked GETATTR.");
411103945Sjeff		if (vrefcnt(xvp) == 0 || xvp->v_type == VNON ||
412104698Smckusick		    (VTOI(xvp)->i_flags & SF_SNAPSHOT) ||
41390098Smckusick		    (VOP_GETATTR(xvp, &vat, td->td_proc->p_ucred, td) == 0 &&
41490098Smckusick		    vat.va_nlink > 0)) {
41590098Smckusick			mtx_lock(&mntvnode_mtx);
41690098Smckusick			continue;
41790098Smckusick		}
41890098Smckusick		if (snapdebug)
41990098Smckusick			vprint("ffs_snapshot: busy vnode", xvp);
420104688Sjeff		if (vn_lock(xvp, LK_EXCLUSIVE, td) != 0)
42190098Smckusick			goto loop;
42290098Smckusick		xp = VTOI(xvp);
42390098Smckusick		/*
42490098Smckusick		 * If there is a fragment, clear it here.
42590098Smckusick		 */
42690098Smckusick		blkno = 0;
42790098Smckusick		loc = howmany(xp->i_size, fs->fs_bsize) - 1;
42890098Smckusick		if (loc < NDADDR) {
42990098Smckusick			len = fragroundup(fs, blkoff(fs, xp->i_size));
43090098Smckusick			if (len < fs->fs_bsize) {
43198542Smckusick				ffs_blkfree(copy_fs, vp, DIP(xp, i_db[loc]),
43298542Smckusick				    len, xp->i_number);
43398542Smckusick				blkno = DIP(xp, i_db[loc]);
43498542Smckusick				DIP(xp, i_db[loc]) = 0;
43590098Smckusick			}
43690098Smckusick		}
43798542Smckusick		if (xp->i_ump->um_fstype == UFS1)
43898542Smckusick			error = expunge_ufs1(vp, xp, copy_fs, fullacct_ufs1,
43998542Smckusick			    BLK_NOCOPY);
44098542Smckusick		else
44198542Smckusick			error = expunge_ufs2(vp, xp, copy_fs, fullacct_ufs2,
44298542Smckusick			    BLK_NOCOPY);
44390098Smckusick		if (blkno)
44498542Smckusick			DIP(xp, i_db[loc]) = blkno;
44590098Smckusick		if (!error)
44690098Smckusick			error = ffs_freefile(copy_fs, vp, xp->i_number,
44790098Smckusick			    xp->i_mode);
44890098Smckusick		VOP_UNLOCK(xvp, 0, td);
44990098Smckusick		if (error) {
45090098Smckusick			free(copy_fs->fs_csp, M_UFSMNT);
45190098Smckusick			bawrite(sbp);
45290098Smckusick			sbp = NULL;
45390098Smckusick			goto out1;
45490098Smckusick		}
45590098Smckusick		mtx_lock(&mntvnode_mtx);
45690098Smckusick	}
45790098Smckusick	mtx_unlock(&mntvnode_mtx);
45890098Smckusick	/*
459105191Smckusick	 * If there already exist snapshots on this filesystem, grab a
460105191Smckusick	 * reference to their shared lock. If this is the first snapshot
461105191Smckusick	 * on this filesystem, we need to allocate a lock for the snapshots
462105191Smckusick	 * to share. In either case, acquire the snapshot lock and give
463105191Smckusick	 * up our original private lock.
464105191Smckusick	 */
465107414Smckusick	VI_LOCK(devvp);
466107414Smckusick	snaphead = &devvp->v_rdev->si_snapshots;
467105191Smckusick	if ((xp = TAILQ_FIRST(snaphead)) != NULL) {
468105191Smckusick		VI_LOCK(vp);
469105191Smckusick		vp->v_vnlock = ITOV(xp)->v_vnlock;
470107414Smckusick		VI_UNLOCK(devvp);
471105191Smckusick	} else {
472105191Smckusick		struct lock *lkp;
473105191Smckusick
474107414Smckusick		VI_UNLOCK(devvp);
475105191Smckusick		MALLOC(lkp, struct lock *, sizeof(struct lock), M_UFSMNT,
476105191Smckusick		    M_WAITOK);
477105191Smckusick		lockinit(lkp, PVFS, "snaplk", VLKTIMEOUT,
478105191Smckusick		    LK_CANRECURSE | LK_NOPAUSE);
479105191Smckusick		VI_LOCK(vp);
480105191Smckusick		vp->v_vnlock = lkp;
481105191Smckusick	}
482105191Smckusick	vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY, td);
483107414Smckusick	transferlockers(&vp->v_lock, vp->v_vnlock);
484107414Smckusick	lockmgr(&vp->v_lock, LK_RELEASE, NULL, td);
485105191Smckusick	/*
48662976Smckusick	 * Record snapshot inode. Since this is the newest snapshot,
48762976Smckusick	 * it must be placed at the end of the list.
48862976Smckusick	 */
489107414Smckusick	VI_LOCK(devvp);
49062976Smckusick	fs->fs_snapinum[snaploc] = ip->i_number;
49173942Smckusick	if (ip->i_nextsnap.tqe_prev != 0)
49262976Smckusick		panic("ffs_snapshot: %d already on list", ip->i_number);
49373942Smckusick	TAILQ_INSERT_TAIL(snaphead, ip, i_nextsnap);
494107414Smckusick	devvp->v_rdev->si_copyonwrite = ffs_copyonwrite;
495107414Smckusick	devvp->v_vflag |= VV_COPYONWRITE;
496107414Smckusick	VI_UNLOCK(devvp);
497101308Sjeff	ASSERT_VOP_LOCKED(vp, "ffs_snapshot vp");
498101308Sjeff	vp->v_vflag |= VV_SYSTEM;
49987827Smckusickout1:
50062976Smckusick	/*
50162976Smckusick	 * Resume operation on filesystem.
50262976Smckusick	 */
50362976Smckusick	vfs_write_resume(vp->v_mount);
50462985Smckusick	vn_start_write(NULL, &wrtmp, V_WAIT);
50587827Smckusick	if (collectsnapstats && starttime.tv_sec > 0) {
50687827Smckusick		nanotime(&endtime);
50787827Smckusick		timespecsub(&endtime, &starttime);
508106965Speter		printf("%s: suspended %ld.%03ld sec, redo %ld of %d\n",
509106965Speter		    vp->v_mount->mnt_stat.f_mntonname, (long)endtime.tv_sec,
51087827Smckusick		    endtime.tv_nsec / 1000000, redo, fs->fs_ncg);
51187827Smckusick	}
51290098Smckusick	if (sbp == NULL)
51390098Smckusick		goto out;
51490098Smckusick	/*
51590098Smckusick	 * Copy allocation information from all the snapshots in
51690098Smckusick	 * this snapshot and then expunge them from its view.
51790098Smckusick	 */
518107414Smckusick	snaphead = &devvp->v_rdev->si_snapshots;
51990098Smckusick	TAILQ_FOREACH(xp, snaphead, i_nextsnap) {
52090098Smckusick		if (xp == ip)
52190098Smckusick			break;
52298542Smckusick		if (xp->i_ump->um_fstype == UFS1)
52398542Smckusick			error = expunge_ufs1(vp, xp, fs, snapacct_ufs1,
52498542Smckusick			    BLK_SNAP);
52598542Smckusick		else
52698542Smckusick			error = expunge_ufs2(vp, xp, fs, snapacct_ufs2,
52798542Smckusick			    BLK_SNAP);
52898542Smckusick		if (error) {
52990098Smckusick			fs->fs_snapinum[snaploc] = 0;
53090098Smckusick			goto done;
53187827Smckusick		}
53290098Smckusick	}
53390098Smckusick	/*
534104698Smckusick	 * Allocate the space for the list of preallocated snapshot blocks.
535104698Smckusick	 */
536107848Smckusick	snaplistsize = fs->fs_ncg + howmany(fs->fs_cssize, fs->fs_bsize) +
537107848Smckusick	    FSMAXSNAP + 1 /* superblock */ + 1 /* last block */ + 1 /* size */;
538107848Smckusick	MALLOC(snapblklist, daddr_t *, snaplistsize * sizeof(daddr_t),
539104698Smckusick	    M_UFSMNT, M_WAITOK);
540107915Smckusick	ip->i_snapblklist = &snapblklist[1];
541104698Smckusick	/*
54290098Smckusick	 * Expunge the blocks used by the snapshots from the set of
543104698Smckusick	 * blocks marked as used in the snapshot bitmaps. Also, collect
544107915Smckusick	 * the list of allocated blocks in i_snapblklist.
54590098Smckusick	 */
54698542Smckusick	if (ip->i_ump->um_fstype == UFS1)
54798542Smckusick		error = expunge_ufs1(vp, ip, copy_fs, mapacct_ufs1, BLK_SNAP);
54898542Smckusick	else
54998542Smckusick		error = expunge_ufs2(vp, ip, copy_fs, mapacct_ufs2, BLK_SNAP);
55098542Smckusick	if (error) {
55190098Smckusick		fs->fs_snapinum[snaploc] = 0;
552107848Smckusick		FREE(snapblklist, M_UFSMNT);
55390098Smckusick		goto done;
55490098Smckusick	}
555107915Smckusick	snaplistsize = ip->i_snapblklist - snapblklist;
556107848Smckusick	snapblklist[0] = snaplistsize;
557107915Smckusick	ip->i_snapblklist = 0;
55890098Smckusick	/*
559104698Smckusick	 * Write out the list of allocated blocks to the end of the snapshot.
560104698Smckusick	 */
561104698Smckusick	auio.uio_iov = &aiov;
562104698Smckusick	auio.uio_iovcnt = 1;
563107848Smckusick	aiov.iov_base = (void *)snapblklist;
564107848Smckusick	aiov.iov_len = snaplistsize * sizeof(daddr_t);
565104698Smckusick	auio.uio_resid = aiov.iov_len;;
566104698Smckusick	auio.uio_offset = ip->i_size;
567104698Smckusick	auio.uio_segflg = UIO_SYSSPACE;
568104698Smckusick	auio.uio_rw = UIO_WRITE;
569104698Smckusick	auio.uio_td = td;
570104698Smckusick	if ((error = VOP_WRITE(vp, &auio, IO_UNIT, td->td_ucred)) != 0) {
571104698Smckusick		fs->fs_snapinum[snaploc] = 0;
572107848Smckusick		FREE(snapblklist, M_UFSMNT);
573104698Smckusick		goto done;
574104698Smckusick	}
575104698Smckusick	/*
57690098Smckusick	 * Write the superblock and its summary information
57790098Smckusick	 * to the snapshot.
57890098Smckusick	 */
57990098Smckusick	blkno = fragstoblks(fs, fs->fs_csaddr);
58090098Smckusick	len = howmany(fs->fs_cssize, fs->fs_bsize);
58190098Smckusick	space = copy_fs->fs_csp;
58290098Smckusick	for (loc = 0; loc < len; loc++) {
58390098Smckusick		error = bread(vp, blkno + loc, fs->fs_bsize, KERNCRED, &nbp);
58490098Smckusick		if (error) {
58590098Smckusick			brelse(nbp);
58690098Smckusick			fs->fs_snapinum[snaploc] = 0;
587107848Smckusick			FREE(snapblklist, M_UFSMNT);
58890098Smckusick			goto done;
58976269Smckusick		}
59090098Smckusick		bcopy(space, nbp->b_data, fs->fs_bsize);
59190098Smckusick		space = (char *)space + fs->fs_bsize;
59290098Smckusick		bawrite(nbp);
59376269Smckusick	}
594107848Smckusick	/*
595107848Smckusick	 * As this is the newest list, it is the most inclusive, so
596107848Smckusick	 * should replace the previous list.
597107848Smckusick	 */
598107848Smckusick	VI_LOCK(devvp);
599107848Smckusick	FREE(devvp->v_rdev->si_snapblklist, M_UFSMNT);
600107848Smckusick	devvp->v_rdev->si_snapblklist = snapblklist;
601107848Smckusick	devvp->v_rdev->si_snaplistsize = snaplistsize;
602107848Smckusick	VI_UNLOCK(devvp);
60390098Smckusickdone:
60490098Smckusick	free(copy_fs->fs_csp, M_UFSMNT);
60590098Smckusick	bawrite(sbp);
60662976Smckusickout:
607105667Smckusick	if (saved_nice > 0)
608105667Smckusick		td->td_ksegrp->kg_nice = saved_nice;
60987827Smckusick	if (fs->fs_active != 0) {
61087827Smckusick		FREE(fs->fs_active, M_DEVBUF);
61187827Smckusick		fs->fs_active = 0;
61287827Smckusick	}
61362976Smckusick	mp->mnt_flag = flag;
61476269Smckusick	if (error)
61583366Sjulian		(void) UFS_TRUNCATE(vp, (off_t)0, 0, NOCRED, td);
61683366Sjulian	(void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td);
61762976Smckusick	if (error)
61862976Smckusick		vput(vp);
61962976Smckusick	else
62083366Sjulian		VOP_UNLOCK(vp, 0, td);
62162976Smckusick	vn_finished_write(wrtmp);
62262976Smckusick	return (error);
62362976Smckusick}
62462976Smckusick
62562976Smckusick/*
62687827Smckusick * Copy a cylinder group map. All the unallocated blocks are marked
62787827Smckusick * BLK_NOCOPY so that the snapshot knows that it need not copy them
62892363Smckusick * if they are later written. If passno is one, then this is a first
62992363Smckusick * pass, so only setting needs to be done. If passno is 2, then this
63087827Smckusick * is a revision to a previous pass which must be undone as the
63187827Smckusick * replacement pass is done.
63287827Smckusick */
63387827Smckusickstatic int
63487827Smckusickcgaccount(cg, vp, nbp, passno)
63587827Smckusick	int cg;
63687827Smckusick	struct vnode *vp;
63787827Smckusick	struct buf *nbp;
63887827Smckusick	int passno;
63987827Smckusick{
64087827Smckusick	struct buf *bp, *ibp;
64187827Smckusick	struct inode *ip;
64287827Smckusick	struct cg *cgp;
64387827Smckusick	struct fs *fs;
64498542Smckusick	ufs2_daddr_t base, numblks;
64598542Smckusick	int error, len, loc, indiroff;
64687827Smckusick
64787827Smckusick	ip = VTOI(vp);
64887827Smckusick	fs = ip->i_fs;
64987827Smckusick	error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)),
65087827Smckusick		(int)fs->fs_cgsize, KERNCRED, &bp);
65187827Smckusick	if (error) {
65287827Smckusick		brelse(bp);
65387827Smckusick		return (error);
65487827Smckusick	}
65587827Smckusick	cgp = (struct cg *)bp->b_data;
65687827Smckusick	if (!cg_chkmagic(cgp)) {
65787827Smckusick		brelse(bp);
65887827Smckusick		return (EIO);
65987827Smckusick	}
66088138Smckusick	atomic_set_int(&ACTIVECGNUM(fs, cg), ACTIVECGOFF(cg));
66187827Smckusick	bcopy(bp->b_data, nbp->b_data, fs->fs_cgsize);
66287827Smckusick	if (fs->fs_cgsize < fs->fs_bsize)
66387827Smckusick		bzero(&nbp->b_data[fs->fs_cgsize],
66487827Smckusick		    fs->fs_bsize - fs->fs_cgsize);
66587827Smckusick	if (passno == 2)
66687827Smckusick		nbp->b_flags |= B_VALIDSUSPWRT;
66787827Smckusick	numblks = howmany(fs->fs_size, fs->fs_frag);
66887827Smckusick	len = howmany(fs->fs_fpg, fs->fs_frag);
66987827Smckusick	base = cg * fs->fs_fpg / fs->fs_frag;
67087827Smckusick	if (base + len >= numblks)
67187827Smckusick		len = numblks - base - 1;
67287827Smckusick	loc = 0;
67387827Smckusick	if (base < NDADDR) {
67487827Smckusick		for ( ; loc < NDADDR; loc++) {
67587827Smckusick			if (ffs_isblock(fs, cg_blksfree(cgp), loc))
67698542Smckusick				DIP(ip, i_db[loc]) = BLK_NOCOPY;
67798542Smckusick			else if (passno == 2 && DIP(ip, i_db[loc])== BLK_NOCOPY)
67898542Smckusick				DIP(ip, i_db[loc]) = 0;
67998542Smckusick			else if (passno == 1 && DIP(ip, i_db[loc])== BLK_NOCOPY)
68087827Smckusick				panic("ffs_snapshot: lost direct block");
68187827Smckusick		}
68287827Smckusick	}
68387827Smckusick	error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(base + loc)),
68498658Sdillon	    fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
68587827Smckusick	if (error) {
68687827Smckusick		brelse(bp);
68787827Smckusick		return (error);
68887827Smckusick	}
68987827Smckusick	indiroff = (base + loc - NDADDR) % NINDIR(fs);
69087827Smckusick	for ( ; loc < len; loc++, indiroff++) {
69187827Smckusick		if (indiroff >= NINDIR(fs)) {
69287827Smckusick			if (passno == 2)
69387827Smckusick				ibp->b_flags |= B_VALIDSUSPWRT;
69487827Smckusick			bawrite(ibp);
69587827Smckusick			error = UFS_BALLOC(vp,
69687827Smckusick			    lblktosize(fs, (off_t)(base + loc)),
69798658Sdillon			    fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
69887827Smckusick			if (error) {
69987827Smckusick				brelse(bp);
70087827Smckusick				return (error);
70187827Smckusick			}
70287827Smckusick			indiroff = 0;
70387827Smckusick		}
70498542Smckusick		if (ip->i_ump->um_fstype == UFS1) {
70598542Smckusick			if (ffs_isblock(fs, cg_blksfree(cgp), loc))
70698542Smckusick				((ufs1_daddr_t *)(ibp->b_data))[indiroff] =
70798542Smckusick				    BLK_NOCOPY;
70898542Smckusick			else if (passno == 2 && ((ufs1_daddr_t *)(ibp->b_data))
70998542Smckusick			    [indiroff] == BLK_NOCOPY)
71098542Smckusick				((ufs1_daddr_t *)(ibp->b_data))[indiroff] = 0;
71198542Smckusick			else if (passno == 1 && ((ufs1_daddr_t *)(ibp->b_data))
71298542Smckusick			    [indiroff] == BLK_NOCOPY)
71398542Smckusick				panic("ffs_snapshot: lost indirect block");
71498542Smckusick			continue;
71598542Smckusick		}
71687827Smckusick		if (ffs_isblock(fs, cg_blksfree(cgp), loc))
71798542Smckusick			((ufs2_daddr_t *)(ibp->b_data))[indiroff] = BLK_NOCOPY;
71887827Smckusick		else if (passno == 2 &&
71998542Smckusick		    ((ufs2_daddr_t *)(ibp->b_data)) [indiroff] == BLK_NOCOPY)
72098542Smckusick			((ufs2_daddr_t *)(ibp->b_data))[indiroff] = 0;
72187827Smckusick		else if (passno == 1 &&
72298542Smckusick		    ((ufs2_daddr_t *)(ibp->b_data)) [indiroff] == BLK_NOCOPY)
72387827Smckusick			panic("ffs_snapshot: lost indirect block");
72487827Smckusick	}
72587827Smckusick	bqrelse(bp);
72687827Smckusick	if (passno == 2)
72787827Smckusick		ibp->b_flags |= B_VALIDSUSPWRT;
72887827Smckusick	bdwrite(ibp);
72987827Smckusick	return (0);
73087827Smckusick}
73187827Smckusick
73287827Smckusick/*
73376269Smckusick * Before expunging a snapshot inode, note all the
73476269Smckusick * blocks that it claims with BLK_SNAP so that fsck will
73576269Smckusick * be able to account for those blocks properly and so
73676269Smckusick * that this snapshot knows that it need not copy them
73798542Smckusick * if the other snapshot holding them is freed. This code
73898542Smckusick * is reproduced once each for UFS1 and UFS2.
73976269Smckusick */
74076269Smckusickstatic int
74198542Smckusickexpunge_ufs1(snapvp, cancelip, fs, acctfunc, expungetype)
74290098Smckusick	struct vnode *snapvp;
74390098Smckusick	struct inode *cancelip;
74476269Smckusick	struct fs *fs;
74598542Smckusick	int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
74698542Smckusick	    struct fs *, ufs_lbn_t, int);
74790098Smckusick	int expungetype;
74876269Smckusick{
74998542Smckusick	int i, error, indiroff;
75098542Smckusick	ufs_lbn_t lbn, rlbn;
75198542Smckusick	ufs2_daddr_t len, blkno, numblks, blksperindir;
75298542Smckusick	struct ufs1_dinode *dip;
75390098Smckusick	struct thread *td = curthread;
75476269Smckusick	struct buf *bp;
75576269Smckusick
75676269Smckusick	/*
75790098Smckusick	 * Prepare to expunge the inode. If its inode block has not
75890098Smckusick	 * yet been copied, then allocate and fill the copy.
75976269Smckusick	 */
76090098Smckusick	lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number));
76190098Smckusick	blkno = 0;
76290098Smckusick	if (lbn < NDADDR) {
763107558Smckusick		blkno = VTOI(snapvp)->i_din1->di_db[lbn];
76490098Smckusick	} else {
76590098Smckusick		td->td_proc->p_flag |= P_COWINPROGRESS;
76690098Smckusick		error = UFS_BALLOC(snapvp, lblktosize(fs, (off_t)lbn),
76798658Sdillon		   fs->fs_bsize, KERNCRED, BA_METAONLY, &bp);
76890098Smckusick		td->td_proc->p_flag &= ~P_COWINPROGRESS;
76990098Smckusick		if (error)
77090098Smckusick			return (error);
77190098Smckusick		indiroff = (lbn - NDADDR) % NINDIR(fs);
77298542Smckusick		blkno = ((ufs1_daddr_t *)(bp->b_data))[indiroff];
77390098Smckusick		bqrelse(bp);
77490098Smckusick	}
775107558Smckusick	if (blkno != 0) {
776107558Smckusick		if ((error = bread(snapvp, lbn, fs->fs_bsize, KERNCRED, &bp)))
777107558Smckusick			return (error);
778107558Smckusick	} else {
779107558Smckusick		error = UFS_BALLOC(snapvp, lblktosize(fs, (off_t)lbn),
780107558Smckusick		    fs->fs_bsize, KERNCRED, 0, &bp);
781107558Smckusick		if (error)
782107558Smckusick			return (error);
783107558Smckusick		if ((error = readblock(bp, lbn)) != 0)
784107558Smckusick			return (error);
785107558Smckusick	}
78690098Smckusick	/*
78790098Smckusick	 * Set a snapshot inode to be a zero length file, regular files
78890098Smckusick	 * to be completely unallocated.
78990098Smckusick	 */
79098542Smckusick	dip = (struct ufs1_dinode *)bp->b_data +
79198542Smckusick	    ino_to_fsbo(fs, cancelip->i_number);
79290098Smckusick	if (expungetype == BLK_NOCOPY)
79390098Smckusick		dip->di_mode = 0;
79476269Smckusick	dip->di_size = 0;
79576269Smckusick	dip->di_blocks = 0;
79676269Smckusick	dip->di_flags &= ~SF_SNAPSHOT;
79798542Smckusick	bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs1_daddr_t));
79876269Smckusick	bdwrite(bp);
799107848Smckusick	/*
800107848Smckusick	 * Now go through and expunge all the blocks in the file
801107848Smckusick	 * using the function requested.
802107848Smckusick	 */
803107848Smckusick	numblks = howmany(cancelip->i_size, fs->fs_bsize);
804107848Smckusick	if ((error = (*acctfunc)(snapvp, &cancelip->i_din1->di_db[0],
805107848Smckusick	    &cancelip->i_din1->di_db[NDADDR], fs, 0, expungetype)))
806107848Smckusick		return (error);
807107848Smckusick	if ((error = (*acctfunc)(snapvp, &cancelip->i_din1->di_ib[0],
808107848Smckusick	    &cancelip->i_din1->di_ib[NIADDR], fs, -1, expungetype)))
809107848Smckusick		return (error);
810107848Smckusick	blksperindir = 1;
811107848Smckusick	lbn = -NDADDR;
812107848Smckusick	len = numblks - NDADDR;
813107848Smckusick	rlbn = NDADDR;
814107848Smckusick	for (i = 0; len > 0 && i < NIADDR; i++) {
815107848Smckusick		error = indiracct_ufs1(snapvp, ITOV(cancelip), i,
816107848Smckusick		    cancelip->i_din1->di_ib[i], lbn, rlbn, len,
817107848Smckusick		    blksperindir, fs, acctfunc, expungetype);
818107848Smckusick		if (error)
819107848Smckusick			return (error);
820107848Smckusick		blksperindir *= NINDIR(fs);
821107848Smckusick		lbn -= blksperindir + 1;
822107848Smckusick		len -= blksperindir;
823107848Smckusick		rlbn += blksperindir;
824107848Smckusick	}
82576269Smckusick	return (0);
82676269Smckusick}
82776269Smckusick
82876269Smckusick/*
82962976Smckusick * Descend an indirect block chain for vnode cancelvp accounting for all
83062976Smckusick * its indirect blocks in snapvp.
83162976Smckusick */
83262976Smckusickstatic int
83398542Smckusickindiracct_ufs1(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks,
83498542Smckusick	    blksperindir, fs, acctfunc, expungetype)
83562976Smckusick	struct vnode *snapvp;
83662976Smckusick	struct vnode *cancelvp;
83762976Smckusick	int level;
83898542Smckusick	ufs1_daddr_t blkno;
83998542Smckusick	ufs_lbn_t lbn;
84098542Smckusick	ufs_lbn_t rlbn;
84198542Smckusick	ufs_lbn_t remblks;
84298542Smckusick	ufs_lbn_t blksperindir;
84376269Smckusick	struct fs *fs;
84498542Smckusick	int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
84598542Smckusick	    struct fs *, ufs_lbn_t, int);
84690098Smckusick	int expungetype;
84762976Smckusick{
84898542Smckusick	int error, num, i;
84998542Smckusick	ufs_lbn_t subblksperindir;
85062976Smckusick	struct indir indirs[NIADDR + 2];
85198542Smckusick	ufs1_daddr_t last, *bap;
85262976Smckusick	struct buf *bp;
85362976Smckusick
85462976Smckusick	if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0)
85562976Smckusick		return (error);
85662976Smckusick	if (lbn != indirs[num - 1 - level].in_lbn || blkno == 0 || num < 2)
85762976Smckusick		panic("indiracct: botched params");
85862976Smckusick	/*
85962976Smckusick	 * We have to expand bread here since it will deadlock looking
86062976Smckusick	 * up the block number for any blocks that are not in the cache.
86162976Smckusick	 */
86262976Smckusick	bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0);
86362976Smckusick	bp->b_blkno = fsbtodb(fs, blkno);
86462976Smckusick	if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 &&
86562976Smckusick	    (error = readblock(bp, fragstoblks(fs, blkno)))) {
86662976Smckusick		brelse(bp);
86762976Smckusick		return (error);
86862976Smckusick	}
86962976Smckusick	/*
87062976Smckusick	 * Account for the block pointers in this indirect block.
87162976Smckusick	 */
87262976Smckusick	last = howmany(remblks, blksperindir);
87362976Smckusick	if (last > NINDIR(fs))
87462976Smckusick		last = NINDIR(fs);
87598542Smckusick	MALLOC(bap, ufs1_daddr_t *, fs->fs_bsize, M_DEVBUF, M_WAITOK);
87676269Smckusick	bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize);
87776269Smckusick	bqrelse(bp);
878107848Smckusick	error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs,
879107848Smckusick	    level == 0 ? rlbn : -1, expungetype);
88062976Smckusick	if (error || level == 0)
88162976Smckusick		goto out;
88262976Smckusick	/*
88362976Smckusick	 * Account for the block pointers in each of the indirect blocks
88462976Smckusick	 * in the levels below us.
88562976Smckusick	 */
88662976Smckusick	subblksperindir = blksperindir / NINDIR(fs);
88762976Smckusick	for (lbn++, level--, i = 0; i < last; i++) {
88898542Smckusick		error = indiracct_ufs1(snapvp, cancelvp, level, bap[i], lbn,
88990098Smckusick		    rlbn, remblks, subblksperindir, fs, acctfunc, expungetype);
89062976Smckusick		if (error)
89162976Smckusick			goto out;
89262976Smckusick		rlbn += blksperindir;
89362976Smckusick		lbn -= blksperindir;
89462976Smckusick		remblks -= blksperindir;
89562976Smckusick	}
89662976Smckusickout:
89776269Smckusick	FREE(bap, M_DEVBUF);
89862976Smckusick	return (error);
89962976Smckusick}
90062976Smckusick
90162976Smckusick/*
90290098Smckusick * Do both snap accounting and map accounting.
90390098Smckusick */
90490098Smckusickstatic int
90598542Smckusickfullacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype)
90690098Smckusick	struct vnode *vp;
90798542Smckusick	ufs1_daddr_t *oldblkp, *lastblkp;
90890098Smckusick	struct fs *fs;
90998542Smckusick	ufs_lbn_t lblkno;
91098542Smckusick	int exptype;	/* BLK_SNAP or BLK_NOCOPY */
91198542Smckusick{
91298542Smckusick	int error;
91398542Smckusick
91498542Smckusick	if ((error = snapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype)))
91598542Smckusick		return (error);
91698542Smckusick	return (mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype));
91798542Smckusick}
91898542Smckusick
91998542Smckusick/*
92098542Smckusick * Identify a set of blocks allocated in a snapshot inode.
92198542Smckusick */
92298542Smckusickstatic int
92398542Smckusicksnapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype)
92498542Smckusick	struct vnode *vp;
92598542Smckusick	ufs1_daddr_t *oldblkp, *lastblkp;
92698542Smckusick	struct fs *fs;
92798542Smckusick	ufs_lbn_t lblkno;
92890098Smckusick	int expungetype;	/* BLK_SNAP or BLK_NOCOPY */
92990098Smckusick{
93098542Smckusick	struct inode *ip = VTOI(vp);
93198542Smckusick	ufs1_daddr_t blkno, *blkp;
93298542Smckusick	ufs_lbn_t lbn;
93398542Smckusick	struct buf *ibp;
93490098Smckusick	int error;
93590098Smckusick
93698542Smckusick	for ( ; oldblkp < lastblkp; oldblkp++) {
93798542Smckusick		blkno = *oldblkp;
93898542Smckusick		if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP)
93998542Smckusick			continue;
94098542Smckusick		lbn = fragstoblks(fs, blkno);
94198542Smckusick		if (lbn < NDADDR) {
94298542Smckusick			blkp = &ip->i_din1->di_db[lbn];
94398542Smckusick			ip->i_flag |= IN_CHANGE | IN_UPDATE;
94498542Smckusick		} else {
94598542Smckusick			error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn),
94698658Sdillon			    fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
94798542Smckusick			if (error)
94898542Smckusick				return (error);
94998542Smckusick			blkp = &((ufs1_daddr_t *)(ibp->b_data))
95098542Smckusick			    [(lbn - NDADDR) % NINDIR(fs)];
95198542Smckusick		}
95298542Smckusick		/*
95398542Smckusick		 * If we are expunging a snapshot vnode and we
95498542Smckusick		 * find a block marked BLK_NOCOPY, then it is
95598542Smckusick		 * one that has been allocated to this snapshot after
95698542Smckusick		 * we took our current snapshot and can be ignored.
95798542Smckusick		 */
95898542Smckusick		if (expungetype == BLK_SNAP && *blkp == BLK_NOCOPY) {
95998542Smckusick			if (lbn >= NDADDR)
96098542Smckusick				brelse(ibp);
96198542Smckusick		} else {
96298542Smckusick			if (*blkp != 0)
96398542Smckusick				panic("snapacct: bad block");
96498542Smckusick			*blkp = expungetype;
96598542Smckusick			if (lbn >= NDADDR)
96698542Smckusick				bdwrite(ibp);
96798542Smckusick		}
96898542Smckusick	}
96998542Smckusick	return (0);
97098542Smckusick}
97198542Smckusick
97298542Smckusick/*
97398542Smckusick * Account for a set of blocks allocated in a snapshot inode.
97498542Smckusick */
97598542Smckusickstatic int
97698542Smckusickmapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype)
97798542Smckusick	struct vnode *vp;
97898542Smckusick	ufs1_daddr_t *oldblkp, *lastblkp;
97998542Smckusick	struct fs *fs;
98098542Smckusick	ufs_lbn_t lblkno;
98198542Smckusick	int expungetype;
98298542Smckusick{
98398542Smckusick	ufs1_daddr_t blkno;
984104698Smckusick	struct inode *ip;
98598542Smckusick	ino_t inum;
98698542Smckusick
987107848Smckusick	/*
988107848Smckusick	 * We only care about the leaf block numbers, not the
989107848Smckusick	 * meta-block numbers.
990107848Smckusick	 */
991107848Smckusick	if (lblkno == -1)
992107848Smckusick		return (0);
993104698Smckusick	ip = VTOI(vp);
994104698Smckusick	inum = ip->i_number;
99598542Smckusick	for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) {
99698542Smckusick		blkno = *oldblkp;
99798542Smckusick		if (blkno == 0 || blkno == BLK_NOCOPY)
99898542Smckusick			continue;
999104698Smckusick		if (expungetype == BLK_SNAP && blkno != BLK_SNAP)
1000107915Smckusick			*ip->i_snapblklist++ = lblkno;
100198542Smckusick		if (blkno == BLK_SNAP)
100298542Smckusick			blkno = blkstofrags(fs, lblkno);
100398542Smckusick		ffs_blkfree(fs, vp, blkno, fs->fs_bsize, inum);
100498542Smckusick	}
100598542Smckusick	return (0);
100698542Smckusick}
100798542Smckusick
100898542Smckusick/*
100998542Smckusick * Before expunging a snapshot inode, note all the
101098542Smckusick * blocks that it claims with BLK_SNAP so that fsck will
101198542Smckusick * be able to account for those blocks properly and so
101298542Smckusick * that this snapshot knows that it need not copy them
101398542Smckusick * if the other snapshot holding them is freed. This code
101498542Smckusick * is reproduced once each for UFS1 and UFS2.
101598542Smckusick */
101698542Smckusickstatic int
101798542Smckusickexpunge_ufs2(snapvp, cancelip, fs, acctfunc, expungetype)
101898542Smckusick	struct vnode *snapvp;
101998542Smckusick	struct inode *cancelip;
102098542Smckusick	struct fs *fs;
102198542Smckusick	int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
102298542Smckusick	    struct fs *, ufs_lbn_t, int);
102398542Smckusick	int expungetype;
102498542Smckusick{
102598542Smckusick	int i, error, indiroff;
102698542Smckusick	ufs_lbn_t lbn, rlbn;
102798542Smckusick	ufs2_daddr_t len, blkno, numblks, blksperindir;
102898542Smckusick	struct ufs2_dinode *dip;
102998542Smckusick	struct thread *td = curthread;
103098542Smckusick	struct buf *bp;
103198542Smckusick
103298542Smckusick	/*
103398542Smckusick	 * Prepare to expunge the inode. If its inode block has not
103498542Smckusick	 * yet been copied, then allocate and fill the copy.
103598542Smckusick	 */
103698542Smckusick	lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number));
103798542Smckusick	blkno = 0;
103898542Smckusick	if (lbn < NDADDR) {
1039107558Smckusick		blkno = VTOI(snapvp)->i_din2->di_db[lbn];
104098542Smckusick	} else {
104198542Smckusick		td->td_proc->p_flag |= P_COWINPROGRESS;
104298542Smckusick		error = UFS_BALLOC(snapvp, lblktosize(fs, (off_t)lbn),
104398658Sdillon		   fs->fs_bsize, KERNCRED, BA_METAONLY, &bp);
104498542Smckusick		td->td_proc->p_flag &= ~P_COWINPROGRESS;
104598542Smckusick		if (error)
104698542Smckusick			return (error);
104798542Smckusick		indiroff = (lbn - NDADDR) % NINDIR(fs);
104898542Smckusick		blkno = ((ufs2_daddr_t *)(bp->b_data))[indiroff];
104998542Smckusick		bqrelse(bp);
105098542Smckusick	}
1051107558Smckusick	if (blkno != 0) {
1052107558Smckusick		if ((error = bread(snapvp, lbn, fs->fs_bsize, KERNCRED, &bp)))
1053107558Smckusick			return (error);
1054107558Smckusick	} else {
1055107558Smckusick		error = UFS_BALLOC(snapvp, lblktosize(fs, (off_t)lbn),
1056107558Smckusick		    fs->fs_bsize, KERNCRED, 0, &bp);
1057107558Smckusick		if (error)
1058107558Smckusick			return (error);
1059107558Smckusick		if ((error = readblock(bp, lbn)) != 0)
1060107558Smckusick			return (error);
1061107558Smckusick	}
106298542Smckusick	/*
106398542Smckusick	 * Set a snapshot inode to be a zero length file, regular files
106498542Smckusick	 * to be completely unallocated.
106598542Smckusick	 */
106698542Smckusick	dip = (struct ufs2_dinode *)bp->b_data +
106798542Smckusick	    ino_to_fsbo(fs, cancelip->i_number);
106898542Smckusick	if (expungetype == BLK_NOCOPY)
106998542Smckusick		dip->di_mode = 0;
107098542Smckusick	dip->di_size = 0;
107198542Smckusick	dip->di_blocks = 0;
107298542Smckusick	dip->di_flags &= ~SF_SNAPSHOT;
107398542Smckusick	bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs2_daddr_t));
107498542Smckusick	bdwrite(bp);
1075107848Smckusick	/*
1076107848Smckusick	 * Now go through and expunge all the blocks in the file
1077107848Smckusick	 * using the function requested.
1078107848Smckusick	 */
1079107848Smckusick	numblks = howmany(cancelip->i_size, fs->fs_bsize);
1080107848Smckusick	if ((error = (*acctfunc)(snapvp, &cancelip->i_din2->di_db[0],
1081107848Smckusick	    &cancelip->i_din2->di_db[NDADDR], fs, 0, expungetype)))
1082107848Smckusick		return (error);
1083107848Smckusick	if ((error = (*acctfunc)(snapvp, &cancelip->i_din2->di_ib[0],
1084107848Smckusick	    &cancelip->i_din2->di_ib[NIADDR], fs, -1, expungetype)))
1085107848Smckusick		return (error);
1086107848Smckusick	blksperindir = 1;
1087107848Smckusick	lbn = -NDADDR;
1088107848Smckusick	len = numblks - NDADDR;
1089107848Smckusick	rlbn = NDADDR;
1090107848Smckusick	for (i = 0; len > 0 && i < NIADDR; i++) {
1091107848Smckusick		error = indiracct_ufs2(snapvp, ITOV(cancelip), i,
1092107848Smckusick		    cancelip->i_din2->di_ib[i], lbn, rlbn, len,
1093107848Smckusick		    blksperindir, fs, acctfunc, expungetype);
1094107848Smckusick		if (error)
1095107848Smckusick			return (error);
1096107848Smckusick		blksperindir *= NINDIR(fs);
1097107848Smckusick		lbn -= blksperindir + 1;
1098107848Smckusick		len -= blksperindir;
1099107848Smckusick		rlbn += blksperindir;
1100107848Smckusick	}
110198542Smckusick	return (0);
110290098Smckusick}
110390098Smckusick
110490098Smckusick/*
110598542Smckusick * Descend an indirect block chain for vnode cancelvp accounting for all
110698542Smckusick * its indirect blocks in snapvp.
110798542Smckusick */
110898542Smckusickstatic int
110998542Smckusickindiracct_ufs2(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks,
111098542Smckusick	    blksperindir, fs, acctfunc, expungetype)
111198542Smckusick	struct vnode *snapvp;
111298542Smckusick	struct vnode *cancelvp;
111398542Smckusick	int level;
111498542Smckusick	ufs2_daddr_t blkno;
111598542Smckusick	ufs_lbn_t lbn;
111698542Smckusick	ufs_lbn_t rlbn;
111798542Smckusick	ufs_lbn_t remblks;
111898542Smckusick	ufs_lbn_t blksperindir;
111998542Smckusick	struct fs *fs;
112098542Smckusick	int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
112198542Smckusick	    struct fs *, ufs_lbn_t, int);
112298542Smckusick	int expungetype;
112398542Smckusick{
112498542Smckusick	int error, num, i;
112598542Smckusick	ufs_lbn_t subblksperindir;
112698542Smckusick	struct indir indirs[NIADDR + 2];
112798542Smckusick	ufs2_daddr_t last, *bap;
112898542Smckusick	struct buf *bp;
112998542Smckusick
113098542Smckusick	if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0)
113198542Smckusick		return (error);
113298542Smckusick	if (lbn != indirs[num - 1 - level].in_lbn || blkno == 0 || num < 2)
113398542Smckusick		panic("indiracct: botched params");
113498542Smckusick	/*
113598542Smckusick	 * We have to expand bread here since it will deadlock looking
113698542Smckusick	 * up the block number for any blocks that are not in the cache.
113798542Smckusick	 */
113898542Smckusick	bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0);
113998542Smckusick	bp->b_blkno = fsbtodb(fs, blkno);
114098542Smckusick	if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 &&
114198542Smckusick	    (error = readblock(bp, fragstoblks(fs, blkno)))) {
114298542Smckusick		brelse(bp);
114398542Smckusick		return (error);
114498542Smckusick	}
114598542Smckusick	/*
114698542Smckusick	 * Account for the block pointers in this indirect block.
114798542Smckusick	 */
114898542Smckusick	last = howmany(remblks, blksperindir);
114998542Smckusick	if (last > NINDIR(fs))
115098542Smckusick		last = NINDIR(fs);
115198542Smckusick	MALLOC(bap, ufs2_daddr_t *, fs->fs_bsize, M_DEVBUF, M_WAITOK);
115298542Smckusick	bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize);
115398542Smckusick	bqrelse(bp);
1154107848Smckusick	error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs,
1155107848Smckusick	    level == 0 ? rlbn : -1, expungetype);
115698542Smckusick	if (error || level == 0)
115798542Smckusick		goto out;
115898542Smckusick	/*
115998542Smckusick	 * Account for the block pointers in each of the indirect blocks
116098542Smckusick	 * in the levels below us.
116198542Smckusick	 */
116298542Smckusick	subblksperindir = blksperindir / NINDIR(fs);
116398542Smckusick	for (lbn++, level--, i = 0; i < last; i++) {
116498542Smckusick		error = indiracct_ufs2(snapvp, cancelvp, level, bap[i], lbn,
116598542Smckusick		    rlbn, remblks, subblksperindir, fs, acctfunc, expungetype);
116698542Smckusick		if (error)
116798542Smckusick			goto out;
116898542Smckusick		rlbn += blksperindir;
116998542Smckusick		lbn -= blksperindir;
117098542Smckusick		remblks -= blksperindir;
117198542Smckusick	}
117298542Smckusickout:
117398542Smckusick	FREE(bap, M_DEVBUF);
117498542Smckusick	return (error);
117598542Smckusick}
117698542Smckusick
117798542Smckusick/*
117898542Smckusick * Do both snap accounting and map accounting.
117998542Smckusick */
118098542Smckusickstatic int
118198542Smckusickfullacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype)
118298542Smckusick	struct vnode *vp;
118398542Smckusick	ufs2_daddr_t *oldblkp, *lastblkp;
118498542Smckusick	struct fs *fs;
118598542Smckusick	ufs_lbn_t lblkno;
118698542Smckusick	int exptype;	/* BLK_SNAP or BLK_NOCOPY */
118798542Smckusick{
118898542Smckusick	int error;
118998542Smckusick
119098542Smckusick	if ((error = snapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype)))
119198542Smckusick		return (error);
119298542Smckusick	return (mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype));
119398542Smckusick}
119498542Smckusick
119598542Smckusick/*
119687827Smckusick * Identify a set of blocks allocated in a snapshot inode.
119762976Smckusick */
119862976Smckusickstatic int
119998542Smckusicksnapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expungetype)
120062976Smckusick	struct vnode *vp;
120198542Smckusick	ufs2_daddr_t *oldblkp, *lastblkp;
120276269Smckusick	struct fs *fs;
120398542Smckusick	ufs_lbn_t lblkno;
120490098Smckusick	int expungetype;	/* BLK_SNAP or BLK_NOCOPY */
120562976Smckusick{
120662976Smckusick	struct inode *ip = VTOI(vp);
120798542Smckusick	ufs2_daddr_t blkno, *blkp;
120898542Smckusick	ufs_lbn_t lbn;
120962976Smckusick	struct buf *ibp;
121062976Smckusick	int error;
121162976Smckusick
121262976Smckusick	for ( ; oldblkp < lastblkp; oldblkp++) {
121362976Smckusick		blkno = *oldblkp;
121462976Smckusick		if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP)
121562976Smckusick			continue;
121662976Smckusick		lbn = fragstoblks(fs, blkno);
121762976Smckusick		if (lbn < NDADDR) {
121898542Smckusick			blkp = &ip->i_din2->di_db[lbn];
121962976Smckusick			ip->i_flag |= IN_CHANGE | IN_UPDATE;
122062976Smckusick		} else {
122176132Sphk			error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn),
122298658Sdillon			    fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
122362976Smckusick			if (error)
122462976Smckusick				return (error);
122598542Smckusick			blkp = &((ufs2_daddr_t *)(ibp->b_data))
122662976Smckusick			    [(lbn - NDADDR) % NINDIR(fs)];
122762976Smckusick		}
122887827Smckusick		/*
122990098Smckusick		 * If we are expunging a snapshot vnode and we
123090098Smckusick		 * find a block marked BLK_NOCOPY, then it is
123187827Smckusick		 * one that has been allocated to this snapshot after
123287827Smckusick		 * we took our current snapshot and can be ignored.
123387827Smckusick		 */
123490098Smckusick		if (expungetype == BLK_SNAP && *blkp == BLK_NOCOPY) {
123587827Smckusick			if (lbn >= NDADDR)
123687827Smckusick				brelse(ibp);
123787827Smckusick		} else {
123887827Smckusick			if (*blkp != 0)
123987827Smckusick				panic("snapacct: bad block");
124090098Smckusick			*blkp = expungetype;
124187827Smckusick			if (lbn >= NDADDR)
124287827Smckusick				bdwrite(ibp);
124363788Smckusick		}
124462976Smckusick	}
124562976Smckusick	return (0);
124662976Smckusick}
124762976Smckusick
124862976Smckusick/*
124976269Smckusick * Account for a set of blocks allocated in a snapshot inode.
125076269Smckusick */
125176269Smckusickstatic int
125298542Smckusickmapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expungetype)
125376269Smckusick	struct vnode *vp;
125498542Smckusick	ufs2_daddr_t *oldblkp, *lastblkp;
125576269Smckusick	struct fs *fs;
125698542Smckusick	ufs_lbn_t lblkno;
125790098Smckusick	int expungetype;
125876269Smckusick{
125998542Smckusick	ufs2_daddr_t blkno;
1260104698Smckusick	struct inode *ip;
126190098Smckusick	ino_t inum;
126276269Smckusick
1263107848Smckusick	/*
1264107848Smckusick	 * We only care about the leaf block numbers, not the
1265107848Smckusick	 * meta-block numbers.
1266107848Smckusick	 */
1267107848Smckusick	if (lblkno == -1)
1268107848Smckusick		return (0);
1269104698Smckusick	ip = VTOI(vp);
1270104698Smckusick	inum = ip->i_number;
127176269Smckusick	for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) {
127276269Smckusick		blkno = *oldblkp;
127376269Smckusick		if (blkno == 0 || blkno == BLK_NOCOPY)
127476269Smckusick			continue;
1275104698Smckusick		if (expungetype == BLK_SNAP && blkno != BLK_SNAP)
1276107915Smckusick			*ip->i_snapblklist++ = lblkno;
127776269Smckusick		if (blkno == BLK_SNAP)
127876269Smckusick			blkno = blkstofrags(fs, lblkno);
127990098Smckusick		ffs_blkfree(fs, vp, blkno, fs->fs_bsize, inum);
128076269Smckusick	}
128176269Smckusick	return (0);
128276269Smckusick}
128376269Smckusick
128476269Smckusick/*
128570183Smckusick * Decrement extra reference on snapshot when last name is removed.
128670183Smckusick * It will not be freed until the last open reference goes away.
128770183Smckusick */
128870183Smckusickvoid
128970183Smckusickffs_snapgone(ip)
129070183Smckusick	struct inode *ip;
129170183Smckusick{
129270183Smckusick	struct inode *xp;
129374547Smckusick	struct fs *fs;
129474547Smckusick	int snaploc;
129570183Smckusick
129670183Smckusick	/*
129770183Smckusick	 * Find snapshot in incore list.
129870183Smckusick	 */
129973942Smckusick	TAILQ_FOREACH(xp, &ip->i_devvp->v_rdev->si_snapshots, i_nextsnap)
130073942Smckusick		if (xp == ip)
130170183Smckusick			break;
1302107848Smckusick	if (xp != NULL)
1303107848Smckusick		vrele(ITOV(ip));
1304107848Smckusick	else if (snapdebug)
130570183Smckusick		printf("ffs_snapgone: lost snapshot vnode %d\n",
130670183Smckusick		    ip->i_number);
130774547Smckusick	/*
130874547Smckusick	 * Delete snapshot inode from superblock. Keep list dense.
130974547Smckusick	 */
131074547Smckusick	fs = ip->i_fs;
131174547Smckusick	for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++)
131274547Smckusick		if (fs->fs_snapinum[snaploc] == ip->i_number)
131374547Smckusick			break;
131474547Smckusick	if (snaploc < FSMAXSNAP) {
131574547Smckusick		for (snaploc++; snaploc < FSMAXSNAP; snaploc++) {
131674547Smckusick			if (fs->fs_snapinum[snaploc] == 0)
131774547Smckusick				break;
131874547Smckusick			fs->fs_snapinum[snaploc - 1] = fs->fs_snapinum[snaploc];
131974547Smckusick		}
132074547Smckusick		fs->fs_snapinum[snaploc - 1] = 0;
132174547Smckusick	}
132270183Smckusick}
132370183Smckusick
132470183Smckusick/*
132562976Smckusick * Prepare a snapshot file for being removed.
132662976Smckusick */
132762976Smckusickvoid
132862976Smckusickffs_snapremove(vp)
132962976Smckusick	struct vnode *vp;
133062976Smckusick{
133173942Smckusick	struct inode *ip;
133262976Smckusick	struct vnode *devvp;
1333105191Smckusick	struct lock *lkp;
133462976Smckusick	struct buf *ibp;
133562976Smckusick	struct fs *fs;
1336105191Smckusick	struct thread *td = curthread;
1337107848Smckusick	ufs2_daddr_t numblks, blkno, dblk, *snapblklist;
133898542Smckusick	int error, loc, last;
133962976Smckusick
134062976Smckusick	ip = VTOI(vp);
134162976Smckusick	fs = ip->i_fs;
1342107414Smckusick	devvp = ip->i_devvp;
134362976Smckusick	/*
134475943Smckusick	 * If active, delete from incore list (this snapshot may
134575943Smckusick	 * already have been in the process of being deleted, so
134675943Smckusick	 * would not have been active).
134775943Smckusick	 *
134862976Smckusick	 * Clear copy-on-write flag if last snapshot.
134962976Smckusick	 */
135075943Smckusick	if (ip->i_nextsnap.tqe_prev != 0) {
1351107414Smckusick		VI_LOCK(devvp);
1352107414Smckusick		lockmgr(&vp->v_lock, LK_INTERLOCK | LK_EXCLUSIVE,
1353107414Smckusick		    VI_MTX(devvp), td);
1354107414Smckusick		VI_LOCK(devvp);
1355107414Smckusick		TAILQ_REMOVE(&devvp->v_rdev->si_snapshots, ip, i_nextsnap);
1356107414Smckusick		ip->i_nextsnap.tqe_prev = 0;
1357105191Smckusick		lkp = vp->v_vnlock;
1358105191Smckusick		vp->v_vnlock = &vp->v_lock;
1359107414Smckusick		lockmgr(lkp, LK_RELEASE, NULL, td);
1360107414Smckusick		if (TAILQ_FIRST(&devvp->v_rdev->si_snapshots) != 0) {
1361107414Smckusick			VI_UNLOCK(devvp);
1362107414Smckusick		} else {
1363107848Smckusick			snapblklist = devvp->v_rdev->si_snapblklist;
1364107848Smckusick			devvp->v_rdev->si_snapblklist = 0;
1365107848Smckusick			devvp->v_rdev->si_snaplistsize = 0;
1366107414Smckusick			devvp->v_rdev->si_copyonwrite = 0;
1367107414Smckusick			devvp->v_vflag &= ~VV_COPYONWRITE;
1368107414Smckusick			lockmgr(lkp, LK_DRAIN|LK_INTERLOCK, VI_MTX(devvp), td);
1369107414Smckusick			lockmgr(lkp, LK_RELEASE, NULL, td);
1370105191Smckusick			lockdestroy(lkp);
1371105191Smckusick			FREE(lkp, M_UFSMNT);
1372107848Smckusick			FREE(snapblklist, M_UFSMNT);
137373942Smckusick		}
137473942Smckusick	}
137562976Smckusick	/*
137662976Smckusick	 * Clear all BLK_NOCOPY fields. Pass any block claims to other
137762976Smckusick	 * snapshots that want them (see ffs_snapblkfree below).
137862976Smckusick	 */
137962976Smckusick	for (blkno = 1; blkno < NDADDR; blkno++) {
138098542Smckusick		dblk = DIP(ip, i_db[blkno]);
138176356Smckusick		if (dblk == BLK_NOCOPY || dblk == BLK_SNAP)
138298542Smckusick			DIP(ip, i_db[blkno]) = 0;
138376356Smckusick		else if ((dblk == blkstofrags(fs, blkno) &&
138490098Smckusick		     ffs_snapblkfree(fs, ip->i_devvp, dblk, fs->fs_bsize,
138590098Smckusick		     ip->i_number))) {
138698542Smckusick			DIP(ip, i_blocks) -= btodb(fs->fs_bsize);
138798542Smckusick			DIP(ip, i_db[blkno]) = 0;
138876356Smckusick		}
138962976Smckusick	}
139076356Smckusick	numblks = howmany(ip->i_size, fs->fs_bsize);
139176356Smckusick	for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) {
139276132Sphk		error = UFS_BALLOC(vp, lblktosize(fs, (off_t)blkno),
139398658Sdillon		    fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
139462976Smckusick		if (error)
139562976Smckusick			continue;
139698542Smckusick		if (fs->fs_size - blkno > NINDIR(fs))
139762976Smckusick			last = NINDIR(fs);
139898542Smckusick		else
139998542Smckusick			last = fs->fs_size - blkno;
140062976Smckusick		for (loc = 0; loc < last; loc++) {
140198542Smckusick			if (ip->i_ump->um_fstype == UFS1) {
140298542Smckusick				dblk = ((ufs1_daddr_t *)(ibp->b_data))[loc];
140398542Smckusick				if (dblk == BLK_NOCOPY || dblk == BLK_SNAP)
140498542Smckusick					((ufs1_daddr_t *)(ibp->b_data))[loc]= 0;
140598542Smckusick				else if ((dblk == blkstofrags(fs, blkno) &&
140698542Smckusick				     ffs_snapblkfree(fs, ip->i_devvp, dblk,
140798542Smckusick				     fs->fs_bsize, ip->i_number))) {
140898542Smckusick					ip->i_din1->di_blocks -=
140998542Smckusick					    btodb(fs->fs_bsize);
141098542Smckusick					((ufs1_daddr_t *)(ibp->b_data))[loc]= 0;
141198542Smckusick				}
141298542Smckusick				continue;
141398542Smckusick			}
141498542Smckusick			dblk = ((ufs2_daddr_t *)(ibp->b_data))[loc];
141576356Smckusick			if (dblk == BLK_NOCOPY || dblk == BLK_SNAP)
141698542Smckusick				((ufs2_daddr_t *)(ibp->b_data))[loc] = 0;
141776356Smckusick			else if ((dblk == blkstofrags(fs, blkno) &&
141890098Smckusick			     ffs_snapblkfree(fs, ip->i_devvp, dblk,
141990098Smckusick			     fs->fs_bsize, ip->i_number))) {
142098542Smckusick				ip->i_din2->di_blocks -= btodb(fs->fs_bsize);
142198542Smckusick				((ufs2_daddr_t *)(ibp->b_data))[loc] = 0;
142276356Smckusick			}
142362976Smckusick		}
142462976Smckusick		bawrite(ibp);
142562976Smckusick	}
142662976Smckusick	/*
142762976Smckusick	 * Clear snapshot flag and drop reference.
142862976Smckusick	 */
142963897Smckusick	ip->i_flags &= ~SF_SNAPSHOT;
143098542Smckusick	DIP(ip, i_flags) = ip->i_flags;
143162976Smckusick	ip->i_flag |= IN_CHANGE | IN_UPDATE;
143262976Smckusick}
143362976Smckusick
143462976Smckusick/*
143562976Smckusick * Notification that a block is being freed. Return zero if the free
143662976Smckusick * should be allowed to proceed. Return non-zero if the snapshot file
143762976Smckusick * wants to claim the block. The block will be claimed if it is an
143862976Smckusick * uncopied part of one of the snapshots. It will be freed if it is
143962976Smckusick * either a BLK_NOCOPY or has already been copied in all of the snapshots.
144062976Smckusick * If a fragment is being freed, then all snapshots that care about
144162976Smckusick * it must make a copy since a snapshot file can only claim full sized
144262976Smckusick * blocks. Note that if more than one snapshot file maps the block,
144362976Smckusick * we can pick one at random to claim it. Since none of the snapshots
144462976Smckusick * can change, we are assurred that they will all see the same unmodified
144562976Smckusick * image. When deleting a snapshot file (see ffs_snapremove above), we
144662976Smckusick * must push any of these claimed blocks to one of the other snapshots
144762976Smckusick * that maps it. These claimed blocks are easily identified as they will
144862976Smckusick * have a block number equal to their logical block number within the
144962976Smckusick * snapshot. A copied block can never have this property because they
145062976Smckusick * must always have been allocated from a BLK_NOCOPY location.
145162976Smckusick */
145262976Smckusickint
145390098Smckusickffs_snapblkfree(fs, devvp, bno, size, inum)
145490098Smckusick	struct fs *fs;
145590098Smckusick	struct vnode *devvp;
145698542Smckusick	ufs2_daddr_t bno;
145762976Smckusick	long size;
145890098Smckusick	ino_t inum;
145962976Smckusick{
146062976Smckusick	struct buf *ibp, *cbp, *savedcbp = 0;
146183366Sjulian	struct thread *td = curthread;
146262976Smckusick	struct inode *ip;
1463107414Smckusick	struct vnode *vp = NULL;
146498542Smckusick	ufs_lbn_t lbn;
146598542Smckusick	ufs2_daddr_t blkno;
1466107414Smckusick	int indiroff = 0, snapshot_locked = 0, error = 0, claimedblk = 0;
146773942Smckusick	struct snaphead *snaphead;
146862976Smckusick
146962976Smckusick	lbn = fragstoblks(fs, bno);
1470107414Smckusickretry:
1471107414Smckusick	VI_LOCK(devvp);
147290098Smckusick	snaphead = &devvp->v_rdev->si_snapshots;
147373942Smckusick	TAILQ_FOREACH(ip, snaphead, i_nextsnap) {
147462976Smckusick		vp = ITOV(ip);
147562976Smckusick		/*
147662976Smckusick		 * Lookup block being written.
147762976Smckusick		 */
147862976Smckusick		if (lbn < NDADDR) {
147998542Smckusick			blkno = DIP(ip, i_db[lbn]);
148062976Smckusick		} else {
1481107414Smckusick			if (snapshot_locked == 0 &&
1482107848Smckusick			    lockmgr(vp->v_vnlock,
1483107414Smckusick			      LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL,
1484107414Smckusick			      VI_MTX(devvp), td) != 0)
1485107414Smckusick				goto retry;
1486107848Smckusick			snapshot_locked = 1;
148783366Sjulian			td->td_proc->p_flag |= P_COWINPROGRESS;
148876132Sphk			error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn),
148998658Sdillon			    fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
149083366Sjulian			td->td_proc->p_flag &= ~P_COWINPROGRESS;
149162976Smckusick			if (error)
149262976Smckusick				break;
149362976Smckusick			indiroff = (lbn - NDADDR) % NINDIR(fs);
149498542Smckusick			if (ip->i_ump->um_fstype == UFS1)
149598542Smckusick				blkno=((ufs1_daddr_t *)(ibp->b_data))[indiroff];
149698542Smckusick			else
149798542Smckusick				blkno=((ufs2_daddr_t *)(ibp->b_data))[indiroff];
149862976Smckusick		}
149962976Smckusick		/*
150062976Smckusick		 * Check to see if block needs to be copied.
150162976Smckusick		 */
150298542Smckusick		if (blkno == 0) {
150398542Smckusick			/*
150498542Smckusick			 * A block that we map is being freed. If it has not
150598542Smckusick			 * been claimed yet, we will claim or copy it (below).
150698542Smckusick			 */
150798542Smckusick			claimedblk = 1;
150898542Smckusick		} else if (blkno == BLK_SNAP) {
150998542Smckusick			/*
151098542Smckusick			 * No previous snapshot claimed the block,
1511107414Smckusick			 * so it will be freed and become a BLK_NOCOPY
151298542Smckusick			 * (don't care) for us.
151398542Smckusick			 */
151462976Smckusick			if (claimedblk)
151562976Smckusick				panic("snapblkfree: inconsistent block type");
1516107414Smckusick			if (snapshot_locked == 0 &&
1517107414Smckusick			    lockmgr(vp->v_vnlock,
1518107414Smckusick			      LK_INTERLOCK | LK_EXCLUSIVE | LK_NOWAIT,
1519107414Smckusick			      VI_MTX(devvp), td) != 0) {
1520107414Smckusick				if (lbn >= NDADDR)
1521107414Smckusick					bqrelse(ibp);
1522107414Smckusick				vn_lock(vp, LK_EXCLUSIVE | LK_SLEEPFAIL, td);
1523107414Smckusick				goto retry;
1524107414Smckusick			}
1525107414Smckusick			snapshot_locked = 1;
152662976Smckusick			if (lbn < NDADDR) {
152798542Smckusick				DIP(ip, i_db[lbn]) = BLK_NOCOPY;
152862976Smckusick				ip->i_flag |= IN_CHANGE | IN_UPDATE;
152998542Smckusick			} else if (ip->i_ump->um_fstype == UFS1) {
153098542Smckusick				((ufs1_daddr_t *)(ibp->b_data))[indiroff] =
153198542Smckusick				    BLK_NOCOPY;
153298542Smckusick				bdwrite(ibp);
153362976Smckusick			} else {
153498542Smckusick				((ufs2_daddr_t *)(ibp->b_data))[indiroff] =
153562976Smckusick				    BLK_NOCOPY;
153662976Smckusick				bdwrite(ibp);
153762976Smckusick			}
153862976Smckusick			continue;
153998542Smckusick		} else /* BLK_NOCOPY or default */ {
154098542Smckusick			/*
154198542Smckusick			 * If the snapshot has already copied the block
154298542Smckusick			 * (default), or does not care about the block,
154398542Smckusick			 * it is not needed.
154498542Smckusick			 */
154598542Smckusick			if (lbn >= NDADDR)
154698542Smckusick				bqrelse(ibp);
154798542Smckusick			continue;
154862976Smckusick		}
154962976Smckusick		/*
155062976Smckusick		 * If this is a full size block, we will just grab it
155162976Smckusick		 * and assign it to the snapshot inode. Otherwise we
155262976Smckusick		 * will proceed to copy it. See explanation for this
155362976Smckusick		 * routine as to why only a single snapshot needs to
155462976Smckusick		 * claim this block.
155562976Smckusick		 */
1556107414Smckusick		if (snapshot_locked == 0 &&
1557107414Smckusick		    lockmgr(vp->v_vnlock,
1558107414Smckusick		      LK_INTERLOCK | LK_EXCLUSIVE | LK_NOWAIT,
1559107414Smckusick		      VI_MTX(devvp), td) != 0) {
1560107414Smckusick			if (lbn >= NDADDR)
1561107414Smckusick				bqrelse(ibp);
1562107414Smckusick			vn_lock(vp, LK_EXCLUSIVE | LK_SLEEPFAIL, td);
1563107414Smckusick			goto retry;
1564107414Smckusick		}
1565107414Smckusick		snapshot_locked = 1;
156662976Smckusick		if (size == fs->fs_bsize) {
156762976Smckusick#ifdef DEBUG
156862976Smckusick			if (snapdebug)
156998687Smux				printf("%s %d lbn %jd from inum %d\n",
157098542Smckusick				    "Grabonremove: snapino", ip->i_number,
157198542Smckusick				    (intmax_t)lbn, inum);
157262976Smckusick#endif
157362976Smckusick			if (lbn < NDADDR) {
157498542Smckusick				DIP(ip, i_db[lbn]) = bno;
157598542Smckusick			} else if (ip->i_ump->um_fstype == UFS1) {
157698542Smckusick				((ufs1_daddr_t *)(ibp->b_data))[indiroff] = bno;
157798542Smckusick				bdwrite(ibp);
157862976Smckusick			} else {
157998542Smckusick				((ufs2_daddr_t *)(ibp->b_data))[indiroff] = bno;
158062976Smckusick				bdwrite(ibp);
158162976Smckusick			}
158298542Smckusick			DIP(ip, i_blocks) += btodb(size);
158362976Smckusick			ip->i_flag |= IN_CHANGE | IN_UPDATE;
158483366Sjulian			VOP_UNLOCK(vp, 0, td);
158562976Smckusick			return (1);
158662976Smckusick		}
158762976Smckusick		if (lbn >= NDADDR)
158863788Smckusick			bqrelse(ibp);
158962976Smckusick		/*
159062976Smckusick		 * Allocate the block into which to do the copy. Note that this
159162976Smckusick		 * allocation will never require any additional allocations for
159262976Smckusick		 * the snapshot inode.
159362976Smckusick		 */
159483366Sjulian		td->td_proc->p_flag |= P_COWINPROGRESS;
159576132Sphk		error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn),
159662976Smckusick		    fs->fs_bsize, KERNCRED, 0, &cbp);
159783366Sjulian		td->td_proc->p_flag &= ~P_COWINPROGRESS;
1598107414Smckusick		if (error)
159962976Smckusick			break;
160062976Smckusick#ifdef DEBUG
160162976Smckusick		if (snapdebug)
160298687Smux			printf("%s%d lbn %jd %s %d size %ld to blkno %jd\n",
160398542Smckusick			    "Copyonremove: snapino ", ip->i_number,
160498542Smckusick			    (intmax_t)lbn, "for inum", inum, size,
160598542Smckusick			    (intmax_t)cbp->b_blkno);
160662976Smckusick#endif
160762976Smckusick		/*
160862976Smckusick		 * If we have already read the old block contents, then
160975943Smckusick		 * simply copy them to the new block. Note that we need
161075943Smckusick		 * to synchronously write snapshots that have not been
161175943Smckusick		 * unlinked, and hence will be visible after a crash,
161275943Smckusick		 * to ensure their integrity.
161362976Smckusick		 */
161462976Smckusick		if (savedcbp != 0) {
161562976Smckusick			bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize);
161662976Smckusick			bawrite(cbp);
161776580Smckusick			if (dopersistence && ip->i_effnlink > 0)
161883366Sjulian				(void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td);
161962976Smckusick			continue;
162062976Smckusick		}
162162976Smckusick		/*
162262976Smckusick		 * Otherwise, read the old block contents into the buffer.
162362976Smckusick		 */
162475943Smckusick		if ((error = readblock(cbp, lbn)) != 0) {
162575943Smckusick			bzero(cbp->b_data, fs->fs_bsize);
162675943Smckusick			bawrite(cbp);
162776580Smckusick			if (dopersistence && ip->i_effnlink > 0)
162883366Sjulian				(void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td);
162962976Smckusick			break;
163075943Smckusick		}
163162976Smckusick		savedcbp = cbp;
163262976Smckusick	}
163375943Smckusick	/*
163475943Smckusick	 * Note that we need to synchronously write snapshots that
163575943Smckusick	 * have not been unlinked, and hence will be visible after
163675943Smckusick	 * a crash, to ensure their integrity.
163775943Smckusick	 */
163875943Smckusick	if (savedcbp) {
163975943Smckusick		vp = savedcbp->b_vp;
164062976Smckusick		bawrite(savedcbp);
1641107414Smckusick		if (dopersistence && VTOI(vp)->i_effnlink > 0)
164283366Sjulian			(void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td);
164375943Smckusick	}
164462976Smckusick	/*
164562976Smckusick	 * If we have been unable to allocate a block in which to do
164662976Smckusick	 * the copy, then return non-zero so that the fragment will
164762976Smckusick	 * not be freed. Although space will be lost, the snapshot
164862976Smckusick	 * will stay consistent.
164962976Smckusick	 */
1650107414Smckusick	if (snapshot_locked)
1651107414Smckusick		VOP_UNLOCK(vp, 0, td);
1652107414Smckusick	else
1653107414Smckusick		VI_UNLOCK(devvp);
165462976Smckusick	return (error);
165562976Smckusick}
165662976Smckusick
165762976Smckusick/*
165862976Smckusick * Associate snapshot files when mounting.
165962976Smckusick */
166062976Smckusickvoid
166162976Smckusickffs_snapshot_mount(mp)
166262976Smckusick	struct mount *mp;
166362976Smckusick{
166462976Smckusick	struct ufsmount *ump = VFSTOUFS(mp);
1665107414Smckusick	struct vnode *devvp = ump->um_devvp;
166662976Smckusick	struct fs *fs = ump->um_fs;
166783366Sjulian	struct thread *td = curthread;
166873942Smckusick	struct snaphead *snaphead;
166962976Smckusick	struct vnode *vp;
1670105191Smckusick	struct inode *ip, *xp;
1671104698Smckusick	struct uio auio;
1672104698Smckusick	struct iovec aiov;
1673107848Smckusick	void *snapblklist;
1674104698Smckusick	char *reason;
1675107848Smckusick	daddr_t snaplistsize;
167662976Smckusick	int error, snaploc, loc;
167762976Smckusick
1678104698Smckusick	/*
1679104698Smckusick	 * XXX The following needs to be set before UFS_TRUNCATE or
1680104698Smckusick	 * VOP_READ can be called.
1681104698Smckusick	 */
1682104698Smckusick	mp->mnt_stat.f_iosize = fs->fs_bsize;
1683104698Smckusick	/*
1684104698Smckusick	 * Process each snapshot listed in the superblock.
1685104698Smckusick	 */
1686107848Smckusick	vp = NULL;
1687107414Smckusick	snaphead = &devvp->v_rdev->si_snapshots;
168862976Smckusick	for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) {
168962976Smckusick		if (fs->fs_snapinum[snaploc] == 0)
1690107848Smckusick			break;
169192462Smckusick		if ((error = VFS_VGET(mp, fs->fs_snapinum[snaploc],
169292462Smckusick		    LK_EXCLUSIVE, &vp)) != 0){
169362976Smckusick			printf("ffs_snapshot_mount: vget failed %d\n", error);
169462976Smckusick			continue;
169562976Smckusick		}
169662976Smckusick		ip = VTOI(vp);
1697104698Smckusick		if ((ip->i_flags & SF_SNAPSHOT) == 0 || ip->i_size ==
1698104698Smckusick		    lblktosize(fs, howmany(fs->fs_size, fs->fs_frag))) {
1699104698Smckusick			if ((ip->i_flags & SF_SNAPSHOT) == 0) {
1700104698Smckusick				reason = "non-snapshot";
1701104698Smckusick			} else {
1702104698Smckusick				reason = "old format snapshot";
1703104698Smckusick				(void)UFS_TRUNCATE(vp, (off_t)0, 0, NOCRED, td);
1704104698Smckusick				(void)VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td);
1705104698Smckusick			}
1706104698Smckusick			printf("ffs_snapshot_mount: %s inode %d\n",
1707104698Smckusick			    reason, fs->fs_snapinum[snaploc]);
170862976Smckusick			vput(vp);
1709107848Smckusick			vp = NULL;
171062976Smckusick			for (loc = snaploc + 1; loc < FSMAXSNAP; loc++) {
171162976Smckusick				if (fs->fs_snapinum[loc] == 0)
171262976Smckusick					break;
171362976Smckusick				fs->fs_snapinum[loc - 1] = fs->fs_snapinum[loc];
171462976Smckusick			}
171562976Smckusick			fs->fs_snapinum[loc - 1] = 0;
171662976Smckusick			snaploc--;
171762976Smckusick			continue;
171862976Smckusick		}
1719104698Smckusick		/*
1720105191Smckusick		 * If there already exist snapshots on this filesystem, grab a
1721105191Smckusick		 * reference to their shared lock. If this is the first snapshot
1722105191Smckusick		 * on this filesystem, we need to allocate a lock for the
1723105191Smckusick		 * snapshots to share. In either case, acquire the snapshot
1724105191Smckusick		 * lock and give up our original private lock.
1725105191Smckusick		 */
1726107414Smckusick		VI_LOCK(devvp);
1727105191Smckusick		if ((xp = TAILQ_FIRST(snaphead)) != NULL) {
1728105191Smckusick			VI_LOCK(vp);
1729105191Smckusick			vp->v_vnlock = ITOV(xp)->v_vnlock;
1730107414Smckusick			VI_UNLOCK(devvp);
1731105191Smckusick		} else {
1732105191Smckusick			struct lock *lkp;
1733105191Smckusick
1734107414Smckusick			VI_UNLOCK(devvp);
1735105191Smckusick			MALLOC(lkp, struct lock *, sizeof(struct lock),
1736105191Smckusick			    M_UFSMNT, M_WAITOK);
1737105191Smckusick			lockinit(lkp, PVFS, "snaplk", VLKTIMEOUT,
1738105191Smckusick			    LK_CANRECURSE | LK_NOPAUSE);
1739105191Smckusick			VI_LOCK(vp);
1740105191Smckusick			vp->v_vnlock = lkp;
1741105191Smckusick		}
1742105191Smckusick		vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY, td);
1743107414Smckusick		transferlockers(&vp->v_lock, vp->v_vnlock);
1744107414Smckusick		lockmgr(&vp->v_lock, LK_RELEASE, NULL, td);
1745105191Smckusick		/*
1746104698Smckusick		 * Link it onto the active snapshot list.
1747104698Smckusick		 */
1748107414Smckusick		VI_LOCK(devvp);
174973942Smckusick		if (ip->i_nextsnap.tqe_prev != 0)
175062976Smckusick			panic("ffs_snapshot_mount: %d already on list",
175162976Smckusick			    ip->i_number);
175273942Smckusick		else
175373942Smckusick			TAILQ_INSERT_TAIL(snaphead, ip, i_nextsnap);
1754101308Sjeff		vp->v_vflag |= VV_SYSTEM;
1755107414Smckusick		VI_UNLOCK(devvp);
175683366Sjulian		VOP_UNLOCK(vp, 0, td);
175762976Smckusick	}
1758107848Smckusick	/*
1759107848Smckusick	 * No usable snapshots found.
1760107848Smckusick	 */
1761107848Smckusick	if (vp == NULL)
1762107848Smckusick		return;
1763107848Smckusick	/*
1764107848Smckusick	 * Allocate the space for the block hints list. We always want to
1765107848Smckusick	 * use the list from the newest snapshot.
1766107848Smckusick	 */
1767107848Smckusick	auio.uio_iov = &aiov;
1768107848Smckusick	auio.uio_iovcnt = 1;
1769107848Smckusick	aiov.iov_base = (void *)&snaplistsize;
1770107848Smckusick	aiov.iov_len = sizeof(snaplistsize);
1771107848Smckusick	auio.uio_resid = aiov.iov_len;
1772107848Smckusick	auio.uio_offset =
1773107848Smckusick	    lblktosize(fs, howmany(fs->fs_size, fs->fs_frag));
1774107848Smckusick	auio.uio_segflg = UIO_SYSSPACE;
1775107848Smckusick	auio.uio_rw = UIO_READ;
1776107848Smckusick	auio.uio_td = td;
1777107848Smckusick	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
1778107848Smckusick	if ((error = VOP_READ(vp, &auio, IO_UNIT, td->td_ucred)) != 0) {
1779107848Smckusick		printf("ffs_snapshot_mount: read_1 failed %d\n", error);
1780107848Smckusick		VOP_UNLOCK(vp, 0, td);
1781107848Smckusick		return;
1782107848Smckusick	}
1783107848Smckusick	MALLOC(snapblklist, void *, snaplistsize * sizeof(daddr_t),
1784107848Smckusick	    M_UFSMNT, M_WAITOK);
1785107848Smckusick	auio.uio_iovcnt = 1;
1786107848Smckusick	aiov.iov_base = snapblklist;
1787107848Smckusick	aiov.iov_len = snaplistsize * sizeof (daddr_t);
1788107848Smckusick	auio.uio_resid = aiov.iov_len;
1789107848Smckusick	auio.uio_offset -= sizeof(snaplistsize);
1790107848Smckusick	if ((error = VOP_READ(vp, &auio, IO_UNIT, td->td_ucred)) != 0) {
1791107848Smckusick		printf("ffs_snapshot_mount: read_2 failed %d\n", error);
1792107848Smckusick		VOP_UNLOCK(vp, 0, td);
1793107848Smckusick		FREE(snapblklist, M_UFSMNT);
1794107848Smckusick		return;
1795107848Smckusick	}
1796107848Smckusick	VOP_UNLOCK(vp, 0, td);
1797107848Smckusick	VI_LOCK(devvp);
1798107848Smckusick	ASSERT_VOP_LOCKED(devvp, "ffs_snapshot_mount");
1799107848Smckusick	devvp->v_rdev->si_snaplistsize = snaplistsize;
1800107848Smckusick	devvp->v_rdev->si_snapblklist = (daddr_t *)snapblklist;
1801107848Smckusick	devvp->v_rdev->si_copyonwrite = ffs_copyonwrite;
1802107848Smckusick	devvp->v_vflag |= VV_COPYONWRITE;
1803107848Smckusick	VI_UNLOCK(devvp);
180462976Smckusick}
180562976Smckusick
180662976Smckusick/*
180762976Smckusick * Disassociate snapshot files when unmounting.
180862976Smckusick */
180962976Smckusickvoid
181062976Smckusickffs_snapshot_unmount(mp)
181162976Smckusick	struct mount *mp;
181262976Smckusick{
1813107414Smckusick	struct vnode *devvp = VFSTOUFS(mp)->um_devvp;
1814107414Smckusick	struct snaphead *snaphead = &devvp->v_rdev->si_snapshots;
1815105191Smckusick	struct lock *lkp = NULL;
181662976Smckusick	struct inode *xp;
1817105191Smckusick	struct vnode *vp;
181862976Smckusick
1819107414Smckusick	VI_LOCK(devvp);
182073942Smckusick	while ((xp = TAILQ_FIRST(snaphead)) != 0) {
1821105191Smckusick		vp = ITOV(xp);
1822105191Smckusick		lkp = vp->v_vnlock;
1823105191Smckusick		vp->v_vnlock = &vp->v_lock;
182473942Smckusick		TAILQ_REMOVE(snaphead, xp, i_nextsnap);
182573942Smckusick		xp->i_nextsnap.tqe_prev = 0;
1826107414Smckusick		if (xp->i_effnlink > 0) {
1827107414Smckusick			VI_UNLOCK(devvp);
1828105191Smckusick			vrele(vp);
1829107414Smckusick			VI_LOCK(devvp);
1830107414Smckusick		}
183162976Smckusick	}
1832107848Smckusick	if (devvp->v_rdev->si_snapblklist != NULL) {
1833107848Smckusick		FREE(devvp->v_rdev->si_snapblklist, M_UFSMNT);
1834107848Smckusick		devvp->v_rdev->si_snapblklist = NULL;
1835107848Smckusick		devvp->v_rdev->si_snaplistsize = 0;
1836107848Smckusick	}
1837105191Smckusick	if (lkp != NULL) {
1838105191Smckusick		lockdestroy(lkp);
1839105191Smckusick		FREE(lkp, M_UFSMNT);
1840105191Smckusick	}
1841107414Smckusick	ASSERT_VOP_LOCKED(devvp, "ffs_snapshot_unmount");
1842107414Smckusick	devvp->v_rdev->si_copyonwrite = 0;
1843107414Smckusick	devvp->v_vflag &= ~VV_COPYONWRITE;
1844107414Smckusick	VI_UNLOCK(devvp);
184562976Smckusick}
184662976Smckusick
184762976Smckusick/*
184862976Smckusick * Check for need to copy block that is about to be written,
184962976Smckusick * copying the block if necessary.
185062976Smckusick */
185173942Smckusickstatic int
185273942Smckusickffs_copyonwrite(devvp, bp)
185373942Smckusick	struct vnode *devvp;
185473942Smckusick	struct buf *bp;
185562976Smckusick{
1856105191Smckusick	struct snaphead *snaphead;
185773942Smckusick	struct buf *ibp, *cbp, *savedcbp = 0;
185883366Sjulian	struct thread *td = curthread;
185973942Smckusick	struct fs *fs;
186062976Smckusick	struct inode *ip;
1861105670Smckusick	struct vnode *vp = 0;
1862107848Smckusick	ufs2_daddr_t lbn, blkno, *snapblklist;
1863105670Smckusick	int lower, upper, mid, indiroff, snapshot_locked = 0, error = 0;
186462976Smckusick
186583366Sjulian	if (td->td_proc->p_flag & P_COWINPROGRESS)
186662976Smckusick		panic("ffs_copyonwrite: recursive call");
1867107848Smckusick	/*
1868107848Smckusick	 * First check to see if it is in the preallocated list.
1869107848Smckusick	 * By doing this check we avoid several potential deadlocks.
1870107848Smckusick	 */
1871107414Smckusick	VI_LOCK(devvp);
1872105191Smckusick	snaphead = &devvp->v_rdev->si_snapshots;
1873105191Smckusick	ip = TAILQ_FIRST(snaphead);
1874105191Smckusick	fs = ip->i_fs;
1875105191Smckusick	lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno));
1876107848Smckusick	snapblklist = devvp->v_rdev->si_snapblklist;
1877107848Smckusick	upper = devvp->v_rdev->si_snaplistsize - 1;
1878107848Smckusick	lower = 1;
1879107848Smckusick	while (lower <= upper) {
1880107848Smckusick		mid = (lower + upper) / 2;
1881107848Smckusick		if (snapblklist[mid] == lbn)
1882107848Smckusick			break;
1883107848Smckusick		if (snapblklist[mid] < lbn)
1884107848Smckusick			lower = mid + 1;
1885107848Smckusick		else
1886107848Smckusick			upper = mid - 1;
1887107848Smckusick	}
1888107848Smckusick	if (lower <= upper) {
1889107848Smckusick		VI_UNLOCK(devvp);
1890107848Smckusick		return (0);
1891107848Smckusick	}
1892107848Smckusick	/*
1893107848Smckusick	 * Not in the precomputed list, so check the snapshots.
1894107848Smckusick	 */
1895107414Smckusickretry:
1896105191Smckusick	TAILQ_FOREACH(ip, snaphead, i_nextsnap) {
189762976Smckusick		vp = ITOV(ip);
189862976Smckusick		/*
189962976Smckusick		 * We ensure that everything of our own that needs to be
190062976Smckusick		 * copied will be done at the time that ffs_snapshot is
190162976Smckusick		 * called. Thus we can skip the check here which can
190276132Sphk		 * deadlock in doing the lookup in UFS_BALLOC.
190362976Smckusick		 */
190462976Smckusick		if (bp->b_vp == vp)
190562976Smckusick			continue;
190662976Smckusick		/*
1907105670Smckusick		 * Check to see if block needs to be copied. We do not have
1908105670Smckusick		 * to hold the snapshot lock while doing this lookup as it
1909105670Smckusick		 * will never require any additional allocations for the
1910105670Smckusick		 * snapshot inode.
191162976Smckusick		 */
191262976Smckusick		if (lbn < NDADDR) {
191398542Smckusick			blkno = DIP(ip, i_db[lbn]);
191462976Smckusick		} else {
1915107414Smckusick			if (snapshot_locked == 0 &&
1916107848Smckusick			    lockmgr(vp->v_vnlock,
1917107414Smckusick			      LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL,
1918107414Smckusick			      VI_MTX(devvp), td) != 0) {
1919107414Smckusick				VI_LOCK(devvp);
1920107414Smckusick				goto retry;
1921107414Smckusick			}
1922107848Smckusick			snapshot_locked = 1;
192383366Sjulian			td->td_proc->p_flag |= P_COWINPROGRESS;
192476132Sphk			error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn),
1925105191Smckusick			   fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
192683366Sjulian			td->td_proc->p_flag &= ~P_COWINPROGRESS;
1927105191Smckusick			if (error)
1928105191Smckusick				break;
192962976Smckusick			indiroff = (lbn - NDADDR) % NINDIR(fs);
193098542Smckusick			if (ip->i_ump->um_fstype == UFS1)
193198542Smckusick				blkno=((ufs1_daddr_t *)(ibp->b_data))[indiroff];
193298542Smckusick			else
193398542Smckusick				blkno=((ufs2_daddr_t *)(ibp->b_data))[indiroff];
193463788Smckusick			bqrelse(ibp);
193562976Smckusick		}
193662976Smckusick#ifdef DIAGNOSTIC
193762976Smckusick		if (blkno == BLK_SNAP && bp->b_lblkno >= 0)
193862976Smckusick			panic("ffs_copyonwrite: bad copy block");
193962976Smckusick#endif
1940105191Smckusick		if (blkno != 0)
194162976Smckusick			continue;
194262976Smckusick		/*
1943105670Smckusick		 * Allocate the block into which to do the copy. Since
1944105670Smckusick		 * multiple processes may all try to copy the same block,
1945105670Smckusick		 * we have to recheck our need to do a copy if we sleep
1946105670Smckusick		 * waiting for the lock.
1947105670Smckusick		 *
1948105670Smckusick		 * Because all snapshots on a filesystem share a single
1949105670Smckusick		 * lock, we ensure that we will never be in competition
1950105670Smckusick		 * with another process to allocate a block.
195162976Smckusick		 */
1952105670Smckusick		if (snapshot_locked == 0 &&
1953107414Smckusick		    lockmgr(vp->v_vnlock,
1954107414Smckusick		      LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL,
1955107414Smckusick		      VI_MTX(devvp), td) != 0) {
1956107414Smckusick			VI_LOCK(devvp);
1957105670Smckusick			goto retry;
1958107414Smckusick		}
1959105670Smckusick		snapshot_locked = 1;
196083366Sjulian		td->td_proc->p_flag |= P_COWINPROGRESS;
196176132Sphk		error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn),
1962105191Smckusick		    fs->fs_bsize, KERNCRED, 0, &cbp);
196383366Sjulian		td->td_proc->p_flag &= ~P_COWINPROGRESS;
1964105191Smckusick		if (error)
1965105191Smckusick			break;
196662976Smckusick#ifdef DEBUG
196762976Smckusick		if (snapdebug) {
196898687Smux			printf("Copyonwrite: snapino %d lbn %jd for ",
196998542Smckusick			    ip->i_number, (intmax_t)lbn);
197073942Smckusick			if (bp->b_vp == devvp)
197162976Smckusick				printf("fs metadata");
197262976Smckusick			else
197362976Smckusick				printf("inum %d", VTOI(bp->b_vp)->i_number);
197498687Smux			printf(" lblkno %jd to blkno %jd\n",
197598542Smckusick			    (intmax_t)bp->b_lblkno, (intmax_t)cbp->b_blkno);
197662976Smckusick		}
197762976Smckusick#endif
197862976Smckusick		/*
197962976Smckusick		 * If we have already read the old block contents, then
198075943Smckusick		 * simply copy them to the new block. Note that we need
198175943Smckusick		 * to synchronously write snapshots that have not been
198275943Smckusick		 * unlinked, and hence will be visible after a crash,
198375943Smckusick		 * to ensure their integrity.
198462976Smckusick		 */
198562976Smckusick		if (savedcbp != 0) {
198662976Smckusick			bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize);
198762976Smckusick			bawrite(cbp);
198876580Smckusick			if (dopersistence && ip->i_effnlink > 0)
198983366Sjulian				(void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td);
199062976Smckusick			continue;
199162976Smckusick		}
199262976Smckusick		/*
199362976Smckusick		 * Otherwise, read the old block contents into the buffer.
199462976Smckusick		 */
199575943Smckusick		if ((error = readblock(cbp, lbn)) != 0) {
199675943Smckusick			bzero(cbp->b_data, fs->fs_bsize);
199775943Smckusick			bawrite(cbp);
199876580Smckusick			if (dopersistence && ip->i_effnlink > 0)
199983366Sjulian				(void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td);
200062976Smckusick			break;
200175943Smckusick		}
200262976Smckusick		savedcbp = cbp;
200362976Smckusick	}
200475943Smckusick	/*
200575943Smckusick	 * Note that we need to synchronously write snapshots that
200675943Smckusick	 * have not been unlinked, and hence will be visible after
200775943Smckusick	 * a crash, to ensure their integrity.
200875943Smckusick	 */
200975943Smckusick	if (savedcbp) {
201075943Smckusick		vp = savedcbp->b_vp;
201162976Smckusick		bawrite(savedcbp);
2012105191Smckusick		if (dopersistence && VTOI(vp)->i_effnlink > 0)
201383366Sjulian			(void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td);
201475943Smckusick	}
2015105670Smckusick	if (snapshot_locked)
2016105670Smckusick		VOP_UNLOCK(vp, 0, td);
2017107414Smckusick	else
2018107414Smckusick		VI_UNLOCK(devvp);
201962976Smckusick	return (error);
202062976Smckusick}
202162976Smckusick
202262976Smckusick/*
202362976Smckusick * Read the specified block into the given buffer.
202462976Smckusick * Much of this boiler-plate comes from bwrite().
202562976Smckusick */
202662976Smckusickstatic int
202762976Smckusickreadblock(bp, lbn)
202862976Smckusick	struct buf *bp;
202998542Smckusick	ufs2_daddr_t lbn;
203062976Smckusick{
203162976Smckusick	struct uio auio;
203262976Smckusick	struct iovec aiov;
203383366Sjulian	struct thread *td = curthread;
203462976Smckusick	struct inode *ip = VTOI(bp->b_vp);
203562976Smckusick
203662976Smckusick	aiov.iov_base = bp->b_data;
203762976Smckusick	aiov.iov_len = bp->b_bcount;
203862976Smckusick	auio.uio_iov = &aiov;
203962976Smckusick	auio.uio_iovcnt = 1;
204062976Smckusick	auio.uio_offset = dbtob(fsbtodb(ip->i_fs, blkstofrags(ip->i_fs, lbn)));
204162976Smckusick	auio.uio_resid = bp->b_bcount;
204262976Smckusick	auio.uio_rw = UIO_READ;
204362976Smckusick	auio.uio_segflg = UIO_SYSSPACE;
204483366Sjulian	auio.uio_td = td;
204562976Smckusick	return (physio(ip->i_devvp->v_rdev, &auio, 0));
204662976Smckusick}
2047