ffs_snapshot.c revision 63788
162976Smckusick/*
262976Smckusick * Copyright 2000 Marshall Kirk McKusick. All Rights Reserved.
362976Smckusick *
462976Smckusick * Further information about snapshots can be obtained from:
562976Smckusick *
662976Smckusick *	Marshall Kirk McKusick		http://www.mckusick.com/softdep/
762976Smckusick *	1614 Oxford Street		mckusick@mckusick.com
862976Smckusick *	Berkeley, CA 94709-1608		+1-510-843-9542
962976Smckusick *	USA
1062976Smckusick *
1162976Smckusick * Redistribution and use in source and binary forms, with or without
1262976Smckusick * modification, are permitted provided that the following conditions
1362976Smckusick * are met:
1462976Smckusick *
1562976Smckusick * 1. Redistributions of source code must retain the above copyright
1662976Smckusick *    notice, this list of conditions and the following disclaimer.
1762976Smckusick * 2. Redistributions in binary form must reproduce the above copyright
1862976Smckusick *    notice, this list of conditions and the following disclaimer in the
1962976Smckusick *    documentation and/or other materials provided with the distribution.
2062976Smckusick *
2162976Smckusick * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY
2262976Smckusick * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
2362976Smckusick * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
2462976Smckusick * DISCLAIMED.  IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR
2562976Smckusick * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
2662976Smckusick * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
2762976Smckusick * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
2862976Smckusick * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
2962976Smckusick * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
3062976Smckusick * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
3162976Smckusick * SUCH DAMAGE.
3262976Smckusick *
3363788Smckusick *	@(#)ffs_snapshot.c	8.11 (McKusick) 7/23/00
3462976Smckusick * $FreeBSD: head/sys/ufs/ffs/ffs_snapshot.c 63788 2000-07-24 05:28:33Z mckusick $
3562976Smckusick */
3662976Smckusick
3762976Smckusick#include <sys/param.h>
3862976Smckusick#include <sys/systm.h>
3962976Smckusick#include <sys/bio.h>
4062976Smckusick#include <sys/buf.h>
4162976Smckusick#include <sys/proc.h>
4262976Smckusick#include <sys/namei.h>
4362976Smckusick#include <sys/stat.h>
4462976Smckusick#include <sys/malloc.h>
4562976Smckusick#include <sys/mount.h>
4662976Smckusick#include <sys/resource.h>
4762976Smckusick#include <sys/resourcevar.h>
4862976Smckusick#include <sys/vnode.h>
4962976Smckusick
5062976Smckusick#include <ufs/ufs/extattr.h>
5162976Smckusick#include <ufs/ufs/quota.h>
5262976Smckusick#include <ufs/ufs/ufsmount.h>
5362976Smckusick#include <ufs/ufs/inode.h>
5462976Smckusick#include <ufs/ufs/ufs_extern.h>
5562976Smckusick
5662976Smckusick#include <ufs/ffs/fs.h>
5762976Smckusick#include <ufs/ffs/ffs_extern.h>
5862976Smckusick
5962976Smckusick#define KERNCRED proc0.p_ucred
6062976Smckusick#define CURPROC curproc
6162976Smckusick#define DEBUG
6262976Smckusick
6362976Smckusickstatic int indiracct __P((struct vnode *, struct vnode *, int, ufs_daddr_t,
6462976Smckusick	int, int, int, int));
6562976Smckusickstatic int snapacct __P((struct vnode *, ufs_daddr_t *, ufs_daddr_t *));
6662976Smckusickstatic int readblock __P((struct buf *, daddr_t));
6762976Smckusick
6862976Smckusick#ifdef DEBUG
6962976Smckusick#include <sys/sysctl.h>
7062976Smckusickint snapdebug = 0;
7162976SmckusickSYSCTL_INT(_debug, OID_AUTO, snapdebug, CTLFLAG_RW, &snapdebug, 0, "");
7262976Smckusick#endif /* DEBUG */
7362976Smckusick
7462976Smckusick/*
7562976Smckusick * Create a snapshot file and initialize it for the filesystem.
7662976Smckusick */
7762976Smckusickint
7862976Smckusickffs_snapshot(mp, snapfile)
7962976Smckusick	struct mount *mp;
8062976Smckusick	char *snapfile;
8162976Smckusick{
8262976Smckusick	ufs_daddr_t rlbn;
8362976Smckusick	ufs_daddr_t lbn, blkno, copyblkno, inoblks[FSMAXSNAP];
8462976Smckusick	int error, cg, snaploc, indiroff, numblks;
8562976Smckusick	int i, size, base, len, loc, inoblkcnt;
8662976Smckusick	int blksperindir, flag = mp->mnt_flag;
8762976Smckusick	struct fs *fs = VFSTOUFS(mp)->um_fs;
8862976Smckusick	struct proc *p = CURPROC;
8962976Smckusick	struct inode *devip, *ip, *xp;
9062976Smckusick	struct buf *bp, *nbp, *ibp;
9162976Smckusick	struct vnode *vp, *devvp;
9262976Smckusick	struct nameidata nd;
9362976Smckusick	struct mount *wrtmp;
9462976Smckusick	struct dinode *dip;
9562976Smckusick	struct vattr vat;
9662976Smckusick	struct cg *cgp;
9762976Smckusick
9862976Smckusick	/*
9962976Smckusick	 * Need to serialize access to snapshot code per filesystem.
10062976Smckusick	 */
10162976Smckusick	/*
10262976Smckusick	 * Assign a snapshot slot in the superblock.
10362976Smckusick	 */
10462976Smckusick	for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++)
10562976Smckusick		if (fs->fs_snapinum[snaploc] == 0)
10662976Smckusick			break;
10762976Smckusick	if (snaploc == FSMAXSNAP)
10862976Smckusick		return (ENOSPC);
10962976Smckusick	/*
11062976Smckusick	 * Create the snapshot file.
11162976Smckusick	 */
11262976Smckusickrestart:
11362976Smckusick	NDINIT(&nd, CREATE, LOCKPARENT | LOCKLEAF, UIO_USERSPACE, snapfile, p);
11462976Smckusick	if ((error = namei(&nd)) != 0)
11562976Smckusick		return (error);
11662976Smckusick	if (nd.ni_vp != NULL) {
11762976Smckusick		vput(nd.ni_vp);
11862976Smckusick		error = EEXIST;
11962976Smckusick	}
12062976Smckusick	if (nd.ni_dvp->v_mount != mp)
12162976Smckusick		error = EXDEV;
12262976Smckusick	if (error) {
12362976Smckusick		NDFREE(&nd, NDF_ONLY_PNBUF);
12462976Smckusick		if (nd.ni_dvp == nd.ni_vp)
12562976Smckusick			vrele(nd.ni_dvp);
12662976Smckusick		else
12762976Smckusick			vput(nd.ni_dvp);
12862976Smckusick		return (error);
12962976Smckusick	}
13062976Smckusick	VATTR_NULL(&vat);
13162976Smckusick	vat.va_type = VREG;
13262976Smckusick	vat.va_mode = S_IRUSR;
13362976Smckusick	vat.va_vaflags |= VA_EXCLUSIVE;
13462976Smckusick	if (VOP_GETWRITEMOUNT(nd.ni_dvp, &wrtmp))
13562976Smckusick		wrtmp = NULL;
13662976Smckusick	if (wrtmp != mp)
13762976Smckusick		panic("ffs_snapshot: mount mismatch");
13862985Smckusick	if (vn_start_write(NULL, &wrtmp, V_NOWAIT) != 0) {
13962976Smckusick		NDFREE(&nd, NDF_ONLY_PNBUF);
14062976Smckusick		vput(nd.ni_dvp);
14162985Smckusick		if ((error = vn_start_write(NULL, &wrtmp,
14262985Smckusick		    V_XSLEEP | PCATCH)) != 0)
14362976Smckusick			return (error);
14462976Smckusick		goto restart;
14562976Smckusick	}
14662976Smckusick	VOP_LEASE(nd.ni_dvp, p, KERNCRED, LEASE_WRITE);
14762976Smckusick	error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vat);
14862976Smckusick	vput(nd.ni_dvp);
14962976Smckusick	if (error) {
15062976Smckusick		NDFREE(&nd, NDF_ONLY_PNBUF);
15162976Smckusick		vn_finished_write(wrtmp);
15262976Smckusick		return (error);
15362976Smckusick	}
15462976Smckusick	vp = nd.ni_vp;
15562976Smckusick	ip = VTOI(vp);
15662976Smckusick	devvp = ip->i_devvp;
15762976Smckusick	devip = VTOI(devvp);
15862976Smckusick	/*
15962976Smckusick	 * Allocate and copy the last block contents so as to be able
16062976Smckusick	 * to set size to that of the filesystem.
16162976Smckusick	 */
16262976Smckusick	numblks = howmany(fs->fs_size, fs->fs_frag);
16362976Smckusick	error = VOP_BALLOC(vp, lblktosize(fs, (off_t)(numblks - 1)),
16462976Smckusick	    fs->fs_bsize, KERNCRED, B_CLRBUF, &bp);
16562976Smckusick	if (error)
16662976Smckusick		goto out;
16762976Smckusick	ip->i_size = lblktosize(fs, (off_t)numblks);
16862976Smckusick	ip->i_flag |= IN_CHANGE | IN_UPDATE;
16962976Smckusick	if ((error = readblock(bp, numblks - 1)) != 0)
17062976Smckusick		goto out;
17162976Smckusick	bawrite(bp);
17262976Smckusick	/*
17362976Smckusick	 * Preallocate critical data structures so that we can copy
17462976Smckusick	 * them in without further allocation after we suspend all
17562976Smckusick	 * operations on the filesystem. We would like to just release
17662976Smckusick	 * the allocated buffers without writing them since they will
17762976Smckusick	 * be filled in below once we are ready to go, but this upsets
17862976Smckusick	 * the soft update code, so we go ahead and write the new buffers.
17962976Smckusick	 *
18062976Smckusick	 * Allocate all indirect blocks. Also allocate shadow copies
18162976Smckusick	 * for each of the indirect blocks.
18262976Smckusick	 */
18362976Smckusick	for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) {
18462976Smckusick		error = VOP_BALLOC(vp, lblktosize(fs, (off_t)blkno),
18562976Smckusick		    fs->fs_bsize, p->p_ucred, B_METAONLY, &ibp);
18662976Smckusick		if (error)
18762976Smckusick			goto out;
18862976Smckusick		copyblkno = fragstoblks(fs, dbtofsb(fs, ibp->b_blkno));
18962976Smckusick		bdwrite(ibp);
19062976Smckusick		error = VOP_BALLOC(vp, lblktosize(fs, (off_t)copyblkno),
19162976Smckusick		    fs->fs_bsize, p->p_ucred, 0, &nbp);
19262976Smckusick		if (error)
19362976Smckusick			goto out;
19462976Smckusick		bawrite(nbp);
19562976Smckusick	}
19662976Smckusick	/*
19762976Smckusick	 * Allocate shadow blocks to copy all of the other snapshot inodes
19862976Smckusick	 * so that we will be able to expunge them from this snapshot.
19962976Smckusick	 */
20062976Smckusick	for (loc = 0, inoblkcnt = 0; loc < snaploc; loc++) {
20162976Smckusick		blkno = fragstoblks(fs, ino_to_fsba(fs, fs->fs_snapinum[loc]));
20262976Smckusick		for (i = 0; i < inoblkcnt; i++)
20362976Smckusick			if (inoblks[i] == blkno)
20462976Smckusick				break;
20562976Smckusick		if (i == inoblkcnt) {
20662976Smckusick			inoblks[inoblkcnt++] = blkno;
20762976Smckusick			error = VOP_BALLOC(vp, lblktosize(fs, (off_t)blkno),
20862976Smckusick			    fs->fs_bsize, KERNCRED, 0, &nbp);
20962976Smckusick			if (error)
21062976Smckusick				goto out;
21162976Smckusick			bawrite(nbp);
21262976Smckusick		}
21362976Smckusick	}
21462976Smckusick	/*
21562976Smckusick	 * Allocate all cylinder group blocks.
21662976Smckusick	 */
21762976Smckusick	for (cg = 0; cg < fs->fs_ncg; cg++) {
21862976Smckusick		error = VOP_BALLOC(vp, (off_t)(cgtod(fs, cg)) << fs->fs_fshift,
21962976Smckusick		    fs->fs_bsize, KERNCRED, 0, &nbp);
22062976Smckusick		if (error)
22162976Smckusick			goto out;
22262976Smckusick		bawrite(nbp);
22362976Smckusick	}
22462976Smckusick	/*
22562976Smckusick	 * Allocate copies for the superblock and its summary information.
22662976Smckusick	 */
22762976Smckusick	error = VOP_BALLOC(vp, (off_t)(SBOFF), fs->fs_bsize, KERNCRED,
22862976Smckusick	    0, &nbp);
22962976Smckusick	if (error)
23062976Smckusick		goto out;
23162976Smckusick	bawrite(nbp);
23262976Smckusick	blkno = fragstoblks(fs, fs->fs_csaddr);
23362976Smckusick	len = howmany(fs->fs_cssize, fs->fs_bsize);
23462976Smckusick	for (loc = 0; loc < len; loc++) {
23562976Smckusick		error = VOP_BALLOC(vp, lblktosize(fs, (off_t)(blkno + loc)),
23662976Smckusick		    fs->fs_bsize, KERNCRED, 0, &nbp);
23762976Smckusick		if (error)
23862976Smckusick			goto out;
23962976Smckusick		bawrite(nbp);
24062976Smckusick	}
24162976Smckusick	/*
24262976Smckusick	 * Change inode to snapshot type file.
24362976Smckusick	 */
24462976Smckusick	ip->i_flags |= SF_IMMUTABLE | SF_SNAPSHOT;
24562976Smckusick	ip->i_flag |= IN_CHANGE | IN_UPDATE;
24662976Smckusick	/*
24762976Smckusick	 * Ensure that the snapshot is completely on disk.
24862976Smckusick	 */
24962976Smckusick	if ((error = VOP_FSYNC(vp, KERNCRED, MNT_WAIT, p)) != 0)
25062976Smckusick		goto out;
25162976Smckusick	/*
25262976Smckusick	 * All allocations are done, so we can now snapshot the system.
25362976Smckusick	 *
25462976Smckusick	 * Suspend operation on filesystem.
25562976Smckusick	 */
25662976Smckusick	for (;;) {
25762976Smckusick		vn_finished_write(wrtmp);
25862976Smckusick		vfs_write_suspend(vp->v_mount);
25962976Smckusick		if (mp->mnt_kern_flag & MNTK_SUSPENDED)
26062976Smckusick			break;
26162985Smckusick		vn_start_write(NULL, &wrtmp, V_WAIT);
26262976Smckusick	}
26362976Smckusick	/*
26462976Smckusick	 * First, copy all the cylinder group maps. All the unallocated
26562976Smckusick	 * blocks are marked BLK_NOCOPY so that the snapshot knows that
26662976Smckusick	 * it need not copy them if they are later written.
26762976Smckusick	 */
26862976Smckusick	len = howmany(fs->fs_fpg, fs->fs_frag);
26962976Smckusick	for (cg = 0; cg < fs->fs_ncg; cg++) {
27062976Smckusick		error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)),
27162976Smckusick			(int)fs->fs_cgsize, KERNCRED, &bp);
27262976Smckusick		if (error) {
27362976Smckusick			brelse(bp);
27462976Smckusick			goto out1;
27562976Smckusick		}
27662976Smckusick		cgp = (struct cg *)bp->b_data;
27762976Smckusick		if (!cg_chkmagic(cgp)) {
27862976Smckusick			brelse(bp);
27962976Smckusick			error = EIO;
28062976Smckusick			goto out1;
28162976Smckusick		}
28262976Smckusick		error = bread(vp, fragstoblks(fs, cgtod(fs, cg)), fs->fs_bsize,
28362976Smckusick			KERNCRED, &nbp);
28462976Smckusick		if (error) {
28562976Smckusick			brelse(bp);
28662976Smckusick			brelse(nbp);
28762976Smckusick			goto out1;
28862976Smckusick		}
28962976Smckusick		bcopy(bp->b_data, nbp->b_data, fs->fs_cgsize);
29062976Smckusick		if (fs->fs_cgsize < fs->fs_bsize)
29162976Smckusick			bzero(&nbp->b_data[fs->fs_cgsize],
29262976Smckusick			    fs->fs_bsize - fs->fs_cgsize);
29363788Smckusick		nbp->b_flags |= B_VALIDSUSPWRT;
29462976Smckusick		bawrite(nbp);
29562976Smckusick		base = cg * fs->fs_fpg / fs->fs_frag;
29662976Smckusick		if (base + len > numblks)
29762976Smckusick			len = numblks - base;
29862976Smckusick		loc = 0;
29962976Smckusick		if (base < NDADDR) {
30062976Smckusick			for ( ; loc < NDADDR; loc++) {
30162976Smckusick				if (!ffs_isblock(fs, cg_blksfree(cgp), loc))
30262976Smckusick					continue;
30362976Smckusick				ip->i_db[loc] = BLK_NOCOPY;
30462976Smckusick			}
30562976Smckusick		}
30662976Smckusick		error = VOP_BALLOC(vp, lblktosize(fs, (off_t)(base + loc)),
30762976Smckusick		    fs->fs_bsize, KERNCRED, B_METAONLY, &ibp);
30862976Smckusick		if (error) {
30962976Smckusick			brelse(bp);
31062976Smckusick			goto out1;
31162976Smckusick		}
31262976Smckusick		indiroff = (base + loc - NDADDR) % NINDIR(fs);
31362976Smckusick		for ( ; loc < len; loc++, indiroff++) {
31462976Smckusick			if (indiroff >= NINDIR(fs)) {
31563788Smckusick				ibp->b_flags |= B_VALIDSUSPWRT;
31662976Smckusick				bawrite(ibp);
31762976Smckusick				error = VOP_BALLOC(vp,
31862976Smckusick				    lblktosize(fs, (off_t)(base + loc)),
31962976Smckusick				    fs->fs_bsize, KERNCRED, B_METAONLY, &ibp);
32062976Smckusick				if (error) {
32162976Smckusick					brelse(bp);
32262976Smckusick					goto out1;
32362976Smckusick				}
32462976Smckusick				indiroff = 0;
32562976Smckusick			}
32662976Smckusick			if (!ffs_isblock(fs, cg_blksfree(cgp), loc))
32762976Smckusick				continue;
32862976Smckusick			((ufs_daddr_t *)(ibp->b_data))[indiroff] = BLK_NOCOPY;
32962976Smckusick		}
33063788Smckusick		bqrelse(bp);
33163788Smckusick		ibp->b_flags |= B_VALIDSUSPWRT;
33262976Smckusick		bdwrite(ibp);
33362976Smckusick	}
33462976Smckusick	/*
33562976Smckusick	 * Snapshot the superblock and its summary information.
33662976Smckusick	 */
33762976Smckusick	error = VOP_BALLOC(vp, (off_t)(SBOFF), fs->fs_bsize, KERNCRED,
33862976Smckusick	    0, &nbp);
33962976Smckusick	if (error)
34062976Smckusick		goto out1;
34162976Smckusick	bcopy(fs, nbp->b_data, fs->fs_sbsize);
34262976Smckusick	((struct fs *)(nbp->b_data))->fs_clean = 1;
34362976Smckusick	if (fs->fs_sbsize < fs->fs_bsize)
34462976Smckusick		bzero(&nbp->b_data[fs->fs_sbsize],
34562976Smckusick		    fs->fs_bsize - fs->fs_sbsize);
34663788Smckusick	nbp->b_flags |= B_VALIDSUSPWRT;
34762976Smckusick	bawrite(nbp);
34862976Smckusick	blkno = fragstoblks(fs, fs->fs_csaddr);
34962976Smckusick	len = howmany(fs->fs_cssize, fs->fs_bsize) - 1;
35062976Smckusick	size = fs->fs_bsize;
35162976Smckusick	for (loc = 0; loc <= len; loc++) {
35262976Smckusick		error = VOP_BALLOC(vp, lblktosize(fs, (off_t)(blkno + loc)),
35362976Smckusick		    fs->fs_bsize, KERNCRED, 0, &nbp);
35462976Smckusick		if (error)
35562976Smckusick			goto out1;
35662976Smckusick		if (loc == len) {
35762976Smckusick			readblock(nbp, blkno + loc);
35862976Smckusick			size = fs->fs_cssize % fs->fs_bsize;
35962976Smckusick		}
36062976Smckusick		bcopy(fs->fs_csp[loc], nbp->b_data, size);
36163788Smckusick		nbp->b_flags |= B_VALIDSUSPWRT;
36262976Smckusick		bawrite(nbp);
36362976Smckusick	}
36462976Smckusick	/*
36562976Smckusick	 * Copy the shadow blocks for the snapshot inodes so that
36662976Smckusick	 * the copies can can be expunged.
36762976Smckusick	 */
36862976Smckusick	for (loc = 0; loc < inoblkcnt; loc++) {
36962976Smckusick		error = VOP_BALLOC(vp, lblktosize(fs, (off_t)inoblks[loc]),
37062976Smckusick		    fs->fs_bsize, KERNCRED, 0, &nbp);
37162976Smckusick		if (error)
37262976Smckusick			goto out1;
37362976Smckusick		readblock(nbp, inoblks[loc]);
37463788Smckusick		nbp->b_flags |= B_VALIDSUSPWRT;
37562976Smckusick		bdwrite(nbp);
37662976Smckusick	}
37762976Smckusick	/*
37862976Smckusick	 * Copy allocation information from other snapshots and then
37962976Smckusick	 * expunge them from the view of the current snapshot.
38062976Smckusick	 */
38162976Smckusick	for (xp = devip->i_copyonwrite; xp; xp = xp->i_copyonwrite) {
38262976Smckusick		/*
38362976Smckusick		 * Before expunging a snapshot inode, note all the
38462976Smckusick		 * blocks that it claims with BLK_SNAP so that fsck will
38562976Smckusick		 * be able to account for those blocks properly and so
38662976Smckusick		 * that this snapshot knows that it need not copy them
38762976Smckusick		 * if the other snapshot holding them is freed.
38862976Smckusick		 */
38962976Smckusick		if ((error = snapacct(vp, &xp->i_db[0], &xp->i_ib[NIADDR])) !=0)
39062976Smckusick			goto out1;
39162976Smckusick		blksperindir = 1;
39262976Smckusick		lbn = -NDADDR;
39362976Smckusick		len = numblks - NDADDR;
39462976Smckusick		rlbn = NDADDR;
39562976Smckusick		for (i = 0; len > 0 && i < NIADDR; i++) {
39662976Smckusick			error = indiracct(vp, ITOV(xp), i, xp->i_ib[i], lbn,
39762976Smckusick			    rlbn, len, blksperindir);
39862976Smckusick			if (error)
39962976Smckusick				goto out1;
40062976Smckusick			blksperindir *= NINDIR(fs);
40162976Smckusick			lbn -= blksperindir + 1;
40262976Smckusick			len -= blksperindir;
40362976Smckusick			rlbn += blksperindir;
40462976Smckusick		}
40562976Smckusick		/*
40662976Smckusick		 * Set copied snapshot inode to be a zero length file.
40762976Smckusick		 */
40862976Smckusick		blkno = fragstoblks(fs, ino_to_fsba(fs, xp->i_number));
40962976Smckusick		error = VOP_BALLOC(vp, lblktosize(fs, (off_t)blkno),
41062976Smckusick		    fs->fs_bsize, KERNCRED, 0, &nbp);
41162976Smckusick		if (error)
41262976Smckusick			goto out1;
41362976Smckusick		dip = (struct dinode *)nbp->b_data +
41462976Smckusick		    ino_to_fsbo(fs, xp->i_number);
41562976Smckusick		dip->di_size = 0;
41662976Smckusick		dip->di_blocks = 0;
41762976Smckusick		dip->di_flags &= ~(SF_IMMUTABLE | SF_SNAPSHOT);
41862976Smckusick		bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs_daddr_t));
41963788Smckusick		nbp->b_flags |= B_VALIDSUSPWRT;
42062976Smckusick		bdwrite(nbp);
42162976Smckusick	}
42262976Smckusick	/*
42362976Smckusick	 * Copy all indirect blocks to their shadows (allocated above)
42462976Smckusick	 * to avoid deadlock in ffs_copyonwrite.
42562976Smckusick	 */
42662976Smckusick	for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) {
42762976Smckusick		error = VOP_BALLOC(vp, lblktosize(fs, (off_t)blkno),
42862976Smckusick		    fs->fs_bsize, p->p_ucred, B_METAONLY, &ibp);
42962976Smckusick		if (error)
43062976Smckusick			goto out1;
43162976Smckusick		copyblkno = fragstoblks(fs, dbtofsb(fs, ibp->b_blkno));
43263788Smckusick		bqrelse(ibp);
43362976Smckusick		error = VOP_BALLOC(vp, lblktosize(fs, (off_t)copyblkno),
43462976Smckusick		    fs->fs_bsize, p->p_ucred, 0, &nbp);
43562976Smckusick		if (error)
43662976Smckusick			goto out1;
43762976Smckusick		error = VOP_BALLOC(vp, lblktosize(fs, (off_t)blkno),
43862976Smckusick		    fs->fs_bsize, p->p_ucred, B_METAONLY, &ibp);
43962976Smckusick		if (error) {
44062976Smckusick			brelse(nbp);
44162976Smckusick			goto out1;
44262976Smckusick		}
44362976Smckusick		bcopy(ibp->b_data, nbp->b_data, fs->fs_bsize);
44463788Smckusick		bqrelse(ibp);
44563788Smckusick		nbp->b_flags |= B_VALIDSUSPWRT;
44662976Smckusick		bawrite(nbp);
44762976Smckusick	}
44862976Smckusick	/*
44962976Smckusick	 * Record snapshot inode. Since this is the newest snapshot,
45062976Smckusick	 * it must be placed at the end of the list.
45162976Smckusick	 */
45262976Smckusick	fs->fs_snapinum[snaploc] = ip->i_number;
45362976Smckusick	if (ip->i_copyonwrite != 0)
45462976Smckusick		panic("ffs_snapshot: %d already on list", ip->i_number);
45562976Smckusick	if (devip->i_copyonwrite == 0) {
45662976Smckusick		devvp->v_flag |= VCOPYONWRITE;
45762976Smckusick		devip->i_copyonwrite = ip;
45862976Smckusick	} else {
45962976Smckusick		for (xp = devip->i_copyonwrite; xp->i_copyonwrite != 0; )
46062976Smckusick			xp = xp->i_copyonwrite;
46162976Smckusick		xp->i_copyonwrite = ip;
46262976Smckusick	}
46362976Smckusick	vp->v_flag |= VSYSTEM;
46462976Smckusick	/*
46562976Smckusick	 * Resume operation on filesystem.
46662976Smckusick	 */
46762976Smckusickout1:
46862976Smckusick	vfs_write_resume(vp->v_mount);
46962985Smckusick	vn_start_write(NULL, &wrtmp, V_WAIT);
47062976Smckusickout:
47162976Smckusick	mp->mnt_flag = flag;
47262976Smckusick	(void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, p);
47362976Smckusick	if (error)
47462976Smckusick		vput(vp);
47562976Smckusick	else
47662976Smckusick		VOP_UNLOCK(vp, 0, p);
47762976Smckusick	vn_finished_write(wrtmp);
47862976Smckusick	return (error);
47962976Smckusick}
48062976Smckusick
48162976Smckusick/*
48262976Smckusick * Descend an indirect block chain for vnode cancelvp accounting for all
48362976Smckusick * its indirect blocks in snapvp.
48462976Smckusick */
48562976Smckusickstatic int
48662976Smckusickindiracct(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks, blksperindir)
48762976Smckusick	struct vnode *snapvp;
48862976Smckusick	struct vnode *cancelvp;
48962976Smckusick	int level;
49062976Smckusick	ufs_daddr_t blkno;
49162976Smckusick	int lbn;
49262976Smckusick	int rlbn;
49362976Smckusick	int remblks;
49462976Smckusick	int blksperindir;
49562976Smckusick{
49662976Smckusick	int subblksperindir, error, last, num, i;
49762976Smckusick	struct indir indirs[NIADDR + 2];
49862976Smckusick	ufs_daddr_t *bap;
49962976Smckusick	struct buf *bp;
50062976Smckusick	struct fs *fs;
50162976Smckusick
50262976Smckusick	if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0)
50362976Smckusick		return (error);
50462976Smckusick	if (lbn != indirs[num - 1 - level].in_lbn || blkno == 0 || num < 2)
50562976Smckusick		panic("indiracct: botched params");
50662976Smckusick	/*
50762976Smckusick	 * We have to expand bread here since it will deadlock looking
50862976Smckusick	 * up the block number for any blocks that are not in the cache.
50962976Smckusick	 */
51062976Smckusick	fs = VTOI(cancelvp)->i_fs;
51162976Smckusick	bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0);
51262976Smckusick	bp->b_blkno = fsbtodb(fs, blkno);
51362976Smckusick	if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 &&
51462976Smckusick	    (error = readblock(bp, fragstoblks(fs, blkno)))) {
51562976Smckusick		brelse(bp);
51662976Smckusick		return (error);
51762976Smckusick	}
51862976Smckusick	/*
51962976Smckusick	 * Account for the block pointers in this indirect block.
52062976Smckusick	 */
52162976Smckusick	last = howmany(remblks, blksperindir);
52262976Smckusick	if (last > NINDIR(fs))
52362976Smckusick		last = NINDIR(fs);
52462976Smckusick	if (snapvp != cancelvp) {
52562976Smckusick		bap = (ufs_daddr_t *)bp->b_data;
52662976Smckusick	} else {
52762976Smckusick		MALLOC(bap, ufs_daddr_t *, fs->fs_bsize, M_DEVBUF, M_WAITOK);
52862976Smckusick		bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize);
52963788Smckusick		bqrelse(bp);
53062976Smckusick	}
53162976Smckusick	error = snapacct(snapvp, &bap[0], &bap[last]);
53262976Smckusick	if (error || level == 0)
53362976Smckusick		goto out;
53462976Smckusick	/*
53562976Smckusick	 * Account for the block pointers in each of the indirect blocks
53662976Smckusick	 * in the levels below us.
53762976Smckusick	 */
53862976Smckusick	subblksperindir = blksperindir / NINDIR(fs);
53962976Smckusick	for (lbn++, level--, i = 0; i < last; i++) {
54062976Smckusick		error = indiracct(snapvp, cancelvp, level, bap[i], lbn,
54162976Smckusick		    rlbn, remblks, subblksperindir);
54262976Smckusick		if (error)
54362976Smckusick			goto out;
54462976Smckusick		rlbn += blksperindir;
54562976Smckusick		lbn -= blksperindir;
54662976Smckusick		remblks -= blksperindir;
54762976Smckusick	}
54862976Smckusickout:
54962976Smckusick	if (snapvp != cancelvp)
55063788Smckusick		bqrelse(bp);
55162976Smckusick	else
55262976Smckusick		FREE(bap, M_DEVBUF);
55362976Smckusick	return (error);
55462976Smckusick}
55562976Smckusick
55662976Smckusick/*
55762976Smckusick * Account for a set of blocks allocated in a snapshot inode.
55862976Smckusick */
55962976Smckusickstatic int
56062976Smckusicksnapacct(vp, oldblkp, lastblkp)
56162976Smckusick	struct vnode *vp;
56262976Smckusick	ufs_daddr_t *oldblkp, *lastblkp;
56362976Smckusick{
56462976Smckusick	struct inode *ip = VTOI(vp);
56562976Smckusick	struct fs *fs = ip->i_fs;
56662976Smckusick	ufs_daddr_t lbn, blkno, *blkp;
56762976Smckusick	struct buf *ibp;
56862976Smckusick	int error;
56962976Smckusick
57062976Smckusick	for ( ; oldblkp < lastblkp; oldblkp++) {
57162976Smckusick		blkno = *oldblkp;
57262976Smckusick		if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP)
57362976Smckusick			continue;
57462976Smckusick		lbn = fragstoblks(fs, blkno);
57562976Smckusick		if (lbn < NDADDR) {
57662976Smckusick			blkp = &ip->i_db[lbn];
57762976Smckusick			ip->i_flag |= IN_CHANGE | IN_UPDATE;
57862976Smckusick		} else {
57962976Smckusick			error = VOP_BALLOC(vp, lblktosize(fs, (off_t)lbn),
58062976Smckusick			    fs->fs_bsize, KERNCRED, B_METAONLY, &ibp);
58162976Smckusick			if (error)
58262976Smckusick				return (error);
58362976Smckusick			blkp = &((ufs_daddr_t *)(ibp->b_data))
58462976Smckusick			    [(lbn - NDADDR) % NINDIR(fs)];
58562976Smckusick		}
58662976Smckusick		if (*blkp != 0)
58762976Smckusick			panic("snapacct: bad block");
58862976Smckusick		*blkp = BLK_SNAP;
58963788Smckusick		if (lbn >= NDADDR) {
59063788Smckusick			ibp->b_flags |= B_VALIDSUSPWRT;
59162976Smckusick			bdwrite(ibp);
59263788Smckusick		}
59362976Smckusick	}
59462976Smckusick	return (0);
59562976Smckusick}
59662976Smckusick
59762976Smckusick/*
59862976Smckusick * Prepare a snapshot file for being removed.
59962976Smckusick */
60062976Smckusickvoid
60162976Smckusickffs_snapremove(vp)
60262976Smckusick	struct vnode *vp;
60362976Smckusick{
60462976Smckusick	struct inode *ip, *xp;
60562976Smckusick	struct vnode *devvp;
60662976Smckusick	struct buf *ibp;
60762976Smckusick	struct fs *fs;
60862976Smckusick	ufs_daddr_t blkno, dblk;
60962976Smckusick	int error, snaploc, loc, last;
61062976Smckusick
61162976Smckusick	ip = VTOI(vp);
61262976Smckusick	fs = ip->i_fs;
61362976Smckusick	/*
61462976Smckusick	 * Delete snapshot inode from superblock. Keep list dense.
61562976Smckusick	 */
61662976Smckusick	for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++)
61762976Smckusick		if (fs->fs_snapinum[snaploc] == ip->i_number)
61862976Smckusick			break;
61962976Smckusick	if (snaploc < FSMAXSNAP) {
62062976Smckusick		for (snaploc++; snaploc < FSMAXSNAP; snaploc++) {
62162976Smckusick			if (fs->fs_snapinum[snaploc] == 0)
62262976Smckusick				break;
62362976Smckusick			fs->fs_snapinum[snaploc - 1] = fs->fs_snapinum[snaploc];
62462976Smckusick		}
62562976Smckusick		fs->fs_snapinum[snaploc - 1] = 0;
62662976Smckusick	}
62762976Smckusick	/*
62862976Smckusick	 * Delete from incore list.
62962976Smckusick	 * Clear copy-on-write flag if last snapshot.
63062976Smckusick	 */
63162976Smckusick	devvp = ip->i_devvp;
63262976Smckusick	for (xp = VTOI(devvp); xp; xp = xp->i_copyonwrite) {
63362976Smckusick		if (xp->i_copyonwrite != ip)
63462976Smckusick			continue;
63562976Smckusick		xp->i_copyonwrite = ip->i_copyonwrite;
63662976Smckusick		ip->i_copyonwrite = 0;
63762976Smckusick		break;
63862976Smckusick	}
63962976Smckusick	if (xp == 0) {
64062976Smckusick		printf("ffs_snapremove: lost snapshot vnode %d\n",
64162976Smckusick		    ip->i_number);
64262976Smckusick		vref(vp);
64362976Smckusick	}
64462976Smckusick	if (VTOI(devvp)->i_copyonwrite == 0)
64562976Smckusick		devvp->v_flag &= ~VCOPYONWRITE;
64662976Smckusick	/*
64762976Smckusick	 * Clear all BLK_NOCOPY fields. Pass any block claims to other
64862976Smckusick	 * snapshots that want them (see ffs_snapblkfree below).
64962976Smckusick	 */
65062976Smckusick	for (blkno = 1; blkno < NDADDR; blkno++) {
65162976Smckusick		dblk = ip->i_db[blkno];
65262976Smckusick		if (dblk == BLK_NOCOPY || dblk == BLK_SNAP ||
65362976Smckusick		    (dblk == blkstofrags(fs, blkno) &&
65462976Smckusick		     ffs_snapblkfree(ip, dblk, fs->fs_bsize)))
65562976Smckusick			ip->i_db[blkno] = 0;
65662976Smckusick	}
65762976Smckusick	for (blkno = NDADDR; blkno < fs->fs_size; blkno += NINDIR(fs)) {
65862976Smckusick		error = VOP_BALLOC(vp, lblktosize(fs, (off_t)blkno),
65962976Smckusick		    fs->fs_bsize, KERNCRED, B_METAONLY, &ibp);
66062976Smckusick		if (error)
66162976Smckusick			continue;
66262976Smckusick		if ((last = fs->fs_size - blkno) > NINDIR(fs))
66362976Smckusick			last = NINDIR(fs);
66462976Smckusick		for (loc = 0; loc < last; loc++) {
66562976Smckusick			dblk = ((ufs_daddr_t *)(ibp->b_data))[loc];
66662976Smckusick			if (dblk == BLK_NOCOPY || dblk == BLK_SNAP ||
66762976Smckusick			    (dblk == blkstofrags(fs, blkno) &&
66862976Smckusick			     ffs_snapblkfree(ip, dblk, fs->fs_bsize)))
66962976Smckusick				((ufs_daddr_t *)(ibp->b_data))[loc] = 0;
67062976Smckusick		}
67162976Smckusick		bawrite(ibp);
67262976Smckusick	}
67362976Smckusick	/*
67462976Smckusick	 * Clear snapshot flag and drop reference.
67562976Smckusick	 */
67662976Smckusick	ip->i_flags &= ~(SF_IMMUTABLE | SF_SNAPSHOT);
67762976Smckusick	ip->i_flag |= IN_CHANGE | IN_UPDATE;
67862976Smckusick	vrele(vp);
67962976Smckusick}
68062976Smckusick
68162976Smckusick/*
68262976Smckusick * Notification that a block is being freed. Return zero if the free
68362976Smckusick * should be allowed to proceed. Return non-zero if the snapshot file
68462976Smckusick * wants to claim the block. The block will be claimed if it is an
68562976Smckusick * uncopied part of one of the snapshots. It will be freed if it is
68662976Smckusick * either a BLK_NOCOPY or has already been copied in all of the snapshots.
68762976Smckusick * If a fragment is being freed, then all snapshots that care about
68862976Smckusick * it must make a copy since a snapshot file can only claim full sized
68962976Smckusick * blocks. Note that if more than one snapshot file maps the block,
69062976Smckusick * we can pick one at random to claim it. Since none of the snapshots
69162976Smckusick * can change, we are assurred that they will all see the same unmodified
69262976Smckusick * image. When deleting a snapshot file (see ffs_snapremove above), we
69362976Smckusick * must push any of these claimed blocks to one of the other snapshots
69462976Smckusick * that maps it. These claimed blocks are easily identified as they will
69562976Smckusick * have a block number equal to their logical block number within the
69662976Smckusick * snapshot. A copied block can never have this property because they
69762976Smckusick * must always have been allocated from a BLK_NOCOPY location.
69862976Smckusick */
69962976Smckusickint
70062976Smckusickffs_snapblkfree(freeip, bno, size)
70162976Smckusick	struct inode *freeip;
70262976Smckusick	ufs_daddr_t bno;
70362976Smckusick	long size;
70462976Smckusick{
70562976Smckusick	struct buf *ibp, *cbp, *savedcbp = 0;
70662976Smckusick	struct fs *fs = freeip->i_fs;
70762976Smckusick	struct proc *p = CURPROC;
70862976Smckusick	struct inode *ip;
70962976Smckusick	struct vnode *vp;
71062976Smckusick	ufs_daddr_t lbn, blkno;
71162976Smckusick	int indiroff = 0, error = 0, claimedblk = 0;
71262976Smckusick
71362976Smckusick	lbn = fragstoblks(fs, bno);
71462976Smckusick	for (ip = VTOI(freeip->i_devvp)->i_copyonwrite; ip;
71562976Smckusick	     ip = ip->i_copyonwrite) {
71662976Smckusick		vp = ITOV(ip);
71762976Smckusick		/*
71862976Smckusick		 * Lookup block being written.
71962976Smckusick		 */
72062976Smckusick		if (lbn < NDADDR) {
72162976Smckusick			blkno = ip->i_db[lbn];
72262976Smckusick		} else {
72362976Smckusick			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
72462976Smckusick			p->p_flag |= P_COWINPROGRESS;
72562976Smckusick			error = VOP_BALLOC(vp, lblktosize(fs, (off_t)lbn),
72662976Smckusick			    fs->fs_bsize, KERNCRED, B_METAONLY, &ibp);
72762976Smckusick			p->p_flag &= ~P_COWINPROGRESS;
72862976Smckusick			VOP_UNLOCK(vp, 0, p);
72962976Smckusick			if (error)
73062976Smckusick				break;
73162976Smckusick			indiroff = (lbn - NDADDR) % NINDIR(fs);
73262976Smckusick			blkno = ((ufs_daddr_t *)(ibp->b_data))[indiroff];
73362976Smckusick		}
73462976Smckusick		/*
73562976Smckusick		 * Check to see if block needs to be copied.
73662976Smckusick		 */
73762976Smckusick		switch (blkno) {
73862976Smckusick		/*
73962976Smckusick		 * If the snapshot has already copied the block (default),
74062976Smckusick		 * or does not care about the block, it is not needed.
74162976Smckusick		 */
74262976Smckusick		default:
74362976Smckusick		case BLK_NOCOPY:
74462976Smckusick			if (lbn >= NDADDR)
74563788Smckusick				bqrelse(ibp);
74662976Smckusick			continue;
74762976Smckusick		/*
74862976Smckusick		 * No previous snapshot claimed the block, so it will be
74962976Smckusick		 * freed and become a BLK_NOCOPY (don't care) for us.
75062976Smckusick		 */
75162976Smckusick		case BLK_SNAP:
75262976Smckusick			if (claimedblk)
75362976Smckusick				panic("snapblkfree: inconsistent block type");
75462976Smckusick			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
75562976Smckusick			if (lbn < NDADDR) {
75662976Smckusick				ip->i_db[lbn] = BLK_NOCOPY;
75762976Smckusick				ip->i_flag |= IN_CHANGE | IN_UPDATE;
75862976Smckusick			} else {
75962976Smckusick				((ufs_daddr_t *)(ibp->b_data))[indiroff] =
76062976Smckusick				    BLK_NOCOPY;
76162976Smckusick				bdwrite(ibp);
76262976Smckusick			}
76362976Smckusick			VOP_UNLOCK(vp, 0, p);
76462976Smckusick			continue;
76562976Smckusick		/*
76662976Smckusick		 * A block that we map is being freed. If it has not been
76762976Smckusick		 * claimed yet, we will claim or copy it (below).
76862976Smckusick		 */
76962976Smckusick		case 0:
77062976Smckusick			claimedblk = 1;
77162976Smckusick			break;
77262976Smckusick		}
77362976Smckusick		/*
77462976Smckusick		 * If this is a full size block, we will just grab it
77562976Smckusick		 * and assign it to the snapshot inode. Otherwise we
77662976Smckusick		 * will proceed to copy it. See explanation for this
77762976Smckusick		 * routine as to why only a single snapshot needs to
77862976Smckusick		 * claim this block.
77962976Smckusick		 */
78062976Smckusick		if (size == fs->fs_bsize) {
78162976Smckusick#ifdef DEBUG
78262976Smckusick			if (snapdebug)
78362976Smckusick				printf("%s %d lbn %d from inum %d\n",
78462976Smckusick				    "Grabonremove: snapino", ip->i_number, lbn,
78562976Smckusick				    freeip->i_number);
78662976Smckusick#endif
78762976Smckusick			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
78862976Smckusick			if (lbn < NDADDR) {
78962976Smckusick				ip->i_db[lbn] = bno;
79062976Smckusick			} else {
79162976Smckusick				((ufs_daddr_t *)(ibp->b_data))[indiroff] = bno;
79262976Smckusick				bdwrite(ibp);
79362976Smckusick			}
79462976Smckusick			ip->i_blocks += btodb(size);
79562976Smckusick			ip->i_flag |= IN_CHANGE | IN_UPDATE;
79662976Smckusick			VOP_UNLOCK(vp, 0, p);
79762976Smckusick			return (1);
79862976Smckusick		}
79962976Smckusick		if (lbn >= NDADDR)
80063788Smckusick			bqrelse(ibp);
80162976Smckusick		/*
80262976Smckusick		 * Allocate the block into which to do the copy. Note that this
80362976Smckusick		 * allocation will never require any additional allocations for
80462976Smckusick		 * the snapshot inode.
80562976Smckusick		 */
80662976Smckusick		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
80762976Smckusick		p->p_flag |= P_COWINPROGRESS;
80862976Smckusick		error = VOP_BALLOC(vp, lblktosize(fs, (off_t)lbn),
80962976Smckusick		    fs->fs_bsize, KERNCRED, 0, &cbp);
81062976Smckusick		p->p_flag &= ~P_COWINPROGRESS;
81162976Smckusick		VOP_UNLOCK(vp, 0, p);
81262976Smckusick		if (error)
81362976Smckusick			break;
81462976Smckusick#ifdef DEBUG
81562976Smckusick		if (snapdebug)
81662976Smckusick			printf("%s%d lbn %d for inum %d size %ld to blkno %d\n",
81762976Smckusick			    "Copyonremove: snapino ", ip->i_number, lbn,
81862976Smckusick			    freeip->i_number, size, cbp->b_blkno);
81962976Smckusick#endif
82062976Smckusick		/*
82162976Smckusick		 * If we have already read the old block contents, then
82262976Smckusick		 * simply copy them to the new block.
82362976Smckusick		 */
82462976Smckusick		if (savedcbp != 0) {
82562976Smckusick			bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize);
82662976Smckusick			bawrite(cbp);
82762976Smckusick			continue;
82862976Smckusick		}
82962976Smckusick		/*
83062976Smckusick		 * Otherwise, read the old block contents into the buffer.
83162976Smckusick		 */
83262976Smckusick		if ((error = readblock(cbp, lbn)) != 0)
83362976Smckusick			break;
83462976Smckusick		savedcbp = cbp;
83562976Smckusick	}
83662976Smckusick	if (savedcbp)
83762976Smckusick		bawrite(savedcbp);
83862976Smckusick	/*
83962976Smckusick	 * If we have been unable to allocate a block in which to do
84062976Smckusick	 * the copy, then return non-zero so that the fragment will
84162976Smckusick	 * not be freed. Although space will be lost, the snapshot
84262976Smckusick	 * will stay consistent.
84362976Smckusick	 */
84462976Smckusick	return (error);
84562976Smckusick}
84662976Smckusick
84762976Smckusick/*
84862976Smckusick * Associate snapshot files when mounting.
84962976Smckusick */
85062976Smckusickvoid
85162976Smckusickffs_snapshot_mount(mp)
85262976Smckusick	struct mount *mp;
85362976Smckusick{
85462976Smckusick	struct ufsmount *ump = VFSTOUFS(mp);
85562976Smckusick	struct fs *fs = ump->um_fs;
85662976Smckusick	struct proc *p = CURPROC;
85762976Smckusick	struct inode *ip, **listtailp;
85862976Smckusick	struct vnode *vp;
85962976Smckusick	int error, snaploc, loc;
86062976Smckusick
86162976Smckusick	listtailp = &VTOI(ump->um_devvp)->i_copyonwrite;
86262976Smckusick	for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) {
86362976Smckusick		if (fs->fs_snapinum[snaploc] == 0)
86462976Smckusick			return;
86562976Smckusick		if ((error = VFS_VGET(mp, fs->fs_snapinum[snaploc], &vp)) != 0){
86662976Smckusick			printf("ffs_snapshot_mount: vget failed %d\n", error);
86762976Smckusick			continue;
86862976Smckusick		}
86962976Smckusick		ip = VTOI(vp);
87062976Smckusick		if ((ip->i_flags & SF_SNAPSHOT) == 0) {
87162976Smckusick			printf("ffs_snapshot_mount: non-snapshot inode %d\n",
87262976Smckusick			    fs->fs_snapinum[snaploc]);
87362976Smckusick			vput(vp);
87462976Smckusick			for (loc = snaploc + 1; loc < FSMAXSNAP; loc++) {
87562976Smckusick				if (fs->fs_snapinum[loc] == 0)
87662976Smckusick					break;
87762976Smckusick				fs->fs_snapinum[loc - 1] = fs->fs_snapinum[loc];
87862976Smckusick			}
87962976Smckusick			fs->fs_snapinum[loc - 1] = 0;
88062976Smckusick			snaploc--;
88162976Smckusick			continue;
88262976Smckusick		}
88362976Smckusick		if (ip->i_copyonwrite != 0)
88462976Smckusick			panic("ffs_snapshot_mount: %d already on list",
88562976Smckusick			    ip->i_number);
88662976Smckusick		*listtailp = ip;
88762976Smckusick		listtailp = &ip->i_copyonwrite;
88862976Smckusick		vp->v_flag |= VSYSTEM;
88962976Smckusick		VOP_UNLOCK(vp, 0, p);
89062976Smckusick		ump->um_devvp->v_flag |= VCOPYONWRITE;
89162976Smckusick	}
89262976Smckusick}
89362976Smckusick
89462976Smckusick/*
89562976Smckusick * Disassociate snapshot files when unmounting.
89662976Smckusick */
89762976Smckusickvoid
89862976Smckusickffs_snapshot_unmount(mp)
89962976Smckusick	struct mount *mp;
90062976Smckusick{
90162976Smckusick	struct ufsmount *ump = VFSTOUFS(mp);
90262976Smckusick	struct inode *devip = VTOI(ump->um_devvp);
90362976Smckusick	struct inode *xp;
90462976Smckusick
90562976Smckusick	while ((xp = devip->i_copyonwrite) != 0) {
90662976Smckusick		devip->i_copyonwrite = xp->i_copyonwrite;
90762976Smckusick		xp->i_copyonwrite = 0;
90862976Smckusick		vrele(ITOV(xp));
90962976Smckusick	}
91062976Smckusick	ump->um_devvp->v_flag &= ~VCOPYONWRITE;
91162976Smckusick}
91262976Smckusick
91362976Smckusick/*
91462976Smckusick * Check for need to copy block that is about to be written,
91562976Smckusick * copying the block if necessary.
91662976Smckusick */
91762976Smckusickint
91862976Smckusickffs_copyonwrite(ap)
91962976Smckusick	struct vop_copyonwrite_args /* {
92062976Smckusick		struct vnode *a_vp;
92162976Smckusick		struct buf *a_bp;
92262976Smckusick	} */ *ap;
92362976Smckusick{
92462976Smckusick	struct buf *ibp, *cbp, *savedcbp = 0, *bp = ap->a_bp;
92562976Smckusick	struct fs *fs = VTOI(bp->b_vp)->i_fs;
92662976Smckusick	struct proc *p = CURPROC;
92762976Smckusick	struct inode *ip;
92862976Smckusick	struct vnode *vp;
92962976Smckusick	ufs_daddr_t lbn, blkno;
93062976Smckusick	int indiroff, error = 0;
93162976Smckusick
93262976Smckusick	lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno));
93362976Smckusick	if (p->p_flag & P_COWINPROGRESS)
93462976Smckusick		panic("ffs_copyonwrite: recursive call");
93562976Smckusick	for (ip = VTOI(ap->a_vp)->i_copyonwrite; ip; ip = ip->i_copyonwrite) {
93662976Smckusick		vp = ITOV(ip);
93762976Smckusick		/*
93862976Smckusick		 * We ensure that everything of our own that needs to be
93962976Smckusick		 * copied will be done at the time that ffs_snapshot is
94062976Smckusick		 * called. Thus we can skip the check here which can
94162976Smckusick		 * deadlock in doing the lookup in VOP_BALLOC.
94262976Smckusick		 */
94362976Smckusick		if (bp->b_vp == vp)
94462976Smckusick			continue;
94562976Smckusick		/*
94663788Smckusick		 * Check to see if block needs to be copied. We have to
94763788Smckusick		 * be able to do the VOP_BALLOC without blocking, otherwise
94863788Smckusick		 * we may get in a deadlock with another process also
94963788Smckusick		 * trying to allocate. If we find outselves unable to
95063788Smckusick		 * get the buffer lock, we unlock the snapshot vnode,
95163788Smckusick		 * sleep briefly, and try again.
95262976Smckusick		 */
95363788Smckusickretry:
95463788Smckusick		vn_lock(vp, LK_SHARED | LK_RETRY, p);
95562976Smckusick		if (lbn < NDADDR) {
95662976Smckusick			blkno = ip->i_db[lbn];
95762976Smckusick		} else {
95862976Smckusick			p->p_flag |= P_COWINPROGRESS;
95962976Smckusick			error = VOP_BALLOC(vp, lblktosize(fs, (off_t)lbn),
96063788Smckusick			   fs->fs_bsize, KERNCRED, B_METAONLY | B_NOWAIT, &ibp);
96162976Smckusick			p->p_flag &= ~P_COWINPROGRESS;
96263788Smckusick			if (error) {
96363788Smckusick				VOP_UNLOCK(vp, 0, p);
96463788Smckusick				if (error != EWOULDBLOCK)
96563788Smckusick					break;
96663788Smckusick				tsleep(vp, p->p_usrpri, "nap", 1);
96763788Smckusick				goto retry;
96863788Smckusick			}
96962976Smckusick			indiroff = (lbn - NDADDR) % NINDIR(fs);
97062976Smckusick			blkno = ((ufs_daddr_t *)(ibp->b_data))[indiroff];
97163788Smckusick			bqrelse(ibp);
97262976Smckusick		}
97362976Smckusick#ifdef DIAGNOSTIC
97462976Smckusick		if (blkno == BLK_SNAP && bp->b_lblkno >= 0)
97562976Smckusick			panic("ffs_copyonwrite: bad copy block");
97662976Smckusick#endif
97763788Smckusick		if (blkno != 0) {
97863788Smckusick			VOP_UNLOCK(vp, 0, p);
97962976Smckusick			continue;
98063788Smckusick		}
98162976Smckusick		/*
98262976Smckusick		 * Allocate the block into which to do the copy. Note that this
98362976Smckusick		 * allocation will never require any additional allocations for
98462976Smckusick		 * the snapshot inode.
98562976Smckusick		 */
98662976Smckusick		p->p_flag |= P_COWINPROGRESS;
98762976Smckusick		error = VOP_BALLOC(vp, lblktosize(fs, (off_t)lbn),
98863788Smckusick		    fs->fs_bsize, KERNCRED, B_NOWAIT, &cbp);
98962976Smckusick		p->p_flag &= ~P_COWINPROGRESS;
99062976Smckusick		VOP_UNLOCK(vp, 0, p);
99163788Smckusick		if (error) {
99263788Smckusick			if (error != EWOULDBLOCK)
99363788Smckusick				break;
99463788Smckusick			tsleep(vp, p->p_usrpri, "nap", 1);
99563788Smckusick			goto retry;
99663788Smckusick		}
99762976Smckusick#ifdef DEBUG
99862976Smckusick		if (snapdebug) {
99962976Smckusick			printf("Copyonwrite: snapino %d lbn %d for ",
100062976Smckusick			    ip->i_number, lbn);
100162976Smckusick			if (bp->b_vp == ap->a_vp)
100262976Smckusick				printf("fs metadata");
100362976Smckusick			else
100462976Smckusick				printf("inum %d", VTOI(bp->b_vp)->i_number);
100562976Smckusick			printf(" lblkno %d to blkno %d\n", bp->b_lblkno,
100662976Smckusick			    cbp->b_blkno);
100762976Smckusick		}
100862976Smckusick#endif
100962976Smckusick		/*
101062976Smckusick		 * If we have already read the old block contents, then
101162976Smckusick		 * simply copy them to the new block.
101262976Smckusick		 */
101362976Smckusick		if (savedcbp != 0) {
101462976Smckusick			bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize);
101562976Smckusick			bawrite(cbp);
101662976Smckusick			continue;
101762976Smckusick		}
101862976Smckusick		/*
101962976Smckusick		 * Otherwise, read the old block contents into the buffer.
102062976Smckusick		 */
102162976Smckusick		if ((error = readblock(cbp, lbn)) != 0)
102262976Smckusick			break;
102362976Smckusick		savedcbp = cbp;
102462976Smckusick	}
102562976Smckusick	if (savedcbp)
102662976Smckusick		bawrite(savedcbp);
102762976Smckusick	return (error);
102862976Smckusick}
102962976Smckusick
103062976Smckusick/*
103162976Smckusick * Read the specified block into the given buffer.
103262976Smckusick * Much of this boiler-plate comes from bwrite().
103362976Smckusick */
103462976Smckusickstatic int
103562976Smckusickreadblock(bp, lbn)
103662976Smckusick	struct buf *bp;
103762976Smckusick	daddr_t lbn;
103862976Smckusick{
103962976Smckusick	struct uio auio;
104062976Smckusick	struct iovec aiov;
104162976Smckusick	struct proc *p = CURPROC;
104262976Smckusick	struct inode *ip = VTOI(bp->b_vp);
104362976Smckusick
104462976Smckusick	aiov.iov_base = bp->b_data;
104562976Smckusick	aiov.iov_len = bp->b_bcount;
104662976Smckusick	auio.uio_iov = &aiov;
104762976Smckusick	auio.uio_iovcnt = 1;
104862976Smckusick	auio.uio_offset = dbtob(fsbtodb(ip->i_fs, blkstofrags(ip->i_fs, lbn)));
104962976Smckusick	auio.uio_resid = bp->b_bcount;
105062976Smckusick	auio.uio_rw = UIO_READ;
105162976Smckusick	auio.uio_segflg = UIO_SYSSPACE;
105262976Smckusick	auio.uio_procp = p;
105362976Smckusick	return (physio(ip->i_devvp->v_rdev, &auio, 0));
105462976Smckusick}
1055