ffs_snapshot.c revision 113376
1/*
2 * Copyright 2000 Marshall Kirk McKusick. All Rights Reserved.
3 *
4 * Further information about snapshots can be obtained from:
5 *
6 *	Marshall Kirk McKusick		http://www.mckusick.com/softdep/
7 *	1614 Oxford Street		mckusick@mckusick.com
8 *	Berkeley, CA 94709-1608		+1-510-843-9542
9 *	USA
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 *
15 * 1. Redistributions of source code must retain the above copyright
16 *    notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 *    notice, this list of conditions and the following disclaimer in the
19 *    documentation and/or other materials provided with the distribution.
20 *
21 * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY
22 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
23 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24 * DISCLAIMED.  IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR
25 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * SUCH DAMAGE.
32 *
33 *	@(#)ffs_snapshot.c	8.11 (McKusick) 7/23/00
34 * $FreeBSD: head/sys/ufs/ffs/ffs_snapshot.c 113376 2003-04-12 01:05:19Z jeff $
35 */
36
37#include <sys/param.h>
38#include <sys/kernel.h>
39#include <sys/systm.h>
40#include <sys/conf.h>
41#include <sys/bio.h>
42#include <sys/buf.h>
43#include <sys/proc.h>
44#include <sys/namei.h>
45#include <sys/sched.h>
46#include <sys/stat.h>
47#include <sys/malloc.h>
48#include <sys/mount.h>
49#include <sys/resource.h>
50#include <sys/resourcevar.h>
51#include <sys/vnode.h>
52
53#include <ufs/ufs/extattr.h>
54#include <ufs/ufs/quota.h>
55#include <ufs/ufs/ufsmount.h>
56#include <ufs/ufs/inode.h>
57#include <ufs/ufs/ufs_extern.h>
58
59#include <ufs/ffs/fs.h>
60#include <ufs/ffs/ffs_extern.h>
61
62#define KERNCRED thread0.td_ucred
63#define DEBUG 1
64
65static int cgaccount(int, struct vnode *, struct buf *, int);
66static int expunge_ufs1(struct vnode *, struct inode *, struct fs *,
67    int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *,
68    ufs_lbn_t, int), int);
69static int indiracct_ufs1(struct vnode *, struct vnode *, int,
70    ufs1_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *,
71    int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *,
72    ufs_lbn_t, int), int);
73static int fullacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
74    struct fs *, ufs_lbn_t, int);
75static int snapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
76    struct fs *, ufs_lbn_t, int);
77static int mapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
78    struct fs *, ufs_lbn_t, int);
79static int expunge_ufs2(struct vnode *, struct inode *, struct fs *,
80    int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *,
81    ufs_lbn_t, int), int);
82static int indiracct_ufs2(struct vnode *, struct vnode *, int,
83    ufs2_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *,
84    int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *,
85    ufs_lbn_t, int), int);
86static int fullacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
87    struct fs *, ufs_lbn_t, int);
88static int snapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
89    struct fs *, ufs_lbn_t, int);
90static int mapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
91    struct fs *, ufs_lbn_t, int);
92static int ffs_copyonwrite(struct vnode *, struct buf *);
93static int readblock(struct buf *, ufs2_daddr_t);
94
95/*
96 * To ensure the consistency of snapshots across crashes, we must
97 * synchronously write out copied blocks before allowing the
98 * originals to be modified. Because of the rather severe speed
99 * penalty that this imposes, the following flag allows this
100 * crash persistence to be disabled.
101 */
102int dopersistence = 0;
103
104#ifdef DEBUG
105#include <sys/sysctl.h>
106SYSCTL_INT(_debug, OID_AUTO, dopersistence, CTLFLAG_RW, &dopersistence, 0, "");
107int snapdebug = 0;
108SYSCTL_INT(_debug, OID_AUTO, snapdebug, CTLFLAG_RW, &snapdebug, 0, "");
109int collectsnapstats = 0;
110SYSCTL_INT(_debug, OID_AUTO, collectsnapstats, CTLFLAG_RW, &collectsnapstats,
111	0, "");
112#endif /* DEBUG */
113
114/*
115 * Create a snapshot file and initialize it for the filesystem.
116 */
117int
118ffs_snapshot(mp, snapfile)
119	struct mount *mp;
120	char *snapfile;
121{
122	ufs2_daddr_t numblks, blkno, *blkp, *snapblklist;
123	int error, cg, snaploc;
124	int i, size, len, loc;
125	int flag = mp->mnt_flag;
126	struct timespec starttime = {0, 0}, endtime;
127	char saved_nice = 0;
128	long redo = 0, snaplistsize = 0;
129	int32_t *lp;
130	void *space;
131	struct fs *copy_fs = NULL, *fs = VFSTOUFS(mp)->um_fs;
132	struct snaphead *snaphead;
133	struct thread *td = curthread;
134	struct inode *ip, *xp;
135	struct buf *bp, *nbp, *ibp, *sbp = NULL;
136	struct nameidata nd;
137	struct mount *wrtmp;
138	struct vattr vat;
139	struct vnode *vp, *xvp, *nvp, *devvp;
140	struct uio auio;
141	struct iovec aiov;
142
143	/*
144	 * Need to serialize access to snapshot code per filesystem.
145	 */
146	/*
147	 * Assign a snapshot slot in the superblock.
148	 */
149	for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++)
150		if (fs->fs_snapinum[snaploc] == 0)
151			break;
152	if (snaploc == FSMAXSNAP)
153		return (ENOSPC);
154	/*
155	 * Create the snapshot file.
156	 */
157restart:
158	NDINIT(&nd, CREATE, LOCKPARENT | LOCKLEAF, UIO_USERSPACE, snapfile, td);
159	if ((error = namei(&nd)) != 0)
160		return (error);
161	if (nd.ni_vp != NULL) {
162		vput(nd.ni_vp);
163		error = EEXIST;
164	}
165	if (nd.ni_dvp->v_mount != mp)
166		error = EXDEV;
167	if (error) {
168		NDFREE(&nd, NDF_ONLY_PNBUF);
169		if (nd.ni_dvp == nd.ni_vp)
170			vrele(nd.ni_dvp);
171		else
172			vput(nd.ni_dvp);
173		return (error);
174	}
175	VATTR_NULL(&vat);
176	vat.va_type = VREG;
177	vat.va_mode = S_IRUSR;
178	vat.va_vaflags |= VA_EXCLUSIVE;
179	if (VOP_GETWRITEMOUNT(nd.ni_dvp, &wrtmp))
180		wrtmp = NULL;
181	if (wrtmp != mp)
182		panic("ffs_snapshot: mount mismatch");
183	if (vn_start_write(NULL, &wrtmp, V_NOWAIT) != 0) {
184		NDFREE(&nd, NDF_ONLY_PNBUF);
185		vput(nd.ni_dvp);
186		if ((error = vn_start_write(NULL, &wrtmp,
187		    V_XSLEEP | PCATCH)) != 0)
188			return (error);
189		goto restart;
190	}
191	VOP_LEASE(nd.ni_dvp, td, KERNCRED, LEASE_WRITE);
192	error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vat);
193	vput(nd.ni_dvp);
194	if (error) {
195		NDFREE(&nd, NDF_ONLY_PNBUF);
196		vn_finished_write(wrtmp);
197		return (error);
198	}
199	vp = nd.ni_vp;
200	ip = VTOI(vp);
201	devvp = ip->i_devvp;
202	/*
203	 * Allocate and copy the last block contents so as to be able
204	 * to set size to that of the filesystem.
205	 */
206	numblks = howmany(fs->fs_size, fs->fs_frag);
207	error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(numblks - 1)),
208	    fs->fs_bsize, KERNCRED, BA_CLRBUF, &bp);
209	if (error)
210		goto out;
211	ip->i_size = lblktosize(fs, (off_t)numblks);
212	DIP(ip, i_size) = ip->i_size;
213	ip->i_flag |= IN_CHANGE | IN_UPDATE;
214	if ((error = readblock(bp, numblks - 1)) != 0)
215		goto out;
216	bawrite(bp);
217	/*
218	 * Preallocate critical data structures so that we can copy
219	 * them in without further allocation after we suspend all
220	 * operations on the filesystem. We would like to just release
221	 * the allocated buffers without writing them since they will
222	 * be filled in below once we are ready to go, but this upsets
223	 * the soft update code, so we go ahead and write the new buffers.
224	 *
225	 * Allocate all indirect blocks and mark all of them as not
226	 * needing to be copied.
227	 */
228	for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) {
229		error = UFS_BALLOC(vp, lblktosize(fs, (off_t)blkno),
230		    fs->fs_bsize, td->td_ucred, BA_METAONLY, &ibp);
231		if (error)
232			goto out;
233		bawrite(ibp);
234	}
235	/*
236	 * Allocate copies for the superblock and its summary information.
237	 */
238	error = UFS_BALLOC(vp, fs->fs_sblockloc, fs->fs_sbsize, KERNCRED,
239	    0, &nbp);
240	if (error)
241		goto out;
242	bawrite(nbp);
243	blkno = fragstoblks(fs, fs->fs_csaddr);
244	len = howmany(fs->fs_cssize, fs->fs_bsize);
245	for (loc = 0; loc < len; loc++) {
246		error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(blkno + loc)),
247		    fs->fs_bsize, KERNCRED, 0, &nbp);
248		if (error)
249			goto out;
250		bawrite(nbp);
251	}
252	/*
253	 * Allocate all cylinder group blocks.
254	 */
255	for (cg = 0; cg < fs->fs_ncg; cg++) {
256		error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)),
257		    fs->fs_bsize, KERNCRED, 0, &nbp);
258		if (error)
259			goto out;
260		bawrite(nbp);
261	}
262	/*
263	 * Copy all the cylinder group maps. Although the
264	 * filesystem is still active, we hope that only a few
265	 * cylinder groups will change between now and when we
266	 * suspend operations. Thus, we will be able to quickly
267	 * touch up the few cylinder groups that changed during
268	 * the suspension period.
269	 */
270	len = howmany(fs->fs_ncg, NBBY);
271	MALLOC(fs->fs_active, int *, len, M_DEVBUF, M_WAITOK);
272	bzero(fs->fs_active, len);
273	for (cg = 0; cg < fs->fs_ncg; cg++) {
274		error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)),
275		    fs->fs_bsize, KERNCRED, 0, &nbp);
276		if (error)
277			goto out;
278		error = cgaccount(cg, vp, nbp, 1);
279		bawrite(nbp);
280		if (error)
281			goto out;
282	}
283	/*
284	 * Change inode to snapshot type file.
285	 */
286	ip->i_flags |= SF_SNAPSHOT;
287	DIP(ip, i_flags) = ip->i_flags;
288	ip->i_flag |= IN_CHANGE | IN_UPDATE;
289	/*
290	 * Ensure that the snapshot is completely on disk.
291	 * Since we have marked it as a snapshot it is safe to
292	 * unlock it as no process will be allowed to write to it.
293	 */
294	if ((error = VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td)) != 0)
295		goto out;
296	VOP_UNLOCK(vp, 0, td);
297	/*
298	 * All allocations are done, so we can now snapshot the system.
299	 *
300	 * Recind nice scheduling while running with the filesystem suspended.
301	 */
302	if (td->td_ksegrp->kg_nice > 0) {
303		saved_nice = td->td_ksegrp->kg_nice;
304		sched_nice(td->td_ksegrp, 0);
305	}
306	/*
307	 * Suspend operation on filesystem.
308	 */
309	for (;;) {
310		vn_finished_write(wrtmp);
311		if ((error = vfs_write_suspend(vp->v_mount)) != 0) {
312			vn_start_write(NULL, &wrtmp, V_WAIT);
313			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
314			goto out;
315		}
316		if (mp->mnt_kern_flag & MNTK_SUSPENDED)
317			break;
318		vn_start_write(NULL, &wrtmp, V_WAIT);
319	}
320	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
321	if (collectsnapstats)
322		nanotime(&starttime);
323	/*
324	 * First, copy all the cylinder group maps that have changed.
325	 */
326	for (cg = 0; cg < fs->fs_ncg; cg++) {
327		if ((ACTIVECGNUM(fs, cg) & ACTIVECGOFF(cg)) != 0)
328			continue;
329		redo++;
330		error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)),
331		    fs->fs_bsize, KERNCRED, 0, &nbp);
332		if (error)
333			goto out1;
334		error = cgaccount(cg, vp, nbp, 2);
335		bawrite(nbp);
336		if (error)
337			goto out1;
338	}
339	/*
340	 * Grab a copy of the superblock and its summary information.
341	 * We delay writing it until the suspension is released below.
342	 */
343	error = bread(vp, lblkno(fs, fs->fs_sblockloc), fs->fs_bsize,
344	    KERNCRED, &sbp);
345	if (error) {
346		brelse(sbp);
347		sbp = NULL;
348		goto out1;
349	}
350	loc = blkoff(fs, fs->fs_sblockloc);
351	copy_fs = (struct fs *)(sbp->b_data + loc);
352	bcopy(fs, copy_fs, fs->fs_sbsize);
353	if ((fs->fs_flags & (FS_UNCLEAN | FS_NEEDSFSCK)) == 0)
354		copy_fs->fs_clean = 1;
355	size = fs->fs_bsize < SBLOCKSIZE ? fs->fs_bsize : SBLOCKSIZE;
356	if (fs->fs_sbsize < size)
357		bzero(&sbp->b_data[loc + fs->fs_sbsize], size - fs->fs_sbsize);
358	size = blkroundup(fs, fs->fs_cssize);
359	if (fs->fs_contigsumsize > 0)
360		size += fs->fs_ncg * sizeof(int32_t);
361	space = malloc((u_long)size, M_UFSMNT, M_WAITOK);
362	copy_fs->fs_csp = space;
363	bcopy(fs->fs_csp, copy_fs->fs_csp, fs->fs_cssize);
364	(char *)space += fs->fs_cssize;
365	loc = howmany(fs->fs_cssize, fs->fs_fsize);
366	i = fs->fs_frag - loc % fs->fs_frag;
367	len = (i == fs->fs_frag) ? 0 : i * fs->fs_fsize;
368	if (len > 0) {
369		if ((error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + loc),
370		    len, KERNCRED, &bp)) != 0) {
371			brelse(bp);
372			free(copy_fs->fs_csp, M_UFSMNT);
373			bawrite(sbp);
374			sbp = NULL;
375			goto out1;
376		}
377		bcopy(bp->b_data, space, (u_int)len);
378		(char *)space += len;
379		bp->b_flags |= B_INVAL | B_NOCACHE;
380		brelse(bp);
381	}
382	if (fs->fs_contigsumsize > 0) {
383		copy_fs->fs_maxcluster = lp = space;
384		for (i = 0; i < fs->fs_ncg; i++)
385			*lp++ = fs->fs_contigsumsize;
386	}
387	/*
388	 * We must check for active files that have been unlinked
389	 * (e.g., with a zero link count). We have to expunge all
390	 * trace of these files from the snapshot so that they are
391	 * not reclaimed prematurely by fsck or unnecessarily dumped.
392	 * We turn off the MNTK_SUSPENDED flag to avoid a panic from
393	 * spec_strategy about writing on a suspended filesystem.
394	 * Note that we skip unlinked snapshot files as they will
395	 * be handled separately below.
396	 *
397	 * We also calculate the needed size for the snapshot list.
398	 */
399	snaplistsize = fs->fs_ncg + howmany(fs->fs_cssize, fs->fs_bsize) +
400	    FSMAXSNAP + 1 /* superblock */ + 1 /* last block */ + 1 /* size */;
401	mp->mnt_kern_flag &= ~MNTK_SUSPENDED;
402	mtx_lock(&mntvnode_mtx);
403loop:
404	for (xvp = TAILQ_FIRST(&mp->mnt_nvnodelist); xvp; xvp = nvp) {
405		/*
406		 * Make sure this vnode wasn't reclaimed in getnewvnode().
407		 * Start over if it has (it won't be on the list anymore).
408		 */
409		if (xvp->v_mount != mp)
410			goto loop;
411		nvp = TAILQ_NEXT(xvp, v_nmntvnodes);
412		mtx_unlock(&mntvnode_mtx);
413		mp_fixme("Unlocked GETATTR.");
414		if (vrefcnt(xvp) == 0 || xvp->v_type == VNON ||
415		    (VTOI(xvp)->i_flags & SF_SNAPSHOT) ||
416		    (VOP_GETATTR(xvp, &vat, td->td_ucred, td) == 0 &&
417		    vat.va_nlink > 0)) {
418			mtx_lock(&mntvnode_mtx);
419			continue;
420		}
421		if (snapdebug)
422			vprint("ffs_snapshot: busy vnode", xvp);
423		if (vn_lock(xvp, LK_EXCLUSIVE, td) != 0)
424			goto loop;
425		xp = VTOI(xvp);
426		if (ffs_checkfreefile(copy_fs, vp, xp->i_number)) {
427			VOP_UNLOCK(xvp, 0, td);
428			continue;
429		}
430		/*
431		 * If there is a fragment, clear it here.
432		 */
433		blkno = 0;
434		loc = howmany(xp->i_size, fs->fs_bsize) - 1;
435		if (loc < NDADDR) {
436			len = fragroundup(fs, blkoff(fs, xp->i_size));
437			if (len < fs->fs_bsize) {
438				ffs_blkfree(copy_fs, vp, DIP(xp, i_db[loc]),
439				    len, xp->i_number);
440				blkno = DIP(xp, i_db[loc]);
441				DIP(xp, i_db[loc]) = 0;
442			}
443		}
444		snaplistsize += 1;
445		if (xp->i_ump->um_fstype == UFS1)
446			error = expunge_ufs1(vp, xp, copy_fs, fullacct_ufs1,
447			    BLK_NOCOPY);
448		else
449			error = expunge_ufs2(vp, xp, copy_fs, fullacct_ufs2,
450			    BLK_NOCOPY);
451		if (blkno)
452			DIP(xp, i_db[loc]) = blkno;
453		if (!error)
454			error = ffs_freefile(copy_fs, vp, xp->i_number,
455			    xp->i_mode);
456		VOP_UNLOCK(xvp, 0, td);
457		if (error) {
458			free(copy_fs->fs_csp, M_UFSMNT);
459			bawrite(sbp);
460			sbp = NULL;
461			goto out1;
462		}
463		mtx_lock(&mntvnode_mtx);
464	}
465	mtx_unlock(&mntvnode_mtx);
466	/*
467	 * If there already exist snapshots on this filesystem, grab a
468	 * reference to their shared lock. If this is the first snapshot
469	 * on this filesystem, we need to allocate a lock for the snapshots
470	 * to share. In either case, acquire the snapshot lock and give
471	 * up our original private lock.
472	 */
473	VI_LOCK(devvp);
474	snaphead = &devvp->v_rdev->si_snapshots;
475	if ((xp = TAILQ_FIRST(snaphead)) != NULL) {
476		VI_LOCK(vp);
477		vp->v_vnlock = ITOV(xp)->v_vnlock;
478		VI_UNLOCK(devvp);
479	} else {
480		struct lock *lkp;
481
482		VI_UNLOCK(devvp);
483		MALLOC(lkp, struct lock *, sizeof(struct lock), M_UFSMNT,
484		    M_WAITOK);
485		lockinit(lkp, PVFS, "snaplk", VLKTIMEOUT,
486		    LK_CANRECURSE | LK_NOPAUSE);
487		VI_LOCK(vp);
488		vp->v_vnlock = lkp;
489	}
490	vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY, td);
491	transferlockers(&vp->v_lock, vp->v_vnlock);
492	lockmgr(&vp->v_lock, LK_RELEASE, NULL, td);
493	/*
494	 * If this is the first snapshot on this filesystem, then we need
495	 * to allocate the space for the list of preallocated snapshot blocks.
496	 * This list will be refined below, but this preliminary one will
497	 * keep us out of deadlock until the full one is ready.
498	 */
499	if (xp == NULL) {
500		MALLOC(snapblklist, daddr_t *, snaplistsize * sizeof(daddr_t),
501		    M_UFSMNT, M_WAITOK);
502		blkp = &snapblklist[1];
503		*blkp++ = lblkno(fs, fs->fs_sblockloc);
504		blkno = fragstoblks(fs, fs->fs_csaddr);
505		for (cg = 0; cg < fs->fs_ncg; cg++) {
506			if (fragstoblks(fs, cgtod(fs, cg) > blkno))
507				break;
508			*blkp++ = fragstoblks(fs, cgtod(fs, cg));
509		}
510		len = howmany(fs->fs_cssize, fs->fs_bsize);
511		for (loc = 0; loc < len; loc++)
512			*blkp++ = blkno + loc;
513		for (; cg < fs->fs_ncg; cg++)
514			*blkp++ = fragstoblks(fs, cgtod(fs, cg));
515		snapblklist[0] = blkp - snapblklist;
516		VI_LOCK(devvp);
517		if (devvp->v_rdev->si_snapblklist != NULL)
518			panic("ffs_snapshot: non-empty list");
519		devvp->v_rdev->si_snapblklist = snapblklist;
520		devvp->v_rdev->si_snaplistsize = blkp - snapblklist;
521		VI_UNLOCK(devvp);
522	}
523	/*
524	 * Record snapshot inode. Since this is the newest snapshot,
525	 * it must be placed at the end of the list.
526	 */
527	VI_LOCK(devvp);
528	fs->fs_snapinum[snaploc] = ip->i_number;
529	if (ip->i_nextsnap.tqe_prev != 0)
530		panic("ffs_snapshot: %d already on list", ip->i_number);
531	TAILQ_INSERT_TAIL(snaphead, ip, i_nextsnap);
532	devvp->v_rdev->si_copyonwrite = ffs_copyonwrite;
533	devvp->v_vflag |= VV_COPYONWRITE;
534	VI_UNLOCK(devvp);
535	ASSERT_VOP_LOCKED(vp, "ffs_snapshot vp");
536	vp->v_vflag |= VV_SYSTEM;
537out1:
538	/*
539	 * Resume operation on filesystem.
540	 */
541	vfs_write_resume(vp->v_mount);
542	vn_start_write(NULL, &wrtmp, V_WAIT);
543	if (collectsnapstats && starttime.tv_sec > 0) {
544		nanotime(&endtime);
545		timespecsub(&endtime, &starttime);
546		printf("%s: suspended %ld.%03ld sec, redo %ld of %d\n",
547		    vp->v_mount->mnt_stat.f_mntonname, (long)endtime.tv_sec,
548		    endtime.tv_nsec / 1000000, redo, fs->fs_ncg);
549	}
550	if (sbp == NULL)
551		goto out;
552	/*
553	 * Copy allocation information from all the snapshots in
554	 * this snapshot and then expunge them from its view.
555	 */
556	snaphead = &devvp->v_rdev->si_snapshots;
557	TAILQ_FOREACH(xp, snaphead, i_nextsnap) {
558		if (xp == ip)
559			break;
560		if (xp->i_ump->um_fstype == UFS1)
561			error = expunge_ufs1(vp, xp, fs, snapacct_ufs1,
562			    BLK_SNAP);
563		else
564			error = expunge_ufs2(vp, xp, fs, snapacct_ufs2,
565			    BLK_SNAP);
566		if (error) {
567			fs->fs_snapinum[snaploc] = 0;
568			goto done;
569		}
570	}
571	/*
572	 * Allocate space for the full list of preallocated snapshot blocks.
573	 */
574	MALLOC(snapblklist, daddr_t *, snaplistsize * sizeof(daddr_t),
575	    M_UFSMNT, M_WAITOK);
576	ip->i_snapblklist = &snapblklist[1];
577	/*
578	 * Expunge the blocks used by the snapshots from the set of
579	 * blocks marked as used in the snapshot bitmaps. Also, collect
580	 * the list of allocated blocks in i_snapblklist.
581	 */
582	if (ip->i_ump->um_fstype == UFS1)
583		error = expunge_ufs1(vp, ip, copy_fs, mapacct_ufs1, BLK_SNAP);
584	else
585		error = expunge_ufs2(vp, ip, copy_fs, mapacct_ufs2, BLK_SNAP);
586	if (error) {
587		fs->fs_snapinum[snaploc] = 0;
588		FREE(snapblklist, M_UFSMNT);
589		goto done;
590	}
591	if (snaplistsize < ip->i_snapblklist - snapblklist)
592		panic("ffs_snapshot: list too small");
593	snaplistsize = ip->i_snapblklist - snapblklist;
594	snapblklist[0] = snaplistsize;
595	ip->i_snapblklist = 0;
596	/*
597	 * Write out the list of allocated blocks to the end of the snapshot.
598	 */
599	auio.uio_iov = &aiov;
600	auio.uio_iovcnt = 1;
601	aiov.iov_base = (void *)snapblklist;
602	aiov.iov_len = snaplistsize * sizeof(daddr_t);
603	auio.uio_resid = aiov.iov_len;;
604	auio.uio_offset = ip->i_size;
605	auio.uio_segflg = UIO_SYSSPACE;
606	auio.uio_rw = UIO_WRITE;
607	auio.uio_td = td;
608	if ((error = VOP_WRITE(vp, &auio, IO_UNIT, td->td_ucred)) != 0) {
609		fs->fs_snapinum[snaploc] = 0;
610		FREE(snapblklist, M_UFSMNT);
611		goto done;
612	}
613	/*
614	 * Write the superblock and its summary information
615	 * to the snapshot.
616	 */
617	blkno = fragstoblks(fs, fs->fs_csaddr);
618	len = howmany(fs->fs_cssize, fs->fs_bsize);
619	space = copy_fs->fs_csp;
620	for (loc = 0; loc < len; loc++) {
621		error = bread(vp, blkno + loc, fs->fs_bsize, KERNCRED, &nbp);
622		if (error) {
623			brelse(nbp);
624			fs->fs_snapinum[snaploc] = 0;
625			FREE(snapblklist, M_UFSMNT);
626			goto done;
627		}
628		bcopy(space, nbp->b_data, fs->fs_bsize);
629		space = (char *)space + fs->fs_bsize;
630		bawrite(nbp);
631	}
632	/*
633	 * As this is the newest list, it is the most inclusive, so
634	 * should replace the previous list.
635	 */
636	VI_LOCK(devvp);
637	space = devvp->v_rdev->si_snapblklist;
638	devvp->v_rdev->si_snapblklist = snapblklist;
639	devvp->v_rdev->si_snaplistsize = snaplistsize;
640	if (space != NULL)
641		FREE(space, M_UFSMNT);
642	VI_UNLOCK(devvp);
643done:
644	free(copy_fs->fs_csp, M_UFSMNT);
645	bawrite(sbp);
646out:
647	if (saved_nice > 0)
648		sched_nice(td->td_ksegrp, saved_nice);
649	if (fs->fs_active != 0) {
650		FREE(fs->fs_active, M_DEVBUF);
651		fs->fs_active = 0;
652	}
653	mp->mnt_flag = flag;
654	if (error)
655		(void) UFS_TRUNCATE(vp, (off_t)0, 0, NOCRED, td);
656	(void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td);
657	if (error)
658		vput(vp);
659	else
660		VOP_UNLOCK(vp, 0, td);
661	vn_finished_write(wrtmp);
662	return (error);
663}
664
665/*
666 * Copy a cylinder group map. All the unallocated blocks are marked
667 * BLK_NOCOPY so that the snapshot knows that it need not copy them
668 * if they are later written. If passno is one, then this is a first
669 * pass, so only setting needs to be done. If passno is 2, then this
670 * is a revision to a previous pass which must be undone as the
671 * replacement pass is done.
672 */
673static int
674cgaccount(cg, vp, nbp, passno)
675	int cg;
676	struct vnode *vp;
677	struct buf *nbp;
678	int passno;
679{
680	struct buf *bp, *ibp;
681	struct inode *ip;
682	struct cg *cgp;
683	struct fs *fs;
684	ufs2_daddr_t base, numblks;
685	int error, len, loc, indiroff;
686
687	ip = VTOI(vp);
688	fs = ip->i_fs;
689	error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)),
690		(int)fs->fs_cgsize, KERNCRED, &bp);
691	if (error) {
692		brelse(bp);
693		return (error);
694	}
695	cgp = (struct cg *)bp->b_data;
696	if (!cg_chkmagic(cgp)) {
697		brelse(bp);
698		return (EIO);
699	}
700	atomic_set_int(&ACTIVECGNUM(fs, cg), ACTIVECGOFF(cg));
701	bcopy(bp->b_data, nbp->b_data, fs->fs_cgsize);
702	if (fs->fs_cgsize < fs->fs_bsize)
703		bzero(&nbp->b_data[fs->fs_cgsize],
704		    fs->fs_bsize - fs->fs_cgsize);
705	if (passno == 2)
706		nbp->b_flags |= B_VALIDSUSPWRT;
707	numblks = howmany(fs->fs_size, fs->fs_frag);
708	len = howmany(fs->fs_fpg, fs->fs_frag);
709	base = cg * fs->fs_fpg / fs->fs_frag;
710	if (base + len >= numblks)
711		len = numblks - base - 1;
712	loc = 0;
713	if (base < NDADDR) {
714		for ( ; loc < NDADDR; loc++) {
715			if (ffs_isblock(fs, cg_blksfree(cgp), loc))
716				DIP(ip, i_db[loc]) = BLK_NOCOPY;
717			else if (passno == 2 && DIP(ip, i_db[loc])== BLK_NOCOPY)
718				DIP(ip, i_db[loc]) = 0;
719			else if (passno == 1 && DIP(ip, i_db[loc])== BLK_NOCOPY)
720				panic("ffs_snapshot: lost direct block");
721		}
722	}
723	error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(base + loc)),
724	    fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
725	if (error) {
726		brelse(bp);
727		return (error);
728	}
729	indiroff = (base + loc - NDADDR) % NINDIR(fs);
730	for ( ; loc < len; loc++, indiroff++) {
731		if (indiroff >= NINDIR(fs)) {
732			if (passno == 2)
733				ibp->b_flags |= B_VALIDSUSPWRT;
734			bawrite(ibp);
735			error = UFS_BALLOC(vp,
736			    lblktosize(fs, (off_t)(base + loc)),
737			    fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
738			if (error) {
739				brelse(bp);
740				return (error);
741			}
742			indiroff = 0;
743		}
744		if (ip->i_ump->um_fstype == UFS1) {
745			if (ffs_isblock(fs, cg_blksfree(cgp), loc))
746				((ufs1_daddr_t *)(ibp->b_data))[indiroff] =
747				    BLK_NOCOPY;
748			else if (passno == 2 && ((ufs1_daddr_t *)(ibp->b_data))
749			    [indiroff] == BLK_NOCOPY)
750				((ufs1_daddr_t *)(ibp->b_data))[indiroff] = 0;
751			else if (passno == 1 && ((ufs1_daddr_t *)(ibp->b_data))
752			    [indiroff] == BLK_NOCOPY)
753				panic("ffs_snapshot: lost indirect block");
754			continue;
755		}
756		if (ffs_isblock(fs, cg_blksfree(cgp), loc))
757			((ufs2_daddr_t *)(ibp->b_data))[indiroff] = BLK_NOCOPY;
758		else if (passno == 2 &&
759		    ((ufs2_daddr_t *)(ibp->b_data)) [indiroff] == BLK_NOCOPY)
760			((ufs2_daddr_t *)(ibp->b_data))[indiroff] = 0;
761		else if (passno == 1 &&
762		    ((ufs2_daddr_t *)(ibp->b_data)) [indiroff] == BLK_NOCOPY)
763			panic("ffs_snapshot: lost indirect block");
764	}
765	bqrelse(bp);
766	if (passno == 2)
767		ibp->b_flags |= B_VALIDSUSPWRT;
768	bdwrite(ibp);
769	return (0);
770}
771
772/*
773 * Before expunging a snapshot inode, note all the
774 * blocks that it claims with BLK_SNAP so that fsck will
775 * be able to account for those blocks properly and so
776 * that this snapshot knows that it need not copy them
777 * if the other snapshot holding them is freed. This code
778 * is reproduced once each for UFS1 and UFS2.
779 */
780static int
781expunge_ufs1(snapvp, cancelip, fs, acctfunc, expungetype)
782	struct vnode *snapvp;
783	struct inode *cancelip;
784	struct fs *fs;
785	int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
786	    struct fs *, ufs_lbn_t, int);
787	int expungetype;
788{
789	int i, error, indiroff;
790	ufs_lbn_t lbn, rlbn;
791	ufs2_daddr_t len, blkno, numblks, blksperindir;
792	struct ufs1_dinode *dip;
793	struct thread *td = curthread;
794	struct buf *bp;
795
796	/*
797	 * Prepare to expunge the inode. If its inode block has not
798	 * yet been copied, then allocate and fill the copy.
799	 */
800	lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number));
801	blkno = 0;
802	if (lbn < NDADDR) {
803		blkno = VTOI(snapvp)->i_din1->di_db[lbn];
804	} else {
805		td->td_proc->p_flag |= P_COWINPROGRESS;
806		error = UFS_BALLOC(snapvp, lblktosize(fs, (off_t)lbn),
807		   fs->fs_bsize, KERNCRED, BA_METAONLY, &bp);
808		td->td_proc->p_flag &= ~P_COWINPROGRESS;
809		if (error)
810			return (error);
811		indiroff = (lbn - NDADDR) % NINDIR(fs);
812		blkno = ((ufs1_daddr_t *)(bp->b_data))[indiroff];
813		bqrelse(bp);
814	}
815	if (blkno != 0) {
816		if ((error = bread(snapvp, lbn, fs->fs_bsize, KERNCRED, &bp)))
817			return (error);
818	} else {
819		error = UFS_BALLOC(snapvp, lblktosize(fs, (off_t)lbn),
820		    fs->fs_bsize, KERNCRED, 0, &bp);
821		if (error)
822			return (error);
823		if ((error = readblock(bp, lbn)) != 0)
824			return (error);
825	}
826	/*
827	 * Set a snapshot inode to be a zero length file, regular files
828	 * to be completely unallocated.
829	 */
830	dip = (struct ufs1_dinode *)bp->b_data +
831	    ino_to_fsbo(fs, cancelip->i_number);
832	if (expungetype == BLK_NOCOPY)
833		dip->di_mode = 0;
834	dip->di_size = 0;
835	dip->di_blocks = 0;
836	dip->di_flags &= ~SF_SNAPSHOT;
837	bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs1_daddr_t));
838	bdwrite(bp);
839	/*
840	 * Now go through and expunge all the blocks in the file
841	 * using the function requested.
842	 */
843	numblks = howmany(cancelip->i_size, fs->fs_bsize);
844	if ((error = (*acctfunc)(snapvp, &cancelip->i_din1->di_db[0],
845	    &cancelip->i_din1->di_db[NDADDR], fs, 0, expungetype)))
846		return (error);
847	if ((error = (*acctfunc)(snapvp, &cancelip->i_din1->di_ib[0],
848	    &cancelip->i_din1->di_ib[NIADDR], fs, -1, expungetype)))
849		return (error);
850	blksperindir = 1;
851	lbn = -NDADDR;
852	len = numblks - NDADDR;
853	rlbn = NDADDR;
854	for (i = 0; len > 0 && i < NIADDR; i++) {
855		error = indiracct_ufs1(snapvp, ITOV(cancelip), i,
856		    cancelip->i_din1->di_ib[i], lbn, rlbn, len,
857		    blksperindir, fs, acctfunc, expungetype);
858		if (error)
859			return (error);
860		blksperindir *= NINDIR(fs);
861		lbn -= blksperindir + 1;
862		len -= blksperindir;
863		rlbn += blksperindir;
864	}
865	return (0);
866}
867
868/*
869 * Descend an indirect block chain for vnode cancelvp accounting for all
870 * its indirect blocks in snapvp.
871 */
872static int
873indiracct_ufs1(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks,
874	    blksperindir, fs, acctfunc, expungetype)
875	struct vnode *snapvp;
876	struct vnode *cancelvp;
877	int level;
878	ufs1_daddr_t blkno;
879	ufs_lbn_t lbn;
880	ufs_lbn_t rlbn;
881	ufs_lbn_t remblks;
882	ufs_lbn_t blksperindir;
883	struct fs *fs;
884	int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
885	    struct fs *, ufs_lbn_t, int);
886	int expungetype;
887{
888	int error, num, i;
889	ufs_lbn_t subblksperindir;
890	struct indir indirs[NIADDR + 2];
891	ufs1_daddr_t last, *bap;
892	struct buf *bp;
893
894	if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0)
895		return (error);
896	if (lbn != indirs[num - 1 - level].in_lbn || blkno == 0 || num < 2)
897		panic("indiracct: botched params");
898	/*
899	 * We have to expand bread here since it will deadlock looking
900	 * up the block number for any blocks that are not in the cache.
901	 */
902	bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0, 0);
903	bp->b_blkno = fsbtodb(fs, blkno);
904	if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 &&
905	    (error = readblock(bp, fragstoblks(fs, blkno)))) {
906		brelse(bp);
907		return (error);
908	}
909	/*
910	 * Account for the block pointers in this indirect block.
911	 */
912	last = howmany(remblks, blksperindir);
913	if (last > NINDIR(fs))
914		last = NINDIR(fs);
915	MALLOC(bap, ufs1_daddr_t *, fs->fs_bsize, M_DEVBUF, M_WAITOK);
916	bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize);
917	bqrelse(bp);
918	error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs,
919	    level == 0 ? rlbn : -1, expungetype);
920	if (error || level == 0)
921		goto out;
922	/*
923	 * Account for the block pointers in each of the indirect blocks
924	 * in the levels below us.
925	 */
926	subblksperindir = blksperindir / NINDIR(fs);
927	for (lbn++, level--, i = 0; i < last; i++) {
928		error = indiracct_ufs1(snapvp, cancelvp, level, bap[i], lbn,
929		    rlbn, remblks, subblksperindir, fs, acctfunc, expungetype);
930		if (error)
931			goto out;
932		rlbn += blksperindir;
933		lbn -= blksperindir;
934		remblks -= blksperindir;
935	}
936out:
937	FREE(bap, M_DEVBUF);
938	return (error);
939}
940
941/*
942 * Do both snap accounting and map accounting.
943 */
944static int
945fullacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype)
946	struct vnode *vp;
947	ufs1_daddr_t *oldblkp, *lastblkp;
948	struct fs *fs;
949	ufs_lbn_t lblkno;
950	int exptype;	/* BLK_SNAP or BLK_NOCOPY */
951{
952	int error;
953
954	if ((error = snapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype)))
955		return (error);
956	return (mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype));
957}
958
959/*
960 * Identify a set of blocks allocated in a snapshot inode.
961 */
962static int
963snapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype)
964	struct vnode *vp;
965	ufs1_daddr_t *oldblkp, *lastblkp;
966	struct fs *fs;
967	ufs_lbn_t lblkno;
968	int expungetype;	/* BLK_SNAP or BLK_NOCOPY */
969{
970	struct inode *ip = VTOI(vp);
971	ufs1_daddr_t blkno, *blkp;
972	ufs_lbn_t lbn;
973	struct buf *ibp;
974	int error;
975
976	for ( ; oldblkp < lastblkp; oldblkp++) {
977		blkno = *oldblkp;
978		if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP)
979			continue;
980		lbn = fragstoblks(fs, blkno);
981		if (lbn < NDADDR) {
982			blkp = &ip->i_din1->di_db[lbn];
983			ip->i_flag |= IN_CHANGE | IN_UPDATE;
984		} else {
985			error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn),
986			    fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
987			if (error)
988				return (error);
989			blkp = &((ufs1_daddr_t *)(ibp->b_data))
990			    [(lbn - NDADDR) % NINDIR(fs)];
991		}
992		/*
993		 * If we are expunging a snapshot vnode and we
994		 * find a block marked BLK_NOCOPY, then it is
995		 * one that has been allocated to this snapshot after
996		 * we took our current snapshot and can be ignored.
997		 */
998		if (expungetype == BLK_SNAP && *blkp == BLK_NOCOPY) {
999			if (lbn >= NDADDR)
1000				brelse(ibp);
1001		} else {
1002			if (*blkp != 0)
1003				panic("snapacct: bad block");
1004			*blkp = expungetype;
1005			if (lbn >= NDADDR)
1006				bdwrite(ibp);
1007		}
1008	}
1009	return (0);
1010}
1011
1012/*
1013 * Account for a set of blocks allocated in a snapshot inode.
1014 */
1015static int
1016mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype)
1017	struct vnode *vp;
1018	ufs1_daddr_t *oldblkp, *lastblkp;
1019	struct fs *fs;
1020	ufs_lbn_t lblkno;
1021	int expungetype;
1022{
1023	ufs1_daddr_t blkno;
1024	struct inode *ip;
1025	ino_t inum;
1026	int acctit;
1027
1028	ip = VTOI(vp);
1029	inum = ip->i_number;
1030	if (lblkno == -1)
1031		acctit = 0;
1032	else
1033		acctit = 1;
1034	for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) {
1035		blkno = *oldblkp;
1036		if (blkno == 0 || blkno == BLK_NOCOPY)
1037			continue;
1038		if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP)
1039			*ip->i_snapblklist++ = lblkno;
1040		if (blkno == BLK_SNAP)
1041			blkno = blkstofrags(fs, lblkno);
1042		ffs_blkfree(fs, vp, blkno, fs->fs_bsize, inum);
1043	}
1044	return (0);
1045}
1046
1047/*
1048 * Before expunging a snapshot inode, note all the
1049 * blocks that it claims with BLK_SNAP so that fsck will
1050 * be able to account for those blocks properly and so
1051 * that this snapshot knows that it need not copy them
1052 * if the other snapshot holding them is freed. This code
1053 * is reproduced once each for UFS1 and UFS2.
1054 */
1055static int
1056expunge_ufs2(snapvp, cancelip, fs, acctfunc, expungetype)
1057	struct vnode *snapvp;
1058	struct inode *cancelip;
1059	struct fs *fs;
1060	int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
1061	    struct fs *, ufs_lbn_t, int);
1062	int expungetype;
1063{
1064	int i, error, indiroff;
1065	ufs_lbn_t lbn, rlbn;
1066	ufs2_daddr_t len, blkno, numblks, blksperindir;
1067	struct ufs2_dinode *dip;
1068	struct thread *td = curthread;
1069	struct buf *bp;
1070
1071	/*
1072	 * Prepare to expunge the inode. If its inode block has not
1073	 * yet been copied, then allocate and fill the copy.
1074	 */
1075	lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number));
1076	blkno = 0;
1077	if (lbn < NDADDR) {
1078		blkno = VTOI(snapvp)->i_din2->di_db[lbn];
1079	} else {
1080		td->td_proc->p_flag |= P_COWINPROGRESS;
1081		error = UFS_BALLOC(snapvp, lblktosize(fs, (off_t)lbn),
1082		   fs->fs_bsize, KERNCRED, BA_METAONLY, &bp);
1083		td->td_proc->p_flag &= ~P_COWINPROGRESS;
1084		if (error)
1085			return (error);
1086		indiroff = (lbn - NDADDR) % NINDIR(fs);
1087		blkno = ((ufs2_daddr_t *)(bp->b_data))[indiroff];
1088		bqrelse(bp);
1089	}
1090	if (blkno != 0) {
1091		if ((error = bread(snapvp, lbn, fs->fs_bsize, KERNCRED, &bp)))
1092			return (error);
1093	} else {
1094		error = UFS_BALLOC(snapvp, lblktosize(fs, (off_t)lbn),
1095		    fs->fs_bsize, KERNCRED, 0, &bp);
1096		if (error)
1097			return (error);
1098		if ((error = readblock(bp, lbn)) != 0)
1099			return (error);
1100	}
1101	/*
1102	 * Set a snapshot inode to be a zero length file, regular files
1103	 * to be completely unallocated.
1104	 */
1105	dip = (struct ufs2_dinode *)bp->b_data +
1106	    ino_to_fsbo(fs, cancelip->i_number);
1107	if (expungetype == BLK_NOCOPY)
1108		dip->di_mode = 0;
1109	dip->di_size = 0;
1110	dip->di_blocks = 0;
1111	dip->di_flags &= ~SF_SNAPSHOT;
1112	bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs2_daddr_t));
1113	bdwrite(bp);
1114	/*
1115	 * Now go through and expunge all the blocks in the file
1116	 * using the function requested.
1117	 */
1118	numblks = howmany(cancelip->i_size, fs->fs_bsize);
1119	if ((error = (*acctfunc)(snapvp, &cancelip->i_din2->di_db[0],
1120	    &cancelip->i_din2->di_db[NDADDR], fs, 0, expungetype)))
1121		return (error);
1122	if ((error = (*acctfunc)(snapvp, &cancelip->i_din2->di_ib[0],
1123	    &cancelip->i_din2->di_ib[NIADDR], fs, -1, expungetype)))
1124		return (error);
1125	blksperindir = 1;
1126	lbn = -NDADDR;
1127	len = numblks - NDADDR;
1128	rlbn = NDADDR;
1129	for (i = 0; len > 0 && i < NIADDR; i++) {
1130		error = indiracct_ufs2(snapvp, ITOV(cancelip), i,
1131		    cancelip->i_din2->di_ib[i], lbn, rlbn, len,
1132		    blksperindir, fs, acctfunc, expungetype);
1133		if (error)
1134			return (error);
1135		blksperindir *= NINDIR(fs);
1136		lbn -= blksperindir + 1;
1137		len -= blksperindir;
1138		rlbn += blksperindir;
1139	}
1140	return (0);
1141}
1142
1143/*
1144 * Descend an indirect block chain for vnode cancelvp accounting for all
1145 * its indirect blocks in snapvp.
1146 */
1147static int
1148indiracct_ufs2(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks,
1149	    blksperindir, fs, acctfunc, expungetype)
1150	struct vnode *snapvp;
1151	struct vnode *cancelvp;
1152	int level;
1153	ufs2_daddr_t blkno;
1154	ufs_lbn_t lbn;
1155	ufs_lbn_t rlbn;
1156	ufs_lbn_t remblks;
1157	ufs_lbn_t blksperindir;
1158	struct fs *fs;
1159	int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
1160	    struct fs *, ufs_lbn_t, int);
1161	int expungetype;
1162{
1163	int error, num, i;
1164	ufs_lbn_t subblksperindir;
1165	struct indir indirs[NIADDR + 2];
1166	ufs2_daddr_t last, *bap;
1167	struct buf *bp;
1168
1169	if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0)
1170		return (error);
1171	if (lbn != indirs[num - 1 - level].in_lbn || blkno == 0 || num < 2)
1172		panic("indiracct: botched params");
1173	/*
1174	 * We have to expand bread here since it will deadlock looking
1175	 * up the block number for any blocks that are not in the cache.
1176	 */
1177	bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0, 0);
1178	bp->b_blkno = fsbtodb(fs, blkno);
1179	if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 &&
1180	    (error = readblock(bp, fragstoblks(fs, blkno)))) {
1181		brelse(bp);
1182		return (error);
1183	}
1184	/*
1185	 * Account for the block pointers in this indirect block.
1186	 */
1187	last = howmany(remblks, blksperindir);
1188	if (last > NINDIR(fs))
1189		last = NINDIR(fs);
1190	MALLOC(bap, ufs2_daddr_t *, fs->fs_bsize, M_DEVBUF, M_WAITOK);
1191	bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize);
1192	bqrelse(bp);
1193	error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs,
1194	    level == 0 ? rlbn : -1, expungetype);
1195	if (error || level == 0)
1196		goto out;
1197	/*
1198	 * Account for the block pointers in each of the indirect blocks
1199	 * in the levels below us.
1200	 */
1201	subblksperindir = blksperindir / NINDIR(fs);
1202	for (lbn++, level--, i = 0; i < last; i++) {
1203		error = indiracct_ufs2(snapvp, cancelvp, level, bap[i], lbn,
1204		    rlbn, remblks, subblksperindir, fs, acctfunc, expungetype);
1205		if (error)
1206			goto out;
1207		rlbn += blksperindir;
1208		lbn -= blksperindir;
1209		remblks -= blksperindir;
1210	}
1211out:
1212	FREE(bap, M_DEVBUF);
1213	return (error);
1214}
1215
1216/*
1217 * Do both snap accounting and map accounting.
1218 */
1219static int
1220fullacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype)
1221	struct vnode *vp;
1222	ufs2_daddr_t *oldblkp, *lastblkp;
1223	struct fs *fs;
1224	ufs_lbn_t lblkno;
1225	int exptype;	/* BLK_SNAP or BLK_NOCOPY */
1226{
1227	int error;
1228
1229	if ((error = snapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype)))
1230		return (error);
1231	return (mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype));
1232}
1233
1234/*
1235 * Identify a set of blocks allocated in a snapshot inode.
1236 */
1237static int
1238snapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expungetype)
1239	struct vnode *vp;
1240	ufs2_daddr_t *oldblkp, *lastblkp;
1241	struct fs *fs;
1242	ufs_lbn_t lblkno;
1243	int expungetype;	/* BLK_SNAP or BLK_NOCOPY */
1244{
1245	struct inode *ip = VTOI(vp);
1246	ufs2_daddr_t blkno, *blkp;
1247	ufs_lbn_t lbn;
1248	struct buf *ibp;
1249	int error;
1250
1251	for ( ; oldblkp < lastblkp; oldblkp++) {
1252		blkno = *oldblkp;
1253		if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP)
1254			continue;
1255		lbn = fragstoblks(fs, blkno);
1256		if (lbn < NDADDR) {
1257			blkp = &ip->i_din2->di_db[lbn];
1258			ip->i_flag |= IN_CHANGE | IN_UPDATE;
1259		} else {
1260			error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn),
1261			    fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
1262			if (error)
1263				return (error);
1264			blkp = &((ufs2_daddr_t *)(ibp->b_data))
1265			    [(lbn - NDADDR) % NINDIR(fs)];
1266		}
1267		/*
1268		 * If we are expunging a snapshot vnode and we
1269		 * find a block marked BLK_NOCOPY, then it is
1270		 * one that has been allocated to this snapshot after
1271		 * we took our current snapshot and can be ignored.
1272		 */
1273		if (expungetype == BLK_SNAP && *blkp == BLK_NOCOPY) {
1274			if (lbn >= NDADDR)
1275				brelse(ibp);
1276		} else {
1277			if (*blkp != 0)
1278				panic("snapacct: bad block");
1279			*blkp = expungetype;
1280			if (lbn >= NDADDR)
1281				bdwrite(ibp);
1282		}
1283	}
1284	return (0);
1285}
1286
1287/*
1288 * Account for a set of blocks allocated in a snapshot inode.
1289 */
1290static int
1291mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expungetype)
1292	struct vnode *vp;
1293	ufs2_daddr_t *oldblkp, *lastblkp;
1294	struct fs *fs;
1295	ufs_lbn_t lblkno;
1296	int expungetype;
1297{
1298	ufs2_daddr_t blkno;
1299	struct inode *ip;
1300	ino_t inum;
1301	int acctit;
1302
1303	ip = VTOI(vp);
1304	inum = ip->i_number;
1305	if (lblkno == -1)
1306		acctit = 0;
1307	else
1308		acctit = 1;
1309	for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) {
1310		blkno = *oldblkp;
1311		if (blkno == 0 || blkno == BLK_NOCOPY)
1312			continue;
1313		if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP)
1314			*ip->i_snapblklist++ = lblkno;
1315		if (blkno == BLK_SNAP)
1316			blkno = blkstofrags(fs, lblkno);
1317		ffs_blkfree(fs, vp, blkno, fs->fs_bsize, inum);
1318	}
1319	return (0);
1320}
1321
1322/*
1323 * Decrement extra reference on snapshot when last name is removed.
1324 * It will not be freed until the last open reference goes away.
1325 */
1326void
1327ffs_snapgone(ip)
1328	struct inode *ip;
1329{
1330	struct inode *xp;
1331	struct fs *fs;
1332	int snaploc;
1333
1334	/*
1335	 * Find snapshot in incore list.
1336	 */
1337	TAILQ_FOREACH(xp, &ip->i_devvp->v_rdev->si_snapshots, i_nextsnap)
1338		if (xp == ip)
1339			break;
1340	if (xp != NULL)
1341		vrele(ITOV(ip));
1342	else if (snapdebug)
1343		printf("ffs_snapgone: lost snapshot vnode %d\n",
1344		    ip->i_number);
1345	/*
1346	 * Delete snapshot inode from superblock. Keep list dense.
1347	 */
1348	fs = ip->i_fs;
1349	for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++)
1350		if (fs->fs_snapinum[snaploc] == ip->i_number)
1351			break;
1352	if (snaploc < FSMAXSNAP) {
1353		for (snaploc++; snaploc < FSMAXSNAP; snaploc++) {
1354			if (fs->fs_snapinum[snaploc] == 0)
1355				break;
1356			fs->fs_snapinum[snaploc - 1] = fs->fs_snapinum[snaploc];
1357		}
1358		fs->fs_snapinum[snaploc - 1] = 0;
1359	}
1360}
1361
1362/*
1363 * Prepare a snapshot file for being removed.
1364 */
1365void
1366ffs_snapremove(vp)
1367	struct vnode *vp;
1368{
1369	struct inode *ip;
1370	struct vnode *devvp;
1371	struct lock *lkp;
1372	struct buf *ibp;
1373	struct fs *fs;
1374	struct thread *td = curthread;
1375	ufs2_daddr_t numblks, blkno, dblk, *snapblklist;
1376	int error, loc, last;
1377
1378	ip = VTOI(vp);
1379	fs = ip->i_fs;
1380	devvp = ip->i_devvp;
1381	/*
1382	 * If active, delete from incore list (this snapshot may
1383	 * already have been in the process of being deleted, so
1384	 * would not have been active).
1385	 *
1386	 * Clear copy-on-write flag if last snapshot.
1387	 */
1388	if (ip->i_nextsnap.tqe_prev != 0) {
1389		VI_LOCK(devvp);
1390		lockmgr(&vp->v_lock, LK_INTERLOCK | LK_EXCLUSIVE,
1391		    VI_MTX(devvp), td);
1392		VI_LOCK(devvp);
1393		TAILQ_REMOVE(&devvp->v_rdev->si_snapshots, ip, i_nextsnap);
1394		ip->i_nextsnap.tqe_prev = 0;
1395		lkp = vp->v_vnlock;
1396		vp->v_vnlock = &vp->v_lock;
1397		lockmgr(lkp, LK_RELEASE, NULL, td);
1398		if (TAILQ_FIRST(&devvp->v_rdev->si_snapshots) != 0) {
1399			VI_UNLOCK(devvp);
1400		} else {
1401			snapblklist = devvp->v_rdev->si_snapblklist;
1402			devvp->v_rdev->si_snapblklist = 0;
1403			devvp->v_rdev->si_snaplistsize = 0;
1404			devvp->v_rdev->si_copyonwrite = 0;
1405			devvp->v_vflag &= ~VV_COPYONWRITE;
1406			lockmgr(lkp, LK_DRAIN|LK_INTERLOCK, VI_MTX(devvp), td);
1407			lockmgr(lkp, LK_RELEASE, NULL, td);
1408			lockdestroy(lkp);
1409			FREE(lkp, M_UFSMNT);
1410			FREE(snapblklist, M_UFSMNT);
1411		}
1412	}
1413	/*
1414	 * Clear all BLK_NOCOPY fields. Pass any block claims to other
1415	 * snapshots that want them (see ffs_snapblkfree below).
1416	 */
1417	for (blkno = 1; blkno < NDADDR; blkno++) {
1418		dblk = DIP(ip, i_db[blkno]);
1419		if (dblk == BLK_NOCOPY || dblk == BLK_SNAP)
1420			DIP(ip, i_db[blkno]) = 0;
1421		else if ((dblk == blkstofrags(fs, blkno) &&
1422		     ffs_snapblkfree(fs, ip->i_devvp, dblk, fs->fs_bsize,
1423		     ip->i_number))) {
1424			DIP(ip, i_blocks) -= btodb(fs->fs_bsize);
1425			DIP(ip, i_db[blkno]) = 0;
1426		}
1427	}
1428	numblks = howmany(ip->i_size, fs->fs_bsize);
1429	for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) {
1430		error = UFS_BALLOC(vp, lblktosize(fs, (off_t)blkno),
1431		    fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
1432		if (error)
1433			continue;
1434		if (fs->fs_size - blkno > NINDIR(fs))
1435			last = NINDIR(fs);
1436		else
1437			last = fs->fs_size - blkno;
1438		for (loc = 0; loc < last; loc++) {
1439			if (ip->i_ump->um_fstype == UFS1) {
1440				dblk = ((ufs1_daddr_t *)(ibp->b_data))[loc];
1441				if (dblk == BLK_NOCOPY || dblk == BLK_SNAP)
1442					((ufs1_daddr_t *)(ibp->b_data))[loc]= 0;
1443				else if ((dblk == blkstofrags(fs, blkno) &&
1444				     ffs_snapblkfree(fs, ip->i_devvp, dblk,
1445				     fs->fs_bsize, ip->i_number))) {
1446					ip->i_din1->di_blocks -=
1447					    btodb(fs->fs_bsize);
1448					((ufs1_daddr_t *)(ibp->b_data))[loc]= 0;
1449				}
1450				continue;
1451			}
1452			dblk = ((ufs2_daddr_t *)(ibp->b_data))[loc];
1453			if (dblk == BLK_NOCOPY || dblk == BLK_SNAP)
1454				((ufs2_daddr_t *)(ibp->b_data))[loc] = 0;
1455			else if ((dblk == blkstofrags(fs, blkno) &&
1456			     ffs_snapblkfree(fs, ip->i_devvp, dblk,
1457			     fs->fs_bsize, ip->i_number))) {
1458				ip->i_din2->di_blocks -= btodb(fs->fs_bsize);
1459				((ufs2_daddr_t *)(ibp->b_data))[loc] = 0;
1460			}
1461		}
1462		bawrite(ibp);
1463	}
1464	/*
1465	 * Clear snapshot flag and drop reference.
1466	 */
1467	ip->i_flags &= ~SF_SNAPSHOT;
1468	DIP(ip, i_flags) = ip->i_flags;
1469	ip->i_flag |= IN_CHANGE | IN_UPDATE;
1470}
1471
1472/*
1473 * Notification that a block is being freed. Return zero if the free
1474 * should be allowed to proceed. Return non-zero if the snapshot file
1475 * wants to claim the block. The block will be claimed if it is an
1476 * uncopied part of one of the snapshots. It will be freed if it is
1477 * either a BLK_NOCOPY or has already been copied in all of the snapshots.
1478 * If a fragment is being freed, then all snapshots that care about
1479 * it must make a copy since a snapshot file can only claim full sized
1480 * blocks. Note that if more than one snapshot file maps the block,
1481 * we can pick one at random to claim it. Since none of the snapshots
1482 * can change, we are assurred that they will all see the same unmodified
1483 * image. When deleting a snapshot file (see ffs_snapremove above), we
1484 * must push any of these claimed blocks to one of the other snapshots
1485 * that maps it. These claimed blocks are easily identified as they will
1486 * have a block number equal to their logical block number within the
1487 * snapshot. A copied block can never have this property because they
1488 * must always have been allocated from a BLK_NOCOPY location.
1489 */
1490int
1491ffs_snapblkfree(fs, devvp, bno, size, inum)
1492	struct fs *fs;
1493	struct vnode *devvp;
1494	ufs2_daddr_t bno;
1495	long size;
1496	ino_t inum;
1497{
1498	struct buf *ibp, *cbp, *savedcbp = 0;
1499	struct thread *td = curthread;
1500	struct inode *ip;
1501	struct vnode *vp = NULL;
1502	ufs_lbn_t lbn;
1503	ufs2_daddr_t blkno;
1504	int indiroff = 0, snapshot_locked = 0, error = 0, claimedblk = 0;
1505	struct snaphead *snaphead;
1506
1507	lbn = fragstoblks(fs, bno);
1508retry:
1509	VI_LOCK(devvp);
1510	snaphead = &devvp->v_rdev->si_snapshots;
1511	TAILQ_FOREACH(ip, snaphead, i_nextsnap) {
1512		vp = ITOV(ip);
1513		/*
1514		 * Lookup block being written.
1515		 */
1516		if (lbn < NDADDR) {
1517			blkno = DIP(ip, i_db[lbn]);
1518		} else {
1519			if (snapshot_locked == 0 &&
1520			    lockmgr(vp->v_vnlock,
1521			      LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL,
1522			      VI_MTX(devvp), td) != 0)
1523				goto retry;
1524			snapshot_locked = 1;
1525			td->td_proc->p_flag |= P_COWINPROGRESS;
1526			error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn),
1527			    fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
1528			td->td_proc->p_flag &= ~P_COWINPROGRESS;
1529			if (error)
1530				break;
1531			indiroff = (lbn - NDADDR) % NINDIR(fs);
1532			if (ip->i_ump->um_fstype == UFS1)
1533				blkno=((ufs1_daddr_t *)(ibp->b_data))[indiroff];
1534			else
1535				blkno=((ufs2_daddr_t *)(ibp->b_data))[indiroff];
1536		}
1537		/*
1538		 * Check to see if block needs to be copied.
1539		 */
1540		if (blkno == 0) {
1541			/*
1542			 * A block that we map is being freed. If it has not
1543			 * been claimed yet, we will claim or copy it (below).
1544			 */
1545			claimedblk = 1;
1546		} else if (blkno == BLK_SNAP) {
1547			/*
1548			 * No previous snapshot claimed the block,
1549			 * so it will be freed and become a BLK_NOCOPY
1550			 * (don't care) for us.
1551			 */
1552			if (claimedblk)
1553				panic("snapblkfree: inconsistent block type");
1554			if (snapshot_locked == 0 &&
1555			    lockmgr(vp->v_vnlock,
1556			      LK_INTERLOCK | LK_EXCLUSIVE | LK_NOWAIT,
1557			      VI_MTX(devvp), td) != 0) {
1558				if (lbn >= NDADDR)
1559					bqrelse(ibp);
1560				vn_lock(vp, LK_EXCLUSIVE | LK_SLEEPFAIL, td);
1561				goto retry;
1562			}
1563			snapshot_locked = 1;
1564			if (lbn < NDADDR) {
1565				DIP(ip, i_db[lbn]) = BLK_NOCOPY;
1566				ip->i_flag |= IN_CHANGE | IN_UPDATE;
1567			} else if (ip->i_ump->um_fstype == UFS1) {
1568				((ufs1_daddr_t *)(ibp->b_data))[indiroff] =
1569				    BLK_NOCOPY;
1570				bdwrite(ibp);
1571			} else {
1572				((ufs2_daddr_t *)(ibp->b_data))[indiroff] =
1573				    BLK_NOCOPY;
1574				bdwrite(ibp);
1575			}
1576			continue;
1577		} else /* BLK_NOCOPY or default */ {
1578			/*
1579			 * If the snapshot has already copied the block
1580			 * (default), or does not care about the block,
1581			 * it is not needed.
1582			 */
1583			if (lbn >= NDADDR)
1584				bqrelse(ibp);
1585			continue;
1586		}
1587		/*
1588		 * If this is a full size block, we will just grab it
1589		 * and assign it to the snapshot inode. Otherwise we
1590		 * will proceed to copy it. See explanation for this
1591		 * routine as to why only a single snapshot needs to
1592		 * claim this block.
1593		 */
1594		if (snapshot_locked == 0 &&
1595		    lockmgr(vp->v_vnlock,
1596		      LK_INTERLOCK | LK_EXCLUSIVE | LK_NOWAIT,
1597		      VI_MTX(devvp), td) != 0) {
1598			if (lbn >= NDADDR)
1599				bqrelse(ibp);
1600			vn_lock(vp, LK_EXCLUSIVE | LK_SLEEPFAIL, td);
1601			goto retry;
1602		}
1603		snapshot_locked = 1;
1604		if (size == fs->fs_bsize) {
1605#ifdef DEBUG
1606			if (snapdebug)
1607				printf("%s %d lbn %jd from inum %d\n",
1608				    "Grabonremove: snapino", ip->i_number,
1609				    (intmax_t)lbn, inum);
1610#endif
1611			if (lbn < NDADDR) {
1612				DIP(ip, i_db[lbn]) = bno;
1613			} else if (ip->i_ump->um_fstype == UFS1) {
1614				((ufs1_daddr_t *)(ibp->b_data))[indiroff] = bno;
1615				bdwrite(ibp);
1616			} else {
1617				((ufs2_daddr_t *)(ibp->b_data))[indiroff] = bno;
1618				bdwrite(ibp);
1619			}
1620			DIP(ip, i_blocks) += btodb(size);
1621			ip->i_flag |= IN_CHANGE | IN_UPDATE;
1622			VOP_UNLOCK(vp, 0, td);
1623			return (1);
1624		}
1625		if (lbn >= NDADDR)
1626			bqrelse(ibp);
1627		/*
1628		 * Allocate the block into which to do the copy. Note that this
1629		 * allocation will never require any additional allocations for
1630		 * the snapshot inode.
1631		 */
1632		td->td_proc->p_flag |= P_COWINPROGRESS;
1633		error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn),
1634		    fs->fs_bsize, KERNCRED, 0, &cbp);
1635		td->td_proc->p_flag &= ~P_COWINPROGRESS;
1636		if (error)
1637			break;
1638#ifdef DEBUG
1639		if (snapdebug)
1640			printf("%s%d lbn %jd %s %d size %ld to blkno %jd\n",
1641			    "Copyonremove: snapino ", ip->i_number,
1642			    (intmax_t)lbn, "for inum", inum, size,
1643			    (intmax_t)cbp->b_blkno);
1644#endif
1645		/*
1646		 * If we have already read the old block contents, then
1647		 * simply copy them to the new block. Note that we need
1648		 * to synchronously write snapshots that have not been
1649		 * unlinked, and hence will be visible after a crash,
1650		 * to ensure their integrity.
1651		 */
1652		if (savedcbp != 0) {
1653			bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize);
1654			bawrite(cbp);
1655			if (dopersistence && ip->i_effnlink > 0)
1656				(void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td);
1657			continue;
1658		}
1659		/*
1660		 * Otherwise, read the old block contents into the buffer.
1661		 */
1662		if ((error = readblock(cbp, lbn)) != 0) {
1663			bzero(cbp->b_data, fs->fs_bsize);
1664			bawrite(cbp);
1665			if (dopersistence && ip->i_effnlink > 0)
1666				(void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td);
1667			break;
1668		}
1669		savedcbp = cbp;
1670	}
1671	/*
1672	 * Note that we need to synchronously write snapshots that
1673	 * have not been unlinked, and hence will be visible after
1674	 * a crash, to ensure their integrity.
1675	 */
1676	if (savedcbp) {
1677		vp = savedcbp->b_vp;
1678		bawrite(savedcbp);
1679		if (dopersistence && VTOI(vp)->i_effnlink > 0)
1680			(void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td);
1681	}
1682	/*
1683	 * If we have been unable to allocate a block in which to do
1684	 * the copy, then return non-zero so that the fragment will
1685	 * not be freed. Although space will be lost, the snapshot
1686	 * will stay consistent.
1687	 */
1688	if (snapshot_locked)
1689		VOP_UNLOCK(vp, 0, td);
1690	else
1691		VI_UNLOCK(devvp);
1692	return (error);
1693}
1694
1695/*
1696 * Associate snapshot files when mounting.
1697 */
1698void
1699ffs_snapshot_mount(mp)
1700	struct mount *mp;
1701{
1702	struct ufsmount *ump = VFSTOUFS(mp);
1703	struct vnode *devvp = ump->um_devvp;
1704	struct fs *fs = ump->um_fs;
1705	struct thread *td = curthread;
1706	struct snaphead *snaphead;
1707	struct vnode *vp;
1708	struct inode *ip, *xp;
1709	struct uio auio;
1710	struct iovec aiov;
1711	void *snapblklist;
1712	char *reason;
1713	daddr_t snaplistsize;
1714	int error, snaploc, loc;
1715
1716	/*
1717	 * XXX The following needs to be set before UFS_TRUNCATE or
1718	 * VOP_READ can be called.
1719	 */
1720	mp->mnt_stat.f_iosize = fs->fs_bsize;
1721	/*
1722	 * Process each snapshot listed in the superblock.
1723	 */
1724	vp = NULL;
1725	snaphead = &devvp->v_rdev->si_snapshots;
1726	for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) {
1727		if (fs->fs_snapinum[snaploc] == 0)
1728			break;
1729		if ((error = VFS_VGET(mp, fs->fs_snapinum[snaploc],
1730		    LK_EXCLUSIVE, &vp)) != 0){
1731			printf("ffs_snapshot_mount: vget failed %d\n", error);
1732			continue;
1733		}
1734		ip = VTOI(vp);
1735		if ((ip->i_flags & SF_SNAPSHOT) == 0 || ip->i_size ==
1736		    lblktosize(fs, howmany(fs->fs_size, fs->fs_frag))) {
1737			if ((ip->i_flags & SF_SNAPSHOT) == 0) {
1738				reason = "non-snapshot";
1739			} else {
1740				reason = "old format snapshot";
1741				(void)UFS_TRUNCATE(vp, (off_t)0, 0, NOCRED, td);
1742				(void)VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td);
1743			}
1744			printf("ffs_snapshot_mount: %s inode %d\n",
1745			    reason, fs->fs_snapinum[snaploc]);
1746			vput(vp);
1747			vp = NULL;
1748			for (loc = snaploc + 1; loc < FSMAXSNAP; loc++) {
1749				if (fs->fs_snapinum[loc] == 0)
1750					break;
1751				fs->fs_snapinum[loc - 1] = fs->fs_snapinum[loc];
1752			}
1753			fs->fs_snapinum[loc - 1] = 0;
1754			snaploc--;
1755			continue;
1756		}
1757		/*
1758		 * If there already exist snapshots on this filesystem, grab a
1759		 * reference to their shared lock. If this is the first snapshot
1760		 * on this filesystem, we need to allocate a lock for the
1761		 * snapshots to share. In either case, acquire the snapshot
1762		 * lock and give up our original private lock.
1763		 */
1764		VI_LOCK(devvp);
1765		if ((xp = TAILQ_FIRST(snaphead)) != NULL) {
1766			VI_LOCK(vp);
1767			vp->v_vnlock = ITOV(xp)->v_vnlock;
1768			VI_UNLOCK(devvp);
1769		} else {
1770			struct lock *lkp;
1771
1772			VI_UNLOCK(devvp);
1773			MALLOC(lkp, struct lock *, sizeof(struct lock),
1774			    M_UFSMNT, M_WAITOK);
1775			lockinit(lkp, PVFS, "snaplk", VLKTIMEOUT,
1776			    LK_CANRECURSE | LK_NOPAUSE);
1777			VI_LOCK(vp);
1778			vp->v_vnlock = lkp;
1779		}
1780		vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY, td);
1781		transferlockers(&vp->v_lock, vp->v_vnlock);
1782		lockmgr(&vp->v_lock, LK_RELEASE, NULL, td);
1783		/*
1784		 * Link it onto the active snapshot list.
1785		 */
1786		VI_LOCK(devvp);
1787		if (ip->i_nextsnap.tqe_prev != 0)
1788			panic("ffs_snapshot_mount: %d already on list",
1789			    ip->i_number);
1790		else
1791			TAILQ_INSERT_TAIL(snaphead, ip, i_nextsnap);
1792		vp->v_vflag |= VV_SYSTEM;
1793		VI_UNLOCK(devvp);
1794		VOP_UNLOCK(vp, 0, td);
1795	}
1796	/*
1797	 * No usable snapshots found.
1798	 */
1799	if (vp == NULL)
1800		return;
1801	/*
1802	 * Allocate the space for the block hints list. We always want to
1803	 * use the list from the newest snapshot.
1804	 */
1805	auio.uio_iov = &aiov;
1806	auio.uio_iovcnt = 1;
1807	aiov.iov_base = (void *)&snaplistsize;
1808	aiov.iov_len = sizeof(snaplistsize);
1809	auio.uio_resid = aiov.iov_len;
1810	auio.uio_offset =
1811	    lblktosize(fs, howmany(fs->fs_size, fs->fs_frag));
1812	auio.uio_segflg = UIO_SYSSPACE;
1813	auio.uio_rw = UIO_READ;
1814	auio.uio_td = td;
1815	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
1816	if ((error = VOP_READ(vp, &auio, IO_UNIT, td->td_ucred)) != 0) {
1817		printf("ffs_snapshot_mount: read_1 failed %d\n", error);
1818		VOP_UNLOCK(vp, 0, td);
1819		return;
1820	}
1821	MALLOC(snapblklist, void *, snaplistsize * sizeof(daddr_t),
1822	    M_UFSMNT, M_WAITOK);
1823	auio.uio_iovcnt = 1;
1824	aiov.iov_base = snapblklist;
1825	aiov.iov_len = snaplistsize * sizeof (daddr_t);
1826	auio.uio_resid = aiov.iov_len;
1827	auio.uio_offset -= sizeof(snaplistsize);
1828	if ((error = VOP_READ(vp, &auio, IO_UNIT, td->td_ucred)) != 0) {
1829		printf("ffs_snapshot_mount: read_2 failed %d\n", error);
1830		VOP_UNLOCK(vp, 0, td);
1831		FREE(snapblklist, M_UFSMNT);
1832		return;
1833	}
1834	VOP_UNLOCK(vp, 0, td);
1835	VI_LOCK(devvp);
1836	ASSERT_VOP_LOCKED(devvp, "ffs_snapshot_mount");
1837	devvp->v_rdev->si_snaplistsize = snaplistsize;
1838	devvp->v_rdev->si_snapblklist = (daddr_t *)snapblklist;
1839	devvp->v_rdev->si_copyonwrite = ffs_copyonwrite;
1840	devvp->v_vflag |= VV_COPYONWRITE;
1841	VI_UNLOCK(devvp);
1842}
1843
1844/*
1845 * Disassociate snapshot files when unmounting.
1846 */
1847void
1848ffs_snapshot_unmount(mp)
1849	struct mount *mp;
1850{
1851	struct vnode *devvp = VFSTOUFS(mp)->um_devvp;
1852	struct snaphead *snaphead = &devvp->v_rdev->si_snapshots;
1853	struct lock *lkp = NULL;
1854	struct inode *xp;
1855	struct vnode *vp;
1856
1857	VI_LOCK(devvp);
1858	while ((xp = TAILQ_FIRST(snaphead)) != 0) {
1859		vp = ITOV(xp);
1860		lkp = vp->v_vnlock;
1861		vp->v_vnlock = &vp->v_lock;
1862		TAILQ_REMOVE(snaphead, xp, i_nextsnap);
1863		xp->i_nextsnap.tqe_prev = 0;
1864		if (xp->i_effnlink > 0) {
1865			VI_UNLOCK(devvp);
1866			vrele(vp);
1867			VI_LOCK(devvp);
1868		}
1869	}
1870	if (devvp->v_rdev->si_snapblklist != NULL) {
1871		FREE(devvp->v_rdev->si_snapblklist, M_UFSMNT);
1872		devvp->v_rdev->si_snapblklist = NULL;
1873		devvp->v_rdev->si_snaplistsize = 0;
1874	}
1875	if (lkp != NULL) {
1876		lockdestroy(lkp);
1877		FREE(lkp, M_UFSMNT);
1878	}
1879	ASSERT_VOP_LOCKED(devvp, "ffs_snapshot_unmount");
1880	devvp->v_rdev->si_copyonwrite = 0;
1881	devvp->v_vflag &= ~VV_COPYONWRITE;
1882	VI_UNLOCK(devvp);
1883}
1884
1885/*
1886 * Check for need to copy block that is about to be written,
1887 * copying the block if necessary.
1888 */
1889static int
1890ffs_copyonwrite(devvp, bp)
1891	struct vnode *devvp;
1892	struct buf *bp;
1893{
1894	struct snaphead *snaphead;
1895	struct buf *ibp, *cbp, *savedcbp = 0;
1896	struct thread *td = curthread;
1897	struct fs *fs;
1898	struct inode *ip;
1899	struct vnode *vp = 0;
1900	ufs2_daddr_t lbn, blkno, *snapblklist;
1901	int lower, upper, mid, indiroff, snapshot_locked = 0, error = 0;
1902
1903	if (td->td_proc->p_flag & P_COWINPROGRESS)
1904		panic("ffs_copyonwrite: recursive call");
1905	/*
1906	 * First check to see if it is in the preallocated list.
1907	 * By doing this check we avoid several potential deadlocks.
1908	 */
1909	VI_LOCK(devvp);
1910	snaphead = &devvp->v_rdev->si_snapshots;
1911	ip = TAILQ_FIRST(snaphead);
1912	fs = ip->i_fs;
1913	lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno));
1914	snapblklist = devvp->v_rdev->si_snapblklist;
1915	upper = devvp->v_rdev->si_snaplistsize - 1;
1916	lower = 1;
1917	while (lower <= upper) {
1918		mid = (lower + upper) / 2;
1919		if (snapblklist[mid] == lbn)
1920			break;
1921		if (snapblklist[mid] < lbn)
1922			lower = mid + 1;
1923		else
1924			upper = mid - 1;
1925	}
1926	if (lower <= upper) {
1927		VI_UNLOCK(devvp);
1928		return (0);
1929	}
1930	/*
1931	 * Not in the precomputed list, so check the snapshots.
1932	 */
1933retry:
1934	TAILQ_FOREACH(ip, snaphead, i_nextsnap) {
1935		vp = ITOV(ip);
1936		/*
1937		 * We ensure that everything of our own that needs to be
1938		 * copied will be done at the time that ffs_snapshot is
1939		 * called. Thus we can skip the check here which can
1940		 * deadlock in doing the lookup in UFS_BALLOC.
1941		 */
1942		if (bp->b_vp == vp)
1943			continue;
1944		/*
1945		 * Check to see if block needs to be copied. We do not have
1946		 * to hold the snapshot lock while doing this lookup as it
1947		 * will never require any additional allocations for the
1948		 * snapshot inode.
1949		 */
1950		if (lbn < NDADDR) {
1951			blkno = DIP(ip, i_db[lbn]);
1952		} else {
1953			if (snapshot_locked == 0 &&
1954			    lockmgr(vp->v_vnlock,
1955			      LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL,
1956			      VI_MTX(devvp), td) != 0) {
1957				VI_LOCK(devvp);
1958				goto retry;
1959			}
1960			snapshot_locked = 1;
1961			td->td_proc->p_flag |= P_COWINPROGRESS;
1962			error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn),
1963			   fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
1964			td->td_proc->p_flag &= ~P_COWINPROGRESS;
1965			if (error)
1966				break;
1967			indiroff = (lbn - NDADDR) % NINDIR(fs);
1968			if (ip->i_ump->um_fstype == UFS1)
1969				blkno=((ufs1_daddr_t *)(ibp->b_data))[indiroff];
1970			else
1971				blkno=((ufs2_daddr_t *)(ibp->b_data))[indiroff];
1972			bqrelse(ibp);
1973		}
1974#ifdef DIAGNOSTIC
1975		if (blkno == BLK_SNAP && bp->b_lblkno >= 0)
1976			panic("ffs_copyonwrite: bad copy block");
1977#endif
1978		if (blkno != 0)
1979			continue;
1980		/*
1981		 * Allocate the block into which to do the copy. Since
1982		 * multiple processes may all try to copy the same block,
1983		 * we have to recheck our need to do a copy if we sleep
1984		 * waiting for the lock.
1985		 *
1986		 * Because all snapshots on a filesystem share a single
1987		 * lock, we ensure that we will never be in competition
1988		 * with another process to allocate a block.
1989		 */
1990		if (snapshot_locked == 0 &&
1991		    lockmgr(vp->v_vnlock,
1992		      LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL,
1993		      VI_MTX(devvp), td) != 0) {
1994			VI_LOCK(devvp);
1995			goto retry;
1996		}
1997		snapshot_locked = 1;
1998		td->td_proc->p_flag |= P_COWINPROGRESS;
1999		error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn),
2000		    fs->fs_bsize, KERNCRED, 0, &cbp);
2001		td->td_proc->p_flag &= ~P_COWINPROGRESS;
2002		if (error)
2003			break;
2004#ifdef DEBUG
2005		if (snapdebug) {
2006			printf("Copyonwrite: snapino %d lbn %jd for ",
2007			    ip->i_number, (intmax_t)lbn);
2008			if (bp->b_vp == devvp)
2009				printf("fs metadata");
2010			else
2011				printf("inum %d", VTOI(bp->b_vp)->i_number);
2012			printf(" lblkno %jd to blkno %jd\n",
2013			    (intmax_t)bp->b_lblkno, (intmax_t)cbp->b_blkno);
2014		}
2015#endif
2016		/*
2017		 * If we have already read the old block contents, then
2018		 * simply copy them to the new block. Note that we need
2019		 * to synchronously write snapshots that have not been
2020		 * unlinked, and hence will be visible after a crash,
2021		 * to ensure their integrity.
2022		 */
2023		if (savedcbp != 0) {
2024			bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize);
2025			bawrite(cbp);
2026			if (dopersistence && ip->i_effnlink > 0)
2027				(void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td);
2028			continue;
2029		}
2030		/*
2031		 * Otherwise, read the old block contents into the buffer.
2032		 */
2033		if ((error = readblock(cbp, lbn)) != 0) {
2034			bzero(cbp->b_data, fs->fs_bsize);
2035			bawrite(cbp);
2036			if (dopersistence && ip->i_effnlink > 0)
2037				(void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td);
2038			break;
2039		}
2040		savedcbp = cbp;
2041	}
2042	/*
2043	 * Note that we need to synchronously write snapshots that
2044	 * have not been unlinked, and hence will be visible after
2045	 * a crash, to ensure their integrity.
2046	 */
2047	if (savedcbp) {
2048		vp = savedcbp->b_vp;
2049		bawrite(savedcbp);
2050		if (dopersistence && VTOI(vp)->i_effnlink > 0)
2051			(void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td);
2052	}
2053	if (snapshot_locked)
2054		VOP_UNLOCK(vp, 0, td);
2055	else
2056		VI_UNLOCK(devvp);
2057	return (error);
2058}
2059
2060/*
2061 * Read the specified block into the given buffer.
2062 * Much of this boiler-plate comes from bwrite().
2063 */
2064static int
2065readblock(bp, lbn)
2066	struct buf *bp;
2067	ufs2_daddr_t lbn;
2068{
2069	struct uio auio;
2070	struct iovec aiov;
2071	struct thread *td = curthread;
2072	struct inode *ip = VTOI(bp->b_vp);
2073
2074	aiov.iov_base = bp->b_data;
2075	aiov.iov_len = bp->b_bcount;
2076	auio.uio_iov = &aiov;
2077	auio.uio_iovcnt = 1;
2078	auio.uio_offset = dbtob(fsbtodb(ip->i_fs, blkstofrags(ip->i_fs, lbn)));
2079	auio.uio_resid = bp->b_bcount;
2080	auio.uio_rw = UIO_READ;
2081	auio.uio_segflg = UIO_SYSSPACE;
2082	auio.uio_td = td;
2083	return (physio(ip->i_devvp->v_rdev, &auio, 0));
2084}
2085