ffs_snapshot.c revision 1.32
1/*	$NetBSD: ffs_snapshot.c,v 1.32 2006/09/29 19:37:11 christos Exp $	*/
2
3/*
4 * Copyright 2000 Marshall Kirk McKusick. All Rights Reserved.
5 *
6 * Further information about snapshots can be obtained from:
7 *
8 *	Marshall Kirk McKusick		http://www.mckusick.com/softdep/
9 *	1614 Oxford Street		mckusick@mckusick.com
10 *	Berkeley, CA 94709-1608		+1-510-843-9542
11 *	USA
12 *
13 * Redistribution and use in source and binary forms, with or without
14 * modification, are permitted provided that the following conditions
15 * are met:
16 *
17 * 1. Redistributions of source code must retain the above copyright
18 *    notice, this list of conditions and the following disclaimer.
19 * 2. Redistributions in binary form must reproduce the above copyright
20 *    notice, this list of conditions and the following disclaimer in the
21 *    documentation and/or other materials provided with the distribution.
22 *
23 * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY
24 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
25 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
26 * DISCLAIMED.  IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR
27 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33 * SUCH DAMAGE.
34 *
35 *	@(#)ffs_snapshot.c	8.11 (McKusick) 7/23/00
36 *
37 *	from FreeBSD: ffs_snapshot.c,v 1.79 2004/02/13 02:02:06 kuriyama Exp
38 */
39
40#include <sys/cdefs.h>
41__KERNEL_RCSID(0, "$NetBSD: ffs_snapshot.c,v 1.32 2006/09/29 19:37:11 christos Exp $");
42
43#if defined(_KERNEL_OPT)
44#include "opt_ffs.h"
45#endif
46
47#include <sys/param.h>
48#include <sys/kernel.h>
49#include <sys/systm.h>
50#include <sys/conf.h>
51#include <sys/buf.h>
52#include <sys/proc.h>
53#include <sys/namei.h>
54#include <sys/sched.h>
55#include <sys/stat.h>
56#include <sys/malloc.h>
57#include <sys/mount.h>
58#include <sys/resource.h>
59#include <sys/resourcevar.h>
60#include <sys/vnode.h>
61#include <sys/kauth.h>
62
63#include <miscfs/specfs/specdev.h>
64
65#include <ufs/ufs/quota.h>
66#include <ufs/ufs/ufsmount.h>
67#include <ufs/ufs/inode.h>
68#include <ufs/ufs/ufs_extern.h>
69#include <ufs/ufs/ufs_bswap.h>
70
71#include <ufs/ffs/fs.h>
72#include <ufs/ffs/ffs_extern.h>
73
74/* FreeBSD -> NetBSD conversion */
75#define KERNCRED	lwp0.l_cred
76#define ufs1_daddr_t	int32_t
77#define ufs2_daddr_t	int64_t
78#define ufs_lbn_t	daddr_t
79#define VI_MTX(v)	(&(v)->v_interlock)
80#define VI_LOCK(v)	simple_lock(&(v)->v_interlock)
81#define VI_UNLOCK(v)	simple_unlock(&(v)->v_interlock)
82#define MNT_ILOCK(v)	simple_lock(&mntvnode_slock)
83#define MNT_IUNLOCK(v)	simple_unlock(&mntvnode_slock)
84
85#if !defined(FFS_NO_SNAPSHOT)
86static int cgaccount(int, struct vnode *, caddr_t, int);
87static int expunge_ufs1(struct vnode *, struct inode *, struct fs *,
88    int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *,
89    ufs_lbn_t, int), int);
90static int indiracct_ufs1(struct vnode *, struct vnode *, int,
91    ufs1_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *,
92    int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *,
93    ufs_lbn_t, int), int);
94static int fullacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
95    struct fs *, ufs_lbn_t, int);
96static int snapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
97    struct fs *, ufs_lbn_t, int);
98static int mapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
99    struct fs *, ufs_lbn_t, int);
100static int expunge_ufs2(struct vnode *, struct inode *, struct fs *,
101    int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *,
102    ufs_lbn_t, int), int);
103static int indiracct_ufs2(struct vnode *, struct vnode *, int,
104    ufs2_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *,
105    int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *,
106    ufs_lbn_t, int), int);
107static int fullacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
108    struct fs *, ufs_lbn_t, int);
109static int snapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
110    struct fs *, ufs_lbn_t, int);
111static int mapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
112    struct fs *, ufs_lbn_t, int);
113#endif /* !defined(FFS_NO_SNAPSHOT) */
114
115static int ffs_copyonwrite(void *, struct buf *);
116static int readfsblk(struct vnode *, caddr_t, ufs2_daddr_t);
117static int __unused readvnblk(struct vnode *, caddr_t, ufs2_daddr_t);
118static int writevnblk(struct vnode *, caddr_t, ufs2_daddr_t);
119static inline int cow_enter(void);
120static inline void cow_leave(int);
121static inline ufs2_daddr_t db_get(struct inode *, int);
122static inline void db_assign(struct inode *, int, ufs2_daddr_t);
123static inline ufs2_daddr_t idb_get(struct inode *, caddr_t, int);
124static inline void idb_assign(struct inode *, caddr_t, int, ufs2_daddr_t);
125
126#ifdef DEBUG
127static int snapdebug = 0;
128#endif
129
130/*
131 * Create a snapshot file and initialize it for the filesystem.
132 * Vnode is locked on entry and return.
133 */
134int
135ffs_snapshot(struct mount *mp, struct vnode *vp, struct timespec *ctime)
136{
137#if defined(FFS_NO_SNAPSHOT)
138	return EOPNOTSUPP;
139}
140#else /* defined(FFS_NO_SNAPSHOT) */
141	ufs2_daddr_t numblks, blkno, *blkp, snaplistsize = 0, *snapblklist;
142	int error, ns, cg, snaploc;
143	int i, s, size, len, loc;
144	int flag = mp->mnt_flag;
145	struct timeval starttime;
146#ifdef DEBUG
147	struct timeval endtime;
148#endif
149	struct timespec ts;
150	long redo = 0;
151	int32_t *lp;
152	void *space;
153	caddr_t sbbuf = NULL;
154	struct ufsmount *ump = VFSTOUFS(mp);
155	struct fs *copy_fs = NULL, *fs = ump->um_fs;
156	struct lwp *l = curlwp;
157	struct inode *ip, *xp;
158	struct buf *bp, *ibp, *nbp;
159	struct vattr vat;
160	struct vnode *xvp, *nvp, *devvp;
161
162	ns = UFS_FSNEEDSWAP(fs);
163	/*
164	 * Need to serialize access to snapshot code per filesystem.
165	 */
166	/*
167	 * If the vnode already is a snapshot, return.
168	 */
169	if (VTOI(vp)->i_flags & SF_SNAPSHOT) {
170		if (ctime) {
171			ctime->tv_sec = DIP(VTOI(vp), mtime);
172			ctime->tv_nsec = DIP(VTOI(vp), mtimensec);
173		}
174		return 0;
175	}
176	/*
177	 * Check mount, exclusive reference and owner.
178	 */
179	if (vp->v_mount != mp)
180		return EXDEV;
181	if (vp->v_usecount != 1 || vp->v_writecount != 0)
182		return EBUSY;
183	if (kauth_authorize_generic(l->l_cred, KAUTH_GENERIC_ISSUSER,
184	    &l->l_acflag) != 0 &&
185	    VTOI(vp)->i_uid != kauth_cred_geteuid(l->l_cred))
186		return EACCES;
187
188	if (vp->v_size != 0) {
189		error = ffs_truncate(vp, 0, 0, NOCRED, l);
190		if (error)
191			return error;
192	}
193	/*
194	 * Assign a snapshot slot in the superblock.
195	 */
196	for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++)
197		if (fs->fs_snapinum[snaploc] == 0)
198			break;
199	if (snaploc == FSMAXSNAP)
200		return (ENOSPC);
201	ip = VTOI(vp);
202	devvp = ip->i_devvp;
203	/*
204	 * Write an empty list of preallocated blocks to the end of
205	 * the snapshot to set size to at least that of the filesystem.
206	 */
207	numblks = howmany(fs->fs_size, fs->fs_frag);
208	blkno = 1;
209	blkno = ufs_rw64(blkno, ns);
210	error = vn_rdwr(UIO_WRITE, vp,
211	    (caddr_t)&blkno, sizeof(blkno), lblktosize(fs, (off_t)numblks),
212	    UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, l->l_cred, NULL, NULL);
213	if (error)
214		goto out;
215	/*
216	 * Preallocate critical data structures so that we can copy
217	 * them in without further allocation after we suspend all
218	 * operations on the filesystem. We would like to just release
219	 * the allocated buffers without writing them since they will
220	 * be filled in below once we are ready to go, but this upsets
221	 * the soft update code, so we go ahead and write the new buffers.
222	 *
223	 * Allocate all indirect blocks and mark all of them as not
224	 * needing to be copied.
225	 */
226	for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) {
227		error = ffs_balloc(vp, lblktosize(fs, (off_t)blkno),
228		    fs->fs_bsize, l->l_cred, B_METAONLY, &ibp);
229		if (error)
230			goto out;
231		bawrite(ibp);
232	}
233	/*
234	 * Allocate copies for the superblock and its summary information.
235	 */
236	error = ffs_balloc(vp, fs->fs_sblockloc, fs->fs_sbsize, KERNCRED,
237	    0, &nbp);
238	if (error)
239		goto out;
240	bawrite(nbp);
241	blkno = fragstoblks(fs, fs->fs_csaddr);
242	len = howmany(fs->fs_cssize, fs->fs_bsize);
243	for (loc = 0; loc < len; loc++) {
244		error = ffs_balloc(vp, lblktosize(fs, (off_t)(blkno + loc)),
245		    fs->fs_bsize, KERNCRED, 0, &nbp);
246		if (error)
247			goto out;
248		bawrite(nbp);
249	}
250	/*
251	 * Copy all the cylinder group maps. Although the
252	 * filesystem is still active, we hope that only a few
253	 * cylinder groups will change between now and when we
254	 * suspend operations. Thus, we will be able to quickly
255	 * touch up the few cylinder groups that changed during
256	 * the suspension period.
257	 */
258	len = howmany(fs->fs_ncg, NBBY);
259	fs->fs_active = malloc(len, M_DEVBUF, M_WAITOK | M_ZERO);
260	for (cg = 0; cg < fs->fs_ncg; cg++) {
261		if ((error = ffs_balloc(vp, lfragtosize(fs, cgtod(fs, cg)),
262		    fs->fs_bsize, KERNCRED, 0, &nbp)) != 0)
263			goto out;
264		error = cgaccount(cg, vp, nbp->b_data, 1);
265		bawrite(nbp);
266		if (error)
267			goto out;
268	}
269	/*
270	 * Change inode to snapshot type file.
271	 */
272	ip->i_flags |= SF_SNAPSHOT;
273	DIP_ASSIGN(ip, flags, ip->i_flags);
274	ip->i_flag |= IN_CHANGE | IN_UPDATE;
275	/*
276	 * Ensure that the snapshot is completely on disk.
277	 * Since we have marked it as a snapshot it is safe to
278	 * unlock it as no process will be allowed to write to it.
279	 */
280	if ((error = VOP_FSYNC(vp, KERNCRED, FSYNC_WAIT, 0, 0, l)) != 0)
281		goto out;
282	VOP_UNLOCK(vp, 0);
283	/*
284	 * All allocations are done, so we can now snapshot the system.
285	 *
286	 * Suspend operation on filesystem.
287	 */
288	if ((error = vfs_write_suspend(vp->v_mount, PUSER|PCATCH, 0)) != 0) {
289		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
290		goto out;
291	}
292	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
293	getmicrotime(&starttime);
294	/*
295	 * First, copy all the cylinder group maps that have changed.
296	 */
297	for (cg = 0; cg < fs->fs_ncg; cg++) {
298		if (ACTIVECG_ISSET(fs, cg))
299			continue;
300		redo++;
301		if ((error = ffs_balloc(vp, lfragtosize(fs, cgtod(fs, cg)),
302		    fs->fs_bsize, KERNCRED, 0, &nbp)) != 0)
303			goto out1;
304		error = cgaccount(cg, vp, nbp->b_data, 2);
305		bawrite(nbp);
306		if (error)
307			goto out1;
308	}
309	/*
310	 * Grab a copy of the superblock and its summary information.
311	 * We delay writing it until the suspension is released below.
312	 */
313	sbbuf = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK);
314	loc = blkoff(fs, fs->fs_sblockloc);
315	if (loc > 0)
316		bzero(&sbbuf[0], loc);
317	copy_fs = (struct fs *)(sbbuf + loc);
318	bcopy(fs, copy_fs, fs->fs_sbsize);
319	size = fs->fs_bsize < SBLOCKSIZE ? fs->fs_bsize : SBLOCKSIZE;
320	if (fs->fs_sbsize < size)
321		bzero(&sbbuf[loc + fs->fs_sbsize], size - fs->fs_sbsize);
322	size = blkroundup(fs, fs->fs_cssize);
323	if (fs->fs_contigsumsize > 0)
324		size += fs->fs_ncg * sizeof(int32_t);
325	space = malloc((u_long)size, M_UFSMNT, M_WAITOK);
326	copy_fs->fs_csp = space;
327	bcopy(fs->fs_csp, copy_fs->fs_csp, fs->fs_cssize);
328	space = (char *)space + fs->fs_cssize;
329	loc = howmany(fs->fs_cssize, fs->fs_fsize);
330	i = fs->fs_frag - loc % fs->fs_frag;
331	len = (i == fs->fs_frag) ? 0 : i * fs->fs_fsize;
332	if (len > 0) {
333		if ((error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + loc),
334		    len, KERNCRED, &bp)) != 0) {
335			brelse(bp);
336			free(copy_fs->fs_csp, M_UFSMNT);
337			goto out1;
338		}
339		bcopy(bp->b_data, space, (u_int)len);
340		space = (char *)space + len;
341		bp->b_flags |= B_INVAL | B_NOCACHE;
342		brelse(bp);
343	}
344	if (fs->fs_contigsumsize > 0) {
345		copy_fs->fs_maxcluster = lp = space;
346		for (i = 0; i < fs->fs_ncg; i++)
347			*lp++ = fs->fs_contigsumsize;
348	}
349	/*
350	 * We must check for active files that have been unlinked
351	 * (e.g., with a zero link count). We have to expunge all
352	 * trace of these files from the snapshot so that they are
353	 * not reclaimed prematurely by fsck or unnecessarily dumped.
354	 * We turn off the MNTK_SUSPENDED flag to avoid a panic from
355	 * spec_strategy about writing on a suspended filesystem.
356	 * Note that we skip unlinked snapshot files as they will
357	 * be handled separately below.
358	 *
359	 * We also calculate the needed size for the snapshot list.
360	 */
361	snaplistsize = fs->fs_ncg + howmany(fs->fs_cssize, fs->fs_bsize) +
362	    FSMAXSNAP + 1 /* superblock */ + 1 /* last block */ + 1 /* size */;
363	MNT_ILOCK(mp);
364loop:
365	for (xvp = LIST_FIRST(&mp->mnt_vnodelist); xvp; xvp = nvp) {
366		/*
367		 * Make sure this vnode wasn't reclaimed in getnewvnode().
368		 * Start over if it has (it won't be on the list anymore).
369		 */
370		if (xvp->v_mount != mp)
371			goto loop;
372		nvp = LIST_NEXT(xvp, v_mntvnodes);
373		VI_LOCK(xvp);
374		MNT_IUNLOCK(mp);
375		if ((xvp->v_flag & VXLOCK) ||
376		    xvp->v_usecount == 0 || xvp->v_type == VNON ||
377		    (VTOI(xvp)->i_flags & SF_SNAPSHOT)) {
378			VI_UNLOCK(xvp);
379			MNT_ILOCK(mp);
380			continue;
381		}
382		if (vn_lock(xvp, LK_EXCLUSIVE | LK_INTERLOCK) != 0) {
383			MNT_ILOCK(mp);
384			goto loop;
385		}
386#ifdef DEBUG
387		if (snapdebug)
388			vprint("ffs_snapshot: busy vnode", xvp);
389#endif
390		if (VOP_GETATTR(xvp, &vat, l->l_cred, l) == 0 &&
391		    vat.va_nlink > 0) {
392			VOP_UNLOCK(xvp, 0);
393			MNT_ILOCK(mp);
394			continue;
395		}
396		xp = VTOI(xvp);
397		if (ffs_checkfreefile(copy_fs, vp, xp->i_number)) {
398			VOP_UNLOCK(xvp, 0);
399			MNT_ILOCK(mp);
400			continue;
401		}
402		/*
403		 * If there is a fragment, clear it here.
404		 */
405		blkno = 0;
406		loc = howmany(xp->i_size, fs->fs_bsize) - 1;
407		if (loc < NDADDR) {
408			len = fragroundup(fs, blkoff(fs, xp->i_size));
409			if (len > 0 && len < fs->fs_bsize) {
410				ffs_blkfree(copy_fs, vp, db_get(xp, loc),
411				    len, xp->i_number);
412				blkno = db_get(xp, loc);
413				db_assign(xp, loc, 0);
414			}
415		}
416		snaplistsize += 1;
417		if (xp->i_ump->um_fstype == UFS1)
418			error = expunge_ufs1(vp, xp, copy_fs, fullacct_ufs1,
419			    BLK_NOCOPY);
420		else
421			error = expunge_ufs2(vp, xp, copy_fs, fullacct_ufs2,
422			    BLK_NOCOPY);
423		if (blkno)
424			db_assign(xp, loc, blkno);
425		if (!error)
426			error = ffs_freefile(copy_fs, vp, xp->i_number,
427			    xp->i_mode);
428		VOP_UNLOCK(xvp, 0);
429		if (error) {
430			free(copy_fs->fs_csp, M_UFSMNT);
431			goto out1;
432		}
433		MNT_ILOCK(mp);
434	}
435	MNT_IUNLOCK(mp);
436	/*
437	 * If there already exist snapshots on this filesystem, grab a
438	 * reference to their shared lock. If this is the first snapshot
439	 * on this filesystem, we need to allocate a lock for the snapshots
440	 * to share. In either case, acquire the snapshot lock and give
441	 * up our original private lock.
442	 */
443	VI_LOCK(devvp);
444	if ((xp = TAILQ_FIRST(&ump->um_snapshots)) != NULL) {
445		struct lock *lkp;
446
447		lkp = ITOV(xp)->v_vnlock;
448		VI_UNLOCK(devvp);
449		VI_LOCK(vp);
450		vp->v_vnlock = lkp;
451	} else {
452		struct lock *lkp;
453
454		VI_UNLOCK(devvp);
455		MALLOC(lkp, struct lock *, sizeof(struct lock), M_UFSMNT,
456		    M_WAITOK);
457		lockinit(lkp, PVFS, "snaplk", 0, LK_CANRECURSE);
458		VI_LOCK(vp);
459		vp->v_vnlock = lkp;
460	}
461	vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY);
462	transferlockers(&vp->v_lock, vp->v_vnlock);
463	lockmgr(&vp->v_lock, LK_RELEASE, NULL);
464	/*
465	 * If this is the first snapshot on this filesystem, then we need
466	 * to allocate the space for the list of preallocated snapshot blocks.
467	 * This list will be refined below, but this preliminary one will
468	 * keep us out of deadlock until the full one is ready.
469	 */
470	if (xp == NULL) {
471		snapblklist = malloc(
472		    snaplistsize * sizeof(ufs2_daddr_t), M_UFSMNT, M_WAITOK);
473		blkp = &snapblklist[1];
474		*blkp++ = lblkno(fs, fs->fs_sblockloc);
475		blkno = fragstoblks(fs, fs->fs_csaddr);
476		for (cg = 0; cg < fs->fs_ncg; cg++) {
477			if (fragstoblks(fs, cgtod(fs, cg)) > blkno)
478				break;
479			*blkp++ = fragstoblks(fs, cgtod(fs, cg));
480		}
481		len = howmany(fs->fs_cssize, fs->fs_bsize);
482		for (loc = 0; loc < len; loc++)
483			*blkp++ = blkno + loc;
484		for (; cg < fs->fs_ncg; cg++)
485			*blkp++ = fragstoblks(fs, cgtod(fs, cg));
486		snapblklist[0] = blkp - snapblklist;
487		VI_LOCK(devvp);
488		if (ump->um_snapblklist != NULL)
489			panic("ffs_snapshot: non-empty list");
490		ump->um_snapblklist = snapblklist;
491		VI_UNLOCK(devvp);
492	}
493	/*
494	 * Record snapshot inode. Since this is the newest snapshot,
495	 * it must be placed at the end of the list.
496	 */
497	VI_LOCK(devvp);
498	fs->fs_snapinum[snaploc] = ip->i_number;
499	if (ip->i_nextsnap.tqe_prev != 0)
500		panic("ffs_snapshot: %llu already on list",
501		    (unsigned long long)ip->i_number);
502	TAILQ_INSERT_TAIL(&ump->um_snapshots, ip, i_nextsnap);
503	VI_UNLOCK(devvp);
504	if (xp == NULL)
505		vn_cow_establish(devvp, ffs_copyonwrite, devvp);
506	vp->v_flag |= VSYSTEM;
507out1:
508	/*
509	 * Resume operation on filesystem.
510	 */
511	vfs_write_resume(vp->v_mount);
512	/*
513	 * Set the mtime to the time the snapshot has been taken.
514	 */
515	TIMEVAL_TO_TIMESPEC(&starttime, &ts);
516	if (ctime)
517		*ctime = ts;
518	DIP_ASSIGN(ip, mtime, ts.tv_sec);
519	DIP_ASSIGN(ip, mtimensec, ts.tv_nsec);
520	ip->i_flag |= IN_CHANGE | IN_UPDATE;
521
522#ifdef DEBUG
523	if (starttime.tv_sec > 0) {
524		getmicrotime(&endtime);
525		timersub(&endtime, &starttime, &endtime);
526		printf("%s: suspended %ld.%03ld sec, redo %ld of %d\n",
527		    vp->v_mount->mnt_stat.f_mntonname, (long)endtime.tv_sec,
528		    endtime.tv_usec / 1000, redo, fs->fs_ncg);
529	}
530#endif
531	if (error)
532		goto out;
533	/*
534	 * Copy allocation information from all the snapshots in
535	 * this snapshot and then expunge them from its view.
536	 */
537	TAILQ_FOREACH(xp, &ump->um_snapshots, i_nextsnap) {
538		if (xp == ip)
539			break;
540		if (xp->i_ump->um_fstype == UFS1)
541			error = expunge_ufs1(vp, xp, fs, snapacct_ufs1,
542			    BLK_SNAP);
543		else
544			error = expunge_ufs2(vp, xp, fs, snapacct_ufs2,
545			    BLK_SNAP);
546		if (error) {
547			fs->fs_snapinum[snaploc] = 0;
548			goto done;
549		}
550	}
551	/*
552	 * Allocate space for the full list of preallocated snapshot blocks.
553	 */
554	snapblklist = malloc(snaplistsize * sizeof(ufs2_daddr_t),
555	    M_UFSMNT, M_WAITOK);
556	ip->i_snapblklist = &snapblklist[1];
557	/*
558	 * Expunge the blocks used by the snapshots from the set of
559	 * blocks marked as used in the snapshot bitmaps. Also, collect
560	 * the list of allocated blocks in i_snapblklist.
561	 */
562	if (ip->i_ump->um_fstype == UFS1)
563		error = expunge_ufs1(vp, ip, copy_fs, mapacct_ufs1, BLK_SNAP);
564	else
565		error = expunge_ufs2(vp, ip, copy_fs, mapacct_ufs2, BLK_SNAP);
566	if (error) {
567		fs->fs_snapinum[snaploc] = 0;
568		FREE(snapblklist, M_UFSMNT);
569		goto done;
570	}
571	if (snaplistsize < ip->i_snapblklist - snapblklist)
572		panic("ffs_snapshot: list too small");
573	snaplistsize = ip->i_snapblklist - snapblklist;
574	snapblklist[0] = snaplistsize;
575	ip->i_snapblklist = &snapblklist[0];
576	/*
577	 * Write out the list of allocated blocks to the end of the snapshot.
578	 */
579	for (i = 0; i < snaplistsize; i++)
580		snapblklist[i] = ufs_rw64(snapblklist[i], ns);
581	error = vn_rdwr(UIO_WRITE, vp, (caddr_t)snapblklist,
582	    snaplistsize*sizeof(ufs2_daddr_t), lblktosize(fs, (off_t)numblks),
583	    UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, l->l_cred, NULL, NULL);
584	for (i = 0; i < snaplistsize; i++)
585		snapblklist[i] = ufs_rw64(snapblklist[i], ns);
586	if (error) {
587		fs->fs_snapinum[snaploc] = 0;
588		FREE(snapblklist, M_UFSMNT);
589		goto done;
590	}
591	/*
592	 * Write the superblock and its summary information
593	 * to the snapshot.
594	 */
595	blkno = fragstoblks(fs, fs->fs_csaddr);
596	len = howmany(fs->fs_cssize, fs->fs_bsize);
597	space = copy_fs->fs_csp;
598#ifdef FFS_EI
599	if (ns) {
600		ffs_sb_swap(copy_fs, copy_fs);
601		ffs_csum_swap(space, space, fs->fs_cssize);
602	}
603#endif
604	for (loc = 0; loc < len; loc++) {
605		error = bread(vp, blkno + loc, fs->fs_bsize, KERNCRED, &nbp);
606		if (error) {
607			brelse(nbp);
608			fs->fs_snapinum[snaploc] = 0;
609			FREE(snapblklist, M_UFSMNT);
610			goto done;
611		}
612		bcopy(space, nbp->b_data, fs->fs_bsize);
613		space = (char *)space + fs->fs_bsize;
614		bawrite(nbp);
615	}
616	/*
617	 * As this is the newest list, it is the most inclusive, so
618	 * should replace the previous list. If this is the first snapshot
619	 * free the preliminary list.
620	 */
621	VI_LOCK(devvp);
622	space = ump->um_snapblklist;
623	ump->um_snapblklist = snapblklist;
624	VI_UNLOCK(devvp);
625	if (TAILQ_FIRST(&ump->um_snapshots) == ip)
626		FREE(space, M_UFSMNT);
627done:
628	free(copy_fs->fs_csp, M_UFSMNT);
629	if (!error) {
630		error = bread(vp, lblkno(fs, fs->fs_sblockloc), fs->fs_bsize,
631		    KERNCRED, &nbp);
632		if (error) {
633			brelse(nbp);
634			fs->fs_snapinum[snaploc] = 0;
635		}
636		bcopy(sbbuf, nbp->b_data, fs->fs_bsize);
637		bawrite(nbp);
638	}
639out:
640	/*
641	 * Invalidate and free all pages on the snapshot vnode.
642	 * All metadata has been written through the buffer cache.
643	 * Clean all dirty buffers now to avoid UBC inconsistencies.
644	 */
645	if (!error) {
646		simple_lock(&vp->v_interlock);
647		error = VOP_PUTPAGES(vp, 0, 0,
648		    PGO_ALLPAGES|PGO_CLEANIT|PGO_SYNCIO|PGO_FREE);
649	}
650	if (!error) {
651		s = splbio();
652		for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
653			nbp = LIST_NEXT(bp, b_vnbufs);
654			simple_lock(&bp->b_interlock);
655			splx(s);
656			if ((bp->b_flags & (B_DELWRI|B_BUSY)) != B_DELWRI)
657				panic("ffs_snapshot: not dirty or busy, bp %p",
658				    bp);
659			bp->b_flags |= B_BUSY|B_VFLUSH;
660			if (LIST_FIRST(&bp->b_dep) == NULL)
661				bp->b_flags |= B_NOCACHE;
662			simple_unlock(&bp->b_interlock);
663			bwrite(bp);
664			s = splbio();
665		}
666		simple_lock(&global_v_numoutput_slock);
667		while (vp->v_numoutput) {
668			vp->v_flag |= VBWAIT;
669			ltsleep((caddr_t)&vp->v_numoutput, PRIBIO+1,
670			    "snapflushbuf", 0, &global_v_numoutput_slock);
671		}
672		simple_unlock(&global_v_numoutput_slock);
673		splx(s);
674	}
675	if (sbbuf)
676		free(sbbuf, M_UFSMNT);
677	if (fs->fs_active != 0) {
678		FREE(fs->fs_active, M_DEVBUF);
679		fs->fs_active = 0;
680	}
681	mp->mnt_flag = flag;
682	if (error)
683		(void) ffs_truncate(vp, (off_t)0, 0, NOCRED, l);
684	else
685		vref(vp);
686	return (error);
687}
688
689/*
690 * Copy a cylinder group map. All the unallocated blocks are marked
691 * BLK_NOCOPY so that the snapshot knows that it need not copy them
692 * if they are later written. If passno is one, then this is a first
693 * pass, so only setting needs to be done. If passno is 2, then this
694 * is a revision to a previous pass which must be undone as the
695 * replacement pass is done.
696 */
697static int
698cgaccount(int cg, struct vnode *vp, caddr_t data, int passno)
699{
700	struct buf *bp, *ibp;
701	struct inode *ip;
702	struct cg *cgp;
703	struct fs *fs;
704	ufs2_daddr_t base, numblks;
705	int error, len, loc, ns, indiroff;
706
707	ip = VTOI(vp);
708	fs = ip->i_fs;
709	ns = UFS_FSNEEDSWAP(fs);
710	error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)),
711		(int)fs->fs_cgsize, KERNCRED, &bp);
712	if (error) {
713		brelse(bp);
714		return (error);
715	}
716	cgp = (struct cg *)bp->b_data;
717	if (!cg_chkmagic(cgp, ns)) {
718		brelse(bp);
719		return (EIO);
720	}
721	ACTIVECG_SET(fs, cg);
722
723	bcopy(bp->b_data, data, fs->fs_cgsize);
724	brelse(bp);
725	if (fs->fs_cgsize < fs->fs_bsize)
726		bzero(&data[fs->fs_cgsize],
727		    fs->fs_bsize - fs->fs_cgsize);
728	numblks = howmany(fs->fs_size, fs->fs_frag);
729	len = howmany(fs->fs_fpg, fs->fs_frag);
730	base = cg * fs->fs_fpg / fs->fs_frag;
731	if (base + len >= numblks)
732		len = numblks - base - 1;
733	loc = 0;
734	if (base < NDADDR) {
735		for ( ; loc < NDADDR; loc++) {
736			if (ffs_isblock(fs, cg_blksfree(cgp, ns), loc))
737				db_assign(ip, loc, BLK_NOCOPY);
738			else if (db_get(ip, loc) == BLK_NOCOPY) {
739				if (passno == 2)
740					db_assign(ip, loc, 0);
741				else if (passno == 1)
742					panic("ffs_snapshot: lost direct block");
743			}
744		}
745	}
746	if ((error = ffs_balloc(vp, lblktosize(fs, (off_t)(base + loc)),
747	    fs->fs_bsize, KERNCRED, B_METAONLY, &ibp)) != 0)
748		return (error);
749	indiroff = (base + loc - NDADDR) % NINDIR(fs);
750	for ( ; loc < len; loc++, indiroff++) {
751		if (indiroff >= NINDIR(fs)) {
752			bawrite(ibp);
753			if ((error = ffs_balloc(vp,
754			    lblktosize(fs, (off_t)(base + loc)),
755			    fs->fs_bsize, KERNCRED, B_METAONLY, &ibp)) != 0)
756				return (error);
757			indiroff = 0;
758		}
759		if (ffs_isblock(fs, cg_blksfree(cgp, ns), loc))
760			idb_assign(ip, ibp->b_data, indiroff, BLK_NOCOPY);
761		else if (idb_get(ip, ibp->b_data, indiroff) == BLK_NOCOPY) {
762			if (passno == 2)
763				idb_assign(ip, ibp->b_data, indiroff, 0);
764			else if (passno == 1)
765				panic("ffs_snapshot: lost indirect block");
766		}
767	}
768	bdwrite(ibp);
769	return (0);
770}
771
772/*
773 * Before expunging a snapshot inode, note all the
774 * blocks that it claims with BLK_SNAP so that fsck will
775 * be able to account for those blocks properly and so
776 * that this snapshot knows that it need not copy them
777 * if the other snapshot holding them is freed. This code
778 * is reproduced once each for UFS1 and UFS2.
779 */
780static int
781expunge_ufs1(struct vnode *snapvp, struct inode *cancelip, struct fs *fs,
782    int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
783		    struct fs *, ufs_lbn_t, int),
784    int expungetype)
785{
786	int i, s, error, ns, indiroff;
787	ufs_lbn_t lbn, rlbn;
788	ufs2_daddr_t len, blkno, numblks, blksperindir;
789	struct ufs1_dinode *dip;
790	struct buf *bp;
791	caddr_t bf;
792
793	ns = UFS_FSNEEDSWAP(fs);
794	/*
795	 * Prepare to expunge the inode. If its inode block has not
796	 * yet been copied, then allocate and fill the copy.
797	 */
798	lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number));
799	blkno = 0;
800	if (lbn < NDADDR) {
801		blkno = db_get(VTOI(snapvp), lbn);
802	} else {
803		s = cow_enter();
804		error = ffs_balloc(snapvp, lblktosize(fs, (off_t)lbn),
805		   fs->fs_bsize, KERNCRED, B_METAONLY, &bp);
806		cow_leave(s);
807		if (error)
808			return (error);
809		indiroff = (lbn - NDADDR) % NINDIR(fs);
810		blkno = idb_get(VTOI(snapvp), bp->b_data, indiroff);
811		brelse(bp);
812	}
813	bf = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK);
814	if (blkno != 0)
815		error = readvnblk(snapvp, bf, lbn);
816	else
817		error = readfsblk(snapvp, bf, lbn);
818	if (error) {
819		free(bf, M_UFSMNT);
820		return error;
821	}
822	/*
823	 * Set a snapshot inode to be a zero length file, regular files
824	 * to be completely unallocated.
825	 */
826	dip = (struct ufs1_dinode *)bf + ino_to_fsbo(fs, cancelip->i_number);
827	if (expungetype == BLK_NOCOPY)
828		dip->di_mode = 0;
829	dip->di_size = 0;
830	dip->di_blocks = 0;
831	dip->di_flags =
832	    ufs_rw32(ufs_rw32(dip->di_flags, ns) & ~SF_SNAPSHOT, ns);
833	bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs1_daddr_t));
834	error = writevnblk(snapvp, bf, lbn);
835	free(bf, M_UFSMNT);
836	if (error)
837		return error;
838	/*
839	 * Now go through and expunge all the blocks in the file
840	 * using the function requested.
841	 */
842	numblks = howmany(cancelip->i_size, fs->fs_bsize);
843	if ((error = (*acctfunc)(snapvp, &cancelip->i_ffs1_db[0],
844	    &cancelip->i_ffs1_db[NDADDR], fs, 0, expungetype)))
845		return (error);
846	if ((error = (*acctfunc)(snapvp, &cancelip->i_ffs1_ib[0],
847	    &cancelip->i_ffs1_ib[NIADDR], fs, -1, expungetype)))
848		return (error);
849	blksperindir = 1;
850	lbn = -NDADDR;
851	len = numblks - NDADDR;
852	rlbn = NDADDR;
853	for (i = 0; len > 0 && i < NIADDR; i++) {
854		error = indiracct_ufs1(snapvp, ITOV(cancelip), i,
855		    ufs_rw32(cancelip->i_ffs1_ib[i], ns), lbn, rlbn, len,
856		    blksperindir, fs, acctfunc, expungetype);
857		if (error)
858			return (error);
859		blksperindir *= NINDIR(fs);
860		lbn -= blksperindir + 1;
861		len -= blksperindir;
862		rlbn += blksperindir;
863	}
864	return (0);
865}
866
867/*
868 * Descend an indirect block chain for vnode cancelvp accounting for all
869 * its indirect blocks in snapvp.
870 */
871static int
872indiracct_ufs1(struct vnode *snapvp, struct vnode *cancelvp, int level,
873    ufs1_daddr_t blkno, ufs_lbn_t lbn, ufs_lbn_t rlbn, ufs_lbn_t remblks,
874    ufs_lbn_t blksperindir, struct fs *fs,
875    int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
876		    struct fs *, ufs_lbn_t, int),
877    int expungetype)
878{
879	int error, ns, num, i;
880	ufs_lbn_t subblksperindir;
881	struct indir indirs[NIADDR + 2];
882	ufs1_daddr_t last, *bap;
883	struct buf *bp;
884
885	ns = UFS_FSNEEDSWAP(fs);
886
887	if (blkno == 0) {
888		if (expungetype == BLK_NOCOPY)
889			return (0);
890		panic("indiracct_ufs1: missing indir");
891	}
892	if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0)
893		return (error);
894	if (lbn != indirs[num - 1 - level].in_lbn || num < 2)
895		panic("indiracct_ufs1: botched params");
896	/*
897	 * We have to expand bread here since it will deadlock looking
898	 * up the block number for any blocks that are not in the cache.
899	 */
900	bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0);
901	bp->b_blkno = fsbtodb(fs, blkno);
902	if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 &&
903	    (error = readfsblk(bp->b_vp, bp->b_data, fragstoblks(fs, blkno)))) {
904		brelse(bp);
905		return (error);
906	}
907	/*
908	 * Account for the block pointers in this indirect block.
909	 */
910	last = howmany(remblks, blksperindir);
911	if (last > NINDIR(fs))
912		last = NINDIR(fs);
913	bap = malloc(fs->fs_bsize, M_DEVBUF, M_WAITOK);
914	bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize);
915	brelse(bp);
916	error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs,
917	    level == 0 ? rlbn : -1, expungetype);
918	if (error || level == 0)
919		goto out;
920	/*
921	 * Account for the block pointers in each of the indirect blocks
922	 * in the levels below us.
923	 */
924	subblksperindir = blksperindir / NINDIR(fs);
925	for (lbn++, level--, i = 0; i < last; i++) {
926		error = indiracct_ufs1(snapvp, cancelvp, level,
927		    ufs_rw32(bap[i], ns), lbn, rlbn, remblks, subblksperindir,
928		    fs, acctfunc, expungetype);
929		if (error)
930			goto out;
931		rlbn += blksperindir;
932		lbn -= blksperindir;
933		remblks -= blksperindir;
934	}
935out:
936	FREE(bap, M_DEVBUF);
937	return (error);
938}
939
940/*
941 * Do both snap accounting and map accounting.
942 */
943static int
944fullacct_ufs1(struct vnode *vp, ufs1_daddr_t *oldblkp, ufs1_daddr_t *lastblkp,
945    struct fs *fs, ufs_lbn_t lblkno,
946    int exptype /* BLK_SNAP or BLK_NOCOPY */)
947{
948	int error;
949
950	if ((error = snapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype)))
951		return (error);
952	return (mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype));
953}
954
955/*
956 * Identify a set of blocks allocated in a snapshot inode.
957 */
958static int
959snapacct_ufs1(struct vnode *vp, ufs1_daddr_t *oldblkp, ufs1_daddr_t *lastblkp,
960    struct fs *fs, ufs_lbn_t lblkno,
961    int expungetype /* BLK_SNAP or BLK_NOCOPY */)
962{
963	struct inode *ip = VTOI(vp);
964	ufs1_daddr_t blkno, *blkp;
965	ufs_lbn_t lbn;
966	struct buf *ibp;
967	int error, ns;
968
969	ns = UFS_FSNEEDSWAP(fs);
970
971	for ( ; oldblkp < lastblkp; oldblkp++) {
972		blkno = ufs_rw32(*oldblkp, ns);
973		if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP)
974			continue;
975		lbn = fragstoblks(fs, blkno);
976		if (lbn < NDADDR) {
977			blkp = &ip->i_ffs1_db[lbn];
978			ip->i_flag |= IN_CHANGE | IN_UPDATE;
979		} else {
980			error = ffs_balloc(vp, lblktosize(fs, (off_t)lbn),
981			    fs->fs_bsize, KERNCRED, B_METAONLY, &ibp);
982			if (error)
983				return (error);
984			blkp = &((ufs1_daddr_t *)(ibp->b_data))
985			    [(lbn - NDADDR) % NINDIR(fs)];
986		}
987		/*
988		 * If we are expunging a snapshot vnode and we
989		 * find a block marked BLK_NOCOPY, then it is
990		 * one that has been allocated to this snapshot after
991		 * we took our current snapshot and can be ignored.
992		 */
993		blkno = ufs_rw32(*blkp, ns);
994		if (expungetype == BLK_SNAP && blkno == BLK_NOCOPY) {
995			if (lbn >= NDADDR)
996				brelse(ibp);
997		} else {
998			if (blkno != 0)
999				panic("snapacct_ufs1: bad block");
1000			*blkp = ufs_rw32(expungetype, ns);
1001			if (lbn >= NDADDR)
1002				bdwrite(ibp);
1003		}
1004	}
1005	return (0);
1006}
1007
1008/*
1009 * Account for a set of blocks allocated in a snapshot inode.
1010 */
1011static int
1012mapacct_ufs1(struct vnode *vp, ufs1_daddr_t *oldblkp, ufs1_daddr_t *lastblkp,
1013    struct fs *fs, ufs_lbn_t lblkno, int expungetype)
1014{
1015	ufs1_daddr_t blkno;
1016	struct inode *ip;
1017	ino_t inum;
1018	int acctit, ns;
1019
1020	ns = UFS_FSNEEDSWAP(fs);
1021	ip = VTOI(vp);
1022	inum = ip->i_number;
1023	if (lblkno == -1)
1024		acctit = 0;
1025	else
1026		acctit = 1;
1027	for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) {
1028		blkno = ufs_rw32(*oldblkp, ns);
1029		if (blkno == 0 || blkno == BLK_NOCOPY)
1030			continue;
1031		if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP)
1032			*ip->i_snapblklist++ = lblkno;
1033		if (blkno == BLK_SNAP)
1034			blkno = blkstofrags(fs, lblkno);
1035		ffs_blkfree(fs, vp, blkno, fs->fs_bsize, inum);
1036	}
1037	return (0);
1038}
1039
1040/*
1041 * Before expunging a snapshot inode, note all the
1042 * blocks that it claims with BLK_SNAP so that fsck will
1043 * be able to account for those blocks properly and so
1044 * that this snapshot knows that it need not copy them
1045 * if the other snapshot holding them is freed. This code
1046 * is reproduced once each for UFS1 and UFS2.
1047 */
1048static int
1049expunge_ufs2(struct vnode *snapvp, struct inode *cancelip, struct fs *fs,
1050    int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
1051		    struct fs *, ufs_lbn_t, int),
1052    int expungetype)
1053{
1054	int i, s, error, ns, indiroff;
1055	ufs_lbn_t lbn, rlbn;
1056	ufs2_daddr_t len, blkno, numblks, blksperindir;
1057	struct ufs2_dinode *dip;
1058	struct buf *bp;
1059	caddr_t bf;
1060
1061	ns = UFS_FSNEEDSWAP(fs);
1062	/*
1063	 * Prepare to expunge the inode. If its inode block has not
1064	 * yet been copied, then allocate and fill the copy.
1065	 */
1066	lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number));
1067	blkno = 0;
1068	if (lbn < NDADDR) {
1069		blkno = db_get(VTOI(snapvp), lbn);
1070	} else {
1071		s = cow_enter();
1072		error = ffs_balloc(snapvp, lblktosize(fs, (off_t)lbn),
1073		   fs->fs_bsize, KERNCRED, B_METAONLY, &bp);
1074		cow_leave(s);
1075		if (error)
1076			return (error);
1077		indiroff = (lbn - NDADDR) % NINDIR(fs);
1078		blkno = idb_get(VTOI(snapvp), bp->b_data, indiroff);
1079		brelse(bp);
1080	}
1081	bf = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK);
1082	if (blkno != 0)
1083		error = readvnblk(snapvp, bf, lbn);
1084	else
1085		error = readfsblk(snapvp, bf, lbn);
1086	if (error) {
1087		free(bf, M_UFSMNT);
1088		return error;
1089	}
1090	/*
1091	 * Set a snapshot inode to be a zero length file, regular files
1092	 * to be completely unallocated.
1093	 */
1094	dip = (struct ufs2_dinode *)bf + ino_to_fsbo(fs, cancelip->i_number);
1095	if (expungetype == BLK_NOCOPY)
1096		dip->di_mode = 0;
1097	dip->di_size = 0;
1098	dip->di_blocks = 0;
1099	dip->di_flags =
1100	    ufs_rw32(ufs_rw32(dip->di_flags, ns) & ~SF_SNAPSHOT, ns);
1101	bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs2_daddr_t));
1102	error = writevnblk(snapvp, bf, lbn);
1103	free(bf, M_UFSMNT);
1104	if (error)
1105		return error;
1106	/*
1107	 * Now go through and expunge all the blocks in the file
1108	 * using the function requested.
1109	 */
1110	numblks = howmany(cancelip->i_size, fs->fs_bsize);
1111	if ((error = (*acctfunc)(snapvp, &cancelip->i_ffs2_db[0],
1112	    &cancelip->i_ffs2_db[NDADDR], fs, 0, expungetype)))
1113		return (error);
1114	if ((error = (*acctfunc)(snapvp, &cancelip->i_ffs2_ib[0],
1115	    &cancelip->i_ffs2_ib[NIADDR], fs, -1, expungetype)))
1116		return (error);
1117	blksperindir = 1;
1118	lbn = -NDADDR;
1119	len = numblks - NDADDR;
1120	rlbn = NDADDR;
1121	for (i = 0; len > 0 && i < NIADDR; i++) {
1122		error = indiracct_ufs2(snapvp, ITOV(cancelip), i,
1123		    ufs_rw64(cancelip->i_ffs2_ib[i], ns), lbn, rlbn, len,
1124		    blksperindir, fs, acctfunc, expungetype);
1125		if (error)
1126			return (error);
1127		blksperindir *= NINDIR(fs);
1128		lbn -= blksperindir + 1;
1129		len -= blksperindir;
1130		rlbn += blksperindir;
1131	}
1132	return (0);
1133}
1134
1135/*
1136 * Descend an indirect block chain for vnode cancelvp accounting for all
1137 * its indirect blocks in snapvp.
1138 */
1139static int
1140indiracct_ufs2(struct vnode *snapvp, struct vnode *cancelvp, int level,
1141    ufs2_daddr_t blkno, ufs_lbn_t lbn, ufs_lbn_t rlbn, ufs_lbn_t remblks,
1142    ufs_lbn_t blksperindir, struct fs *fs,
1143    int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
1144		    struct fs *, ufs_lbn_t, int),
1145    int expungetype)
1146{
1147	int error, ns, num, i;
1148	ufs_lbn_t subblksperindir;
1149	struct indir indirs[NIADDR + 2];
1150	ufs2_daddr_t last, *bap;
1151	struct buf *bp;
1152
1153	ns = UFS_FSNEEDSWAP(fs);
1154
1155	if (blkno == 0) {
1156		if (expungetype == BLK_NOCOPY)
1157			return (0);
1158		panic("indiracct_ufs2: missing indir");
1159	}
1160	if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0)
1161		return (error);
1162	if (lbn != indirs[num - 1 - level].in_lbn || num < 2)
1163		panic("indiracct_ufs2: botched params");
1164	/*
1165	 * We have to expand bread here since it will deadlock looking
1166	 * up the block number for any blocks that are not in the cache.
1167	 */
1168	bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0);
1169	bp->b_blkno = fsbtodb(fs, blkno);
1170	if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 &&
1171	    (error = readfsblk(bp->b_vp, bp->b_data, fragstoblks(fs, blkno)))) {
1172		brelse(bp);
1173		return (error);
1174	}
1175	/*
1176	 * Account for the block pointers in this indirect block.
1177	 */
1178	last = howmany(remblks, blksperindir);
1179	if (last > NINDIR(fs))
1180		last = NINDIR(fs);
1181	bap = malloc(fs->fs_bsize, M_DEVBUF, M_WAITOK);
1182	bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize);
1183	brelse(bp);
1184	error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs,
1185	    level == 0 ? rlbn : -1, expungetype);
1186	if (error || level == 0)
1187		goto out;
1188	/*
1189	 * Account for the block pointers in each of the indirect blocks
1190	 * in the levels below us.
1191	 */
1192	subblksperindir = blksperindir / NINDIR(fs);
1193	for (lbn++, level--, i = 0; i < last; i++) {
1194		error = indiracct_ufs2(snapvp, cancelvp, level,
1195		    ufs_rw64(bap[i], ns), lbn, rlbn, remblks, subblksperindir,
1196		    fs, acctfunc, expungetype);
1197		if (error)
1198			goto out;
1199		rlbn += blksperindir;
1200		lbn -= blksperindir;
1201		remblks -= blksperindir;
1202	}
1203out:
1204	FREE(bap, M_DEVBUF);
1205	return (error);
1206}
1207
1208/*
1209 * Do both snap accounting and map accounting.
1210 */
1211static int
1212fullacct_ufs2(struct vnode *vp, ufs2_daddr_t *oldblkp, ufs2_daddr_t *lastblkp,
1213    struct fs *fs, ufs_lbn_t lblkno,
1214    int exptype /* BLK_SNAP or BLK_NOCOPY */)
1215{
1216	int error;
1217
1218	if ((error = snapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype)))
1219		return (error);
1220	return (mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype));
1221}
1222
1223/*
1224 * Identify a set of blocks allocated in a snapshot inode.
1225 */
1226static int
1227snapacct_ufs2(struct vnode *vp, ufs2_daddr_t *oldblkp, ufs2_daddr_t *lastblkp,
1228    struct fs *fs, ufs_lbn_t lblkno,
1229    int expungetype /* BLK_SNAP or BLK_NOCOPY */)
1230{
1231	struct inode *ip = VTOI(vp);
1232	ufs2_daddr_t blkno, *blkp;
1233	ufs_lbn_t lbn;
1234	struct buf *ibp;
1235	int error, ns;
1236
1237	ns = UFS_FSNEEDSWAP(fs);
1238
1239	for ( ; oldblkp < lastblkp; oldblkp++) {
1240		blkno = ufs_rw64(*oldblkp, ns);
1241		if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP)
1242			continue;
1243		lbn = fragstoblks(fs, blkno);
1244		if (lbn < NDADDR) {
1245			blkp = &ip->i_ffs2_db[lbn];
1246			ip->i_flag |= IN_CHANGE | IN_UPDATE;
1247		} else {
1248			error = ffs_balloc(vp, lblktosize(fs, (off_t)lbn),
1249			    fs->fs_bsize, KERNCRED, B_METAONLY, &ibp);
1250			if (error)
1251				return (error);
1252			blkp = &((ufs2_daddr_t *)(ibp->b_data))
1253			    [(lbn - NDADDR) % NINDIR(fs)];
1254		}
1255		/*
1256		 * If we are expunging a snapshot vnode and we
1257		 * find a block marked BLK_NOCOPY, then it is
1258		 * one that has been allocated to this snapshot after
1259		 * we took our current snapshot and can be ignored.
1260		 */
1261		blkno = ufs_rw64(*blkp, ns);
1262		if (expungetype == BLK_SNAP && blkno == BLK_NOCOPY) {
1263			if (lbn >= NDADDR)
1264				brelse(ibp);
1265		} else {
1266			if (blkno != 0)
1267				panic("snapacct_ufs2: bad block");
1268			*blkp = ufs_rw64(expungetype, ns);
1269			if (lbn >= NDADDR)
1270				bdwrite(ibp);
1271		}
1272	}
1273	return (0);
1274}
1275
1276/*
1277 * Account for a set of blocks allocated in a snapshot inode.
1278 */
1279static int
1280mapacct_ufs2(struct vnode *vp, ufs2_daddr_t *oldblkp, ufs2_daddr_t *lastblkp,
1281    struct fs *fs, ufs_lbn_t lblkno, int expungetype)
1282{
1283	ufs2_daddr_t blkno;
1284	struct inode *ip;
1285	ino_t inum;
1286	int acctit, ns;
1287
1288	ns = UFS_FSNEEDSWAP(fs);
1289	ip = VTOI(vp);
1290	inum = ip->i_number;
1291	if (lblkno == -1)
1292		acctit = 0;
1293	else
1294		acctit = 1;
1295	for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) {
1296		blkno = ufs_rw64(*oldblkp, ns);
1297		if (blkno == 0 || blkno == BLK_NOCOPY)
1298			continue;
1299		if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP)
1300			*ip->i_snapblklist++ = lblkno;
1301		if (blkno == BLK_SNAP)
1302			blkno = blkstofrags(fs, lblkno);
1303		ffs_blkfree(fs, vp, blkno, fs->fs_bsize, inum);
1304	}
1305	return (0);
1306}
1307#endif /* defined(FFS_NO_SNAPSHOT) */
1308
1309/*
1310 * Decrement extra reference on snapshot when last name is removed.
1311 * It will not be freed until the last open reference goes away.
1312 */
1313void
1314ffs_snapgone(struct inode *ip)
1315{
1316	struct ufsmount *ump = VFSTOUFS(ip->i_devvp->v_specmountpoint);
1317	struct inode *xp;
1318	struct fs *fs;
1319	int snaploc;
1320
1321	/*
1322	 * Find snapshot in incore list.
1323	 */
1324	TAILQ_FOREACH(xp, &ump->um_snapshots, i_nextsnap)
1325		if (xp == ip)
1326			break;
1327	if (xp != NULL)
1328		vrele(ITOV(ip));
1329#ifdef DEBUG
1330	else if (snapdebug)
1331		printf("ffs_snapgone: lost snapshot vnode %llu\n",
1332		    (unsigned long long)ip->i_number);
1333#endif
1334	/*
1335	 * Delete snapshot inode from superblock. Keep list dense.
1336	 */
1337	fs = ip->i_fs;
1338	for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++)
1339		if (fs->fs_snapinum[snaploc] == ip->i_number)
1340			break;
1341	if (snaploc < FSMAXSNAP) {
1342		for (snaploc++; snaploc < FSMAXSNAP; snaploc++) {
1343			if (fs->fs_snapinum[snaploc] == 0)
1344				break;
1345			fs->fs_snapinum[snaploc - 1] = fs->fs_snapinum[snaploc];
1346		}
1347		fs->fs_snapinum[snaploc - 1] = 0;
1348	}
1349}
1350
1351/*
1352 * Prepare a snapshot file for being removed.
1353 */
1354void
1355ffs_snapremove(struct vnode *vp)
1356{
1357	struct inode *ip = VTOI(vp), *xp;
1358	struct vnode *devvp = ip->i_devvp;
1359	struct fs *fs = ip->i_fs;
1360	struct ufsmount *ump = VFSTOUFS(devvp->v_specmountpoint);
1361	struct lock *lkp;
1362	struct buf *ibp;
1363	ufs2_daddr_t numblks, blkno, dblk;
1364	int error, ns, loc, last;
1365
1366	ns = UFS_FSNEEDSWAP(fs);
1367	/*
1368	 * If active, delete from incore list (this snapshot may
1369	 * already have been in the process of being deleted, so
1370	 * would not have been active).
1371	 *
1372	 * Clear copy-on-write flag if last snapshot.
1373	 */
1374	if (ip->i_nextsnap.tqe_prev != 0) {
1375		VI_LOCK(devvp);
1376		lockmgr(&vp->v_lock, LK_INTERLOCK | LK_EXCLUSIVE,
1377		    VI_MTX(devvp));
1378		VI_LOCK(devvp);
1379		TAILQ_REMOVE(&ump->um_snapshots, ip, i_nextsnap);
1380		ip->i_nextsnap.tqe_prev = 0;
1381		lkp = vp->v_vnlock;
1382		vp->v_vnlock = &vp->v_lock;
1383		lockmgr(lkp, LK_RELEASE, NULL);
1384		if (TAILQ_FIRST(&ump->um_snapshots) != 0) {
1385			/* Roll back the list of preallocated blocks. */
1386			xp = TAILQ_LAST(&ump->um_snapshots, inodelst);
1387			ump->um_snapblklist = xp->i_snapblklist;
1388			VI_UNLOCK(devvp);
1389		} else {
1390			ump->um_snapblklist = 0;
1391			lockmgr(lkp, LK_DRAIN|LK_INTERLOCK, VI_MTX(devvp));
1392			lockmgr(lkp, LK_RELEASE, NULL);
1393			vn_cow_disestablish(devvp, ffs_copyonwrite, devvp);
1394			FREE(lkp, M_UFSMNT);
1395		}
1396		FREE(ip->i_snapblklist, M_UFSMNT);
1397		ip->i_snapblklist = NULL;
1398	}
1399	/*
1400	 * Clear all BLK_NOCOPY fields. Pass any block claims to other
1401	 * snapshots that want them (see ffs_snapblkfree below).
1402	 */
1403	for (blkno = 1; blkno < NDADDR; blkno++) {
1404		dblk = db_get(ip, blkno);
1405		if (dblk == BLK_NOCOPY || dblk == BLK_SNAP)
1406			db_assign(ip, blkno, 0);
1407		else if ((dblk == blkstofrags(fs, blkno) &&
1408		     ffs_snapblkfree(fs, ip->i_devvp, dblk, fs->fs_bsize,
1409		     ip->i_number))) {
1410			DIP_ADD(ip, blocks, -btodb(fs->fs_bsize));
1411			db_assign(ip, blkno, 0);
1412		}
1413	}
1414	numblks = howmany(ip->i_size, fs->fs_bsize);
1415	for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) {
1416		error = ffs_balloc(vp, lblktosize(fs, (off_t)blkno),
1417		    fs->fs_bsize, KERNCRED, B_METAONLY, &ibp);
1418		if (error)
1419			continue;
1420		if (fs->fs_size - blkno > NINDIR(fs))
1421			last = NINDIR(fs);
1422		else
1423			last = fs->fs_size - blkno;
1424		for (loc = 0; loc < last; loc++) {
1425			dblk = idb_get(ip, ibp->b_data, loc);
1426			if (dblk == BLK_NOCOPY || dblk == BLK_SNAP)
1427				idb_assign(ip, ibp->b_data, loc, 0);
1428			else if (dblk == blkstofrags(fs, blkno) &&
1429			    ffs_snapblkfree(fs, ip->i_devvp, dblk,
1430			    fs->fs_bsize, ip->i_number)) {
1431				DIP_ADD(ip, blocks, -btodb(fs->fs_bsize));
1432				idb_assign(ip, ibp->b_data, loc, 0);
1433			}
1434		}
1435		bawrite(ibp);
1436	}
1437	/*
1438	 * Clear snapshot flag and drop reference.
1439	 */
1440	ip->i_flags &= ~SF_SNAPSHOT;
1441	DIP_ASSIGN(ip, flags, ip->i_flags);
1442	ip->i_flag |= IN_CHANGE | IN_UPDATE;
1443}
1444
1445/*
1446 * Notification that a block is being freed. Return zero if the free
1447 * should be allowed to proceed. Return non-zero if the snapshot file
1448 * wants to claim the block. The block will be claimed if it is an
1449 * uncopied part of one of the snapshots. It will be freed if it is
1450 * either a BLK_NOCOPY or has already been copied in all of the snapshots.
1451 * If a fragment is being freed, then all snapshots that care about
1452 * it must make a copy since a snapshot file can only claim full sized
1453 * blocks. Note that if more than one snapshot file maps the block,
1454 * we can pick one at random to claim it. Since none of the snapshots
1455 * can change, we are assurred that they will all see the same unmodified
1456 * image. When deleting a snapshot file (see ffs_snapremove above), we
1457 * must push any of these claimed blocks to one of the other snapshots
1458 * that maps it. These claimed blocks are easily identified as they will
1459 * have a block number equal to their logical block number within the
1460 * snapshot. A copied block can never have this property because they
1461 * must always have been allocated from a BLK_NOCOPY location.
1462 */
1463int
1464ffs_snapblkfree(struct fs *fs, struct vnode *devvp, ufs2_daddr_t bno,
1465    long size, ino_t inum)
1466{
1467	struct ufsmount *ump = VFSTOUFS(devvp->v_specmountpoint);
1468	struct buf *ibp;
1469	struct inode *ip;
1470	struct vnode *vp = NULL, *saved_vp = NULL;
1471	caddr_t saved_data = NULL;
1472	ufs_lbn_t lbn;
1473	ufs2_daddr_t blkno;
1474	int s, indiroff = 0, snapshot_locked = 0, error = 0, claimedblk = 0;
1475
1476	lbn = fragstoblks(fs, bno);
1477retry:
1478	VI_LOCK(devvp);
1479	TAILQ_FOREACH(ip, &ump->um_snapshots, i_nextsnap) {
1480		vp = ITOV(ip);
1481		/*
1482		 * Lookup block being written.
1483		 */
1484		if (lbn < NDADDR) {
1485			blkno = db_get(ip, lbn);
1486		} else {
1487			if (snapshot_locked == 0 &&
1488			    lockmgr(vp->v_vnlock,
1489			      LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL,
1490			      VI_MTX(devvp)) != 0)
1491				goto retry;
1492			snapshot_locked = 1;
1493			s = cow_enter();
1494			error = ffs_balloc(vp, lblktosize(fs, (off_t)lbn),
1495			    fs->fs_bsize, KERNCRED, B_METAONLY, &ibp);
1496			cow_leave(s);
1497			if (error)
1498				break;
1499			indiroff = (lbn - NDADDR) % NINDIR(fs);
1500			blkno = idb_get(ip, ibp->b_data, indiroff);
1501		}
1502		/*
1503		 * Check to see if block needs to be copied.
1504		 */
1505		if (blkno == 0) {
1506			/*
1507			 * A block that we map is being freed. If it has not
1508			 * been claimed yet, we will claim or copy it (below).
1509			 */
1510			claimedblk = 1;
1511		} else if (blkno == BLK_SNAP) {
1512			/*
1513			 * No previous snapshot claimed the block,
1514			 * so it will be freed and become a BLK_NOCOPY
1515			 * (don't care) for us.
1516			 */
1517			if (claimedblk)
1518				panic("snapblkfree: inconsistent block type");
1519			if (snapshot_locked == 0 &&
1520			    lockmgr(vp->v_vnlock,
1521			      LK_INTERLOCK | LK_EXCLUSIVE | LK_NOWAIT,
1522			      VI_MTX(devvp)) != 0) {
1523#if 0 /* CID-2949: dead code */
1524				if (lbn >= NDADDR)
1525					brelse(ibp);
1526#endif
1527				vn_lock(vp, LK_EXCLUSIVE | LK_SLEEPFAIL);
1528				goto retry;
1529			}
1530			snapshot_locked = 1;
1531			if (lbn < NDADDR) {
1532				db_assign(ip, lbn, BLK_NOCOPY);
1533				ip->i_flag |= IN_CHANGE | IN_UPDATE;
1534			} else {
1535				idb_assign(ip, ibp->b_data, indiroff,
1536				    BLK_NOCOPY);
1537				bwrite(ibp);
1538			}
1539			continue;
1540		} else /* BLK_NOCOPY or default */ {
1541			/*
1542			 * If the snapshot has already copied the block
1543			 * (default), or does not care about the block,
1544			 * it is not needed.
1545			 */
1546			if (lbn >= NDADDR)
1547				brelse(ibp);
1548			continue;
1549		}
1550		/*
1551		 * If this is a full size block, we will just grab it
1552		 * and assign it to the snapshot inode. Otherwise we
1553		 * will proceed to copy it. See explanation for this
1554		 * routine as to why only a single snapshot needs to
1555		 * claim this block.
1556		 */
1557		if (snapshot_locked == 0 &&
1558		    lockmgr(vp->v_vnlock,
1559		      LK_INTERLOCK | LK_EXCLUSIVE | LK_NOWAIT,
1560		      VI_MTX(devvp)) != 0) {
1561			vn_lock(vp, LK_EXCLUSIVE | LK_SLEEPFAIL);
1562			goto retry;
1563		}
1564		snapshot_locked = 1;
1565		if (size == fs->fs_bsize) {
1566#ifdef DEBUG
1567			if (snapdebug)
1568				printf("%s %llu lbn %" PRId64
1569				    "from inum %llu\n",
1570				    "Grabonremove: snapino",
1571				    (unsigned long long)ip->i_number,
1572				    lbn, (unsigned long long)inum);
1573#endif
1574			if (lbn < NDADDR) {
1575				db_assign(ip, lbn, bno);
1576			} else {
1577				idb_assign(ip, ibp->b_data, indiroff, bno);
1578				bwrite(ibp);
1579			}
1580			DIP_ADD(ip, blocks, btodb(size));
1581			ip->i_flag |= IN_CHANGE | IN_UPDATE;
1582			VOP_UNLOCK(vp, 0);
1583			return (1);
1584		}
1585		if (lbn >= NDADDR)
1586			brelse(ibp);
1587#ifdef DEBUG
1588		if (snapdebug)
1589			printf("%s%llu lbn %" PRId64 " %s %llu size %ld\n",
1590			    "Copyonremove: snapino ",
1591			    (unsigned long long)ip->i_number,
1592			    lbn, "for inum", (unsigned long long)inum, size);
1593#endif
1594		/*
1595		 * If we have already read the old block contents, then
1596		 * simply copy them to the new block. Note that we need
1597		 * to synchronously write snapshots that have not been
1598		 * unlinked, and hence will be visible after a crash,
1599		 * to ensure their integrity.
1600		 */
1601		if (saved_data) {
1602			error = writevnblk(vp, saved_data, lbn);
1603			if (error)
1604				break;
1605			continue;
1606		}
1607		/*
1608		 * Otherwise, read the old block contents into the buffer.
1609		 */
1610		saved_data = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK);
1611		saved_vp = vp;
1612		if ((error = readfsblk(vp, saved_data, lbn)) != 0) {
1613			free(saved_data, M_UFSMNT);
1614			saved_data = NULL;
1615			break;
1616		}
1617	}
1618	/*
1619	 * Note that we need to synchronously write snapshots that
1620	 * have not been unlinked, and hence will be visible after
1621	 * a crash, to ensure their integrity.
1622	 */
1623	if (saved_data) {
1624		error = writevnblk(saved_vp, saved_data, lbn);
1625		free(saved_data, M_UFSMNT);
1626	}
1627	/*
1628	 * If we have been unable to allocate a block in which to do
1629	 * the copy, then return non-zero so that the fragment will
1630	 * not be freed. Although space will be lost, the snapshot
1631	 * will stay consistent.
1632	 */
1633	if (snapshot_locked)
1634		VOP_UNLOCK(vp, 0);
1635	else
1636		VI_UNLOCK(devvp);
1637	return (error);
1638}
1639
1640/*
1641 * Associate snapshot files when mounting.
1642 */
1643void
1644ffs_snapshot_mount(struct mount *mp)
1645{
1646	struct ufsmount *ump = VFSTOUFS(mp);
1647	struct vnode *devvp = ump->um_devvp;
1648	struct fs *fs = ump->um_fs;
1649	struct lwp *l = curlwp;
1650	struct vnode *vp;
1651	struct inode *ip, *xp;
1652	ufs2_daddr_t snaplistsize, *snapblklist;
1653	int i, error, ns, snaploc, loc;
1654
1655	ns = UFS_FSNEEDSWAP(fs);
1656	/*
1657	 * XXX The following needs to be set before ffs_truncate or
1658	 * VOP_READ can be called.
1659	 */
1660	mp->mnt_stat.f_iosize = fs->fs_bsize;
1661	/*
1662	 * Process each snapshot listed in the superblock.
1663	 */
1664	vp = NULL;
1665	for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) {
1666		if (fs->fs_snapinum[snaploc] == 0)
1667			break;
1668		if ((error = VFS_VGET(mp, fs->fs_snapinum[snaploc],
1669		    &vp)) != 0) {
1670			printf("ffs_snapshot_mount: vget failed %d\n", error);
1671			continue;
1672		}
1673		ip = VTOI(vp);
1674		if ((ip->i_flags & SF_SNAPSHOT) == 0) {
1675			printf("ffs_snapshot_mount: non-snapshot inode %d\n",
1676			    fs->fs_snapinum[snaploc]);
1677			vput(vp);
1678			vp = NULL;
1679			for (loc = snaploc + 1; loc < FSMAXSNAP; loc++) {
1680				if (fs->fs_snapinum[loc] == 0)
1681					break;
1682				fs->fs_snapinum[loc - 1] = fs->fs_snapinum[loc];
1683			}
1684			fs->fs_snapinum[loc - 1] = 0;
1685			snaploc--;
1686			continue;
1687		}
1688
1689		/*
1690		 * Read the block hints list. Use an empty list on
1691		 * read errors.
1692		 */
1693		error = vn_rdwr(UIO_READ, vp,
1694		    (caddr_t)&snaplistsize, sizeof(snaplistsize),
1695		    lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)),
1696		    UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT,
1697		    l->l_cred, NULL, NULL);
1698		if (error) {
1699			printf("ffs_snapshot_mount: read_1 failed %d\n", error);
1700			snaplistsize = 1;
1701		} else
1702			snaplistsize = ufs_rw64(snaplistsize, ns);
1703		snapblklist = malloc(
1704		    snaplistsize * sizeof(ufs2_daddr_t), M_UFSMNT, M_WAITOK);
1705		if (error)
1706			snapblklist[0] = 1;
1707		else {
1708			error = vn_rdwr(UIO_READ, vp, (caddr_t)snapblklist,
1709			    snaplistsize * sizeof(ufs2_daddr_t),
1710			    lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)),
1711			    UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT,
1712			    l->l_cred, NULL, NULL);
1713			for (i = 0; i < snaplistsize; i++)
1714				snapblklist[i] = ufs_rw64(snapblklist[i], ns);
1715			if (error) {
1716				printf("ffs_snapshot_mount: read_2 failed %d\n",
1717				    error);
1718				snapblklist[0] = 1;
1719			}
1720		}
1721		ip->i_snapblklist = &snapblklist[0];
1722
1723		/*
1724		 * If there already exist snapshots on this filesystem, grab a
1725		 * reference to their shared lock. If this is the first snapshot
1726		 * on this filesystem, we need to allocate a lock for the
1727		 * snapshots to share. In either case, acquire the snapshot
1728		 * lock and give up our original private lock.
1729		 */
1730		VI_LOCK(devvp);
1731		if ((xp = TAILQ_FIRST(&ump->um_snapshots)) != NULL) {
1732			struct lock *lkp;
1733
1734			lkp = ITOV(xp)->v_vnlock;
1735			VI_UNLOCK(devvp);
1736			VI_LOCK(vp);
1737			vp->v_vnlock = lkp;
1738		} else {
1739			struct lock *lkp;
1740
1741			VI_UNLOCK(devvp);
1742			MALLOC(lkp, struct lock *, sizeof(struct lock),
1743			    M_UFSMNT, M_WAITOK);
1744			lockinit(lkp, PVFS, "snaplk", 0, LK_CANRECURSE);
1745			VI_LOCK(vp);
1746			vp->v_vnlock = lkp;
1747		}
1748		vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY);
1749		transferlockers(&vp->v_lock, vp->v_vnlock);
1750		lockmgr(&vp->v_lock, LK_RELEASE, NULL);
1751		/*
1752		 * Link it onto the active snapshot list.
1753		 */
1754		VI_LOCK(devvp);
1755		if (ip->i_nextsnap.tqe_prev != 0)
1756			panic("ffs_snapshot_mount: %llu already on list",
1757			    (unsigned long long)ip->i_number);
1758		else
1759			TAILQ_INSERT_TAIL(&ump->um_snapshots, ip, i_nextsnap);
1760		vp->v_flag |= VSYSTEM;
1761		VI_UNLOCK(devvp);
1762		VOP_UNLOCK(vp, 0);
1763	}
1764	/*
1765	 * No usable snapshots found.
1766	 */
1767	if (vp == NULL)
1768		return;
1769	/*
1770	 * Attach the block hints list. We always want to
1771	 * use the list from the newest snapshot.
1772	*/
1773	xp = TAILQ_LAST(&ump->um_snapshots, inodelst);
1774	VI_LOCK(devvp);
1775	ump->um_snapblklist = xp->i_snapblklist;
1776	VI_UNLOCK(devvp);
1777	vn_cow_establish(devvp, ffs_copyonwrite, devvp);
1778}
1779
1780/*
1781 * Disassociate snapshot files when unmounting.
1782 */
1783void
1784ffs_snapshot_unmount(struct mount *mp)
1785{
1786	struct ufsmount *ump = VFSTOUFS(mp);
1787	struct vnode *devvp = ump->um_devvp;
1788	struct lock *lkp = NULL;
1789	struct inode *xp;
1790	struct vnode *vp;
1791
1792	VI_LOCK(devvp);
1793	while ((xp = TAILQ_FIRST(&ump->um_snapshots)) != 0) {
1794		vp = ITOV(xp);
1795		lkp = vp->v_vnlock;
1796		vp->v_vnlock = &vp->v_lock;
1797		TAILQ_REMOVE(&ump->um_snapshots, xp, i_nextsnap);
1798		xp->i_nextsnap.tqe_prev = 0;
1799		if (xp->i_snapblklist == ump->um_snapblklist)
1800			ump->um_snapblklist = NULL;
1801		VI_UNLOCK(devvp);
1802		FREE(xp->i_snapblklist, M_UFSMNT);
1803		if (xp->i_ffs_effnlink > 0)
1804			vrele(vp);
1805		VI_LOCK(devvp);
1806	}
1807	VI_UNLOCK(devvp);
1808	if (lkp != NULL) {
1809		vn_cow_disestablish(devvp, ffs_copyonwrite, devvp);
1810		FREE(lkp, M_UFSMNT);
1811	}
1812}
1813
1814/*
1815 * Check for need to copy block that is about to be written,
1816 * copying the block if necessary.
1817 */
1818static int
1819ffs_copyonwrite(void *v, struct buf *bp)
1820{
1821	struct buf *ibp;
1822	struct fs *fs;
1823	struct inode *ip;
1824	struct vnode *devvp = v, *vp = 0, *saved_vp = NULL;
1825	struct ufsmount *ump = VFSTOUFS(devvp->v_specmountpoint);
1826	caddr_t saved_data = NULL;
1827	ufs2_daddr_t lbn, blkno, *snapblklist;
1828	int lower, upper, mid, s, ns, indiroff, snapshot_locked = 0, error = 0;
1829
1830	/*
1831	 * Check for valid snapshots.
1832	 */
1833	VI_LOCK(devvp);
1834	ip = TAILQ_FIRST(&ump->um_snapshots);
1835	if (ip == NULL) {
1836		VI_UNLOCK(devvp);
1837		return 0;
1838	}
1839	/*
1840	 * First check to see if it is in the preallocated list.
1841	 * By doing this check we avoid several potential deadlocks.
1842	 */
1843	fs = ip->i_fs;
1844	ns = UFS_FSNEEDSWAP(fs);
1845	lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno));
1846	snapblklist = ump->um_snapblklist;
1847	upper = ump->um_snapblklist[0] - 1;
1848	lower = 1;
1849	while (lower <= upper) {
1850		mid = (lower + upper) / 2;
1851		if (snapblklist[mid] == lbn)
1852			break;
1853		if (snapblklist[mid] < lbn)
1854			lower = mid + 1;
1855		else
1856			upper = mid - 1;
1857	}
1858	if (lower <= upper) {
1859		VI_UNLOCK(devvp);
1860		return 0;
1861	}
1862	/*
1863	 * Not in the precomputed list, so check the snapshots.
1864	 */
1865retry:
1866	TAILQ_FOREACH(ip, &ump->um_snapshots, i_nextsnap) {
1867		vp = ITOV(ip);
1868		/*
1869		 * We ensure that everything of our own that needs to be
1870		 * copied will be done at the time that ffs_snapshot is
1871		 * called. Thus we can skip the check here which can
1872		 * deadlock in doing the lookup in ffs_balloc.
1873		 */
1874		if (bp->b_vp == vp)
1875			continue;
1876		/*
1877		 * Check to see if block needs to be copied. We do not have
1878		 * to hold the snapshot lock while doing this lookup as it
1879		 * will never require any additional allocations for the
1880		 * snapshot inode.
1881		 */
1882		if (lbn < NDADDR) {
1883			blkno = db_get(ip, lbn);
1884		} else {
1885			if (snapshot_locked == 0 &&
1886			    lockmgr(vp->v_vnlock,
1887			      LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL,
1888			      VI_MTX(devvp)) != 0) {
1889				VI_LOCK(devvp);
1890				goto retry;
1891			}
1892			snapshot_locked = 1;
1893			s = cow_enter();
1894			error = ffs_balloc(vp, lblktosize(fs, (off_t)lbn),
1895			   fs->fs_bsize, KERNCRED, B_METAONLY, &ibp);
1896			cow_leave(s);
1897			if (error)
1898				break;
1899			indiroff = (lbn - NDADDR) % NINDIR(fs);
1900			blkno = idb_get(ip, ibp->b_data, indiroff);
1901			brelse(ibp);
1902		}
1903#ifdef DIAGNOSTIC
1904		if (blkno == BLK_SNAP && bp->b_lblkno >= 0)
1905			panic("ffs_copyonwrite: bad copy block");
1906#endif
1907		if (blkno != 0)
1908			continue;
1909#ifdef DIAGNOSTIC
1910		if (curlwp->l_flag & L_COWINPROGRESS)
1911			printf("ffs_copyonwrite: recursive call\n");
1912#endif
1913		/*
1914		 * Allocate the block into which to do the copy. Since
1915		 * multiple processes may all try to copy the same block,
1916		 * we have to recheck our need to do a copy if we sleep
1917		 * waiting for the lock.
1918		 *
1919		 * Because all snapshots on a filesystem share a single
1920		 * lock, we ensure that we will never be in competition
1921		 * with another process to allocate a block.
1922		 */
1923		if (snapshot_locked == 0 &&
1924		    lockmgr(vp->v_vnlock,
1925		      LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL,
1926		      VI_MTX(devvp)) != 0) {
1927			VI_LOCK(devvp);
1928			goto retry;
1929		}
1930		snapshot_locked = 1;
1931#ifdef DEBUG
1932		if (snapdebug) {
1933			printf("Copyonwrite: snapino %llu lbn %" PRId64 " for ",
1934			    (unsigned long long)ip->i_number, lbn);
1935			if (bp->b_vp == devvp)
1936				printf("fs metadata");
1937			else
1938				printf("inum %llu", (unsigned long long)
1939				    VTOI(bp->b_vp)->i_number);
1940			printf(" lblkno %" PRId64 "\n", bp->b_lblkno);
1941		}
1942#endif
1943		/*
1944		 * If we have already read the old block contents, then
1945		 * simply copy them to the new block. Note that we need
1946		 * to synchronously write snapshots that have not been
1947		 * unlinked, and hence will be visible after a crash,
1948		 * to ensure their integrity.
1949		 */
1950		if (saved_data) {
1951			error = writevnblk(vp, saved_data, lbn);
1952			if (error)
1953				break;
1954			continue;
1955		}
1956		/*
1957		 * Otherwise, read the old block contents into the buffer.
1958		 */
1959		saved_data = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK);
1960		saved_vp = vp;
1961		if ((error = readfsblk(vp, saved_data, lbn)) != 0) {
1962			free(saved_data, M_UFSMNT);
1963			saved_data = NULL;
1964			break;
1965		}
1966	}
1967	/*
1968	 * Note that we need to synchronously write snapshots that
1969	 * have not been unlinked, and hence will be visible after
1970	 * a crash, to ensure their integrity.
1971	 */
1972	if (saved_data) {
1973		error = writevnblk(saved_vp, saved_data, lbn);
1974		free(saved_data, M_UFSMNT);
1975	}
1976	if (snapshot_locked)
1977		VOP_UNLOCK(vp, 0);
1978	else
1979		VI_UNLOCK(devvp);
1980	return error;
1981}
1982
1983/*
1984 * Read the specified block from disk. Vp is usually a snapshot vnode.
1985 */
1986static int
1987readfsblk(struct vnode *vp, caddr_t data, ufs2_daddr_t lbn)
1988{
1989	int error;
1990	struct inode *ip = VTOI(vp);
1991	struct fs *fs = ip->i_fs;
1992	struct buf *nbp;
1993
1994	nbp = getiobuf();
1995	nbp->b_flags = B_READ;
1996	nbp->b_bcount = nbp->b_bufsize = fs->fs_bsize;
1997	nbp->b_error = 0;
1998	nbp->b_data = data;
1999	nbp->b_blkno = nbp->b_rawblkno = fsbtodb(fs, blkstofrags(fs, lbn));
2000	nbp->b_proc = NULL;
2001	nbp->b_dev = ip->i_devvp->v_rdev;
2002	nbp->b_vp = NULLVP;
2003
2004	DEV_STRATEGY(nbp);
2005
2006	error = biowait(nbp);
2007
2008	putiobuf(nbp);
2009
2010	return error;
2011}
2012
2013/*
2014 * Read the specified block. Bypass UBC to prevent deadlocks.
2015 */
2016static int
2017readvnblk(struct vnode *vp, caddr_t data, ufs2_daddr_t lbn)
2018{
2019	int error;
2020	daddr_t bn;
2021	off_t offset;
2022	struct inode *ip = VTOI(vp);
2023	struct fs *fs = ip->i_fs;
2024
2025	error = VOP_BMAP(vp, lbn, NULL, &bn, NULL);
2026	if (error)
2027		return error;
2028
2029	if (bn != (daddr_t)-1) {
2030		offset = dbtob(bn);
2031		simple_lock(&vp->v_interlock);
2032		error = VOP_PUTPAGES(vp, trunc_page(offset),
2033		    round_page(offset+fs->fs_bsize),
2034		    PGO_CLEANIT|PGO_SYNCIO|PGO_FREE);
2035		if (error)
2036			return error;
2037
2038		return readfsblk(vp, data, fragstoblks(fs, dbtofsb(fs, bn)));
2039	}
2040
2041	bzero(data, fs->fs_bsize);
2042
2043	return 0;
2044}
2045
2046/*
2047 * Write the specified block. Bypass UBC to prevent deadlocks.
2048 */
2049static int
2050writevnblk(struct vnode *vp, caddr_t data, ufs2_daddr_t lbn)
2051{
2052	int s, error;
2053	off_t offset;
2054	struct buf *bp;
2055	struct inode *ip = VTOI(vp);
2056	struct fs *fs = ip->i_fs;
2057
2058	offset = lblktosize(fs, (off_t)lbn);
2059	s = cow_enter();
2060	simple_lock(&vp->v_interlock);
2061	error = VOP_PUTPAGES(vp, trunc_page(offset),
2062	    round_page(offset+fs->fs_bsize), PGO_CLEANIT|PGO_SYNCIO|PGO_FREE);
2063	if (error == 0)
2064		error = ffs_balloc(vp, lblktosize(fs, (off_t)lbn),
2065		    fs->fs_bsize, KERNCRED, B_SYNC, &bp);
2066	cow_leave(s);
2067	if (error)
2068		return error;
2069
2070	bcopy(data, bp->b_data, fs->fs_bsize);
2071	bp->b_flags |= B_NOCACHE;
2072
2073	return bwrite(bp);
2074}
2075
2076/*
2077 * Set/reset lwp's L_COWINPROGRESS flag.
2078 * May be called recursive.
2079 */
2080static inline int
2081cow_enter(void)
2082{
2083	struct lwp *l = curlwp;
2084
2085	if (l->l_flag & L_COWINPROGRESS) {
2086		return 0;
2087	} else {
2088		l->l_flag |= L_COWINPROGRESS;
2089		return L_COWINPROGRESS;
2090	}
2091}
2092
2093static inline void
2094cow_leave(int flag)
2095{
2096	struct lwp *l = curlwp;
2097
2098	l->l_flag &= ~flag;
2099}
2100
2101/*
2102 * Get/Put direct block from inode or buffer containing disk addresses. Take
2103 * care for fs type (UFS1/UFS2) and byte swapping. These functions should go
2104 * into a global include.
2105 */
2106static inline ufs2_daddr_t
2107db_get(struct inode *ip, int loc)
2108{
2109	if (ip->i_ump->um_fstype == UFS1)
2110		return ufs_rw32(ip->i_ffs1_db[loc], UFS_IPNEEDSWAP(ip));
2111	else
2112		return ufs_rw64(ip->i_ffs2_db[loc], UFS_IPNEEDSWAP(ip));
2113}
2114
2115static inline void
2116db_assign(struct inode *ip, int loc, ufs2_daddr_t val)
2117{
2118	if (ip->i_ump->um_fstype == UFS1)
2119		ip->i_ffs1_db[loc] = ufs_rw32(val, UFS_IPNEEDSWAP(ip));
2120	else
2121		ip->i_ffs2_db[loc] = ufs_rw64(val, UFS_IPNEEDSWAP(ip));
2122}
2123
2124static inline ufs2_daddr_t
2125idb_get(struct inode *ip, caddr_t bf, int loc)
2126{
2127	if (ip->i_ump->um_fstype == UFS1)
2128		return ufs_rw32(((ufs1_daddr_t *)(bf))[loc],
2129		    UFS_IPNEEDSWAP(ip));
2130	else
2131		return ufs_rw64(((ufs2_daddr_t *)(bf))[loc],
2132		    UFS_IPNEEDSWAP(ip));
2133}
2134
2135static inline void
2136idb_assign(struct inode *ip, caddr_t bf, int loc, ufs2_daddr_t val)
2137{
2138	if (ip->i_ump->um_fstype == UFS1)
2139		((ufs1_daddr_t *)(bf))[loc] =
2140		    ufs_rw32(val, UFS_IPNEEDSWAP(ip));
2141	else
2142		((ufs2_daddr_t *)(bf))[loc] =
2143		    ufs_rw64(val, UFS_IPNEEDSWAP(ip));
2144}
2145