ffs_snapshot.c revision 1.9
1/*
2 * Copyright 2000 Marshall Kirk McKusick. All Rights Reserved.
3 *
4 * Further information about snapshots can be obtained from:
5 *
6 *	Marshall Kirk McKusick		http://www.mckusick.com/softdep/
7 *	1614 Oxford Street		mckusick@mckusick.com
8 *	Berkeley, CA 94709-1608		+1-510-843-9542
9 *	USA
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 *
15 * 1. Redistributions of source code must retain the above copyright
16 *    notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 *    notice, this list of conditions and the following disclaimer in the
19 *    documentation and/or other materials provided with the distribution.
20 *
21 * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY
22 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
23 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24 * DISCLAIMED.  IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR
25 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * SUCH DAMAGE.
32 *
33 *	@(#)ffs_snapshot.c	8.11 (McKusick) 7/23/00
34 *
35 *	from FreeBSD: ffs_snapshot.c,v 1.79 2004/02/13 02:02:06 kuriyama Exp
36 */
37
38#include <sys/cdefs.h>
39__KERNEL_RCSID(0, "$NetBSD: ffs_snapshot.c,v 1.9 2005/02/09 16:05:29 hannken Exp $");
40
41#if defined(_KERNEL_OPT)
42#include "opt_ffs.h"
43#endif
44
45#include <sys/param.h>
46#include <sys/kernel.h>
47#include <sys/systm.h>
48#include <sys/conf.h>
49#include <sys/buf.h>
50#include <sys/proc.h>
51#include <sys/namei.h>
52#include <sys/sched.h>
53#include <sys/stat.h>
54#include <sys/malloc.h>
55#include <sys/mount.h>
56#include <sys/resource.h>
57#include <sys/resourcevar.h>
58#include <sys/vnode.h>
59
60#include <miscfs/specfs/specdev.h>
61
62#include <ufs/ufs/quota.h>
63#include <ufs/ufs/ufsmount.h>
64#include <ufs/ufs/inode.h>
65#include <ufs/ufs/ufs_extern.h>
66#include <ufs/ufs/ufs_bswap.h>
67
68#include <ufs/ffs/fs.h>
69#include <ufs/ffs/ffs_extern.h>
70
71/* FreeBSD -> NetBSD conversion */
72#define KERNCRED	proc0.p_ucred
73#define ufs1_daddr_t	int32_t
74#define ufs2_daddr_t	int64_t
75#define ufs_lbn_t	daddr_t
76#define VI_MTX(v)	(&(v)->v_interlock)
77#define VI_LOCK(v)	simple_lock(&(v)->v_interlock)
78#define VI_UNLOCK(v)	simple_unlock(&(v)->v_interlock)
79#define MNT_ILOCK(v)	simple_lock(&mntvnode_slock)
80#define MNT_IUNLOCK(v)	simple_unlock(&mntvnode_slock)
81
82static int cgaccount(int, struct vnode *, caddr_t, int);
83static int expunge_ufs1(struct vnode *, struct inode *, struct fs *,
84    int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *,
85    ufs_lbn_t, int), int);
86static int indiracct_ufs1(struct vnode *, struct vnode *, int,
87    ufs1_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *,
88    int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *,
89    ufs_lbn_t, int), int);
90static int fullacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
91    struct fs *, ufs_lbn_t, int);
92static int snapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
93    struct fs *, ufs_lbn_t, int);
94static int mapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
95    struct fs *, ufs_lbn_t, int);
96static int expunge_ufs2(struct vnode *, struct inode *, struct fs *,
97    int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *,
98    ufs_lbn_t, int), int);
99static int indiracct_ufs2(struct vnode *, struct vnode *, int,
100    ufs2_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *,
101    int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *,
102    ufs_lbn_t, int), int);
103static int fullacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
104    struct fs *, ufs_lbn_t, int);
105static int snapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
106    struct fs *, ufs_lbn_t, int);
107static int mapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
108    struct fs *, ufs_lbn_t, int);
109static int ffs_copyonwrite(void *, struct buf *);
110static int readfsblk(struct vnode *, caddr_t, ufs2_daddr_t);
111static int readvnblk(struct vnode *, caddr_t, ufs2_daddr_t);
112static int writevnblk(struct vnode *, caddr_t, ufs2_daddr_t);
113static inline int cow_enter(void);
114static inline void cow_leave(int);
115static inline ufs2_daddr_t db_get(struct inode *, int);
116static inline void db_assign(struct inode *, int, ufs2_daddr_t);
117static inline ufs2_daddr_t idb_get(struct inode *, caddr_t, int);
118static inline void idb_assign(struct inode *, caddr_t, int, ufs2_daddr_t);
119
120#ifdef DEBUG
121static int snapdebug = 0;
122#endif
123
124/*
125 * Create a snapshot file and initialize it for the filesystem.
126 * Vnode is locked on entry and return.
127 */
128int
129ffs_snapshot(mp, vp, ctime)
130	struct mount *mp;
131	struct vnode *vp;
132	struct timespec *ctime;
133{
134	ufs2_daddr_t numblks, blkno, *blkp, snaplistsize = 0, *snapblklist;
135	int error, ns, cg, snaploc;
136	int i, size, len, loc;
137	int flag = mp->mnt_flag;
138	struct timeval starttime;
139#ifdef DEBUG
140	struct timeval endtime;
141#endif
142	struct timespec ts;
143	long redo = 0;
144	int32_t *lp;
145	void *space;
146	caddr_t cgbuf;
147	struct ufsmount *ump = VFSTOUFS(mp);
148	struct fs *copy_fs = NULL, *fs = ump->um_fs;
149	struct proc *p = curproc;
150	struct inode *ip, *xp;
151	struct buf *bp, *ibp;
152	struct vattr vat;
153	struct vnode *xvp, *nvp, *devvp;
154
155	ns = UFS_FSNEEDSWAP(fs);
156	/*
157	 * Need to serialize access to snapshot code per filesystem.
158	 */
159	/*
160	 * If the vnode already is a snapshot, return.
161	 */
162	if (VTOI(vp)->i_flags & SF_SNAPSHOT) {
163		if (ctime) {
164			ctime->tv_sec = DIP(VTOI(vp), mtime);
165			ctime->tv_nsec = DIP(VTOI(vp), mtimensec);
166		}
167		return 0;
168	}
169	/*
170	 * Check mount, exclusive reference and owner.
171	 */
172	if (vp->v_mount != mp)
173		return EXDEV;
174	if (vp->v_usecount != 1 || vp->v_writecount != 0)
175		return EBUSY;
176	if (suser(p->p_ucred, &p->p_acflag) != 0 &&
177	    VTOI(vp)->i_uid != p->p_ucred->cr_uid)
178		return EACCES;
179
180	if (vp->v_size != 0) {
181		error = VOP_TRUNCATE(vp, 0, 0, NOCRED, p);
182		if (error)
183			return error;
184	}
185	/*
186	 * Assign a snapshot slot in the superblock.
187	 */
188	for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++)
189		if (fs->fs_snapinum[snaploc] == 0)
190			break;
191	if (snaploc == FSMAXSNAP)
192		return (ENOSPC);
193	ip = VTOI(vp);
194	devvp = ip->i_devvp;
195	/*
196	 * Allocate and copy the last block contents so as to be able
197	 * to set size to that of the filesystem.
198	 */
199	numblks = howmany(fs->fs_size, fs->fs_frag);
200	cgbuf = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK);
201	if ((error = readfsblk(vp, cgbuf, numblks - 1)) != 0)
202		goto out;
203	error = vn_rdwr(UIO_WRITE, vp,
204	    cgbuf, fs->fs_bsize, lblktosize(fs, (off_t)(numblks - 1)),
205	    UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, p->p_ucred, NULL, NULL);
206	if (error)
207		goto out;
208	/*
209	 * Preallocate critical data structures so that we can copy
210	 * them in without further allocation after we suspend all
211	 * operations on the filesystem. We would like to just release
212	 * the allocated buffers without writing them since they will
213	 * be filled in below once we are ready to go, but this upsets
214	 * the soft update code, so we go ahead and write the new buffers.
215	 *
216	 * Allocate all indirect blocks and mark all of them as not
217	 * needing to be copied.
218	 */
219	for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) {
220		error = VOP_BALLOC(vp, lblktosize(fs, (off_t)blkno),
221		    fs->fs_bsize, p->p_ucred, B_METAONLY, &ibp);
222		if (error)
223			goto out;
224		bwrite(ibp);
225	}
226	/*
227	 * Allocate copies for the superblock and its summary information.
228	 */
229	bzero(cgbuf, fs->fs_bsize);
230	blkno = lblkno(fs, fs->fs_sblockloc);
231	for (loc = 0; loc < howmany(fs->fs_sbsize, fs->fs_bsize); loc++)
232		if ((error = writevnblk(vp, cgbuf, blkno + loc)) != 0)
233			goto out;
234	blkno = fragstoblks(fs, fs->fs_csaddr);
235	for (loc = 0; loc < howmany(fs->fs_cssize, fs->fs_bsize); loc++)
236		if ((error = writevnblk(vp, cgbuf, blkno + loc)) != 0)
237			goto out;
238	/*
239	 * Allocate all cylinder group blocks.
240	 */
241	for (cg = 0; cg < fs->fs_ncg; cg++)
242		if ((error = writevnblk(vp, cgbuf,
243		    fragstoblks(fs, cgtod(fs, cg)))) != 0)
244			goto out;
245	/*
246	 * Copy all the cylinder group maps. Although the
247	 * filesystem is still active, we hope that only a few
248	 * cylinder groups will change between now and when we
249	 * suspend operations. Thus, we will be able to quickly
250	 * touch up the few cylinder groups that changed during
251	 * the suspension period.
252	 */
253	len = howmany(fs->fs_ncg, NBBY);
254	MALLOC(fs->fs_active, u_char *, len, M_DEVBUF, M_WAITOK | M_ZERO);
255	for (cg = 0; cg < fs->fs_ncg; cg++) {
256		if ((error = cgaccount(cg, vp, cgbuf, 1)) != 0)
257			goto out;
258		if ((error = writevnblk(vp, cgbuf,
259		    fragstoblks(fs, cgtod(fs, cg)))) != 0)
260			goto out;
261	}
262	/*
263	 * Change inode to snapshot type file.
264	 */
265	ip->i_flags |= SF_SNAPSHOT;
266	DIP_ASSIGN(ip, flags, ip->i_flags);
267	ip->i_flag |= IN_CHANGE | IN_UPDATE;
268	/*
269	 * Ensure that the snapshot is completely on disk.
270	 * Since we have marked it as a snapshot it is safe to
271	 * unlock it as no process will be allowed to write to it.
272	 */
273	if ((error = VOP_FSYNC(vp, KERNCRED, FSYNC_WAIT, 0, 0, p)) != 0)
274		goto out;
275	VOP_UNLOCK(vp, 0);
276	/*
277	 * All allocations are done, so we can now snapshot the system.
278	 *
279	 * Suspend operation on filesystem.
280	 */
281	if ((error = vfs_write_suspend(vp->v_mount, PUSER|PCATCH, 0)) != 0) {
282		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
283		goto out;
284	}
285	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
286	microtime(&starttime);
287	/*
288	 * First, copy all the cylinder group maps that have changed.
289	 */
290	for (cg = 0; cg < fs->fs_ncg; cg++) {
291		if (ACTIVECG_ISSET(fs, cg))
292			continue;
293		redo++;
294		if ((error = cgaccount(cg, vp, cgbuf, 2)) != 0)
295			goto out1;
296		if ((error = writevnblk(vp, cgbuf,
297		    fragstoblks(fs, cgtod(fs, cg)))) != 0)
298			goto out1;
299	}
300	/*
301	 * Grab a copy of the superblock and its summary information.
302	 * We delay writing it until the suspension is released below.
303	 */
304	loc = blkoff(fs, fs->fs_sblockloc);
305	if (loc > 0)
306		bzero(&cgbuf[0], loc);
307	copy_fs = (struct fs *)(cgbuf + loc);
308	bcopy(fs, copy_fs, fs->fs_sbsize);
309	size = fs->fs_bsize < SBLOCKSIZE ? fs->fs_bsize : SBLOCKSIZE;
310	if (fs->fs_sbsize < size)
311		bzero(&cgbuf[loc + fs->fs_sbsize], size - fs->fs_sbsize);
312	size = blkroundup(fs, fs->fs_cssize);
313	if (fs->fs_contigsumsize > 0)
314		size += fs->fs_ncg * sizeof(int32_t);
315	space = malloc((u_long)size, M_UFSMNT, M_WAITOK);
316	copy_fs->fs_csp = space;
317	bcopy(fs->fs_csp, copy_fs->fs_csp, fs->fs_cssize);
318	(char *)space += fs->fs_cssize;
319	loc = howmany(fs->fs_cssize, fs->fs_fsize);
320	i = fs->fs_frag - loc % fs->fs_frag;
321	len = (i == fs->fs_frag) ? 0 : i * fs->fs_fsize;
322	if (len > 0) {
323		if ((error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + loc),
324		    len, KERNCRED, &bp)) != 0) {
325			brelse(bp);
326			free(copy_fs->fs_csp, M_UFSMNT);
327			goto out1;
328		}
329		bcopy(bp->b_data, space, (u_int)len);
330		(char *)space += len;
331		bp->b_flags |= B_INVAL | B_NOCACHE;
332		brelse(bp);
333	}
334	if (fs->fs_contigsumsize > 0) {
335		copy_fs->fs_maxcluster = lp = space;
336		for (i = 0; i < fs->fs_ncg; i++)
337			*lp++ = fs->fs_contigsumsize;
338	}
339	/*
340	 * We must check for active files that have been unlinked
341	 * (e.g., with a zero link count). We have to expunge all
342	 * trace of these files from the snapshot so that they are
343	 * not reclaimed prematurely by fsck or unnecessarily dumped.
344	 * We turn off the MNTK_SUSPENDED flag to avoid a panic from
345	 * spec_strategy about writing on a suspended filesystem.
346	 * Note that we skip unlinked snapshot files as they will
347	 * be handled separately below.
348	 *
349	 * We also calculate the needed size for the snapshot list.
350	 */
351	snaplistsize = fs->fs_ncg + howmany(fs->fs_cssize, fs->fs_bsize) +
352	    FSMAXSNAP + 1 /* superblock */ + 1 /* last block */ + 1 /* size */;
353	MNT_ILOCK(mp);
354loop:
355	for (xvp = LIST_FIRST(&mp->mnt_vnodelist); xvp; xvp = nvp) {
356		/*
357		 * Make sure this vnode wasn't reclaimed in getnewvnode().
358		 * Start over if it has (it won't be on the list anymore).
359		 */
360		if (xvp->v_mount != mp)
361			goto loop;
362		nvp = LIST_NEXT(xvp, v_mntvnodes);
363		VI_LOCK(xvp);
364		MNT_IUNLOCK(mp);
365		if ((xvp->v_flag & VXLOCK) ||
366		    xvp->v_usecount == 0 || xvp->v_type == VNON ||
367		    (VTOI(xvp)->i_flags & SF_SNAPSHOT)) {
368			VI_UNLOCK(xvp);
369			MNT_ILOCK(mp);
370			continue;
371		}
372		if (vn_lock(xvp, LK_EXCLUSIVE | LK_INTERLOCK) != 0) {
373			MNT_ILOCK(mp);
374			goto loop;
375		}
376#ifdef DEBUG
377		if (snapdebug)
378			vprint("ffs_snapshot: busy vnode", xvp);
379#endif
380		if (VOP_GETATTR(xvp, &vat, p->p_ucred, p) == 0 &&
381		    vat.va_nlink > 0) {
382			VOP_UNLOCK(xvp, 0);
383			MNT_ILOCK(mp);
384			continue;
385		}
386		xp = VTOI(xvp);
387		if (ffs_checkfreefile(copy_fs, vp, xp->i_number)) {
388			VOP_UNLOCK(xvp, 0);
389			MNT_ILOCK(mp);
390			continue;
391		}
392		/*
393		 * If there is a fragment, clear it here.
394		 */
395		blkno = 0;
396		loc = howmany(xp->i_size, fs->fs_bsize) - 1;
397		if (loc < NDADDR) {
398			len = fragroundup(fs, blkoff(fs, xp->i_size));
399			if (len > 0 && len < fs->fs_bsize) {
400				ffs_blkfree(copy_fs, vp, db_get(xp, loc),
401				    len, xp->i_number);
402				blkno = db_get(xp, loc);
403				db_assign(xp, loc, 0);
404			}
405		}
406		snaplistsize += 1;
407		if (xp->i_ump->um_fstype == UFS1)
408			error = expunge_ufs1(vp, xp, copy_fs, fullacct_ufs1,
409			    BLK_NOCOPY);
410		else
411			error = expunge_ufs2(vp, xp, copy_fs, fullacct_ufs2,
412			    BLK_NOCOPY);
413		if (blkno)
414			db_assign(xp, loc, blkno);
415		if (!error)
416			error = ffs_freefile(copy_fs, vp, xp->i_number,
417			    xp->i_mode);
418		VOP_UNLOCK(xvp, 0);
419		if (error) {
420			free(copy_fs->fs_csp, M_UFSMNT);
421			goto out1;
422		}
423		MNT_ILOCK(mp);
424	}
425	MNT_IUNLOCK(mp);
426	/*
427	 * If there already exist snapshots on this filesystem, grab a
428	 * reference to their shared lock. If this is the first snapshot
429	 * on this filesystem, we need to allocate a lock for the snapshots
430	 * to share. In either case, acquire the snapshot lock and give
431	 * up our original private lock.
432	 */
433	VI_LOCK(devvp);
434	if ((xp = TAILQ_FIRST(&ump->um_snapshots)) != NULL) {
435		struct lock *lkp;
436
437		lkp = ITOV(xp)->v_vnlock;
438		VI_UNLOCK(devvp);
439		VI_LOCK(vp);
440		vp->v_vnlock = lkp;
441	} else {
442		struct lock *lkp;
443
444		VI_UNLOCK(devvp);
445		MALLOC(lkp, struct lock *, sizeof(struct lock), M_UFSMNT,
446		    M_WAITOK);
447		lockinit(lkp, PVFS, "snaplk", 0, LK_CANRECURSE);
448		VI_LOCK(vp);
449		vp->v_vnlock = lkp;
450	}
451	vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY);
452	transferlockers(&vp->v_lock, vp->v_vnlock);
453	lockmgr(&vp->v_lock, LK_RELEASE, NULL);
454	/*
455	 * If this is the first snapshot on this filesystem, then we need
456	 * to allocate the space for the list of preallocated snapshot blocks.
457	 * This list will be refined below, but this preliminary one will
458	 * keep us out of deadlock until the full one is ready.
459	 */
460	if (xp == NULL) {
461		MALLOC(snapblklist, ufs2_daddr_t *,
462		    snaplistsize * sizeof(ufs2_daddr_t), M_UFSMNT, M_WAITOK);
463		blkp = &snapblklist[1];
464		*blkp++ = ufs_rw64(lblkno(fs, fs->fs_sblockloc), ns);
465		blkno = fragstoblks(fs, fs->fs_csaddr);
466		for (cg = 0; cg < fs->fs_ncg; cg++) {
467			if (fragstoblks(fs, cgtod(fs, cg) > blkno))
468				break;
469			*blkp++ = ufs_rw64(fragstoblks(fs, cgtod(fs, cg)), ns);
470		}
471		len = howmany(fs->fs_cssize, fs->fs_bsize);
472		for (loc = 0; loc < len; loc++)
473			*blkp++ = ufs_rw64(blkno + loc, ns);
474		for (; cg < fs->fs_ncg; cg++)
475			*blkp++ = ufs_rw64(fragstoblks(fs, cgtod(fs, cg)), ns);
476		snapblklist[0] = ufs_rw64(blkp - snapblklist, ns);
477		VI_LOCK(devvp);
478		if (ump->um_snapblklist != NULL)
479			panic("ffs_snapshot: non-empty list");
480		ump->um_snapblklist = snapblklist;
481		ump->um_snaplistsize = blkp - snapblklist;
482		VI_UNLOCK(devvp);
483	}
484	/*
485	 * Record snapshot inode. Since this is the newest snapshot,
486	 * it must be placed at the end of the list.
487	 */
488	VI_LOCK(devvp);
489	fs->fs_snapinum[snaploc] = ip->i_number;
490	if (ip->i_nextsnap.tqe_prev != 0)
491		panic("ffs_snapshot: %d already on list", ip->i_number);
492	TAILQ_INSERT_TAIL(&ump->um_snapshots, ip, i_nextsnap);
493	VI_UNLOCK(devvp);
494	if (xp == NULL)
495		vn_cow_establish(devvp, ffs_copyonwrite, devvp);
496	vp->v_flag |= VSYSTEM;
497out1:
498	/*
499	 * Resume operation on filesystem.
500	 */
501	vfs_write_resume(vp->v_mount);
502	/*
503	 * Set the mtime to the time the snapshot has been taken.
504	 */
505	TIMEVAL_TO_TIMESPEC(&starttime, &ts);
506	if (ctime)
507		*ctime = ts;
508	DIP_ASSIGN(ip, mtime, ts.tv_sec);
509	DIP_ASSIGN(ip, mtimensec, ts.tv_nsec);
510	ip->i_flag |= IN_CHANGE | IN_UPDATE;
511
512#ifdef DEBUG
513	if (starttime.tv_sec > 0) {
514		microtime(&endtime);
515		timersub(&endtime, &starttime, &endtime);
516		printf("%s: suspended %ld.%03ld sec, redo %ld of %d\n",
517		    vp->v_mount->mnt_stat.f_mntonname, (long)endtime.tv_sec,
518		    endtime.tv_usec / 1000, redo, fs->fs_ncg);
519	}
520#endif
521	if (error)
522		goto out;
523	/*
524	 * Copy allocation information from all the snapshots in
525	 * this snapshot and then expunge them from its view.
526	 */
527	TAILQ_FOREACH(xp, &ump->um_snapshots, i_nextsnap) {
528		if (xp == ip)
529			break;
530		if (xp->i_ump->um_fstype == UFS1)
531			error = expunge_ufs1(vp, xp, fs, snapacct_ufs1,
532			    BLK_SNAP);
533		else
534			error = expunge_ufs2(vp, xp, fs, snapacct_ufs2,
535			    BLK_SNAP);
536		if (error) {
537			fs->fs_snapinum[snaploc] = 0;
538			goto done;
539		}
540	}
541	/*
542	 * Allocate space for the full list of preallocated snapshot blocks.
543	 */
544	MALLOC(snapblklist, ufs2_daddr_t *, snaplistsize * sizeof(ufs2_daddr_t),
545	    M_UFSMNT, M_WAITOK);
546	ip->i_snapblklist = &snapblklist[1];
547	/*
548	 * Expunge the blocks used by the snapshots from the set of
549	 * blocks marked as used in the snapshot bitmaps. Also, collect
550	 * the list of allocated blocks in i_snapblklist.
551	 */
552	if (ip->i_ump->um_fstype == UFS1)
553		error = expunge_ufs1(vp, ip, copy_fs, mapacct_ufs1, BLK_SNAP);
554	else
555		error = expunge_ufs2(vp, ip, copy_fs, mapacct_ufs2, BLK_SNAP);
556	if (error) {
557		fs->fs_snapinum[snaploc] = 0;
558		FREE(snapblklist, M_UFSMNT);
559		goto done;
560	}
561	if (snaplistsize < ip->i_snapblklist - snapblklist)
562		panic("ffs_snapshot: list too small");
563	snaplistsize = ip->i_snapblklist - snapblklist;
564	snapblklist[0] = ufs_rw64(snaplistsize, ns);
565	ip->i_snapblklist = 0;
566	/*
567	 * Write out the list of allocated blocks to the end of the snapshot.
568	 */
569	error = vn_rdwr(UIO_WRITE, vp,
570	    (caddr_t)snapblklist, snaplistsize*sizeof(ufs2_daddr_t), ip->i_size,
571	    UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, p->p_ucred, NULL, NULL);
572	if (error) {
573		fs->fs_snapinum[snaploc] = 0;
574		FREE(snapblklist, M_UFSMNT);
575		goto done;
576	}
577	/*
578	 * Write the superblock and its summary information
579	 * to the snapshot.
580	 */
581	blkno = fragstoblks(fs, fs->fs_csaddr);
582	len = howmany(fs->fs_cssize, fs->fs_bsize);
583	space = copy_fs->fs_csp;
584#ifdef FFS_EI
585	if (ns) {
586		ffs_sb_swap(copy_fs, copy_fs);
587		ffs_csum_swap(space, space, fs->fs_cssize);
588	}
589#endif
590	for (loc = 0; loc < len; loc++) {
591		if ((error = writevnblk(vp, space, blkno + loc)) != 0) {
592			fs->fs_snapinum[snaploc] = 0;
593			FREE(snapblklist, M_UFSMNT);
594			goto done;
595		}
596		space = (char *)space + fs->fs_bsize;
597	}
598	/*
599	 * As this is the newest list, it is the most inclusive, so
600	 * should replace the previous list.
601	 */
602	VI_LOCK(devvp);
603	space = ump->um_snapblklist;
604	ump->um_snapblklist = snapblklist;
605	ump->um_snaplistsize = snaplistsize;
606	VI_UNLOCK(devvp);
607	if (space != NULL)
608		FREE(space, M_UFSMNT);
609done:
610	free(copy_fs->fs_csp, M_UFSMNT);
611	blkno = lblkno(fs, fs->fs_sblockloc);
612	if (error == 0 && (error = writevnblk(vp, cgbuf, blkno)) != 0)
613		fs->fs_snapinum[snaploc] = 0;
614out:
615	/*
616	 * All block address modifications are done. Invalidate and free
617	 * all pages on the snapshot vnode. Those coming from read ahead
618	 * are no longer valid.
619	 */
620	if (!error) {
621		simple_lock(&vp->v_interlock);
622		error = VOP_PUTPAGES(vp, 0, 0,
623		    PGO_ALLPAGES|PGO_CLEANIT|PGO_SYNCIO|PGO_FREE);
624	}
625	if (cgbuf)
626		free(cgbuf, M_UFSMNT);
627	if (fs->fs_active != 0) {
628		FREE(fs->fs_active, M_DEVBUF);
629		fs->fs_active = 0;
630	}
631	mp->mnt_flag = flag;
632	if (error)
633		(void) VOP_TRUNCATE(vp, (off_t)0, 0, NOCRED, p);
634	else
635		vref(vp);
636	return (error);
637}
638
639/*
640 * Copy a cylinder group map. All the unallocated blocks are marked
641 * BLK_NOCOPY so that the snapshot knows that it need not copy them
642 * if they are later written. If passno is one, then this is a first
643 * pass, so only setting needs to be done. If passno is 2, then this
644 * is a revision to a previous pass which must be undone as the
645 * replacement pass is done.
646 */
647static int
648cgaccount(cg, vp, data, passno)
649	int cg;
650	struct vnode *vp;
651	caddr_t data;
652	int passno;
653{
654	struct buf *bp, *ibp;
655	struct inode *ip;
656	struct cg *cgp;
657	struct fs *fs;
658	ufs2_daddr_t base, numblks;
659	int error, len, loc, ns, indiroff;
660
661	ip = VTOI(vp);
662	fs = ip->i_fs;
663	ns = UFS_FSNEEDSWAP(fs);
664	error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)),
665		(int)fs->fs_cgsize, KERNCRED, &bp);
666	if (error) {
667		brelse(bp);
668		return (error);
669	}
670	cgp = (struct cg *)bp->b_data;
671	if (!cg_chkmagic(cgp, ns)) {
672		brelse(bp);
673		return (EIO);
674	}
675	ACTIVECG_SET(fs, cg);
676
677	bcopy(bp->b_data, data, fs->fs_cgsize);
678	brelse(bp);
679	if (fs->fs_cgsize < fs->fs_bsize)
680		bzero(&data[fs->fs_cgsize],
681		    fs->fs_bsize - fs->fs_cgsize);
682	numblks = howmany(fs->fs_size, fs->fs_frag);
683	len = howmany(fs->fs_fpg, fs->fs_frag);
684	base = cg * fs->fs_fpg / fs->fs_frag;
685	if (base + len >= numblks)
686		len = numblks - base - 1;
687	loc = 0;
688	if (base < NDADDR) {
689		for ( ; loc < NDADDR; loc++) {
690			if (ffs_isblock(fs, cg_blksfree(cgp, ns), loc))
691				db_assign(ip, loc, BLK_NOCOPY);
692			else if (db_get(ip, loc) == BLK_NOCOPY) {
693				if (passno == 2)
694					db_assign(ip, loc, 0);
695				else if (passno == 1)
696					panic("ffs_snapshot: lost direct block");
697			}
698		}
699	}
700	if ((error = VOP_BALLOC(vp, lblktosize(fs, (off_t)(base + loc)),
701	    fs->fs_bsize, KERNCRED, B_METAONLY, &ibp)) != 0)
702		return (error);
703	indiroff = (base + loc - NDADDR) % NINDIR(fs);
704	for ( ; loc < len; loc++, indiroff++) {
705		if (indiroff >= NINDIR(fs)) {
706			bwrite(ibp);
707			if ((error = VOP_BALLOC(vp,
708			    lblktosize(fs, (off_t)(base + loc)),
709			    fs->fs_bsize, KERNCRED, B_METAONLY, &ibp)) != 0)
710				return (error);
711			indiroff = 0;
712		}
713		if (ffs_isblock(fs, cg_blksfree(cgp, ns), loc))
714			idb_assign(ip, ibp->b_data, indiroff, BLK_NOCOPY);
715		else if (idb_get(ip, ibp->b_data, indiroff) == BLK_NOCOPY) {
716			if (passno == 2)
717				idb_assign(ip, ibp->b_data, indiroff, 0);
718			else if (passno == 1)
719				panic("ffs_snapshot: lost indirect block");
720		}
721	}
722	bwrite(ibp);
723	return (0);
724}
725
726/*
727 * Before expunging a snapshot inode, note all the
728 * blocks that it claims with BLK_SNAP so that fsck will
729 * be able to account for those blocks properly and so
730 * that this snapshot knows that it need not copy them
731 * if the other snapshot holding them is freed. This code
732 * is reproduced once each for UFS1 and UFS2.
733 */
734static int
735expunge_ufs1(snapvp, cancelip, fs, acctfunc, expungetype)
736	struct vnode *snapvp;
737	struct inode *cancelip;
738	struct fs *fs;
739	int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
740	    struct fs *, ufs_lbn_t, int);
741	int expungetype;
742{
743	int i, s, error, ns, indiroff;
744	ufs_lbn_t lbn, rlbn;
745	ufs2_daddr_t len, blkno, numblks, blksperindir;
746	struct ufs1_dinode *dip;
747	struct buf *bp;
748	caddr_t buf;
749
750	ns = UFS_FSNEEDSWAP(fs);
751	/*
752	 * Prepare to expunge the inode. If its inode block has not
753	 * yet been copied, then allocate and fill the copy.
754	 */
755	lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number));
756	blkno = 0;
757	if (lbn < NDADDR) {
758		blkno = db_get(VTOI(snapvp), lbn);
759	} else {
760		s = cow_enter();
761		error = VOP_BALLOC(snapvp, lblktosize(fs, (off_t)lbn),
762		   fs->fs_bsize, KERNCRED, B_METAONLY, &bp);
763		cow_leave(s);
764		if (error)
765			return (error);
766		indiroff = (lbn - NDADDR) % NINDIR(fs);
767		blkno = idb_get(VTOI(snapvp), bp->b_data, indiroff);
768		brelse(bp);
769	}
770	buf = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK);
771	if (blkno != 0)
772		error = readvnblk(snapvp, buf, lbn);
773	else
774		error = readfsblk(snapvp, buf, lbn);
775	if (error) {
776		free(buf, M_UFSMNT);
777		return error;
778	}
779	/*
780	 * Set a snapshot inode to be a zero length file, regular files
781	 * to be completely unallocated.
782	 */
783	dip = (struct ufs1_dinode *)buf + ino_to_fsbo(fs, cancelip->i_number);
784	if (expungetype == BLK_NOCOPY)
785		dip->di_mode = 0;
786	dip->di_size = 0;
787	dip->di_blocks = 0;
788	dip->di_flags =
789	    ufs_rw32(ufs_rw32(dip->di_flags, ns) & ~SF_SNAPSHOT, ns);
790	bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs1_daddr_t));
791	error = writevnblk(snapvp, buf, lbn);
792	free(buf, M_UFSMNT);
793	if (error)
794		return error;
795	/*
796	 * Now go through and expunge all the blocks in the file
797	 * using the function requested.
798	 */
799	numblks = howmany(cancelip->i_size, fs->fs_bsize);
800	if ((error = (*acctfunc)(snapvp, &cancelip->i_ffs1_db[0],
801	    &cancelip->i_ffs1_db[NDADDR], fs, 0, expungetype)))
802		return (error);
803	if ((error = (*acctfunc)(snapvp, &cancelip->i_ffs1_ib[0],
804	    &cancelip->i_ffs1_ib[NIADDR], fs, -1, expungetype)))
805		return (error);
806	blksperindir = 1;
807	lbn = -NDADDR;
808	len = numblks - NDADDR;
809	rlbn = NDADDR;
810	for (i = 0; len > 0 && i < NIADDR; i++) {
811		error = indiracct_ufs1(snapvp, ITOV(cancelip), i,
812		    ufs_rw32(cancelip->i_ffs1_ib[i], ns), lbn, rlbn, len,
813		    blksperindir, fs, acctfunc, expungetype);
814		if (error)
815			return (error);
816		blksperindir *= NINDIR(fs);
817		lbn -= blksperindir + 1;
818		len -= blksperindir;
819		rlbn += blksperindir;
820	}
821	return (0);
822}
823
824/*
825 * Descend an indirect block chain for vnode cancelvp accounting for all
826 * its indirect blocks in snapvp.
827 */
828static int
829indiracct_ufs1(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks,
830	    blksperindir, fs, acctfunc, expungetype)
831	struct vnode *snapvp;
832	struct vnode *cancelvp;
833	int level;
834	ufs1_daddr_t blkno;
835	ufs_lbn_t lbn;
836	ufs_lbn_t rlbn;
837	ufs_lbn_t remblks;
838	ufs_lbn_t blksperindir;
839	struct fs *fs;
840	int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
841	    struct fs *, ufs_lbn_t, int);
842	int expungetype;
843{
844	int error, ns, num, i;
845	ufs_lbn_t subblksperindir;
846	struct indir indirs[NIADDR + 2];
847	ufs1_daddr_t last, *bap;
848	struct buf *bp;
849
850	ns = UFS_FSNEEDSWAP(fs);
851
852	if (blkno == 0) {
853		if (expungetype == BLK_NOCOPY)
854			return (0);
855		panic("indiracct_ufs1: missing indir");
856	}
857	if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0)
858		return (error);
859	if (lbn != indirs[num - 1 - level].in_lbn || num < 2)
860		panic("indiracct_ufs1: botched params");
861	/*
862	 * We have to expand bread here since it will deadlock looking
863	 * up the block number for any blocks that are not in the cache.
864	 */
865	bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0);
866	bp->b_blkno = fsbtodb(fs, blkno);
867	if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 &&
868	    (error = readfsblk(bp->b_vp, bp->b_data, fragstoblks(fs, blkno)))) {
869		brelse(bp);
870		return (error);
871	}
872	/*
873	 * Account for the block pointers in this indirect block.
874	 */
875	last = howmany(remblks, blksperindir);
876	if (last > NINDIR(fs))
877		last = NINDIR(fs);
878	MALLOC(bap, ufs1_daddr_t *, fs->fs_bsize, M_DEVBUF, M_WAITOK);
879	bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize);
880	brelse(bp);
881	error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs,
882	    level == 0 ? rlbn : -1, expungetype);
883	if (error || level == 0)
884		goto out;
885	/*
886	 * Account for the block pointers in each of the indirect blocks
887	 * in the levels below us.
888	 */
889	subblksperindir = blksperindir / NINDIR(fs);
890	for (lbn++, level--, i = 0; i < last; i++) {
891		error = indiracct_ufs1(snapvp, cancelvp, level,
892		    ufs_rw32(bap[i], ns), lbn, rlbn, remblks, subblksperindir,
893		    fs, acctfunc, expungetype);
894		if (error)
895			goto out;
896		rlbn += blksperindir;
897		lbn -= blksperindir;
898		remblks -= blksperindir;
899	}
900out:
901	FREE(bap, M_DEVBUF);
902	return (error);
903}
904
905/*
906 * Do both snap accounting and map accounting.
907 */
908static int
909fullacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype)
910	struct vnode *vp;
911	ufs1_daddr_t *oldblkp, *lastblkp;
912	struct fs *fs;
913	ufs_lbn_t lblkno;
914	int exptype;	/* BLK_SNAP or BLK_NOCOPY */
915{
916	int error;
917
918	if ((error = snapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype)))
919		return (error);
920	return (mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype));
921}
922
923/*
924 * Identify a set of blocks allocated in a snapshot inode.
925 */
926static int
927snapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype)
928	struct vnode *vp;
929	ufs1_daddr_t *oldblkp, *lastblkp;
930	struct fs *fs;
931	ufs_lbn_t lblkno;
932	int expungetype;	/* BLK_SNAP or BLK_NOCOPY */
933{
934	struct inode *ip = VTOI(vp);
935	ufs1_daddr_t blkno, *blkp;
936	ufs_lbn_t lbn;
937	struct buf *ibp;
938	int error, ns;
939
940	ns = UFS_FSNEEDSWAP(fs);
941
942	for ( ; oldblkp < lastblkp; oldblkp++) {
943		blkno = ufs_rw32(*oldblkp, ns);
944		if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP)
945			continue;
946		lbn = fragstoblks(fs, blkno);
947		if (lbn < NDADDR) {
948			blkp = &ip->i_ffs1_db[lbn];
949			ip->i_flag |= IN_CHANGE | IN_UPDATE;
950		} else {
951			error = VOP_BALLOC(vp, lblktosize(fs, (off_t)lbn),
952			    fs->fs_bsize, KERNCRED, B_METAONLY, &ibp);
953			if (error)
954				return (error);
955			blkp = &((ufs1_daddr_t *)(ibp->b_data))
956			    [(lbn - NDADDR) % NINDIR(fs)];
957		}
958		/*
959		 * If we are expunging a snapshot vnode and we
960		 * find a block marked BLK_NOCOPY, then it is
961		 * one that has been allocated to this snapshot after
962		 * we took our current snapshot and can be ignored.
963		 */
964		blkno = ufs_rw32(*blkp, ns);
965		if (expungetype == BLK_SNAP && blkno == BLK_NOCOPY) {
966			if (lbn >= NDADDR)
967				brelse(ibp);
968		} else {
969			if (blkno != 0)
970				panic("snapacct_ufs1: bad block");
971			*blkp = ufs_rw32(expungetype, ns);
972			if (lbn >= NDADDR)
973				bwrite(ibp);
974		}
975	}
976	return (0);
977}
978
979/*
980 * Account for a set of blocks allocated in a snapshot inode.
981 */
982static int
983mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype)
984	struct vnode *vp;
985	ufs1_daddr_t *oldblkp, *lastblkp;
986	struct fs *fs;
987	ufs_lbn_t lblkno;
988	int expungetype;
989{
990	ufs1_daddr_t blkno;
991	struct inode *ip;
992	ino_t inum;
993	int acctit, ns;
994
995	ns = UFS_FSNEEDSWAP(fs);
996	ip = VTOI(vp);
997	inum = ip->i_number;
998	if (lblkno == -1)
999		acctit = 0;
1000	else
1001		acctit = 1;
1002	for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) {
1003		blkno = ufs_rw32(*oldblkp, ns);
1004		if (blkno == 0 || blkno == BLK_NOCOPY)
1005			continue;
1006		if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP)
1007			*ip->i_snapblklist++ = ufs_rw64(lblkno, ns);
1008		if (blkno == BLK_SNAP)
1009			blkno = blkstofrags(fs, lblkno);
1010		ffs_blkfree(fs, vp, blkno, fs->fs_bsize, inum);
1011	}
1012	return (0);
1013}
1014
1015/*
1016 * Before expunging a snapshot inode, note all the
1017 * blocks that it claims with BLK_SNAP so that fsck will
1018 * be able to account for those blocks properly and so
1019 * that this snapshot knows that it need not copy them
1020 * if the other snapshot holding them is freed. This code
1021 * is reproduced once each for UFS1 and UFS2.
1022 */
1023static int
1024expunge_ufs2(snapvp, cancelip, fs, acctfunc, expungetype)
1025	struct vnode *snapvp;
1026	struct inode *cancelip;
1027	struct fs *fs;
1028	int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
1029	    struct fs *, ufs_lbn_t, int);
1030	int expungetype;
1031{
1032	int i, s, error, ns, indiroff;
1033	ufs_lbn_t lbn, rlbn;
1034	ufs2_daddr_t len, blkno, numblks, blksperindir;
1035	struct ufs2_dinode *dip;
1036	struct buf *bp;
1037	caddr_t buf;
1038
1039	ns = UFS_FSNEEDSWAP(fs);
1040	/*
1041	 * Prepare to expunge the inode. If its inode block has not
1042	 * yet been copied, then allocate and fill the copy.
1043	 */
1044	lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number));
1045	blkno = 0;
1046	if (lbn < NDADDR) {
1047		blkno = db_get(VTOI(snapvp), lbn);
1048	} else {
1049		s = cow_enter();
1050		error = VOP_BALLOC(snapvp, lblktosize(fs, (off_t)lbn),
1051		   fs->fs_bsize, KERNCRED, B_METAONLY, &bp);
1052		cow_leave(s);
1053		if (error)
1054			return (error);
1055		indiroff = (lbn - NDADDR) % NINDIR(fs);
1056		blkno = idb_get(VTOI(snapvp), bp->b_data, indiroff);
1057		brelse(bp);
1058	}
1059	buf = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK);
1060	if (blkno != 0)
1061		error = readvnblk(snapvp, buf, lbn);
1062	else
1063		error = readfsblk(snapvp, buf, lbn);
1064	if (error) {
1065		free(buf, M_UFSMNT);
1066		return error;
1067	}
1068	/*
1069	 * Set a snapshot inode to be a zero length file, regular files
1070	 * to be completely unallocated.
1071	 */
1072	dip = (struct ufs2_dinode *)buf + ino_to_fsbo(fs, cancelip->i_number);
1073	if (expungetype == BLK_NOCOPY)
1074		dip->di_mode = 0;
1075	dip->di_size = 0;
1076	dip->di_blocks = 0;
1077	dip->di_flags =
1078	    ufs_rw32(ufs_rw32(dip->di_flags, ns) & ~SF_SNAPSHOT, ns);
1079	bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs2_daddr_t));
1080	error = writevnblk(snapvp, buf, lbn);
1081	free(buf, M_UFSMNT);
1082	if (error)
1083		return error;
1084	/*
1085	 * Now go through and expunge all the blocks in the file
1086	 * using the function requested.
1087	 */
1088	numblks = howmany(cancelip->i_size, fs->fs_bsize);
1089	if ((error = (*acctfunc)(snapvp, &cancelip->i_ffs2_db[0],
1090	    &cancelip->i_ffs2_db[NDADDR], fs, 0, expungetype)))
1091		return (error);
1092	if ((error = (*acctfunc)(snapvp, &cancelip->i_ffs2_ib[0],
1093	    &cancelip->i_ffs2_ib[NIADDR], fs, -1, expungetype)))
1094		return (error);
1095	blksperindir = 1;
1096	lbn = -NDADDR;
1097	len = numblks - NDADDR;
1098	rlbn = NDADDR;
1099	for (i = 0; len > 0 && i < NIADDR; i++) {
1100		error = indiracct_ufs2(snapvp, ITOV(cancelip), i,
1101		    ufs_rw64(cancelip->i_ffs2_ib[i], ns), lbn, rlbn, len,
1102		    blksperindir, fs, acctfunc, expungetype);
1103		if (error)
1104			return (error);
1105		blksperindir *= NINDIR(fs);
1106		lbn -= blksperindir + 1;
1107		len -= blksperindir;
1108		rlbn += blksperindir;
1109	}
1110	return (0);
1111}
1112
1113/*
1114 * Descend an indirect block chain for vnode cancelvp accounting for all
1115 * its indirect blocks in snapvp.
1116 */
1117static int
1118indiracct_ufs2(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks,
1119	    blksperindir, fs, acctfunc, expungetype)
1120	struct vnode *snapvp;
1121	struct vnode *cancelvp;
1122	int level;
1123	ufs2_daddr_t blkno;
1124	ufs_lbn_t lbn;
1125	ufs_lbn_t rlbn;
1126	ufs_lbn_t remblks;
1127	ufs_lbn_t blksperindir;
1128	struct fs *fs;
1129	int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
1130	    struct fs *, ufs_lbn_t, int);
1131	int expungetype;
1132{
1133	int error, ns, num, i;
1134	ufs_lbn_t subblksperindir;
1135	struct indir indirs[NIADDR + 2];
1136	ufs2_daddr_t last, *bap;
1137	struct buf *bp;
1138
1139	ns = UFS_FSNEEDSWAP(fs);
1140
1141	if (blkno == 0) {
1142		if (expungetype == BLK_NOCOPY)
1143			return (0);
1144		panic("indiracct_ufs2: missing indir");
1145	}
1146	if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0)
1147		return (error);
1148	if (lbn != indirs[num - 1 - level].in_lbn || num < 2)
1149		panic("indiracct_ufs2: botched params");
1150	/*
1151	 * We have to expand bread here since it will deadlock looking
1152	 * up the block number for any blocks that are not in the cache.
1153	 */
1154	bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0);
1155	bp->b_blkno = fsbtodb(fs, blkno);
1156	if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 &&
1157	    (error = readfsblk(bp->b_vp, bp->b_data, fragstoblks(fs, blkno)))) {
1158		brelse(bp);
1159		return (error);
1160	}
1161	/*
1162	 * Account for the block pointers in this indirect block.
1163	 */
1164	last = howmany(remblks, blksperindir);
1165	if (last > NINDIR(fs))
1166		last = NINDIR(fs);
1167	MALLOC(bap, ufs2_daddr_t *, fs->fs_bsize, M_DEVBUF, M_WAITOK);
1168	bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize);
1169	brelse(bp);
1170	error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs,
1171	    level == 0 ? rlbn : -1, expungetype);
1172	if (error || level == 0)
1173		goto out;
1174	/*
1175	 * Account for the block pointers in each of the indirect blocks
1176	 * in the levels below us.
1177	 */
1178	subblksperindir = blksperindir / NINDIR(fs);
1179	for (lbn++, level--, i = 0; i < last; i++) {
1180		error = indiracct_ufs2(snapvp, cancelvp, level,
1181		    ufs_rw64(bap[i], ns), lbn, rlbn, remblks, subblksperindir,
1182		    fs, acctfunc, expungetype);
1183		if (error)
1184			goto out;
1185		rlbn += blksperindir;
1186		lbn -= blksperindir;
1187		remblks -= blksperindir;
1188	}
1189out:
1190	FREE(bap, M_DEVBUF);
1191	return (error);
1192}
1193
1194/*
1195 * Do both snap accounting and map accounting.
1196 */
1197static int
1198fullacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype)
1199	struct vnode *vp;
1200	ufs2_daddr_t *oldblkp, *lastblkp;
1201	struct fs *fs;
1202	ufs_lbn_t lblkno;
1203	int exptype;	/* BLK_SNAP or BLK_NOCOPY */
1204{
1205	int error;
1206
1207	if ((error = snapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype)))
1208		return (error);
1209	return (mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype));
1210}
1211
1212/*
1213 * Identify a set of blocks allocated in a snapshot inode.
1214 */
1215static int
1216snapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expungetype)
1217	struct vnode *vp;
1218	ufs2_daddr_t *oldblkp, *lastblkp;
1219	struct fs *fs;
1220	ufs_lbn_t lblkno;
1221	int expungetype;	/* BLK_SNAP or BLK_NOCOPY */
1222{
1223	struct inode *ip = VTOI(vp);
1224	ufs2_daddr_t blkno, *blkp;
1225	ufs_lbn_t lbn;
1226	struct buf *ibp;
1227	int error, ns;
1228
1229	ns = UFS_FSNEEDSWAP(fs);
1230
1231	for ( ; oldblkp < lastblkp; oldblkp++) {
1232		blkno = ufs_rw64(*oldblkp, ns);
1233		if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP)
1234			continue;
1235		lbn = fragstoblks(fs, blkno);
1236		if (lbn < NDADDR) {
1237			blkp = &ip->i_ffs2_db[lbn];
1238			ip->i_flag |= IN_CHANGE | IN_UPDATE;
1239		} else {
1240			error = VOP_BALLOC(vp, lblktosize(fs, (off_t)lbn),
1241			    fs->fs_bsize, KERNCRED, B_METAONLY, &ibp);
1242			if (error)
1243				return (error);
1244			blkp = &((ufs2_daddr_t *)(ibp->b_data))
1245			    [(lbn - NDADDR) % NINDIR(fs)];
1246		}
1247		/*
1248		 * If we are expunging a snapshot vnode and we
1249		 * find a block marked BLK_NOCOPY, then it is
1250		 * one that has been allocated to this snapshot after
1251		 * we took our current snapshot and can be ignored.
1252		 */
1253		blkno = ufs_rw64(*blkp, ns);
1254		if (expungetype == BLK_SNAP && blkno == BLK_NOCOPY) {
1255			if (lbn >= NDADDR)
1256				brelse(ibp);
1257		} else {
1258			if (blkno != 0)
1259				panic("snapacct_ufs2: bad block");
1260			*blkp = ufs_rw64(expungetype, ns);
1261			if (lbn >= NDADDR)
1262				bwrite(ibp);
1263		}
1264	}
1265	return (0);
1266}
1267
1268/*
1269 * Account for a set of blocks allocated in a snapshot inode.
1270 */
1271static int
1272mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expungetype)
1273	struct vnode *vp;
1274	ufs2_daddr_t *oldblkp, *lastblkp;
1275	struct fs *fs;
1276	ufs_lbn_t lblkno;
1277	int expungetype;
1278{
1279	ufs2_daddr_t blkno;
1280	struct inode *ip;
1281	ino_t inum;
1282	int acctit, ns;
1283
1284	ns = UFS_FSNEEDSWAP(fs);
1285	ip = VTOI(vp);
1286	inum = ip->i_number;
1287	if (lblkno == -1)
1288		acctit = 0;
1289	else
1290		acctit = 1;
1291	for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) {
1292		blkno = ufs_rw64(*oldblkp, ns);
1293		if (blkno == 0 || blkno == BLK_NOCOPY)
1294			continue;
1295		if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP)
1296			*ip->i_snapblklist++ = ufs_rw64(lblkno, ns);
1297		if (blkno == BLK_SNAP)
1298			blkno = blkstofrags(fs, lblkno);
1299		ffs_blkfree(fs, vp, blkno, fs->fs_bsize, inum);
1300	}
1301	return (0);
1302}
1303
1304/*
1305 * Decrement extra reference on snapshot when last name is removed.
1306 * It will not be freed until the last open reference goes away.
1307 */
1308void
1309ffs_snapgone(ip)
1310	struct inode *ip;
1311{
1312	struct ufsmount *ump = VFSTOUFS(ip->i_devvp->v_specmountpoint);
1313	struct inode *xp;
1314	struct fs *fs;
1315	int snaploc;
1316
1317	/*
1318	 * Find snapshot in incore list.
1319	 */
1320	TAILQ_FOREACH(xp, &ump->um_snapshots, i_nextsnap)
1321		if (xp == ip)
1322			break;
1323	if (xp != NULL)
1324		vrele(ITOV(ip));
1325#ifdef DEBUG
1326	else if (snapdebug)
1327		printf("ffs_snapgone: lost snapshot vnode %d\n",
1328		    ip->i_number);
1329#endif
1330	/*
1331	 * Delete snapshot inode from superblock. Keep list dense.
1332	 */
1333	fs = ip->i_fs;
1334	for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++)
1335		if (fs->fs_snapinum[snaploc] == ip->i_number)
1336			break;
1337	if (snaploc < FSMAXSNAP) {
1338		for (snaploc++; snaploc < FSMAXSNAP; snaploc++) {
1339			if (fs->fs_snapinum[snaploc] == 0)
1340				break;
1341			fs->fs_snapinum[snaploc - 1] = fs->fs_snapinum[snaploc];
1342		}
1343		fs->fs_snapinum[snaploc - 1] = 0;
1344	}
1345}
1346
1347/*
1348 * Prepare a snapshot file for being removed.
1349 */
1350void
1351ffs_snapremove(vp)
1352	struct vnode *vp;
1353{
1354	struct inode *ip = VTOI(vp);
1355	struct vnode *devvp = ip->i_devvp;
1356	struct fs *fs = ip->i_fs;
1357	struct ufsmount *ump = VFSTOUFS(devvp->v_specmountpoint);
1358	struct lock *lkp;
1359	struct buf *ibp;
1360	ufs2_daddr_t numblks, blkno, dblk, *snapblklist;
1361	int error, ns, loc, last;
1362
1363	ns = UFS_FSNEEDSWAP(fs);
1364	/*
1365	 * If active, delete from incore list (this snapshot may
1366	 * already have been in the process of being deleted, so
1367	 * would not have been active).
1368	 *
1369	 * Clear copy-on-write flag if last snapshot.
1370	 */
1371	if (ip->i_nextsnap.tqe_prev != 0) {
1372		VI_LOCK(devvp);
1373		lockmgr(&vp->v_lock, LK_INTERLOCK | LK_EXCLUSIVE,
1374		    VI_MTX(devvp));
1375		VI_LOCK(devvp);
1376		TAILQ_REMOVE(&ump->um_snapshots, ip, i_nextsnap);
1377		ip->i_nextsnap.tqe_prev = 0;
1378		lkp = vp->v_vnlock;
1379		vp->v_vnlock = &vp->v_lock;
1380		lockmgr(lkp, LK_RELEASE, NULL);
1381		if (TAILQ_FIRST(&ump->um_snapshots) != 0) {
1382			VI_UNLOCK(devvp);
1383		} else {
1384			snapblklist = ump->um_snapblklist;
1385			ump->um_snapblklist = 0;
1386			ump->um_snaplistsize = 0;
1387			lockmgr(lkp, LK_DRAIN|LK_INTERLOCK, VI_MTX(devvp));
1388			lockmgr(lkp, LK_RELEASE, NULL);
1389			vn_cow_disestablish(devvp, ffs_copyonwrite, devvp);
1390			FREE(lkp, M_UFSMNT);
1391			FREE(snapblklist, M_UFSMNT);
1392		}
1393	}
1394	/*
1395	 * Clear all BLK_NOCOPY fields. Pass any block claims to other
1396	 * snapshots that want them (see ffs_snapblkfree below).
1397	 */
1398	for (blkno = 1; blkno < NDADDR; blkno++) {
1399		dblk = db_get(ip, blkno);
1400		if (dblk == BLK_NOCOPY || dblk == BLK_SNAP)
1401			db_assign(ip, blkno, 0);
1402		else if ((dblk == blkstofrags(fs, blkno) &&
1403		     ffs_snapblkfree(fs, ip->i_devvp, dblk, fs->fs_bsize,
1404		     ip->i_number))) {
1405			DIP_ADD(ip, blocks, -btodb(fs->fs_bsize));
1406			db_assign(ip, blkno, 0);
1407		}
1408	}
1409	numblks = howmany(ip->i_size, fs->fs_bsize);
1410	for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) {
1411		error = VOP_BALLOC(vp, lblktosize(fs, (off_t)blkno),
1412		    fs->fs_bsize, KERNCRED, B_METAONLY, &ibp);
1413		if (error)
1414			continue;
1415		if (fs->fs_size - blkno > NINDIR(fs))
1416			last = NINDIR(fs);
1417		else
1418			last = fs->fs_size - blkno;
1419		for (loc = 0; loc < last; loc++) {
1420			dblk = idb_get(ip, ibp->b_data, loc);
1421			if (dblk == BLK_NOCOPY || dblk == BLK_SNAP)
1422				idb_assign(ip, ibp->b_data, loc, 0);
1423			else if (dblk == blkstofrags(fs, blkno) &&
1424			    ffs_snapblkfree(fs, ip->i_devvp, dblk,
1425			    fs->fs_bsize, ip->i_number)) {
1426				DIP_ADD(ip, blocks, -btodb(fs->fs_bsize));
1427				idb_assign(ip, ibp->b_data, loc, 0);
1428			}
1429		}
1430		bwrite(ibp);
1431	}
1432	/*
1433	 * Clear snapshot flag and drop reference.
1434	 */
1435	ip->i_flags &= ~SF_SNAPSHOT;
1436	DIP_ASSIGN(ip, flags, ip->i_flags);
1437	ip->i_flag |= IN_CHANGE | IN_UPDATE;
1438}
1439
1440/*
1441 * Notification that a block is being freed. Return zero if the free
1442 * should be allowed to proceed. Return non-zero if the snapshot file
1443 * wants to claim the block. The block will be claimed if it is an
1444 * uncopied part of one of the snapshots. It will be freed if it is
1445 * either a BLK_NOCOPY or has already been copied in all of the snapshots.
1446 * If a fragment is being freed, then all snapshots that care about
1447 * it must make a copy since a snapshot file can only claim full sized
1448 * blocks. Note that if more than one snapshot file maps the block,
1449 * we can pick one at random to claim it. Since none of the snapshots
1450 * can change, we are assurred that they will all see the same unmodified
1451 * image. When deleting a snapshot file (see ffs_snapremove above), we
1452 * must push any of these claimed blocks to one of the other snapshots
1453 * that maps it. These claimed blocks are easily identified as they will
1454 * have a block number equal to their logical block number within the
1455 * snapshot. A copied block can never have this property because they
1456 * must always have been allocated from a BLK_NOCOPY location.
1457 */
1458int
1459ffs_snapblkfree(fs, devvp, bno, size, inum)
1460	struct fs *fs;
1461	struct vnode *devvp;
1462	ufs2_daddr_t bno;
1463	long size;
1464	ino_t inum;
1465{
1466	struct ufsmount *ump = VFSTOUFS(devvp->v_specmountpoint);
1467	struct buf *ibp;
1468	struct inode *ip;
1469	struct vnode *vp = NULL, *saved_vp = NULL;
1470	caddr_t saved_data = NULL;
1471	ufs_lbn_t lbn;
1472	ufs2_daddr_t blkno;
1473	int s, indiroff = 0, snapshot_locked = 0, error = 0, claimedblk = 0;
1474
1475	lbn = fragstoblks(fs, bno);
1476retry:
1477	VI_LOCK(devvp);
1478	TAILQ_FOREACH(ip, &ump->um_snapshots, i_nextsnap) {
1479		vp = ITOV(ip);
1480		/*
1481		 * Lookup block being written.
1482		 */
1483		if (lbn < NDADDR) {
1484			blkno = db_get(ip, lbn);
1485		} else {
1486			if (snapshot_locked == 0 &&
1487			    lockmgr(vp->v_vnlock,
1488			      LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL,
1489			      VI_MTX(devvp)) != 0)
1490				goto retry;
1491			snapshot_locked = 1;
1492			s = cow_enter();
1493			error = VOP_BALLOC(vp, lblktosize(fs, (off_t)lbn),
1494			    fs->fs_bsize, KERNCRED, B_METAONLY, &ibp);
1495			cow_leave(s);
1496			if (error)
1497				break;
1498			indiroff = (lbn - NDADDR) % NINDIR(fs);
1499			blkno = idb_get(ip, ibp->b_data, indiroff);
1500		}
1501		/*
1502		 * Check to see if block needs to be copied.
1503		 */
1504		if (blkno == 0) {
1505			/*
1506			 * A block that we map is being freed. If it has not
1507			 * been claimed yet, we will claim or copy it (below).
1508			 */
1509			claimedblk = 1;
1510		} else if (blkno == BLK_SNAP) {
1511			/*
1512			 * No previous snapshot claimed the block,
1513			 * so it will be freed and become a BLK_NOCOPY
1514			 * (don't care) for us.
1515			 */
1516			if (claimedblk)
1517				panic("snapblkfree: inconsistent block type");
1518			if (snapshot_locked == 0 &&
1519			    lockmgr(vp->v_vnlock,
1520			      LK_INTERLOCK | LK_EXCLUSIVE | LK_NOWAIT,
1521			      VI_MTX(devvp)) != 0) {
1522				if (lbn >= NDADDR)
1523					brelse(ibp);
1524				vn_lock(vp, LK_EXCLUSIVE | LK_SLEEPFAIL);
1525				goto retry;
1526			}
1527			snapshot_locked = 1;
1528			if (lbn < NDADDR) {
1529				db_assign(ip, lbn, BLK_NOCOPY);
1530				ip->i_flag |= IN_CHANGE | IN_UPDATE;
1531			} else {
1532				idb_assign(ip, ibp->b_data, indiroff,
1533				    BLK_NOCOPY);
1534				bwrite(ibp);
1535			}
1536			continue;
1537		} else /* BLK_NOCOPY or default */ {
1538			/*
1539			 * If the snapshot has already copied the block
1540			 * (default), or does not care about the block,
1541			 * it is not needed.
1542			 */
1543			if (lbn >= NDADDR)
1544				brelse(ibp);
1545			continue;
1546		}
1547		/*
1548		 * If this is a full size block, we will just grab it
1549		 * and assign it to the snapshot inode. Otherwise we
1550		 * will proceed to copy it. See explanation for this
1551		 * routine as to why only a single snapshot needs to
1552		 * claim this block.
1553		 */
1554		if (snapshot_locked == 0 &&
1555		    lockmgr(vp->v_vnlock,
1556		      LK_INTERLOCK | LK_EXCLUSIVE | LK_NOWAIT,
1557		      VI_MTX(devvp)) != 0) {
1558			if (lbn >= NDADDR)
1559				brelse(ibp);
1560			vn_lock(vp, LK_EXCLUSIVE | LK_SLEEPFAIL);
1561			goto retry;
1562		}
1563		snapshot_locked = 1;
1564		if (size == fs->fs_bsize) {
1565#ifdef DEBUG
1566			if (snapdebug)
1567				printf("%s %d lbn %" PRId64 " from inum %d\n",
1568				    "Grabonremove: snapino", ip->i_number,
1569				    lbn, inum);
1570#endif
1571			if (lbn < NDADDR) {
1572				db_assign(ip, lbn, bno);
1573			} else {
1574				idb_assign(ip, ibp->b_data, indiroff, bno);
1575				bwrite(ibp);
1576			}
1577			DIP_ADD(ip, blocks, btodb(size));
1578			ip->i_flag |= IN_CHANGE | IN_UPDATE;
1579			VOP_UNLOCK(vp, 0);
1580			return (1);
1581		}
1582		if (lbn >= NDADDR)
1583			brelse(ibp);
1584#ifdef DEBUG
1585		if (snapdebug)
1586			printf("%s%d lbn %" PRId64 " %s %d size %ld\n",
1587			    "Copyonremove: snapino ", ip->i_number,
1588			    lbn, "for inum", inum, size);
1589#endif
1590		/*
1591		 * If we have already read the old block contents, then
1592		 * simply copy them to the new block. Note that we need
1593		 * to synchronously write snapshots that have not been
1594		 * unlinked, and hence will be visible after a crash,
1595		 * to ensure their integrity.
1596		 */
1597		if (saved_data) {
1598			error = writevnblk(vp, saved_data, lbn);
1599			if (error)
1600				break;
1601			continue;
1602		}
1603		/*
1604		 * Otherwise, read the old block contents into the buffer.
1605		 */
1606		saved_data = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK);
1607		saved_vp = vp;
1608		if ((error = readfsblk(vp, saved_data, lbn)) != 0) {
1609			free(saved_data, M_UFSMNT);
1610			saved_data = NULL;
1611			break;
1612		}
1613	}
1614	/*
1615	 * Note that we need to synchronously write snapshots that
1616	 * have not been unlinked, and hence will be visible after
1617	 * a crash, to ensure their integrity.
1618	 */
1619	if (saved_data) {
1620		error = writevnblk(saved_vp, saved_data, lbn);
1621		free(saved_data, M_UFSMNT);
1622	}
1623	/*
1624	 * If we have been unable to allocate a block in which to do
1625	 * the copy, then return non-zero so that the fragment will
1626	 * not be freed. Although space will be lost, the snapshot
1627	 * will stay consistent.
1628	 */
1629	if (snapshot_locked)
1630		VOP_UNLOCK(vp, 0);
1631	else
1632		VI_UNLOCK(devvp);
1633	return (error);
1634}
1635
1636/*
1637 * Associate snapshot files when mounting.
1638 */
1639void
1640ffs_snapshot_mount(mp)
1641	struct mount *mp;
1642{
1643	struct ufsmount *ump = VFSTOUFS(mp);
1644	struct vnode *devvp = ump->um_devvp;
1645	struct fs *fs = ump->um_fs;
1646	struct proc *p = curproc;
1647	struct vnode *vp;
1648	struct inode *ip, *xp;
1649	ufs2_daddr_t snaplistsize, *snapblklist;
1650	int error, ns, snaploc, loc;
1651
1652	ns = UFS_FSNEEDSWAP(fs);
1653	/*
1654	 * XXX The following needs to be set before VOP_TRUNCATE or
1655	 * VOP_READ can be called.
1656	 */
1657	mp->mnt_stat.f_iosize = fs->fs_bsize;
1658	/*
1659	 * Process each snapshot listed in the superblock.
1660	 */
1661	vp = NULL;
1662	for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) {
1663		if (fs->fs_snapinum[snaploc] == 0)
1664			break;
1665		if ((error = VFS_VGET(mp, fs->fs_snapinum[snaploc],
1666		    &vp)) != 0) {
1667			printf("ffs_snapshot_mount: vget failed %d\n", error);
1668			continue;
1669		}
1670		ip = VTOI(vp);
1671		if ((ip->i_flags & SF_SNAPSHOT) == 0) {
1672			printf("ffs_snapshot_mount: non-snapshot inode %d\n",
1673			    fs->fs_snapinum[snaploc]);
1674			vput(vp);
1675			vp = NULL;
1676			for (loc = snaploc + 1; loc < FSMAXSNAP; loc++) {
1677				if (fs->fs_snapinum[loc] == 0)
1678					break;
1679				fs->fs_snapinum[loc - 1] = fs->fs_snapinum[loc];
1680			}
1681			fs->fs_snapinum[loc - 1] = 0;
1682			snaploc--;
1683			continue;
1684		}
1685		/*
1686		 * If there already exist snapshots on this filesystem, grab a
1687		 * reference to their shared lock. If this is the first snapshot
1688		 * on this filesystem, we need to allocate a lock for the
1689		 * snapshots to share. In either case, acquire the snapshot
1690		 * lock and give up our original private lock.
1691		 */
1692		VI_LOCK(devvp);
1693		if ((xp = TAILQ_FIRST(&ump->um_snapshots)) != NULL) {
1694			struct lock *lkp;
1695
1696			lkp = ITOV(xp)->v_vnlock;
1697			VI_UNLOCK(devvp);
1698			VI_LOCK(vp);
1699			vp->v_vnlock = lkp;
1700		} else {
1701			struct lock *lkp;
1702
1703			VI_UNLOCK(devvp);
1704			MALLOC(lkp, struct lock *, sizeof(struct lock),
1705			    M_UFSMNT, M_WAITOK);
1706			lockinit(lkp, PVFS, "snaplk", 0, LK_CANRECURSE);
1707			VI_LOCK(vp);
1708			vp->v_vnlock = lkp;
1709		}
1710		vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY);
1711		transferlockers(&vp->v_lock, vp->v_vnlock);
1712		lockmgr(&vp->v_lock, LK_RELEASE, NULL);
1713		/*
1714		 * Link it onto the active snapshot list.
1715		 */
1716		VI_LOCK(devvp);
1717		if (ip->i_nextsnap.tqe_prev != 0)
1718			panic("ffs_snapshot_mount: %d already on list",
1719			    ip->i_number);
1720		else
1721			TAILQ_INSERT_TAIL(&ump->um_snapshots, ip, i_nextsnap);
1722		vp->v_flag |= VSYSTEM;
1723		VI_UNLOCK(devvp);
1724		VOP_UNLOCK(vp, 0);
1725	}
1726	/*
1727	 * No usable snapshots found.
1728	 */
1729	if (vp == NULL)
1730		return;
1731	/*
1732	 * Allocate the space for the block hints list. We always want to
1733	 * use the list from the newest snapshot.
1734	 */
1735	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1736	error = vn_rdwr(UIO_READ, vp,
1737	    (caddr_t)&snaplistsize, sizeof(snaplistsize),
1738	    lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)),
1739	    UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, p->p_ucred, NULL, NULL);
1740	if (error) {
1741		printf("ffs_snapshot_mount: read_1 failed %d\n", error);
1742		VOP_UNLOCK(vp, 0);
1743		return;
1744	}
1745	snaplistsize = ufs_rw64(snaplistsize, ns);
1746	MALLOC(snapblklist, ufs2_daddr_t *, snaplistsize * sizeof(ufs2_daddr_t),
1747	    M_UFSMNT, M_WAITOK);
1748	error = vn_rdwr(UIO_READ, vp,
1749	    (caddr_t)snapblklist, snaplistsize * sizeof(ufs2_daddr_t),
1750	    lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)),
1751	    UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, p->p_ucred, NULL, NULL);
1752	if (error) {
1753		printf("ffs_snapshot_mount: read_2 failed %d\n", error);
1754		VOP_UNLOCK(vp, 0);
1755		FREE(snapblklist, M_UFSMNT);
1756		return;
1757	}
1758	VOP_UNLOCK(vp, 0);
1759	VI_LOCK(devvp);
1760	ump->um_snaplistsize = snaplistsize;
1761	ump->um_snapblklist = snapblklist;
1762	VI_UNLOCK(devvp);
1763	vn_cow_establish(devvp, ffs_copyonwrite, devvp);
1764}
1765
1766/*
1767 * Disassociate snapshot files when unmounting.
1768 */
1769void
1770ffs_snapshot_unmount(mp)
1771	struct mount *mp;
1772{
1773	struct ufsmount *ump = VFSTOUFS(mp);
1774	struct vnode *devvp = ump->um_devvp;
1775	struct lock *lkp = NULL;
1776	struct inode *xp;
1777	struct vnode *vp;
1778
1779	VI_LOCK(devvp);
1780	while ((xp = TAILQ_FIRST(&ump->um_snapshots)) != 0) {
1781		vp = ITOV(xp);
1782		lkp = vp->v_vnlock;
1783		vp->v_vnlock = &vp->v_lock;
1784		TAILQ_REMOVE(&ump->um_snapshots, xp, i_nextsnap);
1785		xp->i_nextsnap.tqe_prev = 0;
1786		if (xp->i_ffs_effnlink > 0) {
1787			VI_UNLOCK(devvp);
1788			vrele(vp);
1789			VI_LOCK(devvp);
1790		}
1791	}
1792	if (ump->um_snapblklist != NULL) {
1793		FREE(ump->um_snapblklist, M_UFSMNT);
1794		ump->um_snapblklist = NULL;
1795		ump->um_snaplistsize = 0;
1796	}
1797	VI_UNLOCK(devvp);
1798	if (lkp != NULL) {
1799		vn_cow_disestablish(devvp, ffs_copyonwrite, devvp);
1800		FREE(lkp, M_UFSMNT);
1801	}
1802}
1803
1804/*
1805 * Check for need to copy block that is about to be written,
1806 * copying the block if necessary.
1807 */
1808static int
1809ffs_copyonwrite(v, bp)
1810	void *v;
1811	struct buf *bp;
1812{
1813	struct buf *ibp;
1814	struct fs *fs;
1815	struct inode *ip;
1816	struct vnode *devvp = v, *vp = 0, *saved_vp = NULL;
1817	struct ufsmount *ump = VFSTOUFS(devvp->v_specmountpoint);
1818	caddr_t saved_data = NULL;
1819	ufs2_daddr_t lbn, blkno, *snapblklist;
1820	int lower, upper, mid, s, ns, indiroff, snapshot_locked = 0, error = 0;
1821
1822	/*
1823	 * Check for valid snapshots.
1824	 */
1825	VI_LOCK(devvp);
1826	ip = TAILQ_FIRST(&ump->um_snapshots);
1827	if (ip == NULL) {
1828		VI_UNLOCK(devvp);
1829		return 0;
1830	}
1831	/*
1832	 * First check to see if it is in the preallocated list.
1833	 * By doing this check we avoid several potential deadlocks.
1834	 */
1835	fs = ip->i_fs;
1836	ns = UFS_FSNEEDSWAP(fs);
1837	lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno));
1838	snapblklist = ump->um_snapblklist;
1839	upper = ump->um_snaplistsize - 1;
1840	lower = 1;
1841	while (lower <= upper) {
1842		mid = (lower + upper) / 2;
1843		if (ufs_rw64(snapblklist[mid], ns) == lbn)
1844			break;
1845		if (ufs_rw64(snapblklist[mid], ns) < lbn)
1846			lower = mid + 1;
1847		else
1848			upper = mid - 1;
1849	}
1850	if (lower <= upper) {
1851		VI_UNLOCK(devvp);
1852		return 0;
1853	}
1854	/*
1855	 * Not in the precomputed list, so check the snapshots.
1856	 */
1857retry:
1858	TAILQ_FOREACH(ip, &ump->um_snapshots, i_nextsnap) {
1859		vp = ITOV(ip);
1860		/*
1861		 * We ensure that everything of our own that needs to be
1862		 * copied will be done at the time that ffs_snapshot is
1863		 * called. Thus we can skip the check here which can
1864		 * deadlock in doing the lookup in VOP_BALLOC.
1865		 */
1866		if (bp->b_vp == vp)
1867			continue;
1868		/*
1869		 * Check to see if block needs to be copied. We do not have
1870		 * to hold the snapshot lock while doing this lookup as it
1871		 * will never require any additional allocations for the
1872		 * snapshot inode.
1873		 */
1874		if (lbn < NDADDR) {
1875			blkno = db_get(ip, lbn);
1876		} else {
1877			if (snapshot_locked == 0 &&
1878			    lockmgr(vp->v_vnlock,
1879			      LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL,
1880			      VI_MTX(devvp)) != 0) {
1881				VI_LOCK(devvp);
1882				goto retry;
1883			}
1884			snapshot_locked = 1;
1885			s = cow_enter();
1886			error = VOP_BALLOC(vp, lblktosize(fs, (off_t)lbn),
1887			   fs->fs_bsize, KERNCRED, B_METAONLY, &ibp);
1888			cow_leave(s);
1889			if (error)
1890				break;
1891			indiroff = (lbn - NDADDR) % NINDIR(fs);
1892			blkno = idb_get(ip, ibp->b_data, indiroff);
1893			brelse(ibp);
1894		}
1895#ifdef DIAGNOSTIC
1896		if (blkno == BLK_SNAP && bp->b_lblkno >= 0)
1897			panic("ffs_copyonwrite: bad copy block");
1898#endif
1899		if (blkno != 0)
1900			continue;
1901#ifdef DIAGNOSTIC
1902		if (curlwp->l_flag & L_COWINPROGRESS)
1903			printf("ffs_copyonwrite: recursive call\n");
1904#endif
1905		/*
1906		 * Allocate the block into which to do the copy. Since
1907		 * multiple processes may all try to copy the same block,
1908		 * we have to recheck our need to do a copy if we sleep
1909		 * waiting for the lock.
1910		 *
1911		 * Because all snapshots on a filesystem share a single
1912		 * lock, we ensure that we will never be in competition
1913		 * with another process to allocate a block.
1914		 */
1915		if (snapshot_locked == 0 &&
1916		    lockmgr(vp->v_vnlock,
1917		      LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL,
1918		      VI_MTX(devvp)) != 0) {
1919			VI_LOCK(devvp);
1920			goto retry;
1921		}
1922		snapshot_locked = 1;
1923#ifdef DEBUG
1924		if (snapdebug) {
1925			printf("Copyonwrite: snapino %d lbn %" PRId64 " for ",
1926			    ip->i_number, lbn);
1927			if (bp->b_vp == devvp)
1928				printf("fs metadata");
1929			else
1930				printf("inum %d", VTOI(bp->b_vp)->i_number);
1931			printf(" lblkno %" PRId64 "\n", bp->b_lblkno);
1932		}
1933#endif
1934		/*
1935		 * If we have already read the old block contents, then
1936		 * simply copy them to the new block. Note that we need
1937		 * to synchronously write snapshots that have not been
1938		 * unlinked, and hence will be visible after a crash,
1939		 * to ensure their integrity.
1940		 */
1941		if (saved_data) {
1942			error = writevnblk(vp, saved_data, lbn);
1943			if (error)
1944				break;
1945			continue;
1946		}
1947		/*
1948		 * Otherwise, read the old block contents into the buffer.
1949		 */
1950		saved_data = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK);
1951		saved_vp = vp;
1952		if ((error = readfsblk(vp, saved_data, lbn)) != 0) {
1953			free(saved_data, M_UFSMNT);
1954			saved_data = NULL;
1955			break;
1956		}
1957	}
1958	/*
1959	 * Note that we need to synchronously write snapshots that
1960	 * have not been unlinked, and hence will be visible after
1961	 * a crash, to ensure their integrity.
1962	 */
1963	if (saved_data) {
1964		error = writevnblk(saved_vp, saved_data, lbn);
1965		free(saved_data, M_UFSMNT);
1966	}
1967	if (snapshot_locked)
1968		VOP_UNLOCK(vp, 0);
1969	else
1970		VI_UNLOCK(devvp);
1971	return error;
1972}
1973
1974/*
1975 * Read the specified block from disk. Vp is usually a snapshot vnode.
1976 */
1977static int
1978readfsblk(vp, data, lbn)
1979	struct vnode *vp;
1980	caddr_t data;
1981	ufs2_daddr_t lbn;
1982{
1983	int s, error;
1984	struct inode *ip = VTOI(vp);
1985	struct fs *fs = ip->i_fs;
1986	struct buf *nbp;
1987
1988	s = splbio();
1989	nbp = pool_get(&bufpool, PR_WAITOK);
1990	splx(s);
1991
1992	BUF_INIT(nbp);
1993	nbp->b_flags = B_READ;
1994	nbp->b_bcount = nbp->b_bufsize = fs->fs_bsize;
1995	nbp->b_error = 0;
1996	nbp->b_data = data;
1997	nbp->b_blkno = nbp->b_rawblkno = fsbtodb(fs, blkstofrags(fs, lbn));
1998	nbp->b_proc = NULL;
1999	nbp->b_dev = ip->i_devvp->v_rdev;
2000	nbp->b_vp = NULLVP;
2001
2002	DEV_STRATEGY(nbp);
2003
2004	error = biowait(nbp);
2005
2006	s = splbio();
2007	pool_put(&bufpool, nbp);
2008	splx(s);
2009
2010	return error;
2011}
2012
2013/*
2014 * Read the specified block. Bypass UBC to prevent deadlocks.
2015 */
2016static int
2017readvnblk(vp, data, lbn)
2018	struct vnode *vp;
2019	caddr_t data;
2020	ufs2_daddr_t lbn;
2021{
2022	int error;
2023	daddr_t bn;
2024	off_t offset;
2025	struct inode *ip = VTOI(vp);
2026	struct fs *fs = ip->i_fs;
2027
2028	error = VOP_BMAP(vp, lbn, NULL, &bn, NULL);
2029	if (error)
2030		return error;
2031
2032	if (bn != (daddr_t)-1) {
2033		offset = dbtob(bn);
2034		simple_lock(&vp->v_interlock);
2035		error = VOP_PUTPAGES(vp, trunc_page(offset),
2036		    round_page(offset+fs->fs_bsize),
2037		    PGO_CLEANIT|PGO_SYNCIO|PGO_FREE);
2038		if (error)
2039			return error;
2040
2041		return readfsblk(vp, data, fragstoblks(fs, dbtofsb(fs, bn)));
2042	}
2043
2044	bzero(data, fs->fs_bsize);
2045
2046	return 0;
2047}
2048
2049/*
2050 * Write the specified block. Bypass UBC to prevent deadlocks.
2051 */
2052static int
2053writevnblk(vp, data, lbn)
2054	struct vnode *vp;
2055	caddr_t data;
2056	ufs2_daddr_t lbn;
2057{
2058	int s, error;
2059	off_t offset;
2060	struct buf *bp;
2061	struct inode *ip = VTOI(vp);
2062	struct fs *fs = ip->i_fs;
2063
2064	offset = lblktosize(fs, (off_t)lbn);
2065	s = cow_enter();
2066	simple_lock(&vp->v_interlock);
2067	error = VOP_PUTPAGES(vp, trunc_page(offset),
2068	    round_page(offset+fs->fs_bsize), PGO_CLEANIT|PGO_SYNCIO|PGO_FREE);
2069	if (error == 0)
2070		error = VOP_BALLOC(vp, lblktosize(fs, (off_t)lbn),
2071		    fs->fs_bsize, KERNCRED, B_SYNC, &bp);
2072	cow_leave(s);
2073	if (error)
2074		return error;
2075
2076	bcopy(data, bp->b_data, fs->fs_bsize);
2077	bp->b_flags |= B_NOCACHE;
2078
2079	return bwrite(bp);
2080}
2081
2082/*
2083 * Set/reset lwp's L_COWINPROGRESS flag.
2084 * May be called recursive.
2085 */
2086static inline int
2087cow_enter(void)
2088{
2089	struct lwp *l = curlwp;
2090
2091	if (l->l_flag & L_COWINPROGRESS) {
2092		return 0;
2093	} else {
2094		l->l_flag |= L_COWINPROGRESS;
2095		return L_COWINPROGRESS;
2096	}
2097}
2098
2099static inline void
2100cow_leave(int flag)
2101{
2102	struct lwp *l = curlwp;
2103
2104	l->l_flag &= ~flag;
2105}
2106
2107/*
2108 * Get/Put direct block from inode or buffer containing disk addresses. Take
2109 * care for fs type (UFS1/UFS2) and byte swapping. These functions should go
2110 * into a global include.
2111 */
2112static inline ufs2_daddr_t
2113db_get(struct inode *ip, int loc)
2114{
2115	if (ip->i_ump->um_fstype == UFS1)
2116		return ufs_rw32(ip->i_ffs1_db[loc], UFS_IPNEEDSWAP(ip));
2117	else
2118		return ufs_rw64(ip->i_ffs2_db[loc], UFS_IPNEEDSWAP(ip));
2119}
2120
2121static inline void
2122db_assign(struct inode *ip, int loc, ufs2_daddr_t val)
2123{
2124	if (ip->i_ump->um_fstype == UFS1)
2125		ip->i_ffs1_db[loc] = ufs_rw32(val, UFS_IPNEEDSWAP(ip));
2126	else
2127		ip->i_ffs2_db[loc] = ufs_rw64(val, UFS_IPNEEDSWAP(ip));
2128}
2129
2130static inline ufs2_daddr_t
2131idb_get(struct inode *ip, caddr_t buf, int loc)
2132{
2133	if (ip->i_ump->um_fstype == UFS1)
2134		return ufs_rw32(((ufs1_daddr_t *)(buf))[loc],
2135		    UFS_IPNEEDSWAP(ip));
2136	else
2137		return ufs_rw64(((ufs2_daddr_t *)(buf))[loc],
2138		    UFS_IPNEEDSWAP(ip));
2139}
2140
2141static inline void
2142idb_assign(struct inode *ip, caddr_t buf, int loc, ufs2_daddr_t val)
2143{
2144	if (ip->i_ump->um_fstype == UFS1)
2145		((ufs1_daddr_t *)(buf))[loc] =
2146		    ufs_rw32(val, UFS_IPNEEDSWAP(ip));
2147	else
2148		((ufs2_daddr_t *)(buf))[loc] =
2149		    ufs_rw64(val, UFS_IPNEEDSWAP(ip));
2150}
2151