ffs_snapshot.c revision 1.76
1/*	$NetBSD: ffs_snapshot.c,v 1.76 2008/08/24 09:51:47 hannken Exp $	*/
2
3/*
4 * Copyright 2000 Marshall Kirk McKusick. All Rights Reserved.
5 *
6 * Further information about snapshots can be obtained from:
7 *
8 *	Marshall Kirk McKusick		http://www.mckusick.com/softdep/
9 *	1614 Oxford Street		mckusick@mckusick.com
10 *	Berkeley, CA 94709-1608		+1-510-843-9542
11 *	USA
12 *
13 * Redistribution and use in source and binary forms, with or without
14 * modification, are permitted provided that the following conditions
15 * are met:
16 *
17 * 1. Redistributions of source code must retain the above copyright
18 *    notice, this list of conditions and the following disclaimer.
19 * 2. Redistributions in binary form must reproduce the above copyright
20 *    notice, this list of conditions and the following disclaimer in the
21 *    documentation and/or other materials provided with the distribution.
22 *
23 * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY
24 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
25 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
26 * DISCLAIMED.  IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR
27 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33 * SUCH DAMAGE.
34 *
35 *	@(#)ffs_snapshot.c	8.11 (McKusick) 7/23/00
36 *
37 *	from FreeBSD: ffs_snapshot.c,v 1.79 2004/02/13 02:02:06 kuriyama Exp
38 */
39
40#include <sys/cdefs.h>
41__KERNEL_RCSID(0, "$NetBSD: ffs_snapshot.c,v 1.76 2008/08/24 09:51:47 hannken Exp $");
42
43#if defined(_KERNEL_OPT)
44#include "opt_ffs.h"
45#include "opt_wapbl.h"
46#endif
47
48#include <sys/param.h>
49#include <sys/kernel.h>
50#include <sys/systm.h>
51#include <sys/conf.h>
52#include <sys/buf.h>
53#include <sys/proc.h>
54#include <sys/namei.h>
55#include <sys/sched.h>
56#include <sys/stat.h>
57#include <sys/malloc.h>
58#include <sys/mount.h>
59#include <sys/resource.h>
60#include <sys/resourcevar.h>
61#include <sys/vnode.h>
62#include <sys/kauth.h>
63#include <sys/fstrans.h>
64#include <sys/wapbl.h>
65
66#include <miscfs/specfs/specdev.h>
67
68#include <ufs/ufs/quota.h>
69#include <ufs/ufs/ufsmount.h>
70#include <ufs/ufs/inode.h>
71#include <ufs/ufs/ufs_extern.h>
72#include <ufs/ufs/ufs_bswap.h>
73#include <ufs/ufs/ufs_wapbl.h>
74
75#include <ufs/ffs/fs.h>
76#include <ufs/ffs/ffs_extern.h>
77
78#include <uvm/uvm.h>
79
80/* FreeBSD -> NetBSD conversion */
81#define KERNCRED	lwp0.l_cred
82#define ufs1_daddr_t	int32_t
83#define ufs2_daddr_t	int64_t
84#define ufs_lbn_t	daddr_t
85#define VI_MTX(v)	(&(v)->v_interlock)
86#define VI_LOCK(v)	mutex_enter(&(v)->v_interlock)
87#define VI_UNLOCK(v)	mutex_exit(&(v)->v_interlock)
88#define MNT_ILOCK(v)	mutex_enter(&mntvnode_lock)
89#define MNT_IUNLOCK(v)	mutex_exit(&mntvnode_lock)
90
91#if !defined(FFS_NO_SNAPSHOT)
92typedef int (*acctfunc_t)
93    (struct vnode *, void *, int, int, struct fs *, daddr_t, int);
94
95static int cgaccount(int, struct vnode *, void *, int);
96static int expunge(struct vnode *, struct inode *, struct fs *,
97    acctfunc_t, int);
98static int indiracct(struct vnode *, struct vnode *, int, daddr_t,
99    daddr_t, daddr_t, daddr_t, daddr_t, struct fs *, acctfunc_t, int);
100static int fullacct(struct vnode *, void *, int, int, struct fs *,
101    daddr_t, int);
102static int snapacct(struct vnode *, void *, int, int, struct fs *,
103    daddr_t, int);
104static int mapacct(struct vnode *, void *, int, int, struct fs *,
105    daddr_t, int);
106#endif /* !defined(FFS_NO_SNAPSHOT) */
107
108static int ffs_copyonwrite(void *, struct buf *, bool);
109static int snapblkaddr(struct vnode *, daddr_t, daddr_t *);
110static int rwfsblk(struct vnode *, int, void *, ufs2_daddr_t);
111static int syncsnap(struct vnode *);
112static int wrsnapblk(struct vnode *, void *, ufs2_daddr_t);
113
114static inline ufs2_daddr_t db_get(struct inode *, int);
115static inline void db_assign(struct inode *, int, ufs2_daddr_t);
116static inline ufs2_daddr_t ib_get(struct inode *, int);
117static inline void ib_assign(struct inode *, int, ufs2_daddr_t);
118static inline ufs2_daddr_t idb_get(struct inode *, void *, int);
119static inline void idb_assign(struct inode *, void *, int, ufs2_daddr_t);
120
121struct snap_info {
122	kmutex_t si_lock;			/* Lock this snapinfo */
123	kmutex_t si_snaplock;			/* Snapshot vnode common lock */
124	TAILQ_HEAD(inodelst, inode) si_snapshots; /* List of active snapshots */
125	daddr_t *si_snapblklist;		/* Snapshot block hints list */
126	uint32_t si_gen;			/* Incremented on change */
127};
128
129#ifdef DEBUG
130static int snapdebug = 0;
131#endif
132
133int
134ffs_snapshot_init(struct ufsmount *ump)
135{
136	struct snap_info *si;
137
138	si = ump->um_snapinfo = kmem_alloc(sizeof(*si), KM_SLEEP);
139	if (si == NULL)
140		return ENOMEM;
141
142	TAILQ_INIT(&si->si_snapshots);
143	mutex_init(&si->si_lock, MUTEX_DEFAULT, IPL_NONE);
144	mutex_init(&si->si_snaplock, MUTEX_DEFAULT, IPL_NONE);
145	si->si_gen = 0;
146	si->si_snapblklist = NULL;
147
148	return 0;
149}
150
151void
152ffs_snapshot_fini(struct ufsmount *ump)
153{
154	struct snap_info *si;
155
156	si = ump->um_snapinfo;
157	ump->um_snapinfo = NULL;
158
159	KASSERT(TAILQ_EMPTY(&si->si_snapshots));
160	mutex_destroy(&si->si_lock);
161	mutex_destroy(&si->si_snaplock);
162	KASSERT(si->si_snapblklist == NULL);
163	kmem_free(si, sizeof(*si));
164}
165
166/*
167 * Create a snapshot file and initialize it for the filesystem.
168 * Vnode is locked on entry and return.
169 */
170int
171ffs_snapshot(struct mount *mp, struct vnode *vp,
172    struct timespec *ctime)
173{
174#if defined(FFS_NO_SNAPSHOT)
175	return EOPNOTSUPP;
176}
177#else /* defined(FFS_NO_SNAPSHOT) */
178	ufs2_daddr_t numblks, blkno, *blkp, snaplistsize = 0, *snapblklist;
179	int error, ns, cg, snaploc;
180	int i, size, len, loc;
181	int flag = mp->mnt_flag;
182	struct timeval starttime;
183#ifdef DEBUG
184	struct timeval endtime;
185#endif
186	struct timespec ts;
187	long redo = 0;
188	int32_t *lp;
189	void *space;
190	void *sbbuf = NULL;
191	struct fs *copy_fs = NULL, *fs = VFSTOUFS(mp)->um_fs;
192	struct lwp *l = curlwp;
193	struct inode *ip, *xp;
194	struct buf *bp, *ibp, *nbp;
195	struct vattr vat;
196	struct vnode *xvp, *mvp, *logvp, *devvp;
197	struct snap_info *si;
198	bool suspended = false;
199	bool snapshot_locked = false;
200
201	ns = UFS_FSNEEDSWAP(fs);
202	si = VFSTOUFS(mp)->um_snapinfo;
203
204	/*
205	 * Need to serialize access to snapshot code per filesystem.
206	 */
207	/*
208	 * If the vnode already is a snapshot, return.
209	 */
210	if (VTOI(vp)->i_flags & SF_SNAPSHOT) {
211		if (ctime) {
212			ctime->tv_sec = DIP(VTOI(vp), mtime);
213			ctime->tv_nsec = DIP(VTOI(vp), mtimensec);
214		}
215		return 0;
216	}
217	/*
218	 * Check mount, exclusive reference and owner.
219	 */
220	if (vp->v_mount != mp)
221		return EXDEV;
222	if (vp->v_usecount != 1 || vp->v_writecount != 0)
223		return EBUSY;
224	if (kauth_authorize_generic(l->l_cred, KAUTH_GENERIC_ISSUSER,
225	    NULL) != 0 &&
226	    VTOI(vp)->i_uid != kauth_cred_geteuid(l->l_cred))
227		return EACCES;
228
229	if (vp->v_size != 0) {
230		error = ffs_truncate(vp, 0, 0, NOCRED);
231		if (error)
232			return error;
233	}
234	/*
235	 * Assign a snapshot slot in the superblock.
236	 */
237	for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++)
238		if (fs->fs_snapinum[snaploc] == 0)
239			break;
240	if (snaploc == FSMAXSNAP)
241		return (ENOSPC);
242	ip = VTOI(vp);
243	devvp = ip->i_devvp;
244	if ((fs->fs_flags & FS_DOWAPBL) &&
245	    fs->fs_journal_location == UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM) {
246		error = VFS_VGET(mp,
247		    fs->fs_journallocs[UFS_WAPBL_INFS_INO], &logvp);
248		if (error)
249			return error;
250	} else
251		logvp = NULL;
252	/*
253	 * Write an empty list of preallocated blocks to the end of
254	 * the snapshot to set size to at least that of the filesystem.
255	 */
256	numblks = howmany(fs->fs_size, fs->fs_frag);
257	blkno = 1;
258	blkno = ufs_rw64(blkno, ns);
259	error = vn_rdwr(UIO_WRITE, vp,
260	    (void *)&blkno, sizeof(blkno), lblktosize(fs, (off_t)numblks),
261	    UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, l->l_cred, NULL, NULL);
262	if (error)
263		goto out;
264	/*
265	 * Preallocate critical data structures so that we can copy
266	 * them in without further allocation after we suspend all
267	 * operations on the filesystem. We would like to just release
268	 * the allocated buffers without writing them since they will
269	 * be filled in below once we are ready to go, but this upsets
270	 * the soft update code, so we go ahead and write the new buffers.
271	 *
272	 * Allocate all indirect blocks and mark all of them as not
273	 * needing to be copied.
274	 */
275	error = UFS_WAPBL_BEGIN(mp);
276	if (error)
277		goto out;
278	for (blkno = NDADDR, i = 0; blkno < numblks; blkno += NINDIR(fs)) {
279		error = ffs_balloc(vp, lblktosize(fs, (off_t)blkno),
280		    fs->fs_bsize, l->l_cred, B_METAONLY, &ibp);
281		if (error) {
282			UFS_WAPBL_END(mp);
283			goto out;
284		}
285		if (DOINGSOFTDEP(vp))
286			bawrite(ibp);
287		else
288			brelse(ibp, 0);
289		if ((++i % 16) == 0) {
290			UFS_WAPBL_END(mp);
291			error = UFS_WAPBL_BEGIN(mp);
292			if (error)
293				goto out;
294		}
295	}
296	/*
297	 * Allocate copies for the superblock and its summary information.
298	 */
299	error = ffs_balloc(vp, fs->fs_sblockloc, fs->fs_sbsize, KERNCRED,
300	    0, &nbp);
301	if (error) {
302		UFS_WAPBL_END(mp);
303		goto out;
304	}
305	bawrite(nbp);
306	blkno = fragstoblks(fs, fs->fs_csaddr);
307	len = howmany(fs->fs_cssize, fs->fs_bsize);
308	for (loc = 0; loc < len; loc++) {
309		error = ffs_balloc(vp, lblktosize(fs, (off_t)(blkno + loc)),
310		    fs->fs_bsize, KERNCRED, 0, &nbp);
311		if (error) {
312			UFS_WAPBL_END(mp);
313			goto out;
314		}
315		bawrite(nbp);
316	}
317	/*
318	 * Copy all the cylinder group maps. Although the
319	 * filesystem is still active, we hope that only a few
320	 * cylinder groups will change between now and when we
321	 * suspend operations. Thus, we will be able to quickly
322	 * touch up the few cylinder groups that changed during
323	 * the suspension period.
324	 */
325	len = howmany(fs->fs_ncg, NBBY);
326	fs->fs_active = malloc(len, M_DEVBUF, M_WAITOK | M_ZERO);
327	for (cg = 0; cg < fs->fs_ncg; cg++) {
328		if ((error = ffs_balloc(vp, lfragtosize(fs, cgtod(fs, cg)),
329		    fs->fs_bsize, KERNCRED, 0, &nbp)) != 0)
330			break;
331		error = cgaccount(cg, vp, nbp->b_data, 1);
332		bawrite(nbp);
333		if (error)
334			break;
335	}
336	UFS_WAPBL_END(mp);
337	if (error)
338		goto out;
339	/*
340	 * Change inode to snapshot type file.
341	 */
342	ip->i_flags |= SF_SNAPSHOT;
343	DIP_ASSIGN(ip, flags, ip->i_flags);
344	ip->i_flag |= IN_CHANGE | IN_UPDATE;
345	/*
346	 * Ensure that the snapshot is completely on disk.
347	 * Since we have marked it as a snapshot it is safe to
348	 * unlock it as no process will be allowed to write to it.
349	 */
350	if ((error = VOP_FSYNC(vp, KERNCRED, FSYNC_WAIT, 0, 0)) != 0)
351		goto out;
352	VOP_UNLOCK(vp, 0);
353	/*
354	 * All allocations are done, so we can now snapshot the system.
355	 *
356	 * Suspend operation on filesystem.
357	 */
358	if ((error = vfs_suspend(vp->v_mount, 0)) != 0) {
359		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
360		goto out;
361	}
362	suspended = true;
363	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
364	getmicrotime(&starttime);
365	error = UFS_WAPBL_BEGIN(mp);
366	if (error)
367		goto out;
368	/*
369	 * First, copy all the cylinder group maps that have changed.
370	 */
371	for (cg = 0; cg < fs->fs_ncg; cg++) {
372		if (ACTIVECG_ISSET(fs, cg))
373			continue;
374		redo++;
375		if ((error = ffs_balloc(vp, lfragtosize(fs, cgtod(fs, cg)),
376		    fs->fs_bsize, KERNCRED, 0, &nbp)) != 0)
377			break;
378		error = cgaccount(cg, vp, nbp->b_data, 2);
379		bawrite(nbp);
380		if (error)
381			break;
382	}
383	if (error) {
384		UFS_WAPBL_END(mp);
385		goto out;
386	}
387	/*
388	 * Grab a copy of the superblock and its summary information.
389	 * We delay writing it until the suspension is released below.
390	 */
391	sbbuf = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK);
392	loc = blkoff(fs, fs->fs_sblockloc);
393	if (loc > 0)
394		memset(sbbuf, 0, loc);
395	copy_fs = (struct fs *)((char *)sbbuf + loc);
396	bcopy(fs, copy_fs, fs->fs_sbsize);
397	size = fs->fs_bsize < SBLOCKSIZE ? fs->fs_bsize : SBLOCKSIZE;
398	if (fs->fs_sbsize < size)
399		memset((char *)sbbuf + loc + fs->fs_sbsize, 0,
400		    size - fs->fs_sbsize);
401	size = blkroundup(fs, fs->fs_cssize);
402	if (fs->fs_contigsumsize > 0)
403		size += fs->fs_ncg * sizeof(int32_t);
404	space = malloc((u_long)size, M_UFSMNT, M_WAITOK);
405	copy_fs->fs_csp = space;
406	bcopy(fs->fs_csp, copy_fs->fs_csp, fs->fs_cssize);
407	space = (char *)space + fs->fs_cssize;
408	loc = howmany(fs->fs_cssize, fs->fs_fsize);
409	i = fs->fs_frag - loc % fs->fs_frag;
410	len = (i == fs->fs_frag) ? 0 : i * fs->fs_fsize;
411	if (len > 0) {
412		if ((error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + loc),
413		    len, KERNCRED, 0, &bp)) != 0) {
414			brelse(bp, 0);
415			free(copy_fs->fs_csp, M_UFSMNT);
416			goto out;
417		}
418		bcopy(bp->b_data, space, (u_int)len);
419		space = (char *)space + len;
420		brelse(bp, BC_INVAL | BC_NOCACHE);
421	}
422	if (fs->fs_contigsumsize > 0) {
423		copy_fs->fs_maxcluster = lp = space;
424		for (i = 0; i < fs->fs_ncg; i++)
425			*lp++ = fs->fs_contigsumsize;
426	}
427	/*
428	 * We must check for active files that have been unlinked
429	 * (e.g., with a zero link count). We have to expunge all
430	 * trace of these files from the snapshot so that they are
431	 * not reclaimed prematurely by fsck or unnecessarily dumped.
432	 * We turn off the MNTK_SUSPENDED flag to avoid a panic from
433	 * spec_strategy about writing on a suspended filesystem.
434	 * Note that we skip unlinked snapshot files as they will
435	 * be handled separately below.
436	 *
437	 * We also calculate the needed size for the snapshot list.
438	 */
439	snaplistsize = fs->fs_ncg + howmany(fs->fs_cssize, fs->fs_bsize) +
440	    FSMAXSNAP + 1 /* superblock */ + 1 /* last block */ + 1 /* size */;
441	/* Allocate a marker vnode */
442	if ((mvp = vnalloc(mp)) == NULL) {
443		error = ENOMEM;
444		goto out;
445	}
446	MNT_ILOCK(mp);
447	/*
448	 * NOTE: not using the TAILQ_FOREACH here since in this loop vgone()
449	 * and vclean() can be called indirectly
450	 */
451	for (xvp = TAILQ_FIRST(&mp->mnt_vnodelist); xvp; xvp = vunmark(mvp)) {
452		vmark(mvp, xvp);
453		/*
454		 * Make sure this vnode wasn't reclaimed in getnewvnode().
455		 * Start over if it has (it won't be on the list anymore).
456		 */
457		if (xvp->v_mount != mp || vismarker(xvp))
458			continue;
459		VI_LOCK(xvp);
460		if ((xvp->v_iflag & VI_XLOCK) ||
461		    xvp->v_usecount == 0 || xvp->v_type == VNON ||
462		    VTOI(xvp) == NULL ||
463		    (VTOI(xvp)->i_flags & SF_SNAPSHOT)) {
464			VI_UNLOCK(xvp);
465			continue;
466		}
467		MNT_IUNLOCK(mp);
468		/*
469		 * XXXAD should increase vnode ref count to prevent it
470		 * disappearing or being recycled.
471		 */
472		VI_UNLOCK(xvp);
473#ifdef DEBUG
474		if (snapdebug)
475			vprint("ffs_snapshot: busy vnode", xvp);
476#endif
477		if (xvp != logvp && VOP_GETATTR(xvp, &vat, l->l_cred) == 0 &&
478		    vat.va_nlink > 0) {
479			MNT_ILOCK(mp);
480			continue;
481		}
482		xp = VTOI(xvp);
483		if (xvp != logvp &&
484		    ffs_checkfreefile(copy_fs, vp, xp->i_number)) {
485			MNT_ILOCK(mp);
486			continue;
487		}
488		/*
489		 * If there is a fragment, clear it here.
490		 */
491		blkno = 0;
492		loc = howmany(xp->i_size, fs->fs_bsize) - 1;
493		if (loc < NDADDR) {
494			len = fragroundup(fs, blkoff(fs, xp->i_size));
495			if (len > 0 && len < fs->fs_bsize) {
496				ffs_blkfree(copy_fs, vp, db_get(xp, loc),
497				    len, xp->i_number);
498				blkno = db_get(xp, loc);
499				db_assign(xp, loc, 0);
500			}
501		}
502		snaplistsize += 1;
503		error = expunge(vp, xp, copy_fs, fullacct, BLK_NOCOPY);
504		if (blkno)
505			db_assign(xp, loc, blkno);
506		if (!error)
507			error = ffs_freefile(copy_fs, vp, xp->i_number,
508			    xp->i_mode);
509		if (error) {
510			free(copy_fs->fs_csp, M_UFSMNT);
511			(void)vunmark(mvp);
512			goto out;
513		}
514		MNT_ILOCK(mp);
515	}
516	MNT_IUNLOCK(mp);
517	vnfree(mvp);
518	UFS_WAPBL_END(mp);
519	/*
520	 * Acquire the snapshot lock.
521	 */
522	mutex_enter(&si->si_snaplock);
523	snapshot_locked = true;
524	/*
525	 * If this is the first snapshot on this filesystem, then we need
526	 * to allocate the space for the list of preallocated snapshot blocks.
527	 * This list will be refined below, but this preliminary one will
528	 * keep us out of deadlock until the full one is ready.
529	 */
530	mutex_enter(&si->si_lock);
531	if ((xp = TAILQ_FIRST(&si->si_snapshots)) == NULL) {
532		mutex_exit(&si->si_lock);
533		snapblklist = malloc(
534		    snaplistsize * sizeof(ufs2_daddr_t), M_UFSMNT, M_WAITOK);
535		blkp = &snapblklist[1];
536		*blkp++ = lblkno(fs, fs->fs_sblockloc);
537		blkno = fragstoblks(fs, fs->fs_csaddr);
538		for (cg = 0; cg < fs->fs_ncg; cg++) {
539			if (fragstoblks(fs, cgtod(fs, cg)) > blkno)
540				break;
541			*blkp++ = fragstoblks(fs, cgtod(fs, cg));
542		}
543		len = howmany(fs->fs_cssize, fs->fs_bsize);
544		for (loc = 0; loc < len; loc++)
545			*blkp++ = blkno + loc;
546		for (; cg < fs->fs_ncg; cg++)
547			*blkp++ = fragstoblks(fs, cgtod(fs, cg));
548		snapblklist[0] = blkp - snapblklist;
549		mutex_enter(&si->si_lock);
550		if (si->si_snapblklist != NULL)
551			panic("ffs_snapshot: non-empty list");
552		si->si_snapblklist = snapblklist;
553	}
554	/*
555	 * Record snapshot inode. Since this is the newest snapshot,
556	 * it must be placed at the end of the list.
557	 */
558	fs->fs_snapinum[snaploc] = ip->i_number;
559	if (ip->i_nextsnap.tqe_prev != 0)
560		panic("ffs_snapshot: %llu already on list",
561		    (unsigned long long)ip->i_number);
562	TAILQ_INSERT_TAIL(&si->si_snapshots, ip, i_nextsnap);
563	if (xp == NULL)
564		fscow_establish(mp, ffs_copyonwrite, devvp);
565	si->si_gen++;
566	mutex_exit(&si->si_lock);
567	vp->v_vflag |= VV_SYSTEM;
568	/*
569	 * Set the mtime to the time the snapshot has been taken.
570	 */
571	TIMEVAL_TO_TIMESPEC(&starttime, &ts);
572	if (ctime)
573		*ctime = ts;
574	DIP_ASSIGN(ip, mtime, ts.tv_sec);
575	DIP_ASSIGN(ip, mtimensec, ts.tv_nsec);
576	ip->i_flag |= IN_CHANGE | IN_UPDATE;
577	/*
578	 * Copy allocation information from all the snapshots in
579	 * this snapshot and then expunge them from its view.
580	 */
581	TAILQ_FOREACH(xp, &si->si_snapshots, i_nextsnap) {
582		if (xp == ip)
583			break;
584		if ((error = UFS_WAPBL_BEGIN(mp)) == 0) {
585			error = expunge(vp, xp, fs, snapacct, BLK_SNAP);
586			if (error == 0 && xp->i_ffs_effnlink == 0)
587				error = ffs_freefile(copy_fs, vp,
588				    xp->i_number, xp->i_mode);
589			UFS_WAPBL_END(mp);
590		}
591		if (error) {
592			fs->fs_snapinum[snaploc] = 0;
593			goto done;
594		}
595	}
596	/*
597	 * Allocate space for the full list of preallocated snapshot blocks.
598	 */
599	snapblklist = malloc(snaplistsize * sizeof(ufs2_daddr_t),
600	    M_UFSMNT, M_WAITOK);
601	ip->i_snapblklist = &snapblklist[1];
602	/*
603	 * Expunge the blocks used by the snapshots from the set of
604	 * blocks marked as used in the snapshot bitmaps. Also, collect
605	 * the list of allocated blocks in i_snapblklist.
606	 */
607	if ((error = UFS_WAPBL_BEGIN(mp)) == 0) {
608		expunge(vp, ip, copy_fs, mapacct, BLK_SNAP);
609		UFS_WAPBL_END(mp);
610	}
611	if (error) {
612		fs->fs_snapinum[snaploc] = 0;
613		FREE(snapblklist, M_UFSMNT);
614		goto done;
615	}
616	if (snaplistsize < ip->i_snapblklist - snapblklist)
617		panic("ffs_snapshot: list too small");
618	snaplistsize = ip->i_snapblklist - snapblklist;
619	snapblklist[0] = snaplistsize;
620	ip->i_snapblklist = &snapblklist[0];
621	/*
622	 * Write out the list of allocated blocks to the end of the snapshot.
623	 */
624	for (i = 0; i < snaplistsize; i++)
625		snapblklist[i] = ufs_rw64(snapblklist[i], ns);
626	error = vn_rdwr(UIO_WRITE, vp, (void *)snapblklist,
627	    snaplistsize*sizeof(ufs2_daddr_t), lblktosize(fs, (off_t)numblks),
628	    UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, l->l_cred, NULL, NULL);
629	for (i = 0; i < snaplistsize; i++)
630		snapblklist[i] = ufs_rw64(snapblklist[i], ns);
631	if (error) {
632		fs->fs_snapinum[snaploc] = 0;
633		FREE(snapblklist, M_UFSMNT);
634		goto done;
635	}
636	/*
637	 * Write the superblock and its summary information
638	 * to the snapshot.
639	 */
640	blkno = fragstoblks(fs, fs->fs_csaddr);
641	len = howmany(fs->fs_cssize, fs->fs_bsize);
642	space = copy_fs->fs_csp;
643#ifdef FFS_EI
644	if (ns) {
645		ffs_sb_swap(copy_fs, copy_fs);
646		ffs_csum_swap(space, space, fs->fs_cssize);
647	}
648#endif
649	error = UFS_WAPBL_BEGIN(mp);
650	if (error) {
651		fs->fs_snapinum[snaploc] = 0;
652		FREE(snapblklist, M_UFSMNT);
653		goto done;
654	}
655	for (loc = 0; loc < len; loc++) {
656		error = bread(vp, blkno + loc, fs->fs_bsize, KERNCRED,
657		    B_MODIFY, &nbp);
658		if (error) {
659			brelse(nbp, 0);
660			fs->fs_snapinum[snaploc] = 0;
661			FREE(snapblklist, M_UFSMNT);
662			goto done;
663		}
664		bcopy(space, nbp->b_data, fs->fs_bsize);
665		space = (char *)space + fs->fs_bsize;
666		bawrite(nbp);
667	}
668	/*
669	 * Copy the first NDADDR blocks to the snapshot so ffs_copyonwrite()
670	 * and ffs_snapblkfree() will always work on indirect blocks.
671	 */
672	for (loc = 0; loc < NDADDR; loc++) {
673		if (db_get(ip, loc) != 0)
674			continue;
675		error = ffs_balloc(vp, lblktosize(fs, (off_t)loc),
676		    fs->fs_bsize, KERNCRED, 0, &nbp);
677		if (error)
678			break;
679		error = rwfsblk(vp, B_READ, nbp->b_data, loc);
680		if (error) {
681			brelse(nbp, 0);
682			fs->fs_snapinum[snaploc] = 0;
683			FREE(snapblklist, M_UFSMNT);
684			goto done;
685		}
686		bawrite(nbp);
687	}
688	UFS_WAPBL_END(mp);
689	/*
690	 * As this is the newest list, it is the most inclusive, so
691	 * should replace the previous list. If this is the first snapshot
692	 * free the preliminary list.
693	 */
694	mutex_enter(&si->si_lock);
695	space = si->si_snapblklist;
696	si->si_snapblklist = snapblklist;
697	if (TAILQ_FIRST(&si->si_snapshots) == ip)
698		FREE(space, M_UFSMNT);
699	si->si_gen++;
700	mutex_exit(&si->si_lock);
701done:
702	if (mp->mnt_wapbl)
703		copy_fs->fs_flags &= ~FS_DOWAPBL;
704	free(copy_fs->fs_csp, M_UFSMNT);
705	if (!error) {
706		error = UFS_WAPBL_BEGIN(mp);
707		if (!error) {
708			error = bread(vp, lblkno(fs, fs->fs_sblockloc),
709			    fs->fs_bsize, KERNCRED, B_MODIFY, &nbp);
710			if (error) {
711				brelse(nbp, 0);
712			} else {
713				bcopy(sbbuf, nbp->b_data, fs->fs_bsize);
714				bawrite(nbp);
715			}
716			UFS_WAPBL_END(mp);
717		}
718		if (error)
719			fs->fs_snapinum[snaploc] = 0;
720	}
721out:
722	/*
723	 * Invalidate and free all pages on the snapshot vnode.
724	 * We will read and write through the buffercache.
725	 */
726	if (!error) {
727		mutex_enter(&vp->v_interlock);
728		error = VOP_PUTPAGES(vp, 0, 0,
729		    PGO_ALLPAGES|PGO_CLEANIT|PGO_SYNCIO|PGO_FREE);
730	}
731#ifdef WAPBL
732	if (!error && mp->mnt_wapbl)
733		error = wapbl_flush(mp->mnt_wapbl, 1);
734#endif
735	if (suspended) {
736		vfs_resume(vp->v_mount);
737#ifdef DEBUG
738		if (starttime.tv_sec > 0) {
739			getmicrotime(&endtime);
740			timersub(&endtime, &starttime, &endtime);
741			printf("%s: suspended %ld.%03ld sec, redo %ld of %d\n",
742			    vp->v_mount->mnt_stat.f_mntonname,
743			    (long)endtime.tv_sec, endtime.tv_usec / 1000,
744			    redo, fs->fs_ncg);
745		}
746#endif
747	}
748	if (sbbuf)
749		free(sbbuf, M_UFSMNT);
750	if (fs->fs_active != 0) {
751		FREE(fs->fs_active, M_DEVBUF);
752		fs->fs_active = 0;
753	}
754	mp->mnt_flag = flag;
755	if (error) {
756		if (!UFS_WAPBL_BEGIN(mp)) {
757			(void) ffs_truncate(vp, (off_t)0, 0, NOCRED);
758			UFS_WAPBL_END(mp);
759		}
760	} else
761		vref(vp);
762	if (snapshot_locked)
763		mutex_exit(&si->si_snaplock);
764	return (error);
765}
766
767/*
768 * Copy a cylinder group map. All the unallocated blocks are marked
769 * BLK_NOCOPY so that the snapshot knows that it need not copy them
770 * if they are later written. If passno is one, then this is a first
771 * pass, so only setting needs to be done. If passno is 2, then this
772 * is a revision to a previous pass which must be undone as the
773 * replacement pass is done.
774 */
775static int
776cgaccount(int cg, struct vnode *vp, void *data, int passno)
777{
778	struct buf *bp, *ibp;
779	struct inode *ip;
780	struct cg *cgp;
781	struct fs *fs;
782	ufs2_daddr_t base, numblks;
783	int error, len, loc, ns, indiroff;
784
785	ip = VTOI(vp);
786	fs = ip->i_fs;
787	ns = UFS_FSNEEDSWAP(fs);
788	error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)),
789		(int)fs->fs_cgsize, KERNCRED, 0, &bp);
790	if (error) {
791		brelse(bp, 0);
792		return (error);
793	}
794	cgp = (struct cg *)bp->b_data;
795	if (!cg_chkmagic(cgp, ns)) {
796		brelse(bp, 0);
797		return (EIO);
798	}
799	ACTIVECG_SET(fs, cg);
800
801	bcopy(bp->b_data, data, fs->fs_cgsize);
802	brelse(bp, 0);
803	if (fs->fs_cgsize < fs->fs_bsize)
804		memset((char *)data + fs->fs_cgsize, 0,
805		    fs->fs_bsize - fs->fs_cgsize);
806	numblks = howmany(fs->fs_size, fs->fs_frag);
807	len = howmany(fs->fs_fpg, fs->fs_frag);
808	base = cg * fs->fs_fpg / fs->fs_frag;
809	if (base + len >= numblks)
810		len = numblks - base - 1;
811	loc = 0;
812	if (base < NDADDR) {
813		for ( ; loc < NDADDR; loc++) {
814			if (ffs_isblock(fs, cg_blksfree(cgp, ns), loc))
815				db_assign(ip, loc, BLK_NOCOPY);
816			else if (db_get(ip, loc) == BLK_NOCOPY) {
817				if (passno == 2)
818					db_assign(ip, loc, 0);
819				else if (passno == 1)
820					panic("ffs_snapshot: lost direct block");
821			}
822		}
823	}
824	if ((error = ffs_balloc(vp, lblktosize(fs, (off_t)(base + loc)),
825	    fs->fs_bsize, KERNCRED, B_METAONLY, &ibp)) != 0)
826		return (error);
827	indiroff = (base + loc - NDADDR) % NINDIR(fs);
828	for ( ; loc < len; loc++, indiroff++) {
829		if (indiroff >= NINDIR(fs)) {
830			bawrite(ibp);
831			if ((error = ffs_balloc(vp,
832			    lblktosize(fs, (off_t)(base + loc)),
833			    fs->fs_bsize, KERNCRED, B_METAONLY, &ibp)) != 0)
834				return (error);
835			indiroff = 0;
836		}
837		if (ffs_isblock(fs, cg_blksfree(cgp, ns), loc))
838			idb_assign(ip, ibp->b_data, indiroff, BLK_NOCOPY);
839		else if (idb_get(ip, ibp->b_data, indiroff) == BLK_NOCOPY) {
840			if (passno == 2)
841				idb_assign(ip, ibp->b_data, indiroff, 0);
842			else if (passno == 1)
843				panic("ffs_snapshot: lost indirect block");
844		}
845	}
846	bdwrite(ibp);
847	return (0);
848}
849
850/*
851 * Before expunging a snapshot inode, note all the
852 * blocks that it claims with BLK_SNAP so that fsck will
853 * be able to account for those blocks properly and so
854 * that this snapshot knows that it need not copy them
855 * if the other snapshot holding them is freed.
856 */
857static int
858expunge(struct vnode *snapvp, struct inode *cancelip, struct fs *fs,
859    acctfunc_t acctfunc, int expungetype)
860{
861	int i, error, ns;
862	daddr_t lbn, rlbn;
863	daddr_t len, blkno, numblks, blksperindir;
864	struct ufs1_dinode *dip1;
865	struct ufs2_dinode *dip2;
866	void *bap;
867	struct buf *bp;
868
869	ns = UFS_FSNEEDSWAP(fs);
870	/*
871	 * Prepare to expunge the inode. If its inode block has not
872	 * yet been copied, then allocate and fill the copy.
873	 */
874	lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number));
875	error = snapblkaddr(snapvp, lbn, &blkno);
876	if (error)
877		return error;
878	if (blkno != 0) {
879		error = bread(snapvp, lbn, fs->fs_bsize, KERNCRED,
880		    B_MODIFY, &bp);
881	} else {
882		error = ffs_balloc(snapvp, lblktosize(fs, (off_t)lbn),
883		    fs->fs_bsize, KERNCRED, 0, &bp);
884		if (! error)
885			error = rwfsblk(snapvp, B_READ, bp->b_data, lbn);
886	}
887	if (error)
888		return error;
889	/*
890	 * Set a snapshot inode to be a zero length file, regular files
891	 * or unlinked snapshots to be completely unallocated.
892	 */
893	if (fs->fs_magic == FS_UFS1_MAGIC) {
894		dip1 = (struct ufs1_dinode *)bp->b_data +
895		    ino_to_fsbo(fs, cancelip->i_number);
896		if (expungetype == BLK_NOCOPY || cancelip->i_ffs_effnlink == 0)
897			dip1->di_mode = 0;
898		dip1->di_size = 0;
899		dip1->di_blocks = 0;
900		dip1->di_flags =
901		    ufs_rw32(ufs_rw32(dip1->di_flags, ns) & ~SF_SNAPSHOT, ns);
902		bzero(&dip1->di_db[0], (NDADDR + NIADDR) * sizeof(int32_t));
903	} else {
904		dip2 = (struct ufs2_dinode *)bp->b_data +
905		    ino_to_fsbo(fs, cancelip->i_number);
906		if (expungetype == BLK_NOCOPY || cancelip->i_ffs_effnlink == 0)
907			dip2->di_mode = 0;
908		dip2->di_size = 0;
909		dip2->di_blocks = 0;
910		dip2->di_flags =
911		    ufs_rw32(ufs_rw32(dip2->di_flags, ns) & ~SF_SNAPSHOT, ns);
912		bzero(&dip2->di_db[0], (NDADDR + NIADDR) * sizeof(int64_t));
913	}
914	bdwrite(bp);
915	/*
916	 * Now go through and expunge all the blocks in the file
917	 * using the function requested.
918	 */
919	numblks = howmany(cancelip->i_size, fs->fs_bsize);
920	if (fs->fs_magic == FS_UFS1_MAGIC)
921		bap = &cancelip->i_ffs1_db[0];
922	else
923		bap = &cancelip->i_ffs2_db[0];
924	if ((error = (*acctfunc)(snapvp, bap, 0, NDADDR, fs, 0, expungetype)))
925		return (error);
926	if (fs->fs_magic == FS_UFS1_MAGIC)
927		bap = &cancelip->i_ffs1_ib[0];
928	else
929		bap = &cancelip->i_ffs2_ib[0];
930	if ((error = (*acctfunc)(snapvp, bap, 0, NIADDR, fs, -1, expungetype)))
931		return (error);
932	blksperindir = 1;
933	lbn = -NDADDR;
934	len = numblks - NDADDR;
935	rlbn = NDADDR;
936	for (i = 0; len > 0 && i < NIADDR; i++) {
937		error = indiracct(snapvp, ITOV(cancelip), i,
938		    ib_get(cancelip, i), lbn, rlbn, len,
939		    blksperindir, fs, acctfunc, expungetype);
940		if (error)
941			return (error);
942		blksperindir *= NINDIR(fs);
943		lbn -= blksperindir + 1;
944		len -= blksperindir;
945		rlbn += blksperindir;
946	}
947	return (0);
948}
949
950/*
951 * Descend an indirect block chain for vnode cancelvp accounting for all
952 * its indirect blocks in snapvp.
953 */
954static int
955indiracct(struct vnode *snapvp, struct vnode *cancelvp, int level,
956    daddr_t blkno, daddr_t lbn, daddr_t rlbn, daddr_t remblks,
957    daddr_t blksperindir, struct fs *fs, acctfunc_t acctfunc, int expungetype)
958{
959	int error, num, i;
960	daddr_t subblksperindir;
961	struct indir indirs[NIADDR + 2];
962	daddr_t last;
963	void *bap;
964	struct buf *bp;
965
966	if (blkno == 0) {
967		if (expungetype == BLK_NOCOPY)
968			return (0);
969		panic("indiracct: missing indir");
970	}
971	if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0)
972		return (error);
973	if (lbn != indirs[num - 1 - level].in_lbn || num < 2)
974		panic("indiracct: botched params");
975	/*
976	 * We have to expand bread here since it will deadlock looking
977	 * up the block number for any blocks that are not in the cache.
978	 */
979	error = ffs_getblk(cancelvp, lbn, fsbtodb(fs, blkno), fs->fs_bsize,
980	    false, &bp);
981	if (error)
982		return error;
983	if ((bp->b_oflags & (BO_DONE | BO_DELWRI)) == 0 && (error =
984	    rwfsblk(bp->b_vp, B_READ, bp->b_data, fragstoblks(fs, blkno)))) {
985		brelse(bp, 0);
986		return (error);
987	}
988	/*
989	 * Account for the block pointers in this indirect block.
990	 */
991	last = howmany(remblks, blksperindir);
992	if (last > NINDIR(fs))
993		last = NINDIR(fs);
994	bap = malloc(fs->fs_bsize, M_DEVBUF, M_WAITOK);
995	bcopy(bp->b_data, (void *)bap, fs->fs_bsize);
996	brelse(bp, 0);
997	error = (*acctfunc)(snapvp, bap, 0, last,
998	    fs, level == 0 ? rlbn : -1, expungetype);
999	if (error || level == 0)
1000		goto out;
1001	/*
1002	 * Account for the block pointers in each of the indirect blocks
1003	 * in the levels below us.
1004	 */
1005	subblksperindir = blksperindir / NINDIR(fs);
1006	for (lbn++, level--, i = 0; i < last; i++) {
1007		error = indiracct(snapvp, cancelvp, level,
1008		    idb_get(VTOI(snapvp), bap, i), lbn, rlbn, remblks,
1009		    subblksperindir, fs, acctfunc, expungetype);
1010		if (error)
1011			goto out;
1012		rlbn += blksperindir;
1013		lbn -= blksperindir;
1014		remblks -= blksperindir;
1015	}
1016out:
1017	FREE(bap, M_DEVBUF);
1018	return (error);
1019}
1020
1021/*
1022 * Do both snap accounting and map accounting.
1023 */
1024static int
1025fullacct(struct vnode *vp, void *bap, int oldblkp, int lastblkp,
1026    struct fs *fs, daddr_t lblkno,
1027    int exptype /* BLK_SNAP or BLK_NOCOPY */)
1028{
1029	int error;
1030
1031	if ((error = snapacct(vp, bap, oldblkp, lastblkp, fs, lblkno, exptype)))
1032		return (error);
1033	return (mapacct(vp, bap, oldblkp, lastblkp, fs, lblkno, exptype));
1034}
1035
1036/*
1037 * Identify a set of blocks allocated in a snapshot inode.
1038 */
1039static int
1040snapacct(struct vnode *vp, void *bap, int oldblkp, int lastblkp,
1041    struct fs *fs, daddr_t lblkno,
1042    int expungetype /* BLK_SNAP or BLK_NOCOPY */)
1043{
1044	struct inode *ip = VTOI(vp);
1045	daddr_t blkno;
1046	daddr_t lbn;
1047	struct buf *ibp;
1048	int error;
1049
1050	for ( ; oldblkp < lastblkp; oldblkp++) {
1051		blkno = idb_get(ip, bap, oldblkp);
1052		if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP)
1053			continue;
1054		lbn = fragstoblks(fs, blkno);
1055		if (lbn < NDADDR) {
1056			blkno = db_get(ip, lbn);
1057			ip->i_flag |= IN_CHANGE | IN_UPDATE;
1058		} else {
1059			error = ffs_balloc(vp, lblktosize(fs, (off_t)lbn),
1060			    fs->fs_bsize, KERNCRED, B_METAONLY, &ibp);
1061			if (error)
1062				return (error);
1063			blkno = idb_get(ip, ibp->b_data,
1064			    (lbn - NDADDR) % NINDIR(fs));
1065		}
1066		/*
1067		 * If we are expunging a snapshot vnode and we
1068		 * find a block marked BLK_NOCOPY, then it is
1069		 * one that has been allocated to this snapshot after
1070		 * we took our current snapshot and can be ignored.
1071		 */
1072		if (expungetype == BLK_SNAP && blkno == BLK_NOCOPY) {
1073			if (lbn >= NDADDR)
1074				brelse(ibp, 0);
1075		} else {
1076			if (blkno != 0)
1077				panic("snapacct: bad block");
1078			if (lbn < NDADDR)
1079				db_assign(ip, lbn, expungetype);
1080			else {
1081				idb_assign(ip, ibp->b_data,
1082				    (lbn - NDADDR) % NINDIR(fs), expungetype);
1083				bdwrite(ibp);
1084			}
1085		}
1086	}
1087	return (0);
1088}
1089
1090/*
1091 * Account for a set of blocks allocated in a snapshot inode.
1092 */
1093static int
1094mapacct(struct vnode *vp, void *bap, int oldblkp, int lastblkp,
1095    struct fs *fs, daddr_t lblkno, int expungetype)
1096{
1097	daddr_t blkno;
1098	struct inode *ip;
1099	ino_t inum;
1100	int acctit;
1101
1102	ip = VTOI(vp);
1103	inum = ip->i_number;
1104	if (lblkno == -1)
1105		acctit = 0;
1106	else
1107		acctit = 1;
1108	for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) {
1109		blkno = idb_get(ip, bap, oldblkp);
1110		if (blkno == 0 || blkno == BLK_NOCOPY)
1111			continue;
1112		if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP)
1113			*ip->i_snapblklist++ = lblkno;
1114		if (blkno == BLK_SNAP)
1115			blkno = blkstofrags(fs, lblkno);
1116		ffs_blkfree(fs, vp, blkno, fs->fs_bsize, inum);
1117	}
1118	return (0);
1119}
1120#endif /* defined(FFS_NO_SNAPSHOT) */
1121
1122/*
1123 * Decrement extra reference on snapshot when last name is removed.
1124 * It will not be freed until the last open reference goes away.
1125 */
1126void
1127ffs_snapgone(struct inode *ip)
1128{
1129	struct mount *mp = ip->i_devvp->v_specmountpoint;
1130	struct inode *xp;
1131	struct fs *fs;
1132	struct snap_info *si;
1133	int snaploc;
1134
1135	si = VFSTOUFS(mp)->um_snapinfo;
1136
1137	/*
1138	 * Find snapshot in incore list.
1139	 */
1140	mutex_enter(&si->si_lock);
1141	TAILQ_FOREACH(xp, &si->si_snapshots, i_nextsnap)
1142		if (xp == ip)
1143			break;
1144	mutex_exit(&si->si_lock);
1145	if (xp != NULL)
1146		vrele(ITOV(ip));
1147#ifdef DEBUG
1148	else if (snapdebug)
1149		printf("ffs_snapgone: lost snapshot vnode %llu\n",
1150		    (unsigned long long)ip->i_number);
1151#endif
1152	/*
1153	 * Delete snapshot inode from superblock. Keep list dense.
1154	 */
1155	mutex_enter(&si->si_lock);
1156	fs = ip->i_fs;
1157	for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++)
1158		if (fs->fs_snapinum[snaploc] == ip->i_number)
1159			break;
1160	if (snaploc < FSMAXSNAP) {
1161		for (snaploc++; snaploc < FSMAXSNAP; snaploc++) {
1162			if (fs->fs_snapinum[snaploc] == 0)
1163				break;
1164			fs->fs_snapinum[snaploc - 1] = fs->fs_snapinum[snaploc];
1165		}
1166		fs->fs_snapinum[snaploc - 1] = 0;
1167	}
1168	si->si_gen++;
1169	mutex_exit(&si->si_lock);
1170}
1171
1172/*
1173 * Prepare a snapshot file for being removed.
1174 */
1175void
1176ffs_snapremove(struct vnode *vp)
1177{
1178	struct inode *ip = VTOI(vp), *xp;
1179	struct vnode *devvp = ip->i_devvp;
1180	struct fs *fs = ip->i_fs;
1181	struct mount *mp = devvp->v_specmountpoint;
1182	struct buf *ibp;
1183	struct snap_info *si;
1184	ufs2_daddr_t numblks, blkno, dblk;
1185	int error, loc, last;
1186
1187	si = VFSTOUFS(mp)->um_snapinfo;
1188	mutex_enter(&si->si_snaplock);
1189	/*
1190	 * If active, delete from incore list (this snapshot may
1191	 * already have been in the process of being deleted, so
1192	 * would not have been active).
1193	 *
1194	 * Clear copy-on-write flag if last snapshot.
1195	 */
1196	if (ip->i_nextsnap.tqe_prev != 0) {
1197		mutex_enter(&si->si_lock);
1198		TAILQ_REMOVE(&si->si_snapshots, ip, i_nextsnap);
1199		ip->i_nextsnap.tqe_prev = 0;
1200		if (TAILQ_FIRST(&si->si_snapshots) != 0) {
1201			/* Roll back the list of preallocated blocks. */
1202			xp = TAILQ_LAST(&si->si_snapshots, inodelst);
1203			si->si_snapblklist = xp->i_snapblklist;
1204		} else {
1205			si->si_snapblklist = 0;
1206			si->si_gen++;
1207			mutex_exit(&si->si_lock);
1208			fscow_disestablish(mp, ffs_copyonwrite, devvp);
1209			mutex_enter(&si->si_lock);
1210		}
1211		si->si_gen++;
1212		mutex_exit(&si->si_lock);
1213		FREE(ip->i_snapblklist, M_UFSMNT);
1214		ip->i_snapblklist = NULL;
1215	}
1216	mutex_exit(&si->si_snaplock);
1217	/*
1218	 * Clear all BLK_NOCOPY fields. Pass any block claims to other
1219	 * snapshots that want them (see ffs_snapblkfree below).
1220	 */
1221	for (blkno = 1; blkno < NDADDR; blkno++) {
1222		dblk = db_get(ip, blkno);
1223		if (dblk == BLK_NOCOPY || dblk == BLK_SNAP)
1224			db_assign(ip, blkno, 0);
1225		else if ((dblk == blkstofrags(fs, blkno) &&
1226		     ffs_snapblkfree(fs, ip->i_devvp, dblk, fs->fs_bsize,
1227		     ip->i_number))) {
1228			DIP_ADD(ip, blocks, -btodb(fs->fs_bsize));
1229			db_assign(ip, blkno, 0);
1230		}
1231	}
1232	numblks = howmany(ip->i_size, fs->fs_bsize);
1233	for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) {
1234		error = ffs_balloc(vp, lblktosize(fs, (off_t)blkno),
1235		    fs->fs_bsize, KERNCRED, B_METAONLY, &ibp);
1236		if (error)
1237			continue;
1238		if (fs->fs_size - blkno > NINDIR(fs))
1239			last = NINDIR(fs);
1240		else
1241			last = fs->fs_size - blkno;
1242		for (loc = 0; loc < last; loc++) {
1243			dblk = idb_get(ip, ibp->b_data, loc);
1244			if (dblk == BLK_NOCOPY || dblk == BLK_SNAP)
1245				idb_assign(ip, ibp->b_data, loc, 0);
1246			else if (dblk == blkstofrags(fs, blkno) &&
1247			    ffs_snapblkfree(fs, ip->i_devvp, dblk,
1248			    fs->fs_bsize, ip->i_number)) {
1249				DIP_ADD(ip, blocks, -btodb(fs->fs_bsize));
1250				idb_assign(ip, ibp->b_data, loc, 0);
1251			}
1252		}
1253		bawrite(ibp);
1254	}
1255	/*
1256	 * Clear snapshot flag and drop reference.
1257	 */
1258	ip->i_flags &= ~SF_SNAPSHOT;
1259	DIP_ASSIGN(ip, flags, ip->i_flags);
1260	ip->i_flag |= IN_CHANGE | IN_UPDATE;
1261}
1262
1263/*
1264 * Notification that a block is being freed. Return zero if the free
1265 * should be allowed to proceed. Return non-zero if the snapshot file
1266 * wants to claim the block. The block will be claimed if it is an
1267 * uncopied part of one of the snapshots. It will be freed if it is
1268 * either a BLK_NOCOPY or has already been copied in all of the snapshots.
1269 * If a fragment is being freed, then all snapshots that care about
1270 * it must make a copy since a snapshot file can only claim full sized
1271 * blocks. Note that if more than one snapshot file maps the block,
1272 * we can pick one at random to claim it. Since none of the snapshots
1273 * can change, we are assurred that they will all see the same unmodified
1274 * image. When deleting a snapshot file (see ffs_snapremove above), we
1275 * must push any of these claimed blocks to one of the other snapshots
1276 * that maps it. These claimed blocks are easily identified as they will
1277 * have a block number equal to their logical block number within the
1278 * snapshot. A copied block can never have this property because they
1279 * must always have been allocated from a BLK_NOCOPY location.
1280 */
1281int
1282ffs_snapblkfree(struct fs *fs, struct vnode *devvp, ufs2_daddr_t bno,
1283    long size, ino_t inum)
1284{
1285	struct mount *mp = devvp->v_specmountpoint;
1286	struct buf *ibp;
1287	struct inode *ip;
1288	struct vnode *vp = NULL;
1289	struct snap_info *si;
1290	void *saved_data = NULL;
1291	ufs_lbn_t lbn;
1292	ufs2_daddr_t blkno;
1293	uint32_t gen;
1294	int indiroff = 0, snapshot_locked = 0, error = 0, claimedblk = 0;
1295
1296	si = VFSTOUFS(mp)->um_snapinfo;
1297	lbn = fragstoblks(fs, bno);
1298	mutex_enter(&si->si_lock);
1299retry:
1300	gen = si->si_gen;
1301	TAILQ_FOREACH(ip, &si->si_snapshots, i_nextsnap) {
1302		vp = ITOV(ip);
1303		if (snapshot_locked == 0) {
1304			if (!mutex_tryenter(&si->si_snaplock)) {
1305				mutex_exit(&si->si_lock);
1306				mutex_enter(&si->si_snaplock);
1307				mutex_enter(&si->si_lock);
1308			}
1309			snapshot_locked = 1;
1310			if (gen != si->si_gen)
1311				goto retry;
1312		}
1313		/*
1314		 * Lookup block being written.
1315		 */
1316		if (lbn < NDADDR) {
1317			blkno = db_get(ip, lbn);
1318		} else {
1319			mutex_exit(&si->si_lock);
1320			error = ffs_balloc(vp, lblktosize(fs, (off_t)lbn),
1321			    fs->fs_bsize, KERNCRED, B_METAONLY, &ibp);
1322			if (error) {
1323				mutex_enter(&si->si_lock);
1324				break;
1325			}
1326			indiroff = (lbn - NDADDR) % NINDIR(fs);
1327			blkno = idb_get(ip, ibp->b_data, indiroff);
1328			mutex_enter(&si->si_lock);
1329			if (gen != si->si_gen) {
1330				brelse(ibp, 0);
1331				goto retry;
1332			}
1333		}
1334		/*
1335		 * Check to see if block needs to be copied.
1336		 */
1337		if (blkno == 0) {
1338			/*
1339			 * A block that we map is being freed. If it has not
1340			 * been claimed yet, we will claim or copy it (below).
1341			 */
1342			claimedblk = 1;
1343		} else if (blkno == BLK_SNAP) {
1344			/*
1345			 * No previous snapshot claimed the block,
1346			 * so it will be freed and become a BLK_NOCOPY
1347			 * (don't care) for us.
1348			 */
1349			if (claimedblk)
1350				panic("snapblkfree: inconsistent block type");
1351			if (lbn < NDADDR) {
1352				db_assign(ip, lbn, BLK_NOCOPY);
1353				ip->i_flag |= IN_CHANGE | IN_UPDATE;
1354			} else {
1355				idb_assign(ip, ibp->b_data, indiroff,
1356				    BLK_NOCOPY);
1357				mutex_exit(&si->si_lock);
1358				if (ip->i_ffs_effnlink > 0)
1359					bwrite(ibp);
1360				else
1361					bdwrite(ibp);
1362				mutex_enter(&si->si_lock);
1363				if (gen != si->si_gen)
1364					goto retry;
1365			}
1366			continue;
1367		} else /* BLK_NOCOPY or default */ {
1368			/*
1369			 * If the snapshot has already copied the block
1370			 * (default), or does not care about the block,
1371			 * it is not needed.
1372			 */
1373			if (lbn >= NDADDR)
1374				brelse(ibp, 0);
1375			continue;
1376		}
1377		/*
1378		 * If this is a full size block, we will just grab it
1379		 * and assign it to the snapshot inode. Otherwise we
1380		 * will proceed to copy it. See explanation for this
1381		 * routine as to why only a single snapshot needs to
1382		 * claim this block.
1383		 */
1384		if (size == fs->fs_bsize) {
1385#ifdef DEBUG
1386			if (snapdebug)
1387				printf("%s %llu lbn %" PRId64
1388				    "from inum %llu\n",
1389				    "Grabonremove: snapino",
1390				    (unsigned long long)ip->i_number,
1391				    lbn, (unsigned long long)inum);
1392#endif
1393			mutex_exit(&si->si_lock);
1394			if (lbn < NDADDR) {
1395				db_assign(ip, lbn, bno);
1396			} else {
1397				idb_assign(ip, ibp->b_data, indiroff, bno);
1398				if (ip->i_ffs_effnlink > 0)
1399					bwrite(ibp);
1400				else
1401					bdwrite(ibp);
1402			}
1403			DIP_ADD(ip, blocks, btodb(size));
1404			ip->i_flag |= IN_CHANGE | IN_UPDATE;
1405			if (ip->i_ffs_effnlink > 0 && mp->mnt_wapbl)
1406				error = syncsnap(vp);
1407			else
1408				error = 0;
1409			mutex_exit(&si->si_snaplock);
1410			return (error == 0);
1411		}
1412		if (lbn >= NDADDR)
1413			brelse(ibp, 0);
1414#ifdef DEBUG
1415		if (snapdebug)
1416			printf("%s%llu lbn %" PRId64 " %s %llu size %ld\n",
1417			    "Copyonremove: snapino ",
1418			    (unsigned long long)ip->i_number,
1419			    lbn, "for inum", (unsigned long long)inum, size);
1420#endif
1421		/*
1422		 * If we have already read the old block contents, then
1423		 * simply copy them to the new block. Note that we need
1424		 * to synchronously write snapshots that have not been
1425		 * unlinked, and hence will be visible after a crash,
1426		 * to ensure their integrity.
1427		 */
1428		mutex_exit(&si->si_lock);
1429		if (saved_data == NULL) {
1430			saved_data = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK);
1431			error = rwfsblk(vp, B_READ, saved_data, lbn);
1432			if (error) {
1433				free(saved_data, M_UFSMNT);
1434				saved_data = NULL;
1435				mutex_enter(&si->si_lock);
1436				break;
1437			}
1438		}
1439		error = wrsnapblk(vp, saved_data, lbn);
1440		if (error == 0 && ip->i_ffs_effnlink > 0 && mp->mnt_wapbl)
1441			error = syncsnap(vp);
1442		mutex_enter(&si->si_lock);
1443		if (error)
1444			break;
1445		if (gen != si->si_gen)
1446			goto retry;
1447	}
1448	mutex_exit(&si->si_lock);
1449	if (saved_data)
1450		free(saved_data, M_UFSMNT);
1451	/*
1452	 * If we have been unable to allocate a block in which to do
1453	 * the copy, then return non-zero so that the fragment will
1454	 * not be freed. Although space will be lost, the snapshot
1455	 * will stay consistent.
1456	 */
1457	if (snapshot_locked)
1458		mutex_exit(&si->si_snaplock);
1459	return (error);
1460}
1461
1462/*
1463 * Associate snapshot files when mounting.
1464 */
1465void
1466ffs_snapshot_mount(struct mount *mp)
1467{
1468	struct vnode *devvp = VFSTOUFS(mp)->um_devvp;
1469	struct fs *fs = VFSTOUFS(mp)->um_fs;
1470	struct lwp *l = curlwp;
1471	struct vnode *vp;
1472	struct inode *ip, *xp;
1473	struct snap_info *si;
1474	ufs2_daddr_t snaplistsize, *snapblklist;
1475	int i, error, ns, snaploc, loc;
1476
1477	/*
1478	 * No persistent snapshots on apple ufs file systems.
1479	 */
1480	if (UFS_MPISAPPLEUFS(VFSTOUFS(mp)))
1481		return;
1482
1483	si = VFSTOUFS(mp)->um_snapinfo;
1484	ns = UFS_FSNEEDSWAP(fs);
1485	/*
1486	 * XXX The following needs to be set before ffs_truncate or
1487	 * VOP_READ can be called.
1488	 */
1489	mp->mnt_stat.f_iosize = fs->fs_bsize;
1490	/*
1491	 * Process each snapshot listed in the superblock.
1492	 */
1493	vp = NULL;
1494	mutex_enter(&si->si_lock);
1495	for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) {
1496		if (fs->fs_snapinum[snaploc] == 0)
1497			break;
1498		if ((error = VFS_VGET(mp, fs->fs_snapinum[snaploc],
1499		    &vp)) != 0) {
1500			printf("ffs_snapshot_mount: vget failed %d\n", error);
1501			continue;
1502		}
1503		ip = VTOI(vp);
1504		if ((ip->i_flags & SF_SNAPSHOT) == 0) {
1505			printf("ffs_snapshot_mount: non-snapshot inode %d\n",
1506			    fs->fs_snapinum[snaploc]);
1507			vput(vp);
1508			vp = NULL;
1509			for (loc = snaploc + 1; loc < FSMAXSNAP; loc++) {
1510				if (fs->fs_snapinum[loc] == 0)
1511					break;
1512				fs->fs_snapinum[loc - 1] = fs->fs_snapinum[loc];
1513			}
1514			fs->fs_snapinum[loc - 1] = 0;
1515			snaploc--;
1516			continue;
1517		}
1518
1519		/*
1520		 * Read the block hints list. Use an empty list on
1521		 * read errors.
1522		 */
1523		error = vn_rdwr(UIO_READ, vp,
1524		    (void *)&snaplistsize, sizeof(snaplistsize),
1525		    lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)),
1526		    UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT,
1527		    l->l_cred, NULL, NULL);
1528		if (error) {
1529			printf("ffs_snapshot_mount: read_1 failed %d\n", error);
1530			snaplistsize = 1;
1531		} else
1532			snaplistsize = ufs_rw64(snaplistsize, ns);
1533		snapblklist = malloc(
1534		    snaplistsize * sizeof(ufs2_daddr_t), M_UFSMNT, M_WAITOK);
1535		if (error)
1536			snapblklist[0] = 1;
1537		else {
1538			error = vn_rdwr(UIO_READ, vp, (void *)snapblklist,
1539			    snaplistsize * sizeof(ufs2_daddr_t),
1540			    lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)),
1541			    UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT,
1542			    l->l_cred, NULL, NULL);
1543			for (i = 0; i < snaplistsize; i++)
1544				snapblklist[i] = ufs_rw64(snapblklist[i], ns);
1545			if (error) {
1546				printf("ffs_snapshot_mount: read_2 failed %d\n",
1547				    error);
1548				snapblklist[0] = 1;
1549			}
1550		}
1551		ip->i_snapblklist = &snapblklist[0];
1552
1553		/*
1554		 * Link it onto the active snapshot list.
1555		 */
1556		if (ip->i_nextsnap.tqe_prev != 0)
1557			panic("ffs_snapshot_mount: %llu already on list",
1558			    (unsigned long long)ip->i_number);
1559		else
1560			TAILQ_INSERT_TAIL(&si->si_snapshots, ip, i_nextsnap);
1561		vp->v_vflag |= VV_SYSTEM;
1562		VOP_UNLOCK(vp, 0);
1563	}
1564	/*
1565	 * No usable snapshots found.
1566	 */
1567	if (vp == NULL) {
1568		mutex_exit(&si->si_lock);
1569		return;
1570	}
1571	/*
1572	 * Attach the block hints list. We always want to
1573	 * use the list from the newest snapshot.
1574	*/
1575	xp = TAILQ_LAST(&si->si_snapshots, inodelst);
1576	si->si_snapblklist = xp->i_snapblklist;
1577	fscow_establish(mp, ffs_copyonwrite, devvp);
1578	si->si_gen++;
1579	mutex_exit(&si->si_lock);
1580}
1581
1582/*
1583 * Disassociate snapshot files when unmounting.
1584 */
1585void
1586ffs_snapshot_unmount(struct mount *mp)
1587{
1588	struct vnode *devvp = VFSTOUFS(mp)->um_devvp;
1589	struct inode *xp;
1590	struct vnode *vp = NULL;
1591	struct snap_info *si;
1592
1593	si = VFSTOUFS(mp)->um_snapinfo;
1594	mutex_enter(&si->si_lock);
1595	while ((xp = TAILQ_FIRST(&si->si_snapshots)) != 0) {
1596		vp = ITOV(xp);
1597		vp->v_vnlock = &vp->v_lock;
1598		TAILQ_REMOVE(&si->si_snapshots, xp, i_nextsnap);
1599		xp->i_nextsnap.tqe_prev = 0;
1600		if (xp->i_snapblklist == si->si_snapblklist)
1601			si->si_snapblklist = NULL;
1602		FREE(xp->i_snapblklist, M_UFSMNT);
1603		if (xp->i_ffs_effnlink > 0) {
1604			si->si_gen++;
1605			mutex_exit(&si->si_lock);
1606			vrele(vp);
1607			mutex_enter(&si->si_lock);
1608		}
1609	}
1610	if (vp)
1611		fscow_disestablish(mp, ffs_copyonwrite, devvp);
1612	si->si_gen++;
1613	mutex_exit(&si->si_lock);
1614}
1615
1616/*
1617 * Lookup a snapshots data block address.
1618 * Simpler than UFS_BALLOC() as we know all metadata is already allocated
1619 * and safe even for the pagedaemon where we cannot bread().
1620 */
1621static int
1622snapblkaddr(struct vnode *vp, daddr_t lbn, daddr_t *res)
1623{
1624	struct indir indirs[NIADDR + 2];
1625	struct inode *ip = VTOI(vp);
1626	struct fs *fs = ip->i_fs;
1627	struct buf *bp;
1628	int error, num;
1629
1630	KASSERT(lbn >= 0);
1631
1632	if (lbn < NDADDR) {
1633		*res = db_get(ip, lbn);
1634		return 0;
1635	}
1636	if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0)
1637		return error;
1638	if (curlwp == uvm.pagedaemon_lwp) {
1639		mutex_enter(&bufcache_lock);
1640		bp = incore(vp, indirs[num-1].in_lbn);
1641		if (bp && (bp->b_oflags & (BO_DONE | BO_DELWRI))) {
1642			*res = idb_get(ip, bp->b_data, indirs[num-1].in_off);
1643			error = 0;
1644		} else
1645			error = ENOMEM;
1646		mutex_exit(&bufcache_lock);
1647		return error;
1648	}
1649	error = bread(vp, indirs[num-1].in_lbn, fs->fs_bsize, NOCRED, 0, &bp);
1650	if (error == 0)
1651		*res = idb_get(ip, bp->b_data, indirs[num-1].in_off);
1652	brelse(bp, 0);
1653
1654	return error;
1655}
1656
1657/*
1658 * Check for need to copy block that is about to be written,
1659 * copying the block if necessary.
1660 */
1661static int
1662ffs_copyonwrite(void *v, struct buf *bp, bool data_valid)
1663{
1664	struct fs *fs;
1665	struct inode *ip;
1666	struct vnode *devvp = v, *vp = NULL;
1667	struct mount *mp = devvp->v_specmountpoint;
1668	struct snap_info *si;
1669	void *saved_data = NULL;
1670	ufs2_daddr_t lbn, blkno, *snapblklist;
1671	uint32_t gen;
1672	int lower, upper, mid, snapshot_locked = 0, error = 0;
1673
1674	/*
1675	 * Check for valid snapshots.
1676	 */
1677	si = VFSTOUFS(mp)->um_snapinfo;
1678	mutex_enter(&si->si_lock);
1679	ip = TAILQ_FIRST(&si->si_snapshots);
1680	if (ip == NULL) {
1681		mutex_exit(&si->si_lock);
1682		return 0;
1683	}
1684	/*
1685	 * First check to see if it is after the file system or
1686	 * in the preallocated list.
1687	 * By doing this check we avoid several potential deadlocks.
1688	 */
1689	fs = ip->i_fs;
1690	lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno));
1691	if (bp->b_blkno >= fsbtodb(fs, fs->fs_size)) {
1692		mutex_exit(&si->si_lock);
1693		return 0;
1694	}
1695	snapblklist = si->si_snapblklist;
1696	upper = si->si_snapblklist[0] - 1;
1697	lower = 1;
1698	while (lower <= upper) {
1699		mid = (lower + upper) / 2;
1700		if (snapblklist[mid] == lbn)
1701			break;
1702		if (snapblklist[mid] < lbn)
1703			lower = mid + 1;
1704		else
1705			upper = mid - 1;
1706	}
1707	if (lower <= upper) {
1708		mutex_exit(&si->si_lock);
1709		return 0;
1710	}
1711	/*
1712	 * Not in the precomputed list, so check the snapshots.
1713	 */
1714	 if (data_valid && bp->b_bcount == fs->fs_bsize)
1715		saved_data = bp->b_data;
1716retry:
1717	gen = si->si_gen;
1718	TAILQ_FOREACH(ip, &si->si_snapshots, i_nextsnap) {
1719		vp = ITOV(ip);
1720		/*
1721		 * We ensure that everything of our own that needs to be
1722		 * copied will be done at the time that ffs_snapshot is
1723		 * called. Thus we can skip the check here which can
1724		 * deadlock in doing the lookup in ffs_balloc.
1725		 */
1726		if (bp->b_vp == vp)
1727			continue;
1728		/*
1729		 * Check to see if block needs to be copied.
1730		 */
1731		if (lbn < NDADDR) {
1732			blkno = db_get(ip, lbn);
1733		} else {
1734			mutex_exit(&si->si_lock);
1735			if ((error = snapblkaddr(vp, lbn, &blkno)) != 0) {
1736				mutex_enter(&si->si_lock);
1737				break;
1738			}
1739			mutex_enter(&si->si_lock);
1740			if (gen != si->si_gen)
1741				goto retry;
1742		}
1743#ifdef DIAGNOSTIC
1744		if (blkno == BLK_SNAP && bp->b_lblkno >= 0)
1745			panic("ffs_copyonwrite: bad copy block");
1746#endif
1747		if (blkno != 0)
1748			continue;
1749
1750		if (curlwp == uvm.pagedaemon_lwp) {
1751			error = ENOMEM;
1752			break;
1753		}
1754
1755		if (snapshot_locked == 0) {
1756			if (!mutex_tryenter(&si->si_snaplock)) {
1757				mutex_exit(&si->si_lock);
1758				mutex_enter(&si->si_snaplock);
1759				mutex_enter(&si->si_lock);
1760			}
1761			snapshot_locked = 1;
1762			if (gen != si->si_gen)
1763				goto retry;
1764
1765			/* Check again if block still needs to be copied */
1766			if (lbn < NDADDR) {
1767				blkno = db_get(ip, lbn);
1768			} else {
1769				mutex_exit(&si->si_lock);
1770				if ((error = snapblkaddr(vp, lbn, &blkno)) != 0) {
1771					mutex_enter(&si->si_lock);
1772					break;
1773				}
1774				mutex_enter(&si->si_lock);
1775				if (gen != si->si_gen)
1776					goto retry;
1777			}
1778
1779			if (blkno != 0)
1780				continue;
1781		}
1782		/*
1783		 * Allocate the block into which to do the copy. Since
1784		 * multiple processes may all try to copy the same block,
1785		 * we have to recheck our need to do a copy if we sleep
1786		 * waiting for the lock.
1787		 *
1788		 * Because all snapshots on a filesystem share a single
1789		 * lock, we ensure that we will never be in competition
1790		 * with another process to allocate a block.
1791		 */
1792#ifdef DEBUG
1793		if (snapdebug) {
1794			printf("Copyonwrite: snapino %llu lbn %" PRId64 " for ",
1795			    (unsigned long long)ip->i_number, lbn);
1796			if (bp->b_vp == devvp)
1797				printf("fs metadata");
1798			else
1799				printf("inum %llu", (unsigned long long)
1800				    VTOI(bp->b_vp)->i_number);
1801			printf(" lblkno %" PRId64 "\n", bp->b_lblkno);
1802		}
1803#endif
1804		/*
1805		 * If we have already read the old block contents, then
1806		 * simply copy them to the new block. Note that we need
1807		 * to synchronously write snapshots that have not been
1808		 * unlinked, and hence will be visible after a crash,
1809		 * to ensure their integrity.
1810		 */
1811		mutex_exit(&si->si_lock);
1812		if (saved_data == NULL) {
1813			saved_data = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK);
1814			error = rwfsblk(vp, B_READ, saved_data, lbn);
1815			if (error) {
1816				free(saved_data, M_UFSMNT);
1817				saved_data = NULL;
1818				mutex_enter(&si->si_lock);
1819				break;
1820			}
1821		}
1822		error = wrsnapblk(vp, saved_data, lbn);
1823		if (error == 0 && ip->i_ffs_effnlink > 0 && mp->mnt_wapbl)
1824			error = syncsnap(vp);
1825		mutex_enter(&si->si_lock);
1826		if (error)
1827			break;
1828		if (gen != si->si_gen)
1829			goto retry;
1830	}
1831	/*
1832	 * Note that we need to synchronously write snapshots that
1833	 * have not been unlinked, and hence will be visible after
1834	 * a crash, to ensure their integrity.
1835	 */
1836	mutex_exit(&si->si_lock);
1837	if (saved_data && saved_data != bp->b_data)
1838		free(saved_data, M_UFSMNT);
1839	if (snapshot_locked)
1840		mutex_exit(&si->si_snaplock);
1841	return error;
1842}
1843
1844/*
1845 * Read from a snapshot.
1846 */
1847int
1848ffs_snapshot_read(struct vnode *vp, struct uio *uio, int ioflag)
1849{
1850	struct inode *ip = VTOI(vp);
1851	struct fs *fs = ip->i_fs;
1852	struct snap_info *si = VFSTOUFS(vp->v_mount)->um_snapinfo;
1853	struct buf *bp;
1854	daddr_t lbn, nextlbn;
1855	off_t bytesinfile;
1856	long size, xfersize, blkoffset;
1857	int error;
1858
1859	fstrans_start(vp->v_mount, FSTRANS_SHARED);
1860	mutex_enter(&si->si_snaplock);
1861
1862	for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
1863		bytesinfile = ip->i_size - uio->uio_offset;
1864		if (bytesinfile <= 0)
1865			break;
1866		lbn = lblkno(fs, uio->uio_offset);
1867		nextlbn = lbn + 1;
1868		size = blksize(fs, ip, lbn);
1869		blkoffset = blkoff(fs, uio->uio_offset);
1870		xfersize = MIN(MIN(fs->fs_bsize - blkoffset, uio->uio_resid),
1871		    bytesinfile);
1872
1873		if (lblktosize(fs, nextlbn) >= ip->i_size)
1874			error = bread(vp, lbn, size, NOCRED, 0, &bp);
1875		else {
1876			int nextsize = blksize(fs, ip, nextlbn);
1877			error = breadn(vp, lbn,
1878			    size, &nextlbn, &nextsize, 1, NOCRED, 0, &bp);
1879		}
1880		if (error)
1881			break;
1882
1883		/*
1884		 * We should only get non-zero b_resid when an I/O error
1885		 * has occurred, which should cause us to break above.
1886		 * However, if the short read did not cause an error,
1887		 * then we want to ensure that we do not uiomove bad
1888		 * or uninitialized data.
1889		 */
1890		size -= bp->b_resid;
1891		if (size < xfersize) {
1892			if (size == 0)
1893				break;
1894			xfersize = size;
1895		}
1896		error = uiomove((char *)bp->b_data + blkoffset, xfersize, uio);
1897		if (error)
1898			break;
1899		brelse(bp, BC_AGE);
1900	}
1901	if (bp != NULL)
1902		brelse(bp, BC_AGE);
1903
1904	mutex_exit(&si->si_snaplock);
1905	fstrans_done(vp->v_mount);
1906	return error;
1907}
1908
1909/*
1910 * Read or write the specified block of the filesystem vp resides on
1911 * from or to the disk bypassing the buffer cache.
1912 */
1913static int
1914rwfsblk(struct vnode *vp, int flags, void *data, ufs2_daddr_t lbn)
1915{
1916	int error;
1917	struct inode *ip = VTOI(vp);
1918	struct fs *fs = ip->i_fs;
1919	struct buf *nbp;
1920
1921	nbp = getiobuf(NULL, true);
1922	nbp->b_flags = flags;
1923	nbp->b_bcount = nbp->b_bufsize = fs->fs_bsize;
1924	nbp->b_error = 0;
1925	nbp->b_data = data;
1926	nbp->b_blkno = nbp->b_rawblkno = fsbtodb(fs, blkstofrags(fs, lbn));
1927	nbp->b_proc = NULL;
1928	nbp->b_dev = ip->i_devvp->v_rdev;
1929	SET(nbp->b_cflags, BC_BUSY);	/* mark buffer busy */
1930
1931	bdev_strategy(nbp);
1932
1933	error = biowait(nbp);
1934
1935	putiobuf(nbp);
1936
1937	return error;
1938}
1939
1940/*
1941 * Write all dirty buffers to disk and invalidate them.
1942 */
1943static int
1944syncsnap(struct vnode *vp)
1945{
1946	int error;
1947	buf_t *bp;
1948	struct fs *fs = VTOI(vp)->i_fs;
1949
1950	mutex_enter(&bufcache_lock);
1951	while ((bp = LIST_FIRST(&vp->v_dirtyblkhd))) {
1952		KASSERT((bp->b_cflags & BC_BUSY) == 0);
1953		KASSERT(bp->b_bcount == fs->fs_bsize);
1954		bp->b_cflags |= BC_BUSY;
1955		mutex_exit(&bufcache_lock);
1956		error = rwfsblk(vp, B_WRITE, bp->b_data,
1957		    fragstoblks(fs, dbtofsb(fs, bp->b_blkno)));
1958		brelse(bp, BC_INVAL | BC_VFLUSH);
1959		if (error)
1960			return error;
1961		mutex_enter(&bufcache_lock);
1962	}
1963	mutex_exit(&bufcache_lock);
1964
1965	return 0;
1966}
1967
1968/*
1969 * Write the specified block to a snapshot.
1970 */
1971static int
1972wrsnapblk(struct vnode *vp, void *data, ufs2_daddr_t lbn)
1973{
1974	struct inode *ip = VTOI(vp);
1975	struct fs *fs = ip->i_fs;
1976	struct buf *bp;
1977	int error;
1978
1979	error = ffs_balloc(vp, lblktosize(fs, (off_t)lbn), fs->fs_bsize,
1980	    KERNCRED, (ip->i_ffs_effnlink > 0 ? B_SYNC : 0), &bp);
1981	if (error)
1982		return error;
1983	bcopy(data, bp->b_data, fs->fs_bsize);
1984	if (ip->i_ffs_effnlink > 0)
1985		error = bwrite(bp);
1986	else
1987		bawrite(bp);
1988
1989	return error;
1990}
1991
1992/*
1993 * Get/Put direct block from inode or buffer containing disk addresses. Take
1994 * care for fs type (UFS1/UFS2) and byte swapping. These functions should go
1995 * into a global include.
1996 */
1997static inline ufs2_daddr_t
1998db_get(struct inode *ip, int loc)
1999{
2000	if (ip->i_ump->um_fstype == UFS1)
2001		return ufs_rw32(ip->i_ffs1_db[loc], UFS_IPNEEDSWAP(ip));
2002	else
2003		return ufs_rw64(ip->i_ffs2_db[loc], UFS_IPNEEDSWAP(ip));
2004}
2005
2006static inline void
2007db_assign(struct inode *ip, int loc, ufs2_daddr_t val)
2008{
2009	if (ip->i_ump->um_fstype == UFS1)
2010		ip->i_ffs1_db[loc] = ufs_rw32(val, UFS_IPNEEDSWAP(ip));
2011	else
2012		ip->i_ffs2_db[loc] = ufs_rw64(val, UFS_IPNEEDSWAP(ip));
2013}
2014
2015static inline ufs2_daddr_t
2016ib_get(struct inode *ip, int loc)
2017{
2018	if (ip->i_ump->um_fstype == UFS1)
2019		return ufs_rw32(ip->i_ffs1_ib[loc], UFS_IPNEEDSWAP(ip));
2020	else
2021		return ufs_rw64(ip->i_ffs2_ib[loc], UFS_IPNEEDSWAP(ip));
2022}
2023
2024static inline void
2025ib_assign(struct inode *ip, int loc, ufs2_daddr_t val)
2026{
2027	if (ip->i_ump->um_fstype == UFS1)
2028		ip->i_ffs1_ib[loc] = ufs_rw32(val, UFS_IPNEEDSWAP(ip));
2029	else
2030		ip->i_ffs2_ib[loc] = ufs_rw64(val, UFS_IPNEEDSWAP(ip));
2031}
2032
2033static inline ufs2_daddr_t
2034idb_get(struct inode *ip, void *bf, int loc)
2035{
2036	if (ip->i_ump->um_fstype == UFS1)
2037		return ufs_rw32(((ufs1_daddr_t *)(bf))[loc],
2038		    UFS_IPNEEDSWAP(ip));
2039	else
2040		return ufs_rw64(((ufs2_daddr_t *)(bf))[loc],
2041		    UFS_IPNEEDSWAP(ip));
2042}
2043
2044static inline void
2045idb_assign(struct inode *ip, void *bf, int loc, ufs2_daddr_t val)
2046{
2047	if (ip->i_ump->um_fstype == UFS1)
2048		((ufs1_daddr_t *)(bf))[loc] =
2049		    ufs_rw32(val, UFS_IPNEEDSWAP(ip));
2050	else
2051		((ufs2_daddr_t *)(bf))[loc] =
2052		    ufs_rw64(val, UFS_IPNEEDSWAP(ip));
2053}
2054