ffs_snapshot.c revision 298527
1/*-
2 * Copyright 2000 Marshall Kirk McKusick. All Rights Reserved.
3 *
4 * Further information about snapshots can be obtained from:
5 *
6 *	Marshall Kirk McKusick		http://www.mckusick.com/softdep/
7 *	1614 Oxford Street		mckusick@mckusick.com
8 *	Berkeley, CA 94709-1608		+1-510-843-9542
9 *	USA
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 *
15 * 1. Redistributions of source code must retain the above copyright
16 *    notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 *    notice, this list of conditions and the following disclaimer in the
19 *    documentation and/or other materials provided with the distribution.
20 *
21 * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY
22 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
23 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24 * DISCLAIMED.  IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR
25 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * SUCH DAMAGE.
32 *
33 *	@(#)ffs_snapshot.c	8.11 (McKusick) 7/23/00
34 */
35
36#include <sys/cdefs.h>
37__FBSDID("$FreeBSD: stable/10/sys/ufs/ffs/ffs_snapshot.c 298527 2016-04-24 03:11:52Z pfg $");
38
39#include "opt_quota.h"
40
41#include <sys/param.h>
42#include <sys/kernel.h>
43#include <sys/systm.h>
44#include <sys/conf.h>
45#include <sys/bio.h>
46#include <sys/buf.h>
47#include <sys/fcntl.h>
48#include <sys/proc.h>
49#include <sys/namei.h>
50#include <sys/sched.h>
51#include <sys/stat.h>
52#include <sys/malloc.h>
53#include <sys/mount.h>
54#include <sys/resource.h>
55#include <sys/resourcevar.h>
56#include <sys/rwlock.h>
57#include <sys/vnode.h>
58
59#include <geom/geom.h>
60
61#include <ufs/ufs/extattr.h>
62#include <ufs/ufs/quota.h>
63#include <ufs/ufs/ufsmount.h>
64#include <ufs/ufs/inode.h>
65#include <ufs/ufs/ufs_extern.h>
66
67#include <ufs/ffs/fs.h>
68#include <ufs/ffs/ffs_extern.h>
69
70#define KERNCRED thread0.td_ucred
71#define DEBUG 1
72
73#include "opt_ffs.h"
74
75#ifdef NO_FFS_SNAPSHOT
76int
77ffs_snapshot(mp, snapfile)
78	struct mount *mp;
79	char *snapfile;
80{
81	return (EINVAL);
82}
83
84int
85ffs_snapblkfree(fs, devvp, bno, size, inum, vtype, wkhd)
86	struct fs *fs;
87	struct vnode *devvp;
88	ufs2_daddr_t bno;
89	long size;
90	ino_t inum;
91	enum vtype vtype;
92	struct workhead *wkhd;
93{
94	return (EINVAL);
95}
96
97void
98ffs_snapremove(vp)
99	struct vnode *vp;
100{
101}
102
103void
104ffs_snapshot_mount(mp)
105	struct mount *mp;
106{
107}
108
109void
110ffs_snapshot_unmount(mp)
111	struct mount *mp;
112{
113}
114
115void
116ffs_snapgone(ip)
117	struct inode *ip;
118{
119}
120
121int
122ffs_copyonwrite(devvp, bp)
123	struct vnode *devvp;
124	struct buf *bp;
125{
126	return (EINVAL);
127}
128
129void
130ffs_sync_snap(mp, waitfor)
131	struct mount *mp;
132	int waitfor;
133{
134}
135
136#else
137FEATURE(ffs_snapshot, "FFS snapshot support");
138
139LIST_HEAD(, snapdata) snapfree;
140static struct mtx snapfree_lock;
141MTX_SYSINIT(ffs_snapfree, &snapfree_lock, "snapdata free list", MTX_DEF);
142
143static int cgaccount(int, struct vnode *, struct buf *, int);
144static int expunge_ufs1(struct vnode *, struct inode *, struct fs *,
145    int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *,
146    ufs_lbn_t, int), int, int);
147static int indiracct_ufs1(struct vnode *, struct vnode *, int,
148    ufs1_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *,
149    int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *,
150    ufs_lbn_t, int), int);
151static int fullacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
152    struct fs *, ufs_lbn_t, int);
153static int snapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
154    struct fs *, ufs_lbn_t, int);
155static int mapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
156    struct fs *, ufs_lbn_t, int);
157static int expunge_ufs2(struct vnode *, struct inode *, struct fs *,
158    int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *,
159    ufs_lbn_t, int), int, int);
160static int indiracct_ufs2(struct vnode *, struct vnode *, int,
161    ufs2_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *,
162    int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *,
163    ufs_lbn_t, int), int);
164static int fullacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
165    struct fs *, ufs_lbn_t, int);
166static int snapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
167    struct fs *, ufs_lbn_t, int);
168static int mapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
169    struct fs *, ufs_lbn_t, int);
170static int readblock(struct vnode *vp, struct buf *, ufs2_daddr_t);
171static void try_free_snapdata(struct vnode *devvp);
172static struct snapdata *ffs_snapdata_acquire(struct vnode *devvp);
173static int ffs_bp_snapblk(struct vnode *, struct buf *);
174
175/*
176 * To ensure the consistency of snapshots across crashes, we must
177 * synchronously write out copied blocks before allowing the
178 * originals to be modified. Because of the rather severe speed
179 * penalty that this imposes, the code normally only ensures
180 * persistence for the filesystem metadata contained within a
181 * snapshot. Setting the following flag allows this crash
182 * persistence to be enabled for file contents.
183 */
184int dopersistence = 0;
185
186#ifdef DEBUG
187#include <sys/sysctl.h>
188SYSCTL_INT(_debug, OID_AUTO, dopersistence, CTLFLAG_RW, &dopersistence, 0, "");
189static int snapdebug = 0;
190SYSCTL_INT(_debug, OID_AUTO, snapdebug, CTLFLAG_RW, &snapdebug, 0, "");
191int collectsnapstats = 0;
192SYSCTL_INT(_debug, OID_AUTO, collectsnapstats, CTLFLAG_RW, &collectsnapstats,
193	0, "");
194#endif /* DEBUG */
195
196/*
197 * Create a snapshot file and initialize it for the filesystem.
198 */
199int
200ffs_snapshot(mp, snapfile)
201	struct mount *mp;
202	char *snapfile;
203{
204	ufs2_daddr_t numblks, blkno, *blkp, *snapblklist;
205	int error, cg, snaploc;
206	int i, size, len, loc;
207	ufs2_daddr_t blockno;
208	uint64_t flag;
209	struct timespec starttime = {0, 0}, endtime;
210	char saved_nice = 0;
211	long redo = 0, snaplistsize = 0;
212	int32_t *lp;
213	void *space;
214	struct fs *copy_fs = NULL, *fs;
215	struct thread *td = curthread;
216	struct inode *ip, *xp;
217	struct buf *bp, *nbp, *ibp;
218	struct nameidata nd;
219	struct mount *wrtmp;
220	struct vattr vat;
221	struct vnode *vp, *xvp, *mvp, *devvp;
222	struct uio auio;
223	struct iovec aiov;
224	struct snapdata *sn;
225	struct ufsmount *ump;
226
227	ump = VFSTOUFS(mp);
228	fs = ump->um_fs;
229	sn = NULL;
230	/*
231	 * At the moment, journaled soft updates cannot support
232	 * taking snapshots.
233	 */
234	if (MOUNTEDSUJ(mp)) {
235		vfs_mount_error(mp, "%s: Snapshots are not yet supported when "
236		    "running with journaled soft updates", fs->fs_fsmnt);
237		return (EOPNOTSUPP);
238	}
239	MNT_ILOCK(mp);
240	flag = mp->mnt_flag;
241	MNT_IUNLOCK(mp);
242	/*
243	 * Need to serialize access to snapshot code per filesystem.
244	 */
245	/*
246	 * Assign a snapshot slot in the superblock.
247	 */
248	UFS_LOCK(ump);
249	for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++)
250		if (fs->fs_snapinum[snaploc] == 0)
251			break;
252	UFS_UNLOCK(ump);
253	if (snaploc == FSMAXSNAP)
254		return (ENOSPC);
255	/*
256	 * Create the snapshot file.
257	 */
258restart:
259	NDINIT(&nd, CREATE, LOCKPARENT | LOCKLEAF | NOCACHE, UIO_SYSSPACE,
260	    snapfile, td);
261	if ((error = namei(&nd)) != 0)
262		return (error);
263	if (nd.ni_vp != NULL) {
264		vput(nd.ni_vp);
265		error = EEXIST;
266	}
267	if (nd.ni_dvp->v_mount != mp)
268		error = EXDEV;
269	if (error) {
270		NDFREE(&nd, NDF_ONLY_PNBUF);
271		if (nd.ni_dvp == nd.ni_vp)
272			vrele(nd.ni_dvp);
273		else
274			vput(nd.ni_dvp);
275		return (error);
276	}
277	VATTR_NULL(&vat);
278	vat.va_type = VREG;
279	vat.va_mode = S_IRUSR;
280	vat.va_vaflags |= VA_EXCLUSIVE;
281	if (VOP_GETWRITEMOUNT(nd.ni_dvp, &wrtmp))
282		wrtmp = NULL;
283	if (wrtmp != mp)
284		panic("ffs_snapshot: mount mismatch");
285	vfs_rel(wrtmp);
286	if (vn_start_write(NULL, &wrtmp, V_NOWAIT) != 0) {
287		NDFREE(&nd, NDF_ONLY_PNBUF);
288		vput(nd.ni_dvp);
289		if ((error = vn_start_write(NULL, &wrtmp,
290		    V_XSLEEP | PCATCH)) != 0)
291			return (error);
292		goto restart;
293	}
294	error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vat);
295	VOP_UNLOCK(nd.ni_dvp, 0);
296	if (error) {
297		NDFREE(&nd, NDF_ONLY_PNBUF);
298		vn_finished_write(wrtmp);
299		vrele(nd.ni_dvp);
300		return (error);
301	}
302	vp = nd.ni_vp;
303	vp->v_vflag |= VV_SYSTEM;
304	ip = VTOI(vp);
305	devvp = ip->i_devvp;
306	/*
307	 * Allocate and copy the last block contents so as to be able
308	 * to set size to that of the filesystem.
309	 */
310	numblks = howmany(fs->fs_size, fs->fs_frag);
311	error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(numblks - 1)),
312	    fs->fs_bsize, KERNCRED, BA_CLRBUF, &bp);
313	if (error)
314		goto out;
315	ip->i_size = lblktosize(fs, (off_t)numblks);
316	DIP_SET(ip, i_size, ip->i_size);
317	ip->i_flag |= IN_CHANGE | IN_UPDATE;
318	error = readblock(vp, bp, numblks - 1);
319	bawrite(bp);
320	if (error != 0)
321		goto out;
322	/*
323	 * Preallocate critical data structures so that we can copy
324	 * them in without further allocation after we suspend all
325	 * operations on the filesystem. We would like to just release
326	 * the allocated buffers without writing them since they will
327	 * be filled in below once we are ready to go, but this upsets
328	 * the soft update code, so we go ahead and write the new buffers.
329	 *
330	 * Allocate all indirect blocks and mark all of them as not
331	 * needing to be copied.
332	 */
333	for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) {
334		error = UFS_BALLOC(vp, lblktosize(fs, (off_t)blkno),
335		    fs->fs_bsize, td->td_ucred, BA_METAONLY, &ibp);
336		if (error)
337			goto out;
338		bawrite(ibp);
339	}
340	/*
341	 * Allocate copies for the superblock and its summary information.
342	 */
343	error = UFS_BALLOC(vp, fs->fs_sblockloc, fs->fs_sbsize, KERNCRED,
344	    0, &nbp);
345	if (error)
346		goto out;
347	bawrite(nbp);
348	blkno = fragstoblks(fs, fs->fs_csaddr);
349	len = howmany(fs->fs_cssize, fs->fs_bsize);
350	for (loc = 0; loc < len; loc++) {
351		error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(blkno + loc)),
352		    fs->fs_bsize, KERNCRED, 0, &nbp);
353		if (error)
354			goto out;
355		bawrite(nbp);
356	}
357	/*
358	 * Allocate all cylinder group blocks.
359	 */
360	for (cg = 0; cg < fs->fs_ncg; cg++) {
361		error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)),
362		    fs->fs_bsize, KERNCRED, 0, &nbp);
363		if (error)
364			goto out;
365		bawrite(nbp);
366		if (cg % 10 == 0)
367			ffs_syncvnode(vp, MNT_WAIT, 0);
368	}
369	/*
370	 * Copy all the cylinder group maps. Although the
371	 * filesystem is still active, we hope that only a few
372	 * cylinder groups will change between now and when we
373	 * suspend operations. Thus, we will be able to quickly
374	 * touch up the few cylinder groups that changed during
375	 * the suspension period.
376	 */
377	len = howmany(fs->fs_ncg, NBBY);
378	space = malloc(len, M_DEVBUF, M_WAITOK|M_ZERO);
379	UFS_LOCK(ump);
380	fs->fs_active = space;
381	UFS_UNLOCK(ump);
382	for (cg = 0; cg < fs->fs_ncg; cg++) {
383		error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)),
384		    fs->fs_bsize, KERNCRED, 0, &nbp);
385		if (error)
386			goto out;
387		error = cgaccount(cg, vp, nbp, 1);
388		bawrite(nbp);
389		if (cg % 10 == 0)
390			ffs_syncvnode(vp, MNT_WAIT, 0);
391		if (error)
392			goto out;
393	}
394	/*
395	 * Change inode to snapshot type file.
396	 */
397	ip->i_flags |= SF_SNAPSHOT;
398	DIP_SET(ip, i_flags, ip->i_flags);
399	ip->i_flag |= IN_CHANGE | IN_UPDATE;
400	/*
401	 * Ensure that the snapshot is completely on disk.
402	 * Since we have marked it as a snapshot it is safe to
403	 * unlock it as no process will be allowed to write to it.
404	 */
405	if ((error = ffs_syncvnode(vp, MNT_WAIT, 0)) != 0)
406		goto out;
407	VOP_UNLOCK(vp, 0);
408	/*
409	 * All allocations are done, so we can now snapshot the system.
410	 *
411	 * Recind nice scheduling while running with the filesystem suspended.
412	 */
413	if (td->td_proc->p_nice > 0) {
414		struct proc *p;
415
416		p = td->td_proc;
417		PROC_LOCK(p);
418		saved_nice = p->p_nice;
419		sched_nice(p, 0);
420		PROC_UNLOCK(p);
421	}
422	/*
423	 * Suspend operation on filesystem.
424	 */
425	for (;;) {
426		vn_finished_write(wrtmp);
427		if ((error = vfs_write_suspend(vp->v_mount, 0)) != 0) {
428			vn_start_write(NULL, &wrtmp, V_WAIT);
429			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
430			goto out;
431		}
432		if (mp->mnt_kern_flag & MNTK_SUSPENDED)
433			break;
434		vn_start_write(NULL, &wrtmp, V_WAIT);
435	}
436	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
437	if (ip->i_effnlink == 0) {
438		error = ENOENT;		/* Snapshot file unlinked */
439		goto out1;
440	}
441	if (collectsnapstats)
442		nanotime(&starttime);
443
444	/* The last block might have changed.  Copy it again to be sure. */
445	error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(numblks - 1)),
446	    fs->fs_bsize, KERNCRED, BA_CLRBUF, &bp);
447	if (error != 0)
448		goto out1;
449	error = readblock(vp, bp, numblks - 1);
450	bp->b_flags |= B_VALIDSUSPWRT;
451	bawrite(bp);
452	if (error != 0)
453		goto out1;
454	/*
455	 * First, copy all the cylinder group maps that have changed.
456	 */
457	for (cg = 0; cg < fs->fs_ncg; cg++) {
458		if ((ACTIVECGNUM(fs, cg) & ACTIVECGOFF(cg)) != 0)
459			continue;
460		redo++;
461		error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)),
462		    fs->fs_bsize, KERNCRED, 0, &nbp);
463		if (error)
464			goto out1;
465		error = cgaccount(cg, vp, nbp, 2);
466		bawrite(nbp);
467		if (error)
468			goto out1;
469	}
470	/*
471	 * Grab a copy of the superblock and its summary information.
472	 * We delay writing it until the suspension is released below.
473	 */
474	copy_fs = malloc((u_long)fs->fs_bsize, M_UFSMNT, M_WAITOK);
475	bcopy(fs, copy_fs, fs->fs_sbsize);
476	if ((fs->fs_flags & (FS_UNCLEAN | FS_NEEDSFSCK)) == 0)
477		copy_fs->fs_clean = 1;
478	size = fs->fs_bsize < SBLOCKSIZE ? fs->fs_bsize : SBLOCKSIZE;
479	if (fs->fs_sbsize < size)
480		bzero(&((char *)copy_fs)[fs->fs_sbsize],
481		    size - fs->fs_sbsize);
482	size = blkroundup(fs, fs->fs_cssize);
483	if (fs->fs_contigsumsize > 0)
484		size += fs->fs_ncg * sizeof(int32_t);
485	space = malloc((u_long)size, M_UFSMNT, M_WAITOK);
486	copy_fs->fs_csp = space;
487	bcopy(fs->fs_csp, copy_fs->fs_csp, fs->fs_cssize);
488	space = (char *)space + fs->fs_cssize;
489	loc = howmany(fs->fs_cssize, fs->fs_fsize);
490	i = fs->fs_frag - loc % fs->fs_frag;
491	len = (i == fs->fs_frag) ? 0 : i * fs->fs_fsize;
492	if (len > 0) {
493		if ((error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + loc),
494		    len, KERNCRED, &bp)) != 0) {
495			brelse(bp);
496			free(copy_fs->fs_csp, M_UFSMNT);
497			free(copy_fs, M_UFSMNT);
498			copy_fs = NULL;
499			goto out1;
500		}
501		bcopy(bp->b_data, space, (u_int)len);
502		space = (char *)space + len;
503		bp->b_flags |= B_INVAL | B_NOCACHE;
504		brelse(bp);
505	}
506	if (fs->fs_contigsumsize > 0) {
507		copy_fs->fs_maxcluster = lp = space;
508		for (i = 0; i < fs->fs_ncg; i++)
509			*lp++ = fs->fs_contigsumsize;
510	}
511	/*
512	 * We must check for active files that have been unlinked
513	 * (e.g., with a zero link count). We have to expunge all
514	 * trace of these files from the snapshot so that they are
515	 * not reclaimed prematurely by fsck or unnecessarily dumped.
516	 * We turn off the MNTK_SUSPENDED flag to avoid a panic from
517	 * spec_strategy about writing on a suspended filesystem.
518	 * Note that we skip unlinked snapshot files as they will
519	 * be handled separately below.
520	 *
521	 * We also calculate the needed size for the snapshot list.
522	 */
523	snaplistsize = fs->fs_ncg + howmany(fs->fs_cssize, fs->fs_bsize) +
524	    FSMAXSNAP + 1 /* superblock */ + 1 /* last block */ + 1 /* size */;
525	MNT_ILOCK(mp);
526	mp->mnt_kern_flag &= ~MNTK_SUSPENDED;
527	MNT_IUNLOCK(mp);
528loop:
529	MNT_VNODE_FOREACH_ALL(xvp, mp, mvp) {
530		if ((xvp->v_usecount == 0 &&
531		     (xvp->v_iflag & (VI_OWEINACT | VI_DOINGINACT)) == 0) ||
532		    xvp->v_type == VNON ||
533		    IS_SNAPSHOT(VTOI(xvp))) {
534			VI_UNLOCK(xvp);
535			continue;
536		}
537		/*
538		 * We can skip parent directory vnode because it must have
539		 * this snapshot file in it.
540		 */
541		if (xvp == nd.ni_dvp) {
542			VI_UNLOCK(xvp);
543			continue;
544		}
545		vholdl(xvp);
546		if (vn_lock(xvp, LK_EXCLUSIVE | LK_INTERLOCK) != 0) {
547			MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
548			vdrop(xvp);
549			goto loop;
550		}
551		VI_LOCK(xvp);
552		if (xvp->v_usecount == 0 &&
553		    (xvp->v_iflag & (VI_OWEINACT | VI_DOINGINACT)) == 0) {
554			VI_UNLOCK(xvp);
555			VOP_UNLOCK(xvp, 0);
556			vdrop(xvp);
557			continue;
558		}
559		VI_UNLOCK(xvp);
560		if (snapdebug)
561			vprint("ffs_snapshot: busy vnode", xvp);
562		if (VOP_GETATTR(xvp, &vat, td->td_ucred) == 0 &&
563		    vat.va_nlink > 0) {
564			VOP_UNLOCK(xvp, 0);
565			vdrop(xvp);
566			continue;
567		}
568		xp = VTOI(xvp);
569		if (ffs_checkfreefile(copy_fs, vp, xp->i_number)) {
570			VOP_UNLOCK(xvp, 0);
571			vdrop(xvp);
572			continue;
573		}
574		/*
575		 * If there is a fragment, clear it here.
576		 */
577		blkno = 0;
578		loc = howmany(xp->i_size, fs->fs_bsize) - 1;
579		if (loc < NDADDR) {
580			len = fragroundup(fs, blkoff(fs, xp->i_size));
581			if (len != 0 && len < fs->fs_bsize) {
582				ffs_blkfree(ump, copy_fs, vp,
583				    DIP(xp, i_db[loc]), len, xp->i_number,
584				    xvp->v_type, NULL);
585				blkno = DIP(xp, i_db[loc]);
586				DIP_SET(xp, i_db[loc], 0);
587			}
588		}
589		snaplistsize += 1;
590		if (xp->i_ump->um_fstype == UFS1)
591			error = expunge_ufs1(vp, xp, copy_fs, fullacct_ufs1,
592			    BLK_NOCOPY, 1);
593		else
594			error = expunge_ufs2(vp, xp, copy_fs, fullacct_ufs2,
595			    BLK_NOCOPY, 1);
596		if (blkno)
597			DIP_SET(xp, i_db[loc], blkno);
598		if (!error)
599			error = ffs_freefile(ump, copy_fs, vp, xp->i_number,
600			    xp->i_mode, NULL);
601		VOP_UNLOCK(xvp, 0);
602		vdrop(xvp);
603		if (error) {
604			free(copy_fs->fs_csp, M_UFSMNT);
605			free(copy_fs, M_UFSMNT);
606			copy_fs = NULL;
607			MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
608			goto out1;
609		}
610	}
611	/*
612	 * Erase the journal file from the snapshot.
613	 */
614	if (fs->fs_flags & FS_SUJ) {
615		error = softdep_journal_lookup(mp, &xvp);
616		if (error) {
617			free(copy_fs->fs_csp, M_UFSMNT);
618			free(copy_fs, M_UFSMNT);
619			copy_fs = NULL;
620			goto out1;
621		}
622		xp = VTOI(xvp);
623		if (xp->i_ump->um_fstype == UFS1)
624			error = expunge_ufs1(vp, xp, copy_fs, fullacct_ufs1,
625			    BLK_NOCOPY, 0);
626		else
627			error = expunge_ufs2(vp, xp, copy_fs, fullacct_ufs2,
628			    BLK_NOCOPY, 0);
629		vput(xvp);
630	}
631	/*
632	 * Acquire a lock on the snapdata structure, creating it if necessary.
633	 */
634	sn = ffs_snapdata_acquire(devvp);
635	/*
636	 * Change vnode to use shared snapshot lock instead of the original
637	 * private lock.
638	 */
639	vp->v_vnlock = &sn->sn_lock;
640	lockmgr(&vp->v_lock, LK_RELEASE, NULL);
641	xp = TAILQ_FIRST(&sn->sn_head);
642	/*
643	 * If this is the first snapshot on this filesystem, then we need
644	 * to allocate the space for the list of preallocated snapshot blocks.
645	 * This list will be refined below, but this preliminary one will
646	 * keep us out of deadlock until the full one is ready.
647	 */
648	if (xp == NULL) {
649		snapblklist = malloc(snaplistsize * sizeof(daddr_t),
650		    M_UFSMNT, M_WAITOK);
651		blkp = &snapblklist[1];
652		*blkp++ = lblkno(fs, fs->fs_sblockloc);
653		blkno = fragstoblks(fs, fs->fs_csaddr);
654		for (cg = 0; cg < fs->fs_ncg; cg++) {
655			if (fragstoblks(fs, cgtod(fs, cg) > blkno))
656				break;
657			*blkp++ = fragstoblks(fs, cgtod(fs, cg));
658		}
659		len = howmany(fs->fs_cssize, fs->fs_bsize);
660		for (loc = 0; loc < len; loc++)
661			*blkp++ = blkno + loc;
662		for (; cg < fs->fs_ncg; cg++)
663			*blkp++ = fragstoblks(fs, cgtod(fs, cg));
664		snapblklist[0] = blkp - snapblklist;
665		VI_LOCK(devvp);
666		if (sn->sn_blklist != NULL)
667			panic("ffs_snapshot: non-empty list");
668		sn->sn_blklist = snapblklist;
669		sn->sn_listsize = blkp - snapblklist;
670		VI_UNLOCK(devvp);
671	}
672	/*
673	 * Record snapshot inode. Since this is the newest snapshot,
674	 * it must be placed at the end of the list.
675	 */
676	VI_LOCK(devvp);
677	fs->fs_snapinum[snaploc] = ip->i_number;
678	if (ip->i_nextsnap.tqe_prev != 0)
679		panic("ffs_snapshot: %ju already on list",
680		    (uintmax_t)ip->i_number);
681	TAILQ_INSERT_TAIL(&sn->sn_head, ip, i_nextsnap);
682	devvp->v_vflag |= VV_COPYONWRITE;
683	VI_UNLOCK(devvp);
684	ASSERT_VOP_LOCKED(vp, "ffs_snapshot vp");
685out1:
686	KASSERT((sn != NULL && copy_fs != NULL && error == 0) ||
687		(sn == NULL && copy_fs == NULL && error != 0),
688		("email phk@ and mckusick@"));
689	/*
690	 * Resume operation on filesystem.
691	 */
692	vfs_write_resume(vp->v_mount, VR_START_WRITE | VR_NO_SUSPCLR);
693	if (collectsnapstats && starttime.tv_sec > 0) {
694		nanotime(&endtime);
695		timespecsub(&endtime, &starttime);
696		printf("%s: suspended %ld.%03ld sec, redo %ld of %d\n",
697		    vp->v_mount->mnt_stat.f_mntonname, (long)endtime.tv_sec,
698		    endtime.tv_nsec / 1000000, redo, fs->fs_ncg);
699	}
700	if (copy_fs == NULL)
701		goto out;
702	/*
703	 * Copy allocation information from all the snapshots in
704	 * this snapshot and then expunge them from its view.
705	 */
706	TAILQ_FOREACH(xp, &sn->sn_head, i_nextsnap) {
707		if (xp == ip)
708			break;
709		if (xp->i_ump->um_fstype == UFS1)
710			error = expunge_ufs1(vp, xp, fs, snapacct_ufs1,
711			    BLK_SNAP, 0);
712		else
713			error = expunge_ufs2(vp, xp, fs, snapacct_ufs2,
714			    BLK_SNAP, 0);
715		if (error == 0 && xp->i_effnlink == 0) {
716			error = ffs_freefile(ump,
717					     copy_fs,
718					     vp,
719					     xp->i_number,
720					     xp->i_mode, NULL);
721		}
722		if (error) {
723			fs->fs_snapinum[snaploc] = 0;
724			goto done;
725		}
726	}
727	/*
728	 * Allocate space for the full list of preallocated snapshot blocks.
729	 */
730	snapblklist = malloc(snaplistsize * sizeof(daddr_t),
731	    M_UFSMNT, M_WAITOK);
732	ip->i_snapblklist = &snapblklist[1];
733	/*
734	 * Expunge the blocks used by the snapshots from the set of
735	 * blocks marked as used in the snapshot bitmaps. Also, collect
736	 * the list of allocated blocks in i_snapblklist.
737	 */
738	if (ip->i_ump->um_fstype == UFS1)
739		error = expunge_ufs1(vp, ip, copy_fs, mapacct_ufs1,
740		    BLK_SNAP, 0);
741	else
742		error = expunge_ufs2(vp, ip, copy_fs, mapacct_ufs2,
743		    BLK_SNAP, 0);
744	if (error) {
745		fs->fs_snapinum[snaploc] = 0;
746		free(snapblklist, M_UFSMNT);
747		goto done;
748	}
749	if (snaplistsize < ip->i_snapblklist - snapblklist)
750		panic("ffs_snapshot: list too small");
751	snaplistsize = ip->i_snapblklist - snapblklist;
752	snapblklist[0] = snaplistsize;
753	ip->i_snapblklist = 0;
754	/*
755	 * Write out the list of allocated blocks to the end of the snapshot.
756	 */
757	auio.uio_iov = &aiov;
758	auio.uio_iovcnt = 1;
759	aiov.iov_base = (void *)snapblklist;
760	aiov.iov_len = snaplistsize * sizeof(daddr_t);
761	auio.uio_resid = aiov.iov_len;
762	auio.uio_offset = ip->i_size;
763	auio.uio_segflg = UIO_SYSSPACE;
764	auio.uio_rw = UIO_WRITE;
765	auio.uio_td = td;
766	if ((error = VOP_WRITE(vp, &auio, IO_UNIT, td->td_ucred)) != 0) {
767		fs->fs_snapinum[snaploc] = 0;
768		free(snapblklist, M_UFSMNT);
769		goto done;
770	}
771	/*
772	 * Write the superblock and its summary information
773	 * to the snapshot.
774	 */
775	blkno = fragstoblks(fs, fs->fs_csaddr);
776	len = howmany(fs->fs_cssize, fs->fs_bsize);
777	space = copy_fs->fs_csp;
778	for (loc = 0; loc < len; loc++) {
779		error = bread(vp, blkno + loc, fs->fs_bsize, KERNCRED, &nbp);
780		if (error) {
781			brelse(nbp);
782			fs->fs_snapinum[snaploc] = 0;
783			free(snapblklist, M_UFSMNT);
784			goto done;
785		}
786		bcopy(space, nbp->b_data, fs->fs_bsize);
787		space = (char *)space + fs->fs_bsize;
788		bawrite(nbp);
789	}
790	error = bread(vp, lblkno(fs, fs->fs_sblockloc), fs->fs_bsize,
791	    KERNCRED, &nbp);
792	if (error) {
793		brelse(nbp);
794	} else {
795		loc = blkoff(fs, fs->fs_sblockloc);
796		bcopy((char *)copy_fs, &nbp->b_data[loc], (u_int)fs->fs_sbsize);
797		bawrite(nbp);
798	}
799	/*
800	 * As this is the newest list, it is the most inclusive, so
801	 * should replace the previous list.
802	 */
803	VI_LOCK(devvp);
804	space = sn->sn_blklist;
805	sn->sn_blklist = snapblklist;
806	sn->sn_listsize = snaplistsize;
807	VI_UNLOCK(devvp);
808	if (space != NULL)
809		free(space, M_UFSMNT);
810	/*
811	 * Preallocate all the direct blocks in the snapshot inode so
812	 * that we never have to write the inode itself to commit an
813	 * update to the contents of the snapshot. Note that once
814	 * created, the size of the snapshot will never change, so
815	 * there will never be a need to write the inode except to
816	 * update the non-integrity-critical time fields and
817	 * allocated-block count.
818	 */
819	for (blockno = 0; blockno < NDADDR; blockno++) {
820		if (DIP(ip, i_db[blockno]) != 0)
821			continue;
822		error = UFS_BALLOC(vp, lblktosize(fs, blockno),
823		    fs->fs_bsize, KERNCRED, BA_CLRBUF, &bp);
824		if (error)
825			break;
826		error = readblock(vp, bp, blockno);
827		bawrite(bp);
828		if (error != 0)
829			break;
830	}
831done:
832	free(copy_fs->fs_csp, M_UFSMNT);
833	free(copy_fs, M_UFSMNT);
834	copy_fs = NULL;
835out:
836	NDFREE(&nd, NDF_ONLY_PNBUF);
837	if (saved_nice > 0) {
838		struct proc *p;
839
840		p = td->td_proc;
841		PROC_LOCK(p);
842		sched_nice(td->td_proc, saved_nice);
843		PROC_UNLOCK(td->td_proc);
844	}
845	UFS_LOCK(ump);
846	if (fs->fs_active != 0) {
847		free(fs->fs_active, M_DEVBUF);
848		fs->fs_active = 0;
849	}
850	UFS_UNLOCK(ump);
851	MNT_ILOCK(mp);
852	mp->mnt_flag = (mp->mnt_flag & MNT_QUOTA) | (flag & ~MNT_QUOTA);
853	MNT_IUNLOCK(mp);
854	if (error)
855		(void) ffs_truncate(vp, (off_t)0, 0, NOCRED);
856	(void) ffs_syncvnode(vp, MNT_WAIT, 0);
857	if (error)
858		vput(vp);
859	else
860		VOP_UNLOCK(vp, 0);
861	vrele(nd.ni_dvp);
862	vn_finished_write(wrtmp);
863	process_deferred_inactive(mp);
864	return (error);
865}
866
867/*
868 * Copy a cylinder group map. All the unallocated blocks are marked
869 * BLK_NOCOPY so that the snapshot knows that it need not copy them
870 * if they are later written. If passno is one, then this is a first
871 * pass, so only setting needs to be done. If passno is 2, then this
872 * is a revision to a previous pass which must be undone as the
873 * replacement pass is done.
874 */
875static int
876cgaccount(cg, vp, nbp, passno)
877	int cg;
878	struct vnode *vp;
879	struct buf *nbp;
880	int passno;
881{
882	struct buf *bp, *ibp;
883	struct inode *ip;
884	struct cg *cgp;
885	struct fs *fs;
886	ufs2_daddr_t base, numblks;
887	int error, len, loc, indiroff;
888
889	ip = VTOI(vp);
890	fs = ip->i_fs;
891	error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)),
892		(int)fs->fs_cgsize, KERNCRED, &bp);
893	if (error) {
894		brelse(bp);
895		return (error);
896	}
897	cgp = (struct cg *)bp->b_data;
898	if (!cg_chkmagic(cgp)) {
899		brelse(bp);
900		return (EIO);
901	}
902	UFS_LOCK(ip->i_ump);
903	ACTIVESET(fs, cg);
904	/*
905	 * Recomputation of summary information might not have been performed
906	 * at mount time.  Sync up summary information for current cylinder
907	 * group while data is in memory to ensure that result of background
908	 * fsck is slightly more consistent.
909	 */
910	fs->fs_cs(fs, cg) = cgp->cg_cs;
911	UFS_UNLOCK(ip->i_ump);
912	bcopy(bp->b_data, nbp->b_data, fs->fs_cgsize);
913	if (fs->fs_cgsize < fs->fs_bsize)
914		bzero(&nbp->b_data[fs->fs_cgsize],
915		    fs->fs_bsize - fs->fs_cgsize);
916	cgp = (struct cg *)nbp->b_data;
917	bqrelse(bp);
918	if (passno == 2)
919		nbp->b_flags |= B_VALIDSUSPWRT;
920	numblks = howmany(fs->fs_size, fs->fs_frag);
921	len = howmany(fs->fs_fpg, fs->fs_frag);
922	base = cgbase(fs, cg) / fs->fs_frag;
923	if (base + len >= numblks)
924		len = numblks - base - 1;
925	loc = 0;
926	if (base < NDADDR) {
927		for ( ; loc < NDADDR; loc++) {
928			if (ffs_isblock(fs, cg_blksfree(cgp), loc))
929				DIP_SET(ip, i_db[loc], BLK_NOCOPY);
930			else if (passno == 2 && DIP(ip, i_db[loc])== BLK_NOCOPY)
931				DIP_SET(ip, i_db[loc], 0);
932			else if (passno == 1 && DIP(ip, i_db[loc])== BLK_NOCOPY)
933				panic("ffs_snapshot: lost direct block");
934		}
935	}
936	error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(base + loc)),
937	    fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
938	if (error) {
939		return (error);
940	}
941	indiroff = (base + loc - NDADDR) % NINDIR(fs);
942	for ( ; loc < len; loc++, indiroff++) {
943		if (indiroff >= NINDIR(fs)) {
944			if (passno == 2)
945				ibp->b_flags |= B_VALIDSUSPWRT;
946			bawrite(ibp);
947			error = UFS_BALLOC(vp,
948			    lblktosize(fs, (off_t)(base + loc)),
949			    fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
950			if (error) {
951				return (error);
952			}
953			indiroff = 0;
954		}
955		if (ip->i_ump->um_fstype == UFS1) {
956			if (ffs_isblock(fs, cg_blksfree(cgp), loc))
957				((ufs1_daddr_t *)(ibp->b_data))[indiroff] =
958				    BLK_NOCOPY;
959			else if (passno == 2 && ((ufs1_daddr_t *)(ibp->b_data))
960			    [indiroff] == BLK_NOCOPY)
961				((ufs1_daddr_t *)(ibp->b_data))[indiroff] = 0;
962			else if (passno == 1 && ((ufs1_daddr_t *)(ibp->b_data))
963			    [indiroff] == BLK_NOCOPY)
964				panic("ffs_snapshot: lost indirect block");
965			continue;
966		}
967		if (ffs_isblock(fs, cg_blksfree(cgp), loc))
968			((ufs2_daddr_t *)(ibp->b_data))[indiroff] = BLK_NOCOPY;
969		else if (passno == 2 &&
970		    ((ufs2_daddr_t *)(ibp->b_data)) [indiroff] == BLK_NOCOPY)
971			((ufs2_daddr_t *)(ibp->b_data))[indiroff] = 0;
972		else if (passno == 1 &&
973		    ((ufs2_daddr_t *)(ibp->b_data)) [indiroff] == BLK_NOCOPY)
974			panic("ffs_snapshot: lost indirect block");
975	}
976	if (passno == 2)
977		ibp->b_flags |= B_VALIDSUSPWRT;
978	bdwrite(ibp);
979	return (0);
980}
981
982/*
983 * Before expunging a snapshot inode, note all the
984 * blocks that it claims with BLK_SNAP so that fsck will
985 * be able to account for those blocks properly and so
986 * that this snapshot knows that it need not copy them
987 * if the other snapshot holding them is freed. This code
988 * is reproduced once each for UFS1 and UFS2.
989 */
990static int
991expunge_ufs1(snapvp, cancelip, fs, acctfunc, expungetype, clearmode)
992	struct vnode *snapvp;
993	struct inode *cancelip;
994	struct fs *fs;
995	int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
996	    struct fs *, ufs_lbn_t, int);
997	int expungetype;
998	int clearmode;
999{
1000	int i, error, indiroff;
1001	ufs_lbn_t lbn, rlbn;
1002	ufs2_daddr_t len, blkno, numblks, blksperindir;
1003	struct ufs1_dinode *dip;
1004	struct thread *td = curthread;
1005	struct buf *bp;
1006
1007	/*
1008	 * Prepare to expunge the inode. If its inode block has not
1009	 * yet been copied, then allocate and fill the copy.
1010	 */
1011	lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number));
1012	blkno = 0;
1013	if (lbn < NDADDR) {
1014		blkno = VTOI(snapvp)->i_din1->di_db[lbn];
1015	} else {
1016		if (DOINGSOFTDEP(snapvp))
1017			softdep_prealloc(snapvp, MNT_WAIT);
1018		td->td_pflags |= TDP_COWINPROGRESS;
1019		error = ffs_balloc_ufs1(snapvp, lblktosize(fs, (off_t)lbn),
1020		   fs->fs_bsize, KERNCRED, BA_METAONLY, &bp);
1021		td->td_pflags &= ~TDP_COWINPROGRESS;
1022		if (error)
1023			return (error);
1024		indiroff = (lbn - NDADDR) % NINDIR(fs);
1025		blkno = ((ufs1_daddr_t *)(bp->b_data))[indiroff];
1026		bqrelse(bp);
1027	}
1028	if (blkno != 0) {
1029		if ((error = bread(snapvp, lbn, fs->fs_bsize, KERNCRED, &bp)))
1030			return (error);
1031	} else {
1032		error = ffs_balloc_ufs1(snapvp, lblktosize(fs, (off_t)lbn),
1033		    fs->fs_bsize, KERNCRED, 0, &bp);
1034		if (error)
1035			return (error);
1036		if ((error = readblock(snapvp, bp, lbn)) != 0)
1037			return (error);
1038	}
1039	/*
1040	 * Set a snapshot inode to be a zero length file, regular files
1041	 * or unlinked snapshots to be completely unallocated.
1042	 */
1043	dip = (struct ufs1_dinode *)bp->b_data +
1044	    ino_to_fsbo(fs, cancelip->i_number);
1045	if (clearmode || cancelip->i_effnlink == 0)
1046		dip->di_mode = 0;
1047	dip->di_size = 0;
1048	dip->di_blocks = 0;
1049	dip->di_flags &= ~SF_SNAPSHOT;
1050	bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs1_daddr_t));
1051	bdwrite(bp);
1052	/*
1053	 * Now go through and expunge all the blocks in the file
1054	 * using the function requested.
1055	 */
1056	numblks = howmany(cancelip->i_size, fs->fs_bsize);
1057	if ((error = (*acctfunc)(snapvp, &cancelip->i_din1->di_db[0],
1058	    &cancelip->i_din1->di_db[NDADDR], fs, 0, expungetype)))
1059		return (error);
1060	if ((error = (*acctfunc)(snapvp, &cancelip->i_din1->di_ib[0],
1061	    &cancelip->i_din1->di_ib[NIADDR], fs, -1, expungetype)))
1062		return (error);
1063	blksperindir = 1;
1064	lbn = -NDADDR;
1065	len = numblks - NDADDR;
1066	rlbn = NDADDR;
1067	for (i = 0; len > 0 && i < NIADDR; i++) {
1068		error = indiracct_ufs1(snapvp, ITOV(cancelip), i,
1069		    cancelip->i_din1->di_ib[i], lbn, rlbn, len,
1070		    blksperindir, fs, acctfunc, expungetype);
1071		if (error)
1072			return (error);
1073		blksperindir *= NINDIR(fs);
1074		lbn -= blksperindir + 1;
1075		len -= blksperindir;
1076		rlbn += blksperindir;
1077	}
1078	return (0);
1079}
1080
1081/*
1082 * Descend an indirect block chain for vnode cancelvp accounting for all
1083 * its indirect blocks in snapvp.
1084 */
1085static int
1086indiracct_ufs1(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks,
1087	    blksperindir, fs, acctfunc, expungetype)
1088	struct vnode *snapvp;
1089	struct vnode *cancelvp;
1090	int level;
1091	ufs1_daddr_t blkno;
1092	ufs_lbn_t lbn;
1093	ufs_lbn_t rlbn;
1094	ufs_lbn_t remblks;
1095	ufs_lbn_t blksperindir;
1096	struct fs *fs;
1097	int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
1098	    struct fs *, ufs_lbn_t, int);
1099	int expungetype;
1100{
1101	int error, num, i;
1102	ufs_lbn_t subblksperindir;
1103	struct indir indirs[NIADDR + 2];
1104	ufs1_daddr_t last, *bap;
1105	struct buf *bp;
1106
1107	if (blkno == 0) {
1108		if (expungetype == BLK_NOCOPY)
1109			return (0);
1110		panic("indiracct_ufs1: missing indir");
1111	}
1112	if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0)
1113		return (error);
1114	if (lbn != indirs[num - 1 - level].in_lbn || num < 2)
1115		panic("indiracct_ufs1: botched params");
1116	/*
1117	 * We have to expand bread here since it will deadlock looking
1118	 * up the block number for any blocks that are not in the cache.
1119	 */
1120	bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0, 0);
1121	bp->b_blkno = fsbtodb(fs, blkno);
1122	if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 &&
1123	    (error = readblock(cancelvp, bp, fragstoblks(fs, blkno)))) {
1124		brelse(bp);
1125		return (error);
1126	}
1127	/*
1128	 * Account for the block pointers in this indirect block.
1129	 */
1130	last = howmany(remblks, blksperindir);
1131	if (last > NINDIR(fs))
1132		last = NINDIR(fs);
1133	bap = malloc(fs->fs_bsize, M_DEVBUF, M_WAITOK);
1134	bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize);
1135	bqrelse(bp);
1136	error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs,
1137	    level == 0 ? rlbn : -1, expungetype);
1138	if (error || level == 0)
1139		goto out;
1140	/*
1141	 * Account for the block pointers in each of the indirect blocks
1142	 * in the levels below us.
1143	 */
1144	subblksperindir = blksperindir / NINDIR(fs);
1145	for (lbn++, level--, i = 0; i < last; i++) {
1146		error = indiracct_ufs1(snapvp, cancelvp, level, bap[i], lbn,
1147		    rlbn, remblks, subblksperindir, fs, acctfunc, expungetype);
1148		if (error)
1149			goto out;
1150		rlbn += blksperindir;
1151		lbn -= blksperindir;
1152		remblks -= blksperindir;
1153	}
1154out:
1155	free(bap, M_DEVBUF);
1156	return (error);
1157}
1158
1159/*
1160 * Do both snap accounting and map accounting.
1161 */
1162static int
1163fullacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype)
1164	struct vnode *vp;
1165	ufs1_daddr_t *oldblkp, *lastblkp;
1166	struct fs *fs;
1167	ufs_lbn_t lblkno;
1168	int exptype;	/* BLK_SNAP or BLK_NOCOPY */
1169{
1170	int error;
1171
1172	if ((error = snapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype)))
1173		return (error);
1174	return (mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype));
1175}
1176
1177/*
1178 * Identify a set of blocks allocated in a snapshot inode.
1179 */
1180static int
1181snapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype)
1182	struct vnode *vp;
1183	ufs1_daddr_t *oldblkp, *lastblkp;
1184	struct fs *fs;
1185	ufs_lbn_t lblkno;
1186	int expungetype;	/* BLK_SNAP or BLK_NOCOPY */
1187{
1188	struct inode *ip = VTOI(vp);
1189	ufs1_daddr_t blkno, *blkp;
1190	ufs_lbn_t lbn;
1191	struct buf *ibp;
1192	int error;
1193
1194	for ( ; oldblkp < lastblkp; oldblkp++) {
1195		blkno = *oldblkp;
1196		if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP)
1197			continue;
1198		lbn = fragstoblks(fs, blkno);
1199		if (lbn < NDADDR) {
1200			blkp = &ip->i_din1->di_db[lbn];
1201			ip->i_flag |= IN_CHANGE | IN_UPDATE;
1202		} else {
1203			error = ffs_balloc_ufs1(vp, lblktosize(fs, (off_t)lbn),
1204			    fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
1205			if (error)
1206				return (error);
1207			blkp = &((ufs1_daddr_t *)(ibp->b_data))
1208			    [(lbn - NDADDR) % NINDIR(fs)];
1209		}
1210		/*
1211		 * If we are expunging a snapshot vnode and we
1212		 * find a block marked BLK_NOCOPY, then it is
1213		 * one that has been allocated to this snapshot after
1214		 * we took our current snapshot and can be ignored.
1215		 */
1216		if (expungetype == BLK_SNAP && *blkp == BLK_NOCOPY) {
1217			if (lbn >= NDADDR)
1218				brelse(ibp);
1219		} else {
1220			if (*blkp != 0)
1221				panic("snapacct_ufs1: bad block");
1222			*blkp = expungetype;
1223			if (lbn >= NDADDR)
1224				bdwrite(ibp);
1225		}
1226	}
1227	return (0);
1228}
1229
1230/*
1231 * Account for a set of blocks allocated in a snapshot inode.
1232 */
1233static int
1234mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype)
1235	struct vnode *vp;
1236	ufs1_daddr_t *oldblkp, *lastblkp;
1237	struct fs *fs;
1238	ufs_lbn_t lblkno;
1239	int expungetype;
1240{
1241	ufs1_daddr_t blkno;
1242	struct inode *ip;
1243	ino_t inum;
1244	int acctit;
1245
1246	ip = VTOI(vp);
1247	inum = ip->i_number;
1248	if (lblkno == -1)
1249		acctit = 0;
1250	else
1251		acctit = 1;
1252	for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) {
1253		blkno = *oldblkp;
1254		if (blkno == 0 || blkno == BLK_NOCOPY)
1255			continue;
1256		if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP)
1257			*ip->i_snapblklist++ = lblkno;
1258		if (blkno == BLK_SNAP)
1259			blkno = blkstofrags(fs, lblkno);
1260		ffs_blkfree(ip->i_ump, fs, vp, blkno, fs->fs_bsize, inum,
1261		    vp->v_type, NULL);
1262	}
1263	return (0);
1264}
1265
1266/*
1267 * Before expunging a snapshot inode, note all the
1268 * blocks that it claims with BLK_SNAP so that fsck will
1269 * be able to account for those blocks properly and so
1270 * that this snapshot knows that it need not copy them
1271 * if the other snapshot holding them is freed. This code
1272 * is reproduced once each for UFS1 and UFS2.
1273 */
1274static int
1275expunge_ufs2(snapvp, cancelip, fs, acctfunc, expungetype, clearmode)
1276	struct vnode *snapvp;
1277	struct inode *cancelip;
1278	struct fs *fs;
1279	int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
1280	    struct fs *, ufs_lbn_t, int);
1281	int expungetype;
1282	int clearmode;
1283{
1284	int i, error, indiroff;
1285	ufs_lbn_t lbn, rlbn;
1286	ufs2_daddr_t len, blkno, numblks, blksperindir;
1287	struct ufs2_dinode *dip;
1288	struct thread *td = curthread;
1289	struct buf *bp;
1290
1291	/*
1292	 * Prepare to expunge the inode. If its inode block has not
1293	 * yet been copied, then allocate and fill the copy.
1294	 */
1295	lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number));
1296	blkno = 0;
1297	if (lbn < NDADDR) {
1298		blkno = VTOI(snapvp)->i_din2->di_db[lbn];
1299	} else {
1300		if (DOINGSOFTDEP(snapvp))
1301			softdep_prealloc(snapvp, MNT_WAIT);
1302		td->td_pflags |= TDP_COWINPROGRESS;
1303		error = ffs_balloc_ufs2(snapvp, lblktosize(fs, (off_t)lbn),
1304		   fs->fs_bsize, KERNCRED, BA_METAONLY, &bp);
1305		td->td_pflags &= ~TDP_COWINPROGRESS;
1306		if (error)
1307			return (error);
1308		indiroff = (lbn - NDADDR) % NINDIR(fs);
1309		blkno = ((ufs2_daddr_t *)(bp->b_data))[indiroff];
1310		bqrelse(bp);
1311	}
1312	if (blkno != 0) {
1313		if ((error = bread(snapvp, lbn, fs->fs_bsize, KERNCRED, &bp)))
1314			return (error);
1315	} else {
1316		error = ffs_balloc_ufs2(snapvp, lblktosize(fs, (off_t)lbn),
1317		    fs->fs_bsize, KERNCRED, 0, &bp);
1318		if (error)
1319			return (error);
1320		if ((error = readblock(snapvp, bp, lbn)) != 0)
1321			return (error);
1322	}
1323	/*
1324	 * Set a snapshot inode to be a zero length file, regular files
1325	 * to be completely unallocated.
1326	 */
1327	dip = (struct ufs2_dinode *)bp->b_data +
1328	    ino_to_fsbo(fs, cancelip->i_number);
1329	if (clearmode || cancelip->i_effnlink == 0)
1330		dip->di_mode = 0;
1331	dip->di_size = 0;
1332	dip->di_blocks = 0;
1333	dip->di_flags &= ~SF_SNAPSHOT;
1334	bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs2_daddr_t));
1335	bdwrite(bp);
1336	/*
1337	 * Now go through and expunge all the blocks in the file
1338	 * using the function requested.
1339	 */
1340	numblks = howmany(cancelip->i_size, fs->fs_bsize);
1341	if ((error = (*acctfunc)(snapvp, &cancelip->i_din2->di_db[0],
1342	    &cancelip->i_din2->di_db[NDADDR], fs, 0, expungetype)))
1343		return (error);
1344	if ((error = (*acctfunc)(snapvp, &cancelip->i_din2->di_ib[0],
1345	    &cancelip->i_din2->di_ib[NIADDR], fs, -1, expungetype)))
1346		return (error);
1347	blksperindir = 1;
1348	lbn = -NDADDR;
1349	len = numblks - NDADDR;
1350	rlbn = NDADDR;
1351	for (i = 0; len > 0 && i < NIADDR; i++) {
1352		error = indiracct_ufs2(snapvp, ITOV(cancelip), i,
1353		    cancelip->i_din2->di_ib[i], lbn, rlbn, len,
1354		    blksperindir, fs, acctfunc, expungetype);
1355		if (error)
1356			return (error);
1357		blksperindir *= NINDIR(fs);
1358		lbn -= blksperindir + 1;
1359		len -= blksperindir;
1360		rlbn += blksperindir;
1361	}
1362	return (0);
1363}
1364
1365/*
1366 * Descend an indirect block chain for vnode cancelvp accounting for all
1367 * its indirect blocks in snapvp.
1368 */
1369static int
1370indiracct_ufs2(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks,
1371	    blksperindir, fs, acctfunc, expungetype)
1372	struct vnode *snapvp;
1373	struct vnode *cancelvp;
1374	int level;
1375	ufs2_daddr_t blkno;
1376	ufs_lbn_t lbn;
1377	ufs_lbn_t rlbn;
1378	ufs_lbn_t remblks;
1379	ufs_lbn_t blksperindir;
1380	struct fs *fs;
1381	int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
1382	    struct fs *, ufs_lbn_t, int);
1383	int expungetype;
1384{
1385	int error, num, i;
1386	ufs_lbn_t subblksperindir;
1387	struct indir indirs[NIADDR + 2];
1388	ufs2_daddr_t last, *bap;
1389	struct buf *bp;
1390
1391	if (blkno == 0) {
1392		if (expungetype == BLK_NOCOPY)
1393			return (0);
1394		panic("indiracct_ufs2: missing indir");
1395	}
1396	if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0)
1397		return (error);
1398	if (lbn != indirs[num - 1 - level].in_lbn || num < 2)
1399		panic("indiracct_ufs2: botched params");
1400	/*
1401	 * We have to expand bread here since it will deadlock looking
1402	 * up the block number for any blocks that are not in the cache.
1403	 */
1404	bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0, 0);
1405	bp->b_blkno = fsbtodb(fs, blkno);
1406	if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 &&
1407	    (error = readblock(cancelvp, bp, fragstoblks(fs, blkno)))) {
1408		brelse(bp);
1409		return (error);
1410	}
1411	/*
1412	 * Account for the block pointers in this indirect block.
1413	 */
1414	last = howmany(remblks, blksperindir);
1415	if (last > NINDIR(fs))
1416		last = NINDIR(fs);
1417	bap = malloc(fs->fs_bsize, M_DEVBUF, M_WAITOK);
1418	bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize);
1419	bqrelse(bp);
1420	error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs,
1421	    level == 0 ? rlbn : -1, expungetype);
1422	if (error || level == 0)
1423		goto out;
1424	/*
1425	 * Account for the block pointers in each of the indirect blocks
1426	 * in the levels below us.
1427	 */
1428	subblksperindir = blksperindir / NINDIR(fs);
1429	for (lbn++, level--, i = 0; i < last; i++) {
1430		error = indiracct_ufs2(snapvp, cancelvp, level, bap[i], lbn,
1431		    rlbn, remblks, subblksperindir, fs, acctfunc, expungetype);
1432		if (error)
1433			goto out;
1434		rlbn += blksperindir;
1435		lbn -= blksperindir;
1436		remblks -= blksperindir;
1437	}
1438out:
1439	free(bap, M_DEVBUF);
1440	return (error);
1441}
1442
1443/*
1444 * Do both snap accounting and map accounting.
1445 */
1446static int
1447fullacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype)
1448	struct vnode *vp;
1449	ufs2_daddr_t *oldblkp, *lastblkp;
1450	struct fs *fs;
1451	ufs_lbn_t lblkno;
1452	int exptype;	/* BLK_SNAP or BLK_NOCOPY */
1453{
1454	int error;
1455
1456	if ((error = snapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype)))
1457		return (error);
1458	return (mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype));
1459}
1460
1461/*
1462 * Identify a set of blocks allocated in a snapshot inode.
1463 */
1464static int
1465snapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expungetype)
1466	struct vnode *vp;
1467	ufs2_daddr_t *oldblkp, *lastblkp;
1468	struct fs *fs;
1469	ufs_lbn_t lblkno;
1470	int expungetype;	/* BLK_SNAP or BLK_NOCOPY */
1471{
1472	struct inode *ip = VTOI(vp);
1473	ufs2_daddr_t blkno, *blkp;
1474	ufs_lbn_t lbn;
1475	struct buf *ibp;
1476	int error;
1477
1478	for ( ; oldblkp < lastblkp; oldblkp++) {
1479		blkno = *oldblkp;
1480		if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP)
1481			continue;
1482		lbn = fragstoblks(fs, blkno);
1483		if (lbn < NDADDR) {
1484			blkp = &ip->i_din2->di_db[lbn];
1485			ip->i_flag |= IN_CHANGE | IN_UPDATE;
1486		} else {
1487			error = ffs_balloc_ufs2(vp, lblktosize(fs, (off_t)lbn),
1488			    fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
1489			if (error)
1490				return (error);
1491			blkp = &((ufs2_daddr_t *)(ibp->b_data))
1492			    [(lbn - NDADDR) % NINDIR(fs)];
1493		}
1494		/*
1495		 * If we are expunging a snapshot vnode and we
1496		 * find a block marked BLK_NOCOPY, then it is
1497		 * one that has been allocated to this snapshot after
1498		 * we took our current snapshot and can be ignored.
1499		 */
1500		if (expungetype == BLK_SNAP && *blkp == BLK_NOCOPY) {
1501			if (lbn >= NDADDR)
1502				brelse(ibp);
1503		} else {
1504			if (*blkp != 0)
1505				panic("snapacct_ufs2: bad block");
1506			*blkp = expungetype;
1507			if (lbn >= NDADDR)
1508				bdwrite(ibp);
1509		}
1510	}
1511	return (0);
1512}
1513
1514/*
1515 * Account for a set of blocks allocated in a snapshot inode.
1516 */
1517static int
1518mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expungetype)
1519	struct vnode *vp;
1520	ufs2_daddr_t *oldblkp, *lastblkp;
1521	struct fs *fs;
1522	ufs_lbn_t lblkno;
1523	int expungetype;
1524{
1525	ufs2_daddr_t blkno;
1526	struct inode *ip;
1527	ino_t inum;
1528	int acctit;
1529
1530	ip = VTOI(vp);
1531	inum = ip->i_number;
1532	if (lblkno == -1)
1533		acctit = 0;
1534	else
1535		acctit = 1;
1536	for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) {
1537		blkno = *oldblkp;
1538		if (blkno == 0 || blkno == BLK_NOCOPY)
1539			continue;
1540		if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP)
1541			*ip->i_snapblklist++ = lblkno;
1542		if (blkno == BLK_SNAP)
1543			blkno = blkstofrags(fs, lblkno);
1544		ffs_blkfree(ip->i_ump, fs, vp, blkno, fs->fs_bsize, inum,
1545		    vp->v_type, NULL);
1546	}
1547	return (0);
1548}
1549
1550/*
1551 * Decrement extra reference on snapshot when last name is removed.
1552 * It will not be freed until the last open reference goes away.
1553 */
1554void
1555ffs_snapgone(ip)
1556	struct inode *ip;
1557{
1558	struct inode *xp;
1559	struct fs *fs;
1560	int snaploc;
1561	struct snapdata *sn;
1562	struct ufsmount *ump;
1563
1564	/*
1565	 * Find snapshot in incore list.
1566	 */
1567	xp = NULL;
1568	sn = ip->i_devvp->v_rdev->si_snapdata;
1569	if (sn != NULL)
1570		TAILQ_FOREACH(xp, &sn->sn_head, i_nextsnap)
1571			if (xp == ip)
1572				break;
1573	if (xp != NULL)
1574		vrele(ITOV(ip));
1575	else if (snapdebug)
1576		printf("ffs_snapgone: lost snapshot vnode %ju\n",
1577		    (uintmax_t)ip->i_number);
1578	/*
1579	 * Delete snapshot inode from superblock. Keep list dense.
1580	 */
1581	fs = ip->i_fs;
1582	ump = ip->i_ump;
1583	UFS_LOCK(ump);
1584	for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++)
1585		if (fs->fs_snapinum[snaploc] == ip->i_number)
1586			break;
1587	if (snaploc < FSMAXSNAP) {
1588		for (snaploc++; snaploc < FSMAXSNAP; snaploc++) {
1589			if (fs->fs_snapinum[snaploc] == 0)
1590				break;
1591			fs->fs_snapinum[snaploc - 1] = fs->fs_snapinum[snaploc];
1592		}
1593		fs->fs_snapinum[snaploc - 1] = 0;
1594	}
1595	UFS_UNLOCK(ump);
1596}
1597
1598/*
1599 * Prepare a snapshot file for being removed.
1600 */
1601void
1602ffs_snapremove(vp)
1603	struct vnode *vp;
1604{
1605	struct inode *ip;
1606	struct vnode *devvp;
1607	struct buf *ibp;
1608	struct fs *fs;
1609	ufs2_daddr_t numblks, blkno, dblk;
1610	int error, loc, last;
1611	struct snapdata *sn;
1612
1613	ip = VTOI(vp);
1614	fs = ip->i_fs;
1615	devvp = ip->i_devvp;
1616	/*
1617	 * If active, delete from incore list (this snapshot may
1618	 * already have been in the process of being deleted, so
1619	 * would not have been active).
1620	 *
1621	 * Clear copy-on-write flag if last snapshot.
1622	 */
1623	VI_LOCK(devvp);
1624	if (ip->i_nextsnap.tqe_prev != 0) {
1625		sn = devvp->v_rdev->si_snapdata;
1626		TAILQ_REMOVE(&sn->sn_head, ip, i_nextsnap);
1627		ip->i_nextsnap.tqe_prev = 0;
1628		VI_UNLOCK(devvp);
1629		lockmgr(&vp->v_lock, LK_EXCLUSIVE, NULL);
1630		KASSERT(vp->v_vnlock == &sn->sn_lock,
1631			("ffs_snapremove: lost lock mutation"));
1632		vp->v_vnlock = &vp->v_lock;
1633		VI_LOCK(devvp);
1634		lockmgr(&sn->sn_lock, LK_RELEASE, NULL);
1635		try_free_snapdata(devvp);
1636	} else
1637		VI_UNLOCK(devvp);
1638	/*
1639	 * Clear all BLK_NOCOPY fields. Pass any block claims to other
1640	 * snapshots that want them (see ffs_snapblkfree below).
1641	 */
1642	for (blkno = 1; blkno < NDADDR; blkno++) {
1643		dblk = DIP(ip, i_db[blkno]);
1644		if (dblk == 0)
1645			continue;
1646		if (dblk == BLK_NOCOPY || dblk == BLK_SNAP)
1647			DIP_SET(ip, i_db[blkno], 0);
1648		else if ((dblk == blkstofrags(fs, blkno) &&
1649		     ffs_snapblkfree(fs, ip->i_devvp, dblk, fs->fs_bsize,
1650		     ip->i_number, vp->v_type, NULL))) {
1651			DIP_SET(ip, i_blocks, DIP(ip, i_blocks) -
1652			    btodb(fs->fs_bsize));
1653			DIP_SET(ip, i_db[blkno], 0);
1654		}
1655	}
1656	numblks = howmany(ip->i_size, fs->fs_bsize);
1657	for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) {
1658		error = UFS_BALLOC(vp, lblktosize(fs, (off_t)blkno),
1659		    fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
1660		if (error)
1661			continue;
1662		if (fs->fs_size - blkno > NINDIR(fs))
1663			last = NINDIR(fs);
1664		else
1665			last = fs->fs_size - blkno;
1666		for (loc = 0; loc < last; loc++) {
1667			if (ip->i_ump->um_fstype == UFS1) {
1668				dblk = ((ufs1_daddr_t *)(ibp->b_data))[loc];
1669				if (dblk == 0)
1670					continue;
1671				if (dblk == BLK_NOCOPY || dblk == BLK_SNAP)
1672					((ufs1_daddr_t *)(ibp->b_data))[loc]= 0;
1673				else if ((dblk == blkstofrags(fs, blkno) &&
1674				     ffs_snapblkfree(fs, ip->i_devvp, dblk,
1675				     fs->fs_bsize, ip->i_number, vp->v_type,
1676				     NULL))) {
1677					ip->i_din1->di_blocks -=
1678					    btodb(fs->fs_bsize);
1679					((ufs1_daddr_t *)(ibp->b_data))[loc]= 0;
1680				}
1681				continue;
1682			}
1683			dblk = ((ufs2_daddr_t *)(ibp->b_data))[loc];
1684			if (dblk == 0)
1685				continue;
1686			if (dblk == BLK_NOCOPY || dblk == BLK_SNAP)
1687				((ufs2_daddr_t *)(ibp->b_data))[loc] = 0;
1688			else if ((dblk == blkstofrags(fs, blkno) &&
1689			     ffs_snapblkfree(fs, ip->i_devvp, dblk,
1690			     fs->fs_bsize, ip->i_number, vp->v_type, NULL))) {
1691				ip->i_din2->di_blocks -= btodb(fs->fs_bsize);
1692				((ufs2_daddr_t *)(ibp->b_data))[loc] = 0;
1693			}
1694		}
1695		bawrite(ibp);
1696	}
1697	/*
1698	 * Clear snapshot flag and drop reference.
1699	 */
1700	ip->i_flags &= ~SF_SNAPSHOT;
1701	DIP_SET(ip, i_flags, ip->i_flags);
1702	ip->i_flag |= IN_CHANGE | IN_UPDATE;
1703	/*
1704	 * The dirtied indirects must be written out before
1705	 * softdep_setup_freeblocks() is called.  Otherwise indir_trunc()
1706	 * may find indirect pointers using the magic BLK_* values.
1707	 */
1708	if (DOINGSOFTDEP(vp))
1709		ffs_syncvnode(vp, MNT_WAIT, 0);
1710#ifdef QUOTA
1711	/*
1712	 * Reenable disk quotas for ex-snapshot file.
1713	 */
1714	if (!getinoquota(ip))
1715		(void) chkdq(ip, DIP(ip, i_blocks), KERNCRED, FORCE);
1716#endif
1717}
1718
1719/*
1720 * Notification that a block is being freed. Return zero if the free
1721 * should be allowed to proceed. Return non-zero if the snapshot file
1722 * wants to claim the block. The block will be claimed if it is an
1723 * uncopied part of one of the snapshots. It will be freed if it is
1724 * either a BLK_NOCOPY or has already been copied in all of the snapshots.
1725 * If a fragment is being freed, then all snapshots that care about
1726 * it must make a copy since a snapshot file can only claim full sized
1727 * blocks. Note that if more than one snapshot file maps the block,
1728 * we can pick one at random to claim it. Since none of the snapshots
1729 * can change, we are assurred that they will all see the same unmodified
1730 * image. When deleting a snapshot file (see ffs_snapremove above), we
1731 * must push any of these claimed blocks to one of the other snapshots
1732 * that maps it. These claimed blocks are easily identified as they will
1733 * have a block number equal to their logical block number within the
1734 * snapshot. A copied block can never have this property because they
1735 * must always have been allocated from a BLK_NOCOPY location.
1736 */
1737int
1738ffs_snapblkfree(fs, devvp, bno, size, inum, vtype, wkhd)
1739	struct fs *fs;
1740	struct vnode *devvp;
1741	ufs2_daddr_t bno;
1742	long size;
1743	ino_t inum;
1744	enum vtype vtype;
1745	struct workhead *wkhd;
1746{
1747	struct buf *ibp, *cbp, *savedcbp = NULL;
1748	struct thread *td = curthread;
1749	struct inode *ip;
1750	struct vnode *vp = NULL;
1751	ufs_lbn_t lbn;
1752	ufs2_daddr_t blkno;
1753	int indiroff = 0, error = 0, claimedblk = 0;
1754	struct snapdata *sn;
1755
1756	lbn = fragstoblks(fs, bno);
1757retry:
1758	VI_LOCK(devvp);
1759	sn = devvp->v_rdev->si_snapdata;
1760	if (sn == NULL) {
1761		VI_UNLOCK(devvp);
1762		return (0);
1763	}
1764	if (lockmgr(&sn->sn_lock, LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL,
1765	    VI_MTX(devvp)) != 0)
1766		goto retry;
1767	TAILQ_FOREACH(ip, &sn->sn_head, i_nextsnap) {
1768		vp = ITOV(ip);
1769		if (DOINGSOFTDEP(vp))
1770			softdep_prealloc(vp, MNT_WAIT);
1771		/*
1772		 * Lookup block being written.
1773		 */
1774		if (lbn < NDADDR) {
1775			blkno = DIP(ip, i_db[lbn]);
1776		} else {
1777			td->td_pflags |= TDP_COWINPROGRESS;
1778			error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn),
1779			    fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
1780			td->td_pflags &= ~TDP_COWINPROGRESS;
1781			if (error)
1782				break;
1783			indiroff = (lbn - NDADDR) % NINDIR(fs);
1784			if (ip->i_ump->um_fstype == UFS1)
1785				blkno=((ufs1_daddr_t *)(ibp->b_data))[indiroff];
1786			else
1787				blkno=((ufs2_daddr_t *)(ibp->b_data))[indiroff];
1788		}
1789		/*
1790		 * Check to see if block needs to be copied.
1791		 */
1792		if (blkno == 0) {
1793			/*
1794			 * A block that we map is being freed. If it has not
1795			 * been claimed yet, we will claim or copy it (below).
1796			 */
1797			claimedblk = 1;
1798		} else if (blkno == BLK_SNAP) {
1799			/*
1800			 * No previous snapshot claimed the block,
1801			 * so it will be freed and become a BLK_NOCOPY
1802			 * (don't care) for us.
1803			 */
1804			if (claimedblk)
1805				panic("snapblkfree: inconsistent block type");
1806			if (lbn < NDADDR) {
1807				DIP_SET(ip, i_db[lbn], BLK_NOCOPY);
1808				ip->i_flag |= IN_CHANGE | IN_UPDATE;
1809			} else if (ip->i_ump->um_fstype == UFS1) {
1810				((ufs1_daddr_t *)(ibp->b_data))[indiroff] =
1811				    BLK_NOCOPY;
1812				bdwrite(ibp);
1813			} else {
1814				((ufs2_daddr_t *)(ibp->b_data))[indiroff] =
1815				    BLK_NOCOPY;
1816				bdwrite(ibp);
1817			}
1818			continue;
1819		} else /* BLK_NOCOPY or default */ {
1820			/*
1821			 * If the snapshot has already copied the block
1822			 * (default), or does not care about the block,
1823			 * it is not needed.
1824			 */
1825			if (lbn >= NDADDR)
1826				bqrelse(ibp);
1827			continue;
1828		}
1829		/*
1830		 * If this is a full size block, we will just grab it
1831		 * and assign it to the snapshot inode. Otherwise we
1832		 * will proceed to copy it. See explanation for this
1833		 * routine as to why only a single snapshot needs to
1834		 * claim this block.
1835		 */
1836		if (size == fs->fs_bsize) {
1837#ifdef DEBUG
1838			if (snapdebug)
1839				printf("%s %ju lbn %jd from inum %ju\n",
1840				    "Grabonremove: snapino",
1841				    (uintmax_t)ip->i_number,
1842				    (intmax_t)lbn, (uintmax_t)inum);
1843#endif
1844			/*
1845			 * If journaling is tracking this write we must add
1846			 * the work to the inode or indirect being written.
1847			 */
1848			if (wkhd != NULL) {
1849				if (lbn < NDADDR)
1850					softdep_inode_append(ip,
1851					    curthread->td_ucred, wkhd);
1852				else
1853					softdep_buf_append(ibp, wkhd);
1854			}
1855			if (lbn < NDADDR) {
1856				DIP_SET(ip, i_db[lbn], bno);
1857			} else if (ip->i_ump->um_fstype == UFS1) {
1858				((ufs1_daddr_t *)(ibp->b_data))[indiroff] = bno;
1859				bdwrite(ibp);
1860			} else {
1861				((ufs2_daddr_t *)(ibp->b_data))[indiroff] = bno;
1862				bdwrite(ibp);
1863			}
1864			DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + btodb(size));
1865			ip->i_flag |= IN_CHANGE | IN_UPDATE;
1866			lockmgr(vp->v_vnlock, LK_RELEASE, NULL);
1867			return (1);
1868		}
1869		if (lbn >= NDADDR)
1870			bqrelse(ibp);
1871		/*
1872		 * Allocate the block into which to do the copy. Note that this
1873		 * allocation will never require any additional allocations for
1874		 * the snapshot inode.
1875		 */
1876		td->td_pflags |= TDP_COWINPROGRESS;
1877		error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn),
1878		    fs->fs_bsize, KERNCRED, 0, &cbp);
1879		td->td_pflags &= ~TDP_COWINPROGRESS;
1880		if (error)
1881			break;
1882#ifdef DEBUG
1883		if (snapdebug)
1884			printf("%s%ju lbn %jd %s %ju size %ld to blkno %jd\n",
1885			    "Copyonremove: snapino ", (uintmax_t)ip->i_number,
1886			    (intmax_t)lbn, "for inum", (uintmax_t)inum, size,
1887			    (intmax_t)cbp->b_blkno);
1888#endif
1889		/*
1890		 * If we have already read the old block contents, then
1891		 * simply copy them to the new block. Note that we need
1892		 * to synchronously write snapshots that have not been
1893		 * unlinked, and hence will be visible after a crash,
1894		 * to ensure their integrity. At a minimum we ensure the
1895		 * integrity of the filesystem metadata, but use the
1896		 * dopersistence sysctl-setable flag to decide on the
1897		 * persistence needed for file content data.
1898		 */
1899		if (savedcbp != NULL) {
1900			bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize);
1901			bawrite(cbp);
1902			if ((vtype == VDIR || dopersistence) &&
1903			    ip->i_effnlink > 0)
1904				(void) ffs_syncvnode(vp, MNT_WAIT, NO_INO_UPDT);
1905			continue;
1906		}
1907		/*
1908		 * Otherwise, read the old block contents into the buffer.
1909		 */
1910		if ((error = readblock(vp, cbp, lbn)) != 0) {
1911			bzero(cbp->b_data, fs->fs_bsize);
1912			bawrite(cbp);
1913			if ((vtype == VDIR || dopersistence) &&
1914			    ip->i_effnlink > 0)
1915				(void) ffs_syncvnode(vp, MNT_WAIT, NO_INO_UPDT);
1916			break;
1917		}
1918		savedcbp = cbp;
1919	}
1920	/*
1921	 * Note that we need to synchronously write snapshots that
1922	 * have not been unlinked, and hence will be visible after
1923	 * a crash, to ensure their integrity. At a minimum we
1924	 * ensure the integrity of the filesystem metadata, but
1925	 * use the dopersistence sysctl-setable flag to decide on
1926	 * the persistence needed for file content data.
1927	 */
1928	if (savedcbp) {
1929		vp = savedcbp->b_vp;
1930		bawrite(savedcbp);
1931		if ((vtype == VDIR || dopersistence) &&
1932		    VTOI(vp)->i_effnlink > 0)
1933			(void) ffs_syncvnode(vp, MNT_WAIT, NO_INO_UPDT);
1934	}
1935	/*
1936	 * If we have been unable to allocate a block in which to do
1937	 * the copy, then return non-zero so that the fragment will
1938	 * not be freed. Although space will be lost, the snapshot
1939	 * will stay consistent.
1940	 */
1941	if (error != 0 && wkhd != NULL)
1942		softdep_freework(wkhd);
1943	lockmgr(vp->v_vnlock, LK_RELEASE, NULL);
1944	return (error);
1945}
1946
1947/*
1948 * Associate snapshot files when mounting.
1949 */
1950void
1951ffs_snapshot_mount(mp)
1952	struct mount *mp;
1953{
1954	struct ufsmount *ump = VFSTOUFS(mp);
1955	struct vnode *devvp = ump->um_devvp;
1956	struct fs *fs = ump->um_fs;
1957	struct thread *td = curthread;
1958	struct snapdata *sn;
1959	struct vnode *vp;
1960	struct vnode *lastvp;
1961	struct inode *ip;
1962	struct uio auio;
1963	struct iovec aiov;
1964	void *snapblklist;
1965	char *reason;
1966	daddr_t snaplistsize;
1967	int error, snaploc, loc;
1968
1969	/*
1970	 * XXX The following needs to be set before ffs_truncate or
1971	 * VOP_READ can be called.
1972	 */
1973	mp->mnt_stat.f_iosize = fs->fs_bsize;
1974	/*
1975	 * Process each snapshot listed in the superblock.
1976	 */
1977	vp = NULL;
1978	lastvp = NULL;
1979	sn = NULL;
1980	for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) {
1981		if (fs->fs_snapinum[snaploc] == 0)
1982			break;
1983		if ((error = ffs_vget(mp, fs->fs_snapinum[snaploc],
1984		    LK_EXCLUSIVE, &vp)) != 0){
1985			printf("ffs_snapshot_mount: vget failed %d\n", error);
1986			continue;
1987		}
1988		ip = VTOI(vp);
1989		if (!IS_SNAPSHOT(ip) || ip->i_size ==
1990		    lblktosize(fs, howmany(fs->fs_size, fs->fs_frag))) {
1991			if (!IS_SNAPSHOT(ip)) {
1992				reason = "non-snapshot";
1993			} else {
1994				reason = "old format snapshot";
1995				(void)ffs_truncate(vp, (off_t)0, 0, NOCRED);
1996				(void)ffs_syncvnode(vp, MNT_WAIT, 0);
1997			}
1998			printf("ffs_snapshot_mount: %s inode %d\n",
1999			    reason, fs->fs_snapinum[snaploc]);
2000			vput(vp);
2001			vp = NULL;
2002			for (loc = snaploc + 1; loc < FSMAXSNAP; loc++) {
2003				if (fs->fs_snapinum[loc] == 0)
2004					break;
2005				fs->fs_snapinum[loc - 1] = fs->fs_snapinum[loc];
2006			}
2007			fs->fs_snapinum[loc - 1] = 0;
2008			snaploc--;
2009			continue;
2010		}
2011		/*
2012		 * Acquire a lock on the snapdata structure, creating it if
2013		 * necessary.
2014		 */
2015		sn = ffs_snapdata_acquire(devvp);
2016		/*
2017		 * Change vnode to use shared snapshot lock instead of the
2018		 * original private lock.
2019		 */
2020		vp->v_vnlock = &sn->sn_lock;
2021		lockmgr(&vp->v_lock, LK_RELEASE, NULL);
2022		/*
2023		 * Link it onto the active snapshot list.
2024		 */
2025		VI_LOCK(devvp);
2026		if (ip->i_nextsnap.tqe_prev != 0)
2027			panic("ffs_snapshot_mount: %ju already on list",
2028			    (uintmax_t)ip->i_number);
2029		else
2030			TAILQ_INSERT_TAIL(&sn->sn_head, ip, i_nextsnap);
2031		vp->v_vflag |= VV_SYSTEM;
2032		VI_UNLOCK(devvp);
2033		VOP_UNLOCK(vp, 0);
2034		lastvp = vp;
2035	}
2036	vp = lastvp;
2037	/*
2038	 * No usable snapshots found.
2039	 */
2040	if (sn == NULL || vp == NULL)
2041		return;
2042	/*
2043	 * Allocate the space for the block hints list. We always want to
2044	 * use the list from the newest snapshot.
2045	 */
2046	auio.uio_iov = &aiov;
2047	auio.uio_iovcnt = 1;
2048	aiov.iov_base = (void *)&snaplistsize;
2049	aiov.iov_len = sizeof(snaplistsize);
2050	auio.uio_resid = aiov.iov_len;
2051	auio.uio_offset =
2052	    lblktosize(fs, howmany(fs->fs_size, fs->fs_frag));
2053	auio.uio_segflg = UIO_SYSSPACE;
2054	auio.uio_rw = UIO_READ;
2055	auio.uio_td = td;
2056	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2057	if ((error = VOP_READ(vp, &auio, IO_UNIT, td->td_ucred)) != 0) {
2058		printf("ffs_snapshot_mount: read_1 failed %d\n", error);
2059		VOP_UNLOCK(vp, 0);
2060		return;
2061	}
2062	snapblklist = malloc(snaplistsize * sizeof(daddr_t),
2063	    M_UFSMNT, M_WAITOK);
2064	auio.uio_iovcnt = 1;
2065	aiov.iov_base = snapblklist;
2066	aiov.iov_len = snaplistsize * sizeof (daddr_t);
2067	auio.uio_resid = aiov.iov_len;
2068	auio.uio_offset -= sizeof(snaplistsize);
2069	if ((error = VOP_READ(vp, &auio, IO_UNIT, td->td_ucred)) != 0) {
2070		printf("ffs_snapshot_mount: read_2 failed %d\n", error);
2071		VOP_UNLOCK(vp, 0);
2072		free(snapblklist, M_UFSMNT);
2073		return;
2074	}
2075	VOP_UNLOCK(vp, 0);
2076	VI_LOCK(devvp);
2077	ASSERT_VOP_LOCKED(devvp, "ffs_snapshot_mount");
2078	sn->sn_listsize = snaplistsize;
2079	sn->sn_blklist = (daddr_t *)snapblklist;
2080	devvp->v_vflag |= VV_COPYONWRITE;
2081	VI_UNLOCK(devvp);
2082}
2083
2084/*
2085 * Disassociate snapshot files when unmounting.
2086 */
2087void
2088ffs_snapshot_unmount(mp)
2089	struct mount *mp;
2090{
2091	struct vnode *devvp = VFSTOUFS(mp)->um_devvp;
2092	struct snapdata *sn;
2093	struct inode *xp;
2094	struct vnode *vp;
2095
2096	VI_LOCK(devvp);
2097	sn = devvp->v_rdev->si_snapdata;
2098	while (sn != NULL && (xp = TAILQ_FIRST(&sn->sn_head)) != NULL) {
2099		vp = ITOV(xp);
2100		TAILQ_REMOVE(&sn->sn_head, xp, i_nextsnap);
2101		xp->i_nextsnap.tqe_prev = 0;
2102		lockmgr(&sn->sn_lock, LK_INTERLOCK | LK_EXCLUSIVE,
2103		    VI_MTX(devvp));
2104		lockmgr(&vp->v_lock, LK_EXCLUSIVE, NULL);
2105		KASSERT(vp->v_vnlock == &sn->sn_lock,
2106		("ffs_snapshot_unmount: lost lock mutation"));
2107		vp->v_vnlock = &vp->v_lock;
2108		lockmgr(&vp->v_lock, LK_RELEASE, NULL);
2109		lockmgr(&sn->sn_lock, LK_RELEASE, NULL);
2110		if (xp->i_effnlink > 0)
2111			vrele(vp);
2112		VI_LOCK(devvp);
2113		sn = devvp->v_rdev->si_snapdata;
2114	}
2115	try_free_snapdata(devvp);
2116	ASSERT_VOP_LOCKED(devvp, "ffs_snapshot_unmount");
2117}
2118
2119/*
2120 * Check the buffer block to be belong to device buffer that shall be
2121 * locked after snaplk. devvp shall be locked on entry, and will be
2122 * leaved locked upon exit.
2123 */
2124static int
2125ffs_bp_snapblk(devvp, bp)
2126	struct vnode *devvp;
2127	struct buf *bp;
2128{
2129	struct snapdata *sn;
2130	struct fs *fs;
2131	ufs2_daddr_t lbn, *snapblklist;
2132	int lower, upper, mid;
2133
2134	ASSERT_VI_LOCKED(devvp, "ffs_bp_snapblk");
2135	KASSERT(devvp->v_type == VCHR, ("Not a device %p", devvp));
2136	sn = devvp->v_rdev->si_snapdata;
2137	if (sn == NULL || TAILQ_FIRST(&sn->sn_head) == NULL)
2138		return (0);
2139	fs = TAILQ_FIRST(&sn->sn_head)->i_fs;
2140	lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno));
2141	snapblklist = sn->sn_blklist;
2142	upper = sn->sn_listsize - 1;
2143	lower = 1;
2144	while (lower <= upper) {
2145		mid = (lower + upper) / 2;
2146		if (snapblklist[mid] == lbn)
2147			break;
2148		if (snapblklist[mid] < lbn)
2149			lower = mid + 1;
2150		else
2151			upper = mid - 1;
2152	}
2153	if (lower <= upper)
2154		return (1);
2155	return (0);
2156}
2157
2158void
2159ffs_bdflush(bo, bp)
2160	struct bufobj *bo;
2161	struct buf *bp;
2162{
2163	struct thread *td;
2164	struct vnode *vp, *devvp;
2165	struct buf *nbp;
2166	int bp_bdskip;
2167
2168	if (bo->bo_dirty.bv_cnt <= dirtybufthresh)
2169		return;
2170
2171	td = curthread;
2172	vp = bp->b_vp;
2173	devvp = bo->__bo_vnode;
2174	KASSERT(vp == devvp, ("devvp != vp %p %p", bo, bp));
2175
2176	VI_LOCK(devvp);
2177	bp_bdskip = ffs_bp_snapblk(devvp, bp);
2178	if (bp_bdskip)
2179		bdwriteskip++;
2180	VI_UNLOCK(devvp);
2181	if (bo->bo_dirty.bv_cnt > dirtybufthresh + 10 && !bp_bdskip) {
2182		(void) VOP_FSYNC(vp, MNT_NOWAIT, td);
2183		altbufferflushes++;
2184	} else {
2185		BO_LOCK(bo);
2186		/*
2187		 * Try to find a buffer to flush.
2188		 */
2189		TAILQ_FOREACH(nbp, &bo->bo_dirty.bv_hd, b_bobufs) {
2190			if ((nbp->b_vflags & BV_BKGRDINPROG) ||
2191			    BUF_LOCK(nbp,
2192				     LK_EXCLUSIVE | LK_NOWAIT, NULL))
2193				continue;
2194			if (bp == nbp)
2195				panic("bdwrite: found ourselves");
2196			BO_UNLOCK(bo);
2197			/*
2198			 * Don't countdeps with the bo lock
2199			 * held.
2200			 */
2201			if (buf_countdeps(nbp, 0)) {
2202				BO_LOCK(bo);
2203				BUF_UNLOCK(nbp);
2204				continue;
2205			}
2206			if (bp_bdskip) {
2207				VI_LOCK(devvp);
2208				if (!ffs_bp_snapblk(vp, nbp)) {
2209					VI_UNLOCK(devvp);
2210					BO_LOCK(bo);
2211					BUF_UNLOCK(nbp);
2212					continue;
2213				}
2214				VI_UNLOCK(devvp);
2215			}
2216			if (nbp->b_flags & B_CLUSTEROK) {
2217				vfs_bio_awrite(nbp);
2218			} else {
2219				bremfree(nbp);
2220				bawrite(nbp);
2221			}
2222			dirtybufferflushes++;
2223			break;
2224		}
2225		if (nbp == NULL)
2226			BO_UNLOCK(bo);
2227	}
2228}
2229
2230/*
2231 * Check for need to copy block that is about to be written,
2232 * copying the block if necessary.
2233 */
2234int
2235ffs_copyonwrite(devvp, bp)
2236	struct vnode *devvp;
2237	struct buf *bp;
2238{
2239	struct snapdata *sn;
2240	struct buf *ibp, *cbp, *savedcbp = NULL;
2241	struct thread *td = curthread;
2242	struct fs *fs;
2243	struct inode *ip;
2244	struct vnode *vp = NULL;
2245	ufs2_daddr_t lbn, blkno, *snapblklist;
2246	int lower, upper, mid, indiroff, error = 0;
2247	int launched_async_io, prev_norunningbuf;
2248	long saved_runningbufspace;
2249
2250	if (devvp != bp->b_vp && IS_SNAPSHOT(VTOI(bp->b_vp)))
2251		return (0);		/* Update on a snapshot file */
2252	if (td->td_pflags & TDP_COWINPROGRESS)
2253		panic("ffs_copyonwrite: recursive call");
2254	/*
2255	 * First check to see if it is in the preallocated list.
2256	 * By doing this check we avoid several potential deadlocks.
2257	 */
2258	VI_LOCK(devvp);
2259	sn = devvp->v_rdev->si_snapdata;
2260	if (sn == NULL ||
2261	    TAILQ_EMPTY(&sn->sn_head)) {
2262		VI_UNLOCK(devvp);
2263		return (0);		/* No snapshot */
2264	}
2265	ip = TAILQ_FIRST(&sn->sn_head);
2266	fs = ip->i_fs;
2267	lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno));
2268	snapblklist = sn->sn_blklist;
2269	upper = sn->sn_listsize - 1;
2270	lower = 1;
2271	while (lower <= upper) {
2272		mid = (lower + upper) / 2;
2273		if (snapblklist[mid] == lbn)
2274			break;
2275		if (snapblklist[mid] < lbn)
2276			lower = mid + 1;
2277		else
2278			upper = mid - 1;
2279	}
2280	if (lower <= upper) {
2281		VI_UNLOCK(devvp);
2282		return (0);
2283	}
2284	launched_async_io = 0;
2285	prev_norunningbuf = td->td_pflags & TDP_NORUNNINGBUF;
2286	/*
2287	 * Since I/O on bp isn't yet in progress and it may be blocked
2288	 * for a long time waiting on snaplk, back it out of
2289	 * runningbufspace, possibly waking other threads waiting for space.
2290	 */
2291	saved_runningbufspace = bp->b_runningbufspace;
2292	if (saved_runningbufspace != 0)
2293		runningbufwakeup(bp);
2294	/*
2295	 * Not in the precomputed list, so check the snapshots.
2296	 */
2297	while (lockmgr(&sn->sn_lock, LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL,
2298	    VI_MTX(devvp)) != 0) {
2299		VI_LOCK(devvp);
2300		sn = devvp->v_rdev->si_snapdata;
2301		if (sn == NULL ||
2302		    TAILQ_EMPTY(&sn->sn_head)) {
2303			VI_UNLOCK(devvp);
2304			if (saved_runningbufspace != 0) {
2305				bp->b_runningbufspace = saved_runningbufspace;
2306				atomic_add_long(&runningbufspace,
2307					       bp->b_runningbufspace);
2308			}
2309			return (0);		/* Snapshot gone */
2310		}
2311	}
2312	TAILQ_FOREACH(ip, &sn->sn_head, i_nextsnap) {
2313		vp = ITOV(ip);
2314		if (DOINGSOFTDEP(vp))
2315			softdep_prealloc(vp, MNT_WAIT);
2316		/*
2317		 * We ensure that everything of our own that needs to be
2318		 * copied will be done at the time that ffs_snapshot is
2319		 * called. Thus we can skip the check here which can
2320		 * deadlock in doing the lookup in UFS_BALLOC.
2321		 */
2322		if (bp->b_vp == vp)
2323			continue;
2324		/*
2325		 * Check to see if block needs to be copied. We do not have
2326		 * to hold the snapshot lock while doing this lookup as it
2327		 * will never require any additional allocations for the
2328		 * snapshot inode.
2329		 */
2330		if (lbn < NDADDR) {
2331			blkno = DIP(ip, i_db[lbn]);
2332		} else {
2333			td->td_pflags |= TDP_COWINPROGRESS | TDP_NORUNNINGBUF;
2334			error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn),
2335			   fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
2336			td->td_pflags &= ~TDP_COWINPROGRESS;
2337			if (error)
2338				break;
2339			indiroff = (lbn - NDADDR) % NINDIR(fs);
2340			if (ip->i_ump->um_fstype == UFS1)
2341				blkno=((ufs1_daddr_t *)(ibp->b_data))[indiroff];
2342			else
2343				blkno=((ufs2_daddr_t *)(ibp->b_data))[indiroff];
2344			bqrelse(ibp);
2345		}
2346#ifdef INVARIANTS
2347		if (blkno == BLK_SNAP && bp->b_lblkno >= 0)
2348			panic("ffs_copyonwrite: bad copy block");
2349#endif
2350		if (blkno != 0)
2351			continue;
2352		/*
2353		 * Allocate the block into which to do the copy. Since
2354		 * multiple processes may all try to copy the same block,
2355		 * we have to recheck our need to do a copy if we sleep
2356		 * waiting for the lock.
2357		 *
2358		 * Because all snapshots on a filesystem share a single
2359		 * lock, we ensure that we will never be in competition
2360		 * with another process to allocate a block.
2361		 */
2362		td->td_pflags |= TDP_COWINPROGRESS | TDP_NORUNNINGBUF;
2363		error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn),
2364		    fs->fs_bsize, KERNCRED, 0, &cbp);
2365		td->td_pflags &= ~TDP_COWINPROGRESS;
2366		if (error)
2367			break;
2368#ifdef DEBUG
2369		if (snapdebug) {
2370			printf("Copyonwrite: snapino %ju lbn %jd for ",
2371			    (uintmax_t)ip->i_number, (intmax_t)lbn);
2372			if (bp->b_vp == devvp)
2373				printf("fs metadata");
2374			else
2375				printf("inum %ju",
2376				    (uintmax_t)VTOI(bp->b_vp)->i_number);
2377			printf(" lblkno %jd to blkno %jd\n",
2378			    (intmax_t)bp->b_lblkno, (intmax_t)cbp->b_blkno);
2379		}
2380#endif
2381		/*
2382		 * If we have already read the old block contents, then
2383		 * simply copy them to the new block. Note that we need
2384		 * to synchronously write snapshots that have not been
2385		 * unlinked, and hence will be visible after a crash,
2386		 * to ensure their integrity. At a minimum we ensure the
2387		 * integrity of the filesystem metadata, but use the
2388		 * dopersistence sysctl-setable flag to decide on the
2389		 * persistence needed for file content data.
2390		 */
2391		if (savedcbp != NULL) {
2392			bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize);
2393			bawrite(cbp);
2394			if ((devvp == bp->b_vp || bp->b_vp->v_type == VDIR ||
2395			    dopersistence) && ip->i_effnlink > 0)
2396				(void) ffs_syncvnode(vp, MNT_WAIT, NO_INO_UPDT);
2397			else
2398				launched_async_io = 1;
2399			continue;
2400		}
2401		/*
2402		 * Otherwise, read the old block contents into the buffer.
2403		 */
2404		if ((error = readblock(vp, cbp, lbn)) != 0) {
2405			bzero(cbp->b_data, fs->fs_bsize);
2406			bawrite(cbp);
2407			if ((devvp == bp->b_vp || bp->b_vp->v_type == VDIR ||
2408			    dopersistence) && ip->i_effnlink > 0)
2409				(void) ffs_syncvnode(vp, MNT_WAIT, NO_INO_UPDT);
2410			else
2411				launched_async_io = 1;
2412			break;
2413		}
2414		savedcbp = cbp;
2415	}
2416	/*
2417	 * Note that we need to synchronously write snapshots that
2418	 * have not been unlinked, and hence will be visible after
2419	 * a crash, to ensure their integrity. At a minimum we
2420	 * ensure the integrity of the filesystem metadata, but
2421	 * use the dopersistence sysctl-setable flag to decide on
2422	 * the persistence needed for file content data.
2423	 */
2424	if (savedcbp) {
2425		vp = savedcbp->b_vp;
2426		bawrite(savedcbp);
2427		if ((devvp == bp->b_vp || bp->b_vp->v_type == VDIR ||
2428		    dopersistence) && VTOI(vp)->i_effnlink > 0)
2429			(void) ffs_syncvnode(vp, MNT_WAIT, NO_INO_UPDT);
2430		else
2431			launched_async_io = 1;
2432	}
2433	lockmgr(vp->v_vnlock, LK_RELEASE, NULL);
2434	td->td_pflags = (td->td_pflags & ~TDP_NORUNNINGBUF) |
2435		prev_norunningbuf;
2436	if (launched_async_io && (td->td_pflags & TDP_NORUNNINGBUF) == 0)
2437		waitrunningbufspace();
2438	/*
2439	 * I/O on bp will now be started, so count it in runningbufspace.
2440	 */
2441	if (saved_runningbufspace != 0) {
2442		bp->b_runningbufspace = saved_runningbufspace;
2443		atomic_add_long(&runningbufspace, bp->b_runningbufspace);
2444	}
2445	return (error);
2446}
2447
2448/*
2449 * sync snapshots to force freework records waiting on snapshots to claim
2450 * blocks to free.
2451 */
2452void
2453ffs_sync_snap(mp, waitfor)
2454	struct mount *mp;
2455	int waitfor;
2456{
2457	struct snapdata *sn;
2458	struct vnode *devvp;
2459	struct vnode *vp;
2460	struct inode *ip;
2461
2462	devvp = VFSTOUFS(mp)->um_devvp;
2463	if ((devvp->v_vflag & VV_COPYONWRITE) == 0)
2464		return;
2465	for (;;) {
2466		VI_LOCK(devvp);
2467		sn = devvp->v_rdev->si_snapdata;
2468		if (sn == NULL) {
2469			VI_UNLOCK(devvp);
2470			return;
2471		}
2472		if (lockmgr(&sn->sn_lock,
2473		    LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL,
2474		    VI_MTX(devvp)) == 0)
2475			break;
2476	}
2477	TAILQ_FOREACH(ip, &sn->sn_head, i_nextsnap) {
2478		vp = ITOV(ip);
2479		ffs_syncvnode(vp, waitfor, NO_INO_UPDT);
2480	}
2481	lockmgr(&sn->sn_lock, LK_RELEASE, NULL);
2482}
2483
2484/*
2485 * Read the specified block into the given buffer.
2486 * Much of this boiler-plate comes from bwrite().
2487 */
2488static int
2489readblock(vp, bp, lbn)
2490	struct vnode *vp;
2491	struct buf *bp;
2492	ufs2_daddr_t lbn;
2493{
2494	struct inode *ip = VTOI(vp);
2495	struct bio *bip;
2496
2497	bip = g_alloc_bio();
2498	bip->bio_cmd = BIO_READ;
2499	bip->bio_offset = dbtob(fsbtodb(ip->i_fs, blkstofrags(ip->i_fs, lbn)));
2500	bip->bio_data = bp->b_data;
2501	bip->bio_length = bp->b_bcount;
2502	bip->bio_done = NULL;
2503
2504	g_io_request(bip, ip->i_devvp->v_bufobj.bo_private);
2505	bp->b_error = biowait(bip, "snaprdb");
2506	g_destroy_bio(bip);
2507	return (bp->b_error);
2508}
2509
2510#endif
2511
2512/*
2513 * Process file deletes that were deferred by ufs_inactive() due to
2514 * the file system being suspended. Transfer IN_LAZYACCESS into
2515 * IN_MODIFIED for vnodes that were accessed during suspension.
2516 */
2517void
2518process_deferred_inactive(struct mount *mp)
2519{
2520	struct vnode *vp, *mvp;
2521	struct inode *ip;
2522	struct thread *td;
2523	int error;
2524
2525	td = curthread;
2526	(void) vn_start_secondary_write(NULL, &mp, V_WAIT);
2527 loop:
2528	MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
2529		/*
2530		 * IN_LAZYACCESS is checked here without holding any
2531		 * vnode lock, but this flag is set only while holding
2532		 * vnode interlock.
2533		 */
2534		if (vp->v_type == VNON ||
2535		    ((VTOI(vp)->i_flag & IN_LAZYACCESS) == 0 &&
2536		    ((vp->v_iflag & VI_OWEINACT) == 0 || vp->v_usecount > 0))) {
2537			VI_UNLOCK(vp);
2538			continue;
2539		}
2540		vholdl(vp);
2541		error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK);
2542		if (error != 0) {
2543			vdrop(vp);
2544			if (error == ENOENT)
2545				continue;	/* vnode recycled */
2546			MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
2547			goto loop;
2548		}
2549		ip = VTOI(vp);
2550		if ((ip->i_flag & IN_LAZYACCESS) != 0) {
2551			ip->i_flag &= ~IN_LAZYACCESS;
2552			ip->i_flag |= IN_MODIFIED;
2553		}
2554		VI_LOCK(vp);
2555		if ((vp->v_iflag & VI_OWEINACT) == 0 || vp->v_usecount > 0) {
2556			VI_UNLOCK(vp);
2557			VOP_UNLOCK(vp, 0);
2558			vdrop(vp);
2559			continue;
2560		}
2561		vinactive(vp, td);
2562		VNASSERT((vp->v_iflag & VI_OWEINACT) == 0, vp,
2563			 ("process_deferred_inactive: got VI_OWEINACT"));
2564		VI_UNLOCK(vp);
2565		VOP_UNLOCK(vp, 0);
2566		vdrop(vp);
2567	}
2568	vn_finished_secondary_write(mp);
2569}
2570
2571#ifndef NO_FFS_SNAPSHOT
2572
2573static struct snapdata *
2574ffs_snapdata_alloc(void)
2575{
2576	struct snapdata *sn;
2577
2578	/*
2579	 * Fetch a snapdata from the free list if there is one available.
2580	 */
2581	mtx_lock(&snapfree_lock);
2582	sn = LIST_FIRST(&snapfree);
2583	if (sn != NULL)
2584		LIST_REMOVE(sn, sn_link);
2585	mtx_unlock(&snapfree_lock);
2586	if (sn != NULL)
2587		return (sn);
2588	/*
2589 	 * If there were no free snapdatas allocate one.
2590	 */
2591	sn = malloc(sizeof *sn, M_UFSMNT, M_WAITOK | M_ZERO);
2592	TAILQ_INIT(&sn->sn_head);
2593	lockinit(&sn->sn_lock, PVFS, "snaplk", VLKTIMEOUT,
2594	    LK_CANRECURSE | LK_NOSHARE);
2595	return (sn);
2596}
2597
2598/*
2599 * The snapdata is never freed because we can not be certain that
2600 * there are no threads sleeping on the snap lock.  Persisting
2601 * them permanently avoids costly synchronization in ffs_lock().
2602 */
2603static void
2604ffs_snapdata_free(struct snapdata *sn)
2605{
2606	mtx_lock(&snapfree_lock);
2607	LIST_INSERT_HEAD(&snapfree, sn, sn_link);
2608	mtx_unlock(&snapfree_lock);
2609}
2610
2611/* Try to free snapdata associated with devvp */
2612static void
2613try_free_snapdata(struct vnode *devvp)
2614{
2615	struct snapdata *sn;
2616	ufs2_daddr_t *snapblklist;
2617
2618	ASSERT_VI_LOCKED(devvp, "try_free_snapdata");
2619	sn = devvp->v_rdev->si_snapdata;
2620
2621	if (sn == NULL || TAILQ_FIRST(&sn->sn_head) != NULL ||
2622	    (devvp->v_vflag & VV_COPYONWRITE) == 0) {
2623		VI_UNLOCK(devvp);
2624		return;
2625	}
2626
2627	devvp->v_rdev->si_snapdata = NULL;
2628	devvp->v_vflag &= ~VV_COPYONWRITE;
2629	lockmgr(&sn->sn_lock, LK_DRAIN|LK_INTERLOCK, VI_MTX(devvp));
2630	snapblklist = sn->sn_blklist;
2631	sn->sn_blklist = NULL;
2632	sn->sn_listsize = 0;
2633	lockmgr(&sn->sn_lock, LK_RELEASE, NULL);
2634	if (snapblklist != NULL)
2635		free(snapblklist, M_UFSMNT);
2636	ffs_snapdata_free(sn);
2637}
2638
2639static struct snapdata *
2640ffs_snapdata_acquire(struct vnode *devvp)
2641{
2642	struct snapdata *nsn;
2643	struct snapdata *sn;
2644
2645	/*
2646	 * Allocate a free snapdata.  This is done before acquiring the
2647	 * devvp lock to avoid allocation while the devvp interlock is
2648	 * held.
2649	 */
2650	nsn = ffs_snapdata_alloc();
2651	/*
2652	 * If there snapshots already exist on this filesystem grab a
2653	 * reference to the shared lock.  Otherwise this is the first
2654	 * snapshot on this filesystem and we need to use our
2655	 * pre-allocated snapdata.
2656	 */
2657	VI_LOCK(devvp);
2658	if (devvp->v_rdev->si_snapdata == NULL) {
2659		devvp->v_rdev->si_snapdata = nsn;
2660		nsn = NULL;
2661	}
2662	sn = devvp->v_rdev->si_snapdata;
2663	/*
2664	 * Acquire the snapshot lock.
2665	 */
2666	lockmgr(&sn->sn_lock,
2667	    LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY, VI_MTX(devvp));
2668	/*
2669	 * Free any unused snapdata.
2670	 */
2671	if (nsn != NULL)
2672		ffs_snapdata_free(nsn);
2673
2674	return (sn);
2675}
2676
2677#endif
2678