1/*-
2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3 *
4 * Copyright 2000 Marshall Kirk McKusick. All Rights Reserved.
5 *
6 * Further information about snapshots can be obtained from:
7 *
8 *	Marshall Kirk McKusick		http://www.mckusick.com/softdep/
9 *	1614 Oxford Street		mckusick@mckusick.com
10 *	Berkeley, CA 94709-1608		+1-510-843-9542
11 *	USA
12 *
13 * Redistribution and use in source and binary forms, with or without
14 * modification, are permitted provided that the following conditions
15 * are met:
16 *
17 * 1. Redistributions of source code must retain the above copyright
18 *    notice, this list of conditions and the following disclaimer.
19 * 2. Redistributions in binary form must reproduce the above copyright
20 *    notice, this list of conditions and the following disclaimer in the
21 *    documentation and/or other materials provided with the distribution.
22 *
23 * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY
24 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
25 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
26 * DISCLAIMED.  IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR
27 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33 * SUCH DAMAGE.
34 *
35 *	@(#)ffs_snapshot.c	8.11 (McKusick) 7/23/00
36 */
37
38#include <sys/cdefs.h>
39__FBSDID("$FreeBSD$");
40
41#include "opt_quota.h"
42
43#include <sys/param.h>
44#include <sys/kernel.h>
45#include <sys/systm.h>
46#include <sys/conf.h>
47#include <sys/bio.h>
48#include <sys/buf.h>
49#include <sys/fcntl.h>
50#include <sys/proc.h>
51#include <sys/namei.h>
52#include <sys/sched.h>
53#include <sys/stat.h>
54#include <sys/malloc.h>
55#include <sys/mount.h>
56#include <sys/resource.h>
57#include <sys/resourcevar.h>
58#include <sys/rwlock.h>
59#include <sys/vnode.h>
60
61#include <vm/vm.h>
62#include <vm/vm_extern.h>
63
64#include <geom/geom.h>
65
66#include <ufs/ufs/extattr.h>
67#include <ufs/ufs/quota.h>
68#include <ufs/ufs/ufsmount.h>
69#include <ufs/ufs/inode.h>
70#include <ufs/ufs/ufs_extern.h>
71
72#include <ufs/ffs/fs.h>
73#include <ufs/ffs/ffs_extern.h>
74
75#define KERNCRED thread0.td_ucred
76#define DEBUG 1
77
78#include "opt_ffs.h"
79
80#ifdef NO_FFS_SNAPSHOT
81int
82ffs_snapshot(mp, snapfile)
83	struct mount *mp;
84	char *snapfile;
85{
86	return (EINVAL);
87}
88
89int
90ffs_snapblkfree(fs, devvp, bno, size, inum, vtype, wkhd)
91	struct fs *fs;
92	struct vnode *devvp;
93	ufs2_daddr_t bno;
94	long size;
95	ino_t inum;
96	enum vtype vtype;
97	struct workhead *wkhd;
98{
99	return (EINVAL);
100}
101
102void
103ffs_snapremove(vp)
104	struct vnode *vp;
105{
106}
107
108void
109ffs_snapshot_mount(mp)
110	struct mount *mp;
111{
112}
113
114void
115ffs_snapshot_unmount(mp)
116	struct mount *mp;
117{
118}
119
120void
121ffs_snapgone(ip)
122	struct inode *ip;
123{
124}
125
126int
127ffs_copyonwrite(devvp, bp)
128	struct vnode *devvp;
129	struct buf *bp;
130{
131	return (EINVAL);
132}
133
134void
135ffs_sync_snap(mp, waitfor)
136	struct mount *mp;
137	int waitfor;
138{
139}
140
141#else
142FEATURE(ffs_snapshot, "FFS snapshot support");
143
144LIST_HEAD(, snapdata) snapfree;
145static struct mtx snapfree_lock;
146MTX_SYSINIT(ffs_snapfree, &snapfree_lock, "snapdata free list", MTX_DEF);
147
148static int cgaccount(int, struct vnode *, struct buf *, int);
149static int expunge_ufs1(struct vnode *, struct inode *, struct fs *,
150    int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *,
151    ufs_lbn_t, int), int, int);
152static int indiracct_ufs1(struct vnode *, struct vnode *, int,
153    ufs1_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *,
154    int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *,
155    ufs_lbn_t, int), int);
156static int fullacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
157    struct fs *, ufs_lbn_t, int);
158static int snapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
159    struct fs *, ufs_lbn_t, int);
160static int mapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
161    struct fs *, ufs_lbn_t, int);
162static int expunge_ufs2(struct vnode *, struct inode *, struct fs *,
163    int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *,
164    ufs_lbn_t, int), int, int);
165static int indiracct_ufs2(struct vnode *, struct vnode *, int,
166    ufs2_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *,
167    int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *,
168    ufs_lbn_t, int), int);
169static int fullacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
170    struct fs *, ufs_lbn_t, int);
171static int snapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
172    struct fs *, ufs_lbn_t, int);
173static int mapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
174    struct fs *, ufs_lbn_t, int);
175static int readblock(struct vnode *vp, struct buf *, ufs2_daddr_t);
176static void try_free_snapdata(struct vnode *devvp);
177static struct snapdata *ffs_snapdata_acquire(struct vnode *devvp);
178static int ffs_bp_snapblk(struct vnode *, struct buf *);
179
180/*
181 * To ensure the consistency of snapshots across crashes, we must
182 * synchronously write out copied blocks before allowing the
183 * originals to be modified. Because of the rather severe speed
184 * penalty that this imposes, the code normally only ensures
185 * persistence for the filesystem metadata contained within a
186 * snapshot. Setting the following flag allows this crash
187 * persistence to be enabled for file contents.
188 */
189int dopersistence = 0;
190
191#ifdef DEBUG
192#include <sys/sysctl.h>
193SYSCTL_INT(_debug, OID_AUTO, dopersistence, CTLFLAG_RW, &dopersistence, 0, "");
194static int snapdebug = 0;
195SYSCTL_INT(_debug, OID_AUTO, snapdebug, CTLFLAG_RW, &snapdebug, 0, "");
196int collectsnapstats = 0;
197SYSCTL_INT(_debug, OID_AUTO, collectsnapstats, CTLFLAG_RW, &collectsnapstats,
198	0, "");
199#endif /* DEBUG */
200
201/*
202 * Create a snapshot file and initialize it for the filesystem.
203 */
204int
205ffs_snapshot(mp, snapfile)
206	struct mount *mp;
207	char *snapfile;
208{
209	ufs2_daddr_t numblks, blkno, *blkp, *snapblklist;
210	int error, cg, snaploc;
211	int i, size, len, loc;
212	ufs2_daddr_t blockno;
213	uint64_t flag;
214	struct timespec starttime = {0, 0}, endtime;
215	char saved_nice = 0;
216	long redo = 0, snaplistsize = 0;
217	int32_t *lp;
218	void *space;
219	struct fs *copy_fs = NULL, *fs;
220	struct thread *td = curthread;
221	struct inode *ip, *xp;
222	struct buf *bp, *nbp, *ibp;
223	struct nameidata nd;
224	struct mount *wrtmp;
225	struct vattr vat;
226	struct vnode *vp, *xvp, *mvp, *devvp;
227	struct uio auio;
228	struct iovec aiov;
229	struct snapdata *sn;
230	struct ufsmount *ump;
231
232	ump = VFSTOUFS(mp);
233	fs = ump->um_fs;
234	sn = NULL;
235	/*
236	 * At the moment, journaled soft updates cannot support
237	 * taking snapshots.
238	 */
239	if (MOUNTEDSUJ(mp)) {
240		vfs_mount_error(mp, "%s: Snapshots are not yet supported when "
241		    "running with journaled soft updates", fs->fs_fsmnt);
242		return (EOPNOTSUPP);
243	}
244	MNT_ILOCK(mp);
245	flag = mp->mnt_flag;
246	MNT_IUNLOCK(mp);
247	/*
248	 * Need to serialize access to snapshot code per filesystem.
249	 */
250	/*
251	 * Assign a snapshot slot in the superblock.
252	 */
253	UFS_LOCK(ump);
254	for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++)
255		if (fs->fs_snapinum[snaploc] == 0)
256			break;
257	UFS_UNLOCK(ump);
258	if (snaploc == FSMAXSNAP)
259		return (ENOSPC);
260	/*
261	 * Create the snapshot file.
262	 */
263restart:
264	NDINIT(&nd, CREATE, LOCKPARENT | LOCKLEAF | NOCACHE, UIO_SYSSPACE,
265	    snapfile, td);
266	if ((error = namei(&nd)) != 0)
267		return (error);
268	if (nd.ni_vp != NULL) {
269		vput(nd.ni_vp);
270		error = EEXIST;
271	}
272	if (nd.ni_dvp->v_mount != mp)
273		error = EXDEV;
274	if (error) {
275		NDFREE(&nd, NDF_ONLY_PNBUF);
276		if (nd.ni_dvp == nd.ni_vp)
277			vrele(nd.ni_dvp);
278		else
279			vput(nd.ni_dvp);
280		return (error);
281	}
282	VATTR_NULL(&vat);
283	vat.va_type = VREG;
284	vat.va_mode = S_IRUSR;
285	vat.va_vaflags |= VA_EXCLUSIVE;
286	if (VOP_GETWRITEMOUNT(nd.ni_dvp, &wrtmp))
287		wrtmp = NULL;
288	if (wrtmp != mp)
289		panic("ffs_snapshot: mount mismatch");
290	vfs_rel(wrtmp);
291	if (vn_start_write(NULL, &wrtmp, V_NOWAIT) != 0) {
292		NDFREE(&nd, NDF_ONLY_PNBUF);
293		vput(nd.ni_dvp);
294		if ((error = vn_start_write(NULL, &wrtmp,
295		    V_XSLEEP | PCATCH)) != 0)
296			return (error);
297		goto restart;
298	}
299	error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vat);
300	VOP_UNLOCK(nd.ni_dvp, 0);
301	if (error) {
302		NDFREE(&nd, NDF_ONLY_PNBUF);
303		vn_finished_write(wrtmp);
304		vrele(nd.ni_dvp);
305		return (error);
306	}
307	vp = nd.ni_vp;
308	vnode_create_vobject(nd.ni_vp, fs->fs_size, td);
309	vp->v_vflag |= VV_SYSTEM;
310	ip = VTOI(vp);
311	devvp = ITODEVVP(ip);
312	/*
313	 * Calculate the size of the filesystem then allocate the block
314	 * immediately following the last block of the filesystem that
315	 * will contain the snapshot list. This operation allows us to
316	 * set the size of the snapshot.
317	 */
318	numblks = howmany(fs->fs_size, fs->fs_frag);
319	error = UFS_BALLOC(vp, lblktosize(fs, (off_t)numblks),
320	    fs->fs_bsize, KERNCRED, BA_CLRBUF, &bp);
321	if (error)
322		goto out;
323	bawrite(bp);
324	ip->i_size = lblktosize(fs, (off_t)(numblks + 1));
325	vnode_pager_setsize(vp, ip->i_size);
326	DIP_SET(ip, i_size, ip->i_size);
327	ip->i_flag |= IN_SIZEMOD | IN_CHANGE | IN_UPDATE;
328	/*
329	 * Preallocate critical data structures so that we can copy
330	 * them in without further allocation after we suspend all
331	 * operations on the filesystem. We would like to just release
332	 * the allocated buffers without writing them since they will
333	 * be filled in below once we are ready to go, but this upsets
334	 * the soft update code, so we go ahead and write the new buffers.
335	 *
336	 * Allocate all indirect blocks and mark all of them as not
337	 * needing to be copied.
338	 */
339	for (blkno = UFS_NDADDR; blkno < numblks; blkno += NINDIR(fs)) {
340		error = UFS_BALLOC(vp, lblktosize(fs, (off_t)blkno),
341		    fs->fs_bsize, td->td_ucred, BA_METAONLY, &ibp);
342		if (error)
343			goto out;
344		bawrite(ibp);
345	}
346	/*
347	 * Allocate copies for the superblock and its summary information.
348	 */
349	error = UFS_BALLOC(vp, fs->fs_sblockloc, fs->fs_sbsize, KERNCRED,
350	    0, &nbp);
351	if (error)
352		goto out;
353	bawrite(nbp);
354	blkno = fragstoblks(fs, fs->fs_csaddr);
355	len = howmany(fs->fs_cssize, fs->fs_bsize);
356	for (loc = 0; loc < len; loc++) {
357		error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(blkno + loc)),
358		    fs->fs_bsize, KERNCRED, 0, &nbp);
359		if (error)
360			goto out;
361		bawrite(nbp);
362	}
363	/*
364	 * Allocate all cylinder group blocks.
365	 */
366	for (cg = 0; cg < fs->fs_ncg; cg++) {
367		error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)),
368		    fs->fs_bsize, KERNCRED, 0, &nbp);
369		if (error)
370			goto out;
371		bawrite(nbp);
372		if (cg % 10 == 0) {
373			error = ffs_syncvnode(vp, MNT_WAIT, 0);
374			if (error != 0)
375				goto out;
376		}
377	}
378	/*
379	 * Copy all the cylinder group maps. Although the
380	 * filesystem is still active, we hope that only a few
381	 * cylinder groups will change between now and when we
382	 * suspend operations. Thus, we will be able to quickly
383	 * touch up the few cylinder groups that changed during
384	 * the suspension period.
385	 */
386	len = roundup2(howmany(fs->fs_ncg, NBBY), sizeof(int));
387	space = malloc(len, M_DEVBUF, M_WAITOK | M_ZERO);
388	UFS_LOCK(ump);
389	fs->fs_active = space;
390	UFS_UNLOCK(ump);
391	for (cg = 0; cg < fs->fs_ncg; cg++) {
392		error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)),
393		    fs->fs_bsize, KERNCRED, 0, &nbp);
394		if (error)
395			goto out;
396		error = cgaccount(cg, vp, nbp, 1);
397		bawrite(nbp);
398		if (cg % 10 == 0)
399			ffs_syncvnode(vp, MNT_WAIT, 0);
400		if (error)
401			goto out;
402	}
403	/*
404	 * Change inode to snapshot type file.
405	 */
406	ip->i_flags |= SF_SNAPSHOT;
407	DIP_SET(ip, i_flags, ip->i_flags);
408	ip->i_flag |= IN_CHANGE | IN_UPDATE;
409	/*
410	 * Ensure that the snapshot is completely on disk.
411	 * Since we have marked it as a snapshot it is safe to
412	 * unlock it as no process will be allowed to write to it.
413	 */
414	if ((error = ffs_syncvnode(vp, MNT_WAIT, 0)) != 0)
415		goto out;
416	VOP_UNLOCK(vp, 0);
417	/*
418	 * All allocations are done, so we can now snapshot the system.
419	 *
420	 * Recind nice scheduling while running with the filesystem suspended.
421	 */
422	if (td->td_proc->p_nice > 0) {
423		struct proc *p;
424
425		p = td->td_proc;
426		PROC_LOCK(p);
427		saved_nice = p->p_nice;
428		sched_nice(p, 0);
429		PROC_UNLOCK(p);
430	}
431	/*
432	 * Suspend operation on filesystem.
433	 */
434	for (;;) {
435		vn_finished_write(wrtmp);
436		if ((error = vfs_write_suspend(vp->v_mount, 0)) != 0) {
437			vn_start_write(NULL, &wrtmp, V_WAIT);
438			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
439			goto out;
440		}
441		if (mp->mnt_kern_flag & MNTK_SUSPENDED)
442			break;
443		vn_start_write(NULL, &wrtmp, V_WAIT);
444	}
445	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
446	if (ip->i_effnlink == 0) {
447		error = ENOENT;		/* Snapshot file unlinked */
448		goto resumefs;
449	}
450	if (collectsnapstats)
451		nanotime(&starttime);
452
453	/*
454	 * First, copy all the cylinder group maps that have changed.
455	 */
456	for (cg = 0; cg < fs->fs_ncg; cg++) {
457		if ((ACTIVECGNUM(fs, cg) & ACTIVECGOFF(cg)) != 0)
458			continue;
459		redo++;
460		error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)),
461		    fs->fs_bsize, KERNCRED, 0, &nbp);
462		if (error)
463			goto resumefs;
464		error = cgaccount(cg, vp, nbp, 2);
465		bawrite(nbp);
466		if (error)
467			goto resumefs;
468	}
469	/*
470	 * Grab a copy of the superblock and its summary information.
471	 * We delay writing it until the suspension is released below.
472	 */
473	copy_fs = malloc((u_long)fs->fs_bsize, M_UFSMNT, M_WAITOK);
474	bcopy(fs, copy_fs, fs->fs_sbsize);
475	if ((fs->fs_flags & (FS_UNCLEAN | FS_NEEDSFSCK)) == 0)
476		copy_fs->fs_clean = 1;
477	size = fs->fs_bsize < SBLOCKSIZE ? fs->fs_bsize : SBLOCKSIZE;
478	if (fs->fs_sbsize < size)
479		bzero(&((char *)copy_fs)[fs->fs_sbsize],
480		    size - fs->fs_sbsize);
481	size = blkroundup(fs, fs->fs_cssize);
482	if (fs->fs_contigsumsize > 0)
483		size += fs->fs_ncg * sizeof(int32_t);
484	space = malloc((u_long)size, M_UFSMNT, M_WAITOK);
485	copy_fs->fs_csp = space;
486	bcopy(fs->fs_csp, copy_fs->fs_csp, fs->fs_cssize);
487	space = (char *)space + fs->fs_cssize;
488	loc = howmany(fs->fs_cssize, fs->fs_fsize);
489	i = fs->fs_frag - loc % fs->fs_frag;
490	len = (i == fs->fs_frag) ? 0 : i * fs->fs_fsize;
491	if (len > 0) {
492		if ((error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + loc),
493		    len, KERNCRED, &bp)) != 0) {
494			brelse(bp);
495			goto resumefs;
496		}
497		bcopy(bp->b_data, space, (u_int)len);
498		space = (char *)space + len;
499		bp->b_flags |= B_INVAL | B_NOCACHE;
500		brelse(bp);
501	}
502	if (fs->fs_contigsumsize > 0) {
503		copy_fs->fs_maxcluster = lp = space;
504		for (i = 0; i < fs->fs_ncg; i++)
505			*lp++ = fs->fs_contigsumsize;
506	}
507	/*
508	 * We must check for active files that have been unlinked
509	 * (e.g., with a zero link count). We have to expunge all
510	 * trace of these files from the snapshot so that they are
511	 * not reclaimed prematurely by fsck or unnecessarily dumped.
512	 * We turn off the MNTK_SUSPENDED flag to avoid a panic from
513	 * spec_strategy about writing on a suspended filesystem.
514	 * Note that we skip unlinked snapshot files as they will
515	 * be handled separately below.
516	 *
517	 * We also calculate the size needed for the snapshot list.
518	 * Initial number of entries is composed of:
519	 * - one for each cylinder group map
520	 * - one for each block used by superblock summary table
521	 * - one for each snapshot inode block
522	 * - one for the superblock
523	 * - one for the snapshot list
524	 * The direct block entries in the snapshot are always
525	 * copied (see reason below). Note that the superblock and
526	 * the first cylinder group will almost always be allocated
527	 * in the direct blocks, but we add the slop for them in case
528	 * they do not end up there. The snapshot list size may get
529	 * expanded by one because of an update of an inode block for
530	 * an unlinked but still open file when it is expunged.
531	 *
532	 * Because the direct block pointers are always copied, they
533	 * are not added to the list. Instead ffs_copyonwrite()
534	 * explicitly checks for them before checking the snapshot list.
535	 */
536	snaplistsize = fs->fs_ncg + howmany(fs->fs_cssize, fs->fs_bsize) +
537	    FSMAXSNAP + /* superblock */ 1 + /* snaplist */ 1;
538	MNT_ILOCK(mp);
539	mp->mnt_kern_flag &= ~MNTK_SUSPENDED;
540	MNT_IUNLOCK(mp);
541loop:
542	MNT_VNODE_FOREACH_ALL(xvp, mp, mvp) {
543		if ((xvp->v_usecount == 0 &&
544		     (xvp->v_iflag & (VI_OWEINACT | VI_DOINGINACT)) == 0) ||
545		    xvp->v_type == VNON ||
546		    IS_SNAPSHOT(VTOI(xvp))) {
547			VI_UNLOCK(xvp);
548			continue;
549		}
550		/*
551		 * We can skip parent directory vnode because it must have
552		 * this snapshot file in it.
553		 */
554		if (xvp == nd.ni_dvp) {
555			VI_UNLOCK(xvp);
556			continue;
557		}
558		vholdl(xvp);
559		if (vn_lock(xvp, LK_EXCLUSIVE | LK_INTERLOCK) != 0) {
560			MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
561			vdrop(xvp);
562			goto loop;
563		}
564		VI_LOCK(xvp);
565		if (xvp->v_usecount == 0 &&
566		    (xvp->v_iflag & (VI_OWEINACT | VI_DOINGINACT)) == 0) {
567			VI_UNLOCK(xvp);
568			VOP_UNLOCK(xvp, 0);
569			vdrop(xvp);
570			continue;
571		}
572		VI_UNLOCK(xvp);
573		if (snapdebug)
574			vn_printf(xvp, "ffs_snapshot: busy vnode ");
575		if (VOP_GETATTR(xvp, &vat, td->td_ucred) == 0 &&
576		    vat.va_nlink > 0) {
577			VOP_UNLOCK(xvp, 0);
578			vdrop(xvp);
579			continue;
580		}
581		xp = VTOI(xvp);
582		if (ffs_checkfreefile(copy_fs, vp, xp->i_number)) {
583			VOP_UNLOCK(xvp, 0);
584			vdrop(xvp);
585			continue;
586		}
587		/*
588		 * If there is a fragment, clear it here.
589		 */
590		blkno = 0;
591		loc = howmany(xp->i_size, fs->fs_bsize) - 1;
592		if (loc < UFS_NDADDR) {
593			len = fragroundup(fs, blkoff(fs, xp->i_size));
594			if (len != 0 && len < fs->fs_bsize) {
595				ffs_blkfree(ump, copy_fs, vp,
596				    DIP(xp, i_db[loc]), len, xp->i_number,
597				    xvp->v_type, NULL, SINGLETON_KEY);
598				blkno = DIP(xp, i_db[loc]);
599				DIP_SET(xp, i_db[loc], 0);
600			}
601		}
602		snaplistsize += 1;
603		if (I_IS_UFS1(xp))
604			error = expunge_ufs1(vp, xp, copy_fs, fullacct_ufs1,
605			    BLK_NOCOPY, 1);
606		else
607			error = expunge_ufs2(vp, xp, copy_fs, fullacct_ufs2,
608			    BLK_NOCOPY, 1);
609		if (blkno)
610			DIP_SET(xp, i_db[loc], blkno);
611		if (!error)
612			error = ffs_freefile(ump, copy_fs, vp, xp->i_number,
613			    xp->i_mode, NULL);
614		VOP_UNLOCK(xvp, 0);
615		vdrop(xvp);
616		if (error) {
617			MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
618			goto resumefs;
619		}
620	}
621	/*
622	 * Erase the journal file from the snapshot.
623	 */
624	if (fs->fs_flags & FS_SUJ) {
625		error = softdep_journal_lookup(mp, &xvp);
626		if (error)
627			goto resumefs;
628		xp = VTOI(xvp);
629		if (I_IS_UFS1(xp))
630			error = expunge_ufs1(vp, xp, copy_fs, fullacct_ufs1,
631			    BLK_NOCOPY, 0);
632		else
633			error = expunge_ufs2(vp, xp, copy_fs, fullacct_ufs2,
634			    BLK_NOCOPY, 0);
635		vput(xvp);
636	}
637	/*
638	 * Acquire a lock on the snapdata structure, creating it if necessary.
639	 */
640	sn = ffs_snapdata_acquire(devvp);
641	/*
642	 * Change vnode to use shared snapshot lock instead of the original
643	 * private lock.
644	 */
645	vp->v_vnlock = &sn->sn_lock;
646	lockmgr(&vp->v_lock, LK_RELEASE, NULL);
647	xp = TAILQ_FIRST(&sn->sn_head);
648	/*
649	 * If this is the first snapshot on this filesystem, then we need
650	 * to allocate the space for the list of preallocated snapshot blocks.
651	 * This list will be refined below, but this preliminary one will
652	 * keep us out of deadlock until the full one is ready.
653	 */
654	if (xp == NULL) {
655		snapblklist = malloc(snaplistsize * sizeof(daddr_t),
656		    M_UFSMNT, M_WAITOK);
657		blkp = &snapblklist[1];
658		*blkp++ = lblkno(fs, fs->fs_sblockloc);
659		blkno = fragstoblks(fs, fs->fs_csaddr);
660		for (cg = 0; cg < fs->fs_ncg; cg++) {
661			if (fragstoblks(fs, cgtod(fs, cg) > blkno))
662				break;
663			*blkp++ = fragstoblks(fs, cgtod(fs, cg));
664		}
665		len = howmany(fs->fs_cssize, fs->fs_bsize);
666		for (loc = 0; loc < len; loc++)
667			*blkp++ = blkno + loc;
668		for (; cg < fs->fs_ncg; cg++)
669			*blkp++ = fragstoblks(fs, cgtod(fs, cg));
670		snapblklist[0] = blkp - snapblklist;
671		VI_LOCK(devvp);
672		if (sn->sn_blklist != NULL)
673			panic("ffs_snapshot: non-empty list");
674		sn->sn_blklist = snapblklist;
675		sn->sn_listsize = blkp - snapblklist;
676		VI_UNLOCK(devvp);
677	}
678	/*
679	 * Preallocate all the direct blocks in the snapshot inode so
680	 * that we never have to write the inode itself to commit an
681	 * update to the contents of the snapshot. Note that once
682	 * created, the size of the snapshot will never change, so
683	 * there will never be a need to write the inode except to
684	 * update the non-integrity-critical time fields and
685	 * allocated-block count.
686	 */
687	for (blockno = 0; blockno < UFS_NDADDR; blockno++) {
688		if (DIP(ip, i_db[blockno]) != 0)
689			continue;
690		error = UFS_BALLOC(vp, lblktosize(fs, blockno),
691		    fs->fs_bsize, KERNCRED, BA_CLRBUF, &bp);
692		if (error)
693			goto resumefs;
694		error = readblock(vp, bp, blockno);
695		bawrite(bp);
696		if (error != 0)
697			goto resumefs;
698	}
699	/*
700	 * Record snapshot inode. Since this is the newest snapshot,
701	 * it must be placed at the end of the list.
702	 */
703	VI_LOCK(devvp);
704	fs->fs_snapinum[snaploc] = ip->i_number;
705	if (ip->i_nextsnap.tqe_prev != 0)
706		panic("ffs_snapshot: %ju already on list",
707		    (uintmax_t)ip->i_number);
708	TAILQ_INSERT_TAIL(&sn->sn_head, ip, i_nextsnap);
709	devvp->v_vflag |= VV_COPYONWRITE;
710	VI_UNLOCK(devvp);
711resumefs:
712	ASSERT_VOP_LOCKED(vp, "ffs_snapshot vp");
713	if (error != 0 && copy_fs != NULL) {
714		free(copy_fs->fs_csp, M_UFSMNT);
715		free(copy_fs, M_UFSMNT);
716		copy_fs = NULL;
717	}
718	KASSERT(error != 0 || (sn != NULL && copy_fs != NULL),
719		("missing snapshot setup parameters"));
720	/*
721	 * Resume operation on filesystem.
722	 */
723	vfs_write_resume(vp->v_mount, VR_START_WRITE | VR_NO_SUSPCLR);
724	if (collectsnapstats && starttime.tv_sec > 0) {
725		nanotime(&endtime);
726		timespecsub(&endtime, &starttime, &endtime);
727		printf("%s: suspended %ld.%03ld sec, redo %ld of %d\n",
728		    vp->v_mount->mnt_stat.f_mntonname, (long)endtime.tv_sec,
729		    endtime.tv_nsec / 1000000, redo, fs->fs_ncg);
730	}
731	if (copy_fs == NULL)
732		goto out;
733	/*
734	 * Copy allocation information from all the snapshots in
735	 * this snapshot and then expunge them from its view.
736	 */
737	TAILQ_FOREACH(xp, &sn->sn_head, i_nextsnap) {
738		if (xp == ip)
739			break;
740		if (I_IS_UFS1(xp))
741			error = expunge_ufs1(vp, xp, fs, snapacct_ufs1,
742			    BLK_SNAP, 0);
743		else
744			error = expunge_ufs2(vp, xp, fs, snapacct_ufs2,
745			    BLK_SNAP, 0);
746		if (error == 0 && xp->i_effnlink == 0) {
747			error = ffs_freefile(ump,
748					     copy_fs,
749					     vp,
750					     xp->i_number,
751					     xp->i_mode, NULL);
752		}
753		if (error) {
754			fs->fs_snapinum[snaploc] = 0;
755			goto done;
756		}
757	}
758	/*
759	 * Allocate space for the full list of preallocated snapshot blocks.
760	 */
761	snapblklist = malloc(snaplistsize * sizeof(daddr_t),
762	    M_UFSMNT, M_WAITOK);
763	ip->i_snapblklist = &snapblklist[1];
764	/*
765	 * Expunge the blocks used by the snapshots from the set of
766	 * blocks marked as used in the snapshot bitmaps. Also, collect
767	 * the list of allocated blocks in i_snapblklist.
768	 */
769	if (I_IS_UFS1(ip))
770		error = expunge_ufs1(vp, ip, copy_fs, mapacct_ufs1,
771		    BLK_SNAP, 0);
772	else
773		error = expunge_ufs2(vp, ip, copy_fs, mapacct_ufs2,
774		    BLK_SNAP, 0);
775	if (error) {
776		fs->fs_snapinum[snaploc] = 0;
777		free(snapblklist, M_UFSMNT);
778		goto done;
779	}
780	if (snaplistsize < ip->i_snapblklist - snapblklist)
781		panic("ffs_snapshot: list too small");
782	snaplistsize = ip->i_snapblklist - snapblklist;
783	snapblklist[0] = snaplistsize;
784	ip->i_snapblklist = 0;
785	/*
786	 * Write out the list of allocated blocks to the end of the snapshot.
787	 */
788	auio.uio_iov = &aiov;
789	auio.uio_iovcnt = 1;
790	aiov.iov_base = (void *)snapblklist;
791	aiov.iov_len = snaplistsize * sizeof(daddr_t);
792	auio.uio_resid = aiov.iov_len;
793	auio.uio_offset = lblktosize(fs, (off_t)numblks);
794	auio.uio_segflg = UIO_SYSSPACE;
795	auio.uio_rw = UIO_WRITE;
796	auio.uio_td = td;
797	if ((error = VOP_WRITE(vp, &auio, IO_UNIT, td->td_ucred)) != 0) {
798		fs->fs_snapinum[snaploc] = 0;
799		free(snapblklist, M_UFSMNT);
800		goto done;
801	}
802	/*
803	 * Write the superblock and its summary information
804	 * to the snapshot.
805	 */
806	blkno = fragstoblks(fs, fs->fs_csaddr);
807	len = howmany(fs->fs_cssize, fs->fs_bsize);
808	space = copy_fs->fs_csp;
809	for (loc = 0; loc < len; loc++) {
810		error = bread(vp, blkno + loc, fs->fs_bsize, KERNCRED, &nbp);
811		if (error) {
812			fs->fs_snapinum[snaploc] = 0;
813			free(snapblklist, M_UFSMNT);
814			goto done;
815		}
816		bcopy(space, nbp->b_data, fs->fs_bsize);
817		space = (char *)space + fs->fs_bsize;
818		bawrite(nbp);
819	}
820	error = bread(vp, lblkno(fs, fs->fs_sblockloc), fs->fs_bsize,
821	    KERNCRED, &nbp);
822	if (error) {
823		brelse(nbp);
824	} else {
825		loc = blkoff(fs, fs->fs_sblockloc);
826		bcopy((char *)copy_fs, &nbp->b_data[loc], (u_int)fs->fs_sbsize);
827		bawrite(nbp);
828	}
829	/*
830	 * As this is the newest list, it is the most inclusive, so
831	 * should replace the previous list.
832	 */
833	VI_LOCK(devvp);
834	space = sn->sn_blklist;
835	sn->sn_blklist = snapblklist;
836	sn->sn_listsize = snaplistsize;
837	VI_UNLOCK(devvp);
838	if (space != NULL)
839		free(space, M_UFSMNT);
840done:
841	free(copy_fs->fs_csp, M_UFSMNT);
842	free(copy_fs, M_UFSMNT);
843	copy_fs = NULL;
844out:
845	NDFREE(&nd, NDF_ONLY_PNBUF);
846	if (saved_nice > 0) {
847		struct proc *p;
848
849		p = td->td_proc;
850		PROC_LOCK(p);
851		sched_nice(td->td_proc, saved_nice);
852		PROC_UNLOCK(td->td_proc);
853	}
854	UFS_LOCK(ump);
855	if (fs->fs_active != 0) {
856		free(fs->fs_active, M_DEVBUF);
857		fs->fs_active = 0;
858	}
859	UFS_UNLOCK(ump);
860	MNT_ILOCK(mp);
861	mp->mnt_flag = (mp->mnt_flag & MNT_QUOTA) | (flag & ~MNT_QUOTA);
862	MNT_IUNLOCK(mp);
863	if (error)
864		(void) ffs_truncate(vp, (off_t)0, 0, NOCRED);
865	(void) ffs_syncvnode(vp, MNT_WAIT, 0);
866	if (error)
867		vput(vp);
868	else
869		VOP_UNLOCK(vp, 0);
870	vrele(nd.ni_dvp);
871	vn_finished_write(wrtmp);
872	process_deferred_inactive(mp);
873	return (error);
874}
875
876/*
877 * Copy a cylinder group map. All the unallocated blocks are marked
878 * BLK_NOCOPY so that the snapshot knows that it need not copy them
879 * if they are later written. If passno is one, then this is a first
880 * pass, so only setting needs to be done. If passno is 2, then this
881 * is a revision to a previous pass which must be undone as the
882 * replacement pass is done.
883 */
884static int
885cgaccount(cg, vp, nbp, passno)
886	int cg;
887	struct vnode *vp;
888	struct buf *nbp;
889	int passno;
890{
891	struct buf *bp, *ibp;
892	struct inode *ip;
893	struct cg *cgp;
894	struct fs *fs;
895	ufs2_daddr_t base, numblks;
896	int error, len, loc, indiroff;
897
898	ip = VTOI(vp);
899	fs = ITOFS(ip);
900	if ((error = ffs_getcg(fs, ITODEVVP(ip), cg, &bp, &cgp)) != 0)
901		return (error);
902	UFS_LOCK(ITOUMP(ip));
903	ACTIVESET(fs, cg);
904	/*
905	 * Recomputation of summary information might not have been performed
906	 * at mount time.  Sync up summary information for current cylinder
907	 * group while data is in memory to ensure that result of background
908	 * fsck is slightly more consistent.
909	 */
910	fs->fs_cs(fs, cg) = cgp->cg_cs;
911	UFS_UNLOCK(ITOUMP(ip));
912	bcopy(bp->b_data, nbp->b_data, fs->fs_cgsize);
913	if (fs->fs_cgsize < fs->fs_bsize)
914		bzero(&nbp->b_data[fs->fs_cgsize],
915		    fs->fs_bsize - fs->fs_cgsize);
916	cgp = (struct cg *)nbp->b_data;
917	bqrelse(bp);
918	if (passno == 2)
919		nbp->b_flags |= B_VALIDSUSPWRT;
920	numblks = howmany(fs->fs_size, fs->fs_frag);
921	len = howmany(fs->fs_fpg, fs->fs_frag);
922	base = cgbase(fs, cg) / fs->fs_frag;
923	if (base + len >= numblks)
924		len = numblks - base - 1;
925	loc = 0;
926	if (base < UFS_NDADDR) {
927		for ( ; loc < UFS_NDADDR; loc++) {
928			if (ffs_isblock(fs, cg_blksfree(cgp), loc))
929				DIP_SET(ip, i_db[loc], BLK_NOCOPY);
930			else if (passno == 2 && DIP(ip, i_db[loc])== BLK_NOCOPY)
931				DIP_SET(ip, i_db[loc], 0);
932			else if (passno == 1 && DIP(ip, i_db[loc])== BLK_NOCOPY)
933				panic("ffs_snapshot: lost direct block");
934		}
935	}
936	error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(base + loc)),
937	    fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
938	if (error) {
939		goto out;
940	}
941	indiroff = (base + loc - UFS_NDADDR) % NINDIR(fs);
942	for ( ; loc < len; loc++, indiroff++) {
943		if (indiroff >= NINDIR(fs)) {
944			if (passno == 2)
945				ibp->b_flags |= B_VALIDSUSPWRT;
946			bawrite(ibp);
947			error = UFS_BALLOC(vp,
948			    lblktosize(fs, (off_t)(base + loc)),
949			    fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
950			if (error) {
951				goto out;
952			}
953			indiroff = 0;
954		}
955		if (I_IS_UFS1(ip)) {
956			if (ffs_isblock(fs, cg_blksfree(cgp), loc))
957				((ufs1_daddr_t *)(ibp->b_data))[indiroff] =
958				    BLK_NOCOPY;
959			else if (passno == 2 && ((ufs1_daddr_t *)(ibp->b_data))
960			    [indiroff] == BLK_NOCOPY)
961				((ufs1_daddr_t *)(ibp->b_data))[indiroff] = 0;
962			else if (passno == 1 && ((ufs1_daddr_t *)(ibp->b_data))
963			    [indiroff] == BLK_NOCOPY)
964				panic("ffs_snapshot: lost indirect block");
965			continue;
966		}
967		if (ffs_isblock(fs, cg_blksfree(cgp), loc))
968			((ufs2_daddr_t *)(ibp->b_data))[indiroff] = BLK_NOCOPY;
969		else if (passno == 2 &&
970		    ((ufs2_daddr_t *)(ibp->b_data)) [indiroff] == BLK_NOCOPY)
971			((ufs2_daddr_t *)(ibp->b_data))[indiroff] = 0;
972		else if (passno == 1 &&
973		    ((ufs2_daddr_t *)(ibp->b_data)) [indiroff] == BLK_NOCOPY)
974			panic("ffs_snapshot: lost indirect block");
975	}
976	if (passno == 2)
977		ibp->b_flags |= B_VALIDSUSPWRT;
978	bdwrite(ibp);
979out:
980	/*
981	 * We have to calculate the crc32c here rather than just setting the
982	 * BX_CYLGRP b_xflags because the allocation of the block for the
983	 * the cylinder group map will always be a full size block (fs_bsize)
984	 * even though the cylinder group may be smaller (fs_cgsize). The
985	 * crc32c must be computed only over fs_cgsize whereas the BX_CYLGRP
986	 * flag causes it to be computed over the size of the buffer.
987	 */
988	if ((fs->fs_metackhash & CK_CYLGRP) != 0) {
989		((struct cg *)nbp->b_data)->cg_ckhash = 0;
990		((struct cg *)nbp->b_data)->cg_ckhash =
991		    calculate_crc32c(~0L, nbp->b_data, fs->fs_cgsize);
992	}
993	return (error);
994}
995
996/*
997 * Before expunging a snapshot inode, note all the
998 * blocks that it claims with BLK_SNAP so that fsck will
999 * be able to account for those blocks properly and so
1000 * that this snapshot knows that it need not copy them
1001 * if the other snapshot holding them is freed. This code
1002 * is reproduced once each for UFS1 and UFS2.
1003 */
1004static int
1005expunge_ufs1(snapvp, cancelip, fs, acctfunc, expungetype, clearmode)
1006	struct vnode *snapvp;
1007	struct inode *cancelip;
1008	struct fs *fs;
1009	int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
1010	    struct fs *, ufs_lbn_t, int);
1011	int expungetype;
1012	int clearmode;
1013{
1014	int i, error, indiroff;
1015	ufs_lbn_t lbn, rlbn;
1016	ufs2_daddr_t len, blkno, numblks, blksperindir;
1017	struct ufs1_dinode *dip;
1018	struct thread *td = curthread;
1019	struct buf *bp;
1020
1021	/*
1022	 * Prepare to expunge the inode. If its inode block has not
1023	 * yet been copied, then allocate and fill the copy.
1024	 */
1025	lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number));
1026	blkno = 0;
1027	if (lbn < UFS_NDADDR) {
1028		blkno = VTOI(snapvp)->i_din1->di_db[lbn];
1029	} else {
1030		if (DOINGSOFTDEP(snapvp))
1031			softdep_prealloc(snapvp, MNT_WAIT);
1032		td->td_pflags |= TDP_COWINPROGRESS;
1033		error = ffs_balloc_ufs1(snapvp, lblktosize(fs, (off_t)lbn),
1034		   fs->fs_bsize, KERNCRED, BA_METAONLY, &bp);
1035		td->td_pflags &= ~TDP_COWINPROGRESS;
1036		if (error)
1037			return (error);
1038		indiroff = (lbn - UFS_NDADDR) % NINDIR(fs);
1039		blkno = ((ufs1_daddr_t *)(bp->b_data))[indiroff];
1040		bqrelse(bp);
1041	}
1042	if (blkno != 0) {
1043		if ((error = bread(snapvp, lbn, fs->fs_bsize, KERNCRED, &bp)))
1044			return (error);
1045	} else {
1046		error = ffs_balloc_ufs1(snapvp, lblktosize(fs, (off_t)lbn),
1047		    fs->fs_bsize, KERNCRED, 0, &bp);
1048		if (error)
1049			return (error);
1050		if ((error = readblock(snapvp, bp, lbn)) != 0)
1051			return (error);
1052	}
1053	/*
1054	 * Set a snapshot inode to be a zero length file, regular files
1055	 * or unlinked snapshots to be completely unallocated.
1056	 */
1057	dip = (struct ufs1_dinode *)bp->b_data +
1058	    ino_to_fsbo(fs, cancelip->i_number);
1059	if (clearmode || cancelip->i_effnlink == 0)
1060		dip->di_mode = 0;
1061	dip->di_size = 0;
1062	dip->di_blocks = 0;
1063	dip->di_flags &= ~SF_SNAPSHOT;
1064	bzero(&dip->di_db[0], (UFS_NDADDR + UFS_NIADDR) * sizeof(ufs1_daddr_t));
1065	bdwrite(bp);
1066	/*
1067	 * Now go through and expunge all the blocks in the file
1068	 * using the function requested.
1069	 */
1070	numblks = howmany(cancelip->i_size, fs->fs_bsize);
1071	if ((error = (*acctfunc)(snapvp, &cancelip->i_din1->di_db[0],
1072	    &cancelip->i_din1->di_db[UFS_NDADDR], fs, 0, expungetype)))
1073		return (error);
1074	if ((error = (*acctfunc)(snapvp, &cancelip->i_din1->di_ib[0],
1075	    &cancelip->i_din1->di_ib[UFS_NIADDR], fs, -1, expungetype)))
1076		return (error);
1077	blksperindir = 1;
1078	lbn = -UFS_NDADDR;
1079	len = numblks - UFS_NDADDR;
1080	rlbn = UFS_NDADDR;
1081	for (i = 0; len > 0 && i < UFS_NIADDR; i++) {
1082		error = indiracct_ufs1(snapvp, ITOV(cancelip), i,
1083		    cancelip->i_din1->di_ib[i], lbn, rlbn, len,
1084		    blksperindir, fs, acctfunc, expungetype);
1085		if (error)
1086			return (error);
1087		blksperindir *= NINDIR(fs);
1088		lbn -= blksperindir + 1;
1089		len -= blksperindir;
1090		rlbn += blksperindir;
1091	}
1092	return (0);
1093}
1094
1095/*
1096 * Descend an indirect block chain for vnode cancelvp accounting for all
1097 * its indirect blocks in snapvp.
1098 */
1099static int
1100indiracct_ufs1(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks,
1101	    blksperindir, fs, acctfunc, expungetype)
1102	struct vnode *snapvp;
1103	struct vnode *cancelvp;
1104	int level;
1105	ufs1_daddr_t blkno;
1106	ufs_lbn_t lbn;
1107	ufs_lbn_t rlbn;
1108	ufs_lbn_t remblks;
1109	ufs_lbn_t blksperindir;
1110	struct fs *fs;
1111	int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
1112	    struct fs *, ufs_lbn_t, int);
1113	int expungetype;
1114{
1115	int error, num, i;
1116	ufs_lbn_t subblksperindir;
1117	struct indir indirs[UFS_NIADDR + 2];
1118	ufs1_daddr_t last, *bap;
1119	struct buf *bp;
1120
1121	if (blkno == 0) {
1122		if (expungetype == BLK_NOCOPY)
1123			return (0);
1124		panic("indiracct_ufs1: missing indir");
1125	}
1126	if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0)
1127		return (error);
1128	if (lbn != indirs[num - 1 - level].in_lbn || num < 2)
1129		panic("indiracct_ufs1: botched params");
1130	/*
1131	 * We have to expand bread here since it will deadlock looking
1132	 * up the block number for any blocks that are not in the cache.
1133	 */
1134	bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0, 0);
1135	bp->b_blkno = fsbtodb(fs, blkno);
1136	if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 &&
1137	    (error = readblock(cancelvp, bp, fragstoblks(fs, blkno)))) {
1138		brelse(bp);
1139		return (error);
1140	}
1141	/*
1142	 * Account for the block pointers in this indirect block.
1143	 */
1144	last = howmany(remblks, blksperindir);
1145	if (last > NINDIR(fs))
1146		last = NINDIR(fs);
1147	bap = malloc(fs->fs_bsize, M_DEVBUF, M_WAITOK);
1148	bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize);
1149	bqrelse(bp);
1150	error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs,
1151	    level == 0 ? rlbn : -1, expungetype);
1152	if (error || level == 0)
1153		goto out;
1154	/*
1155	 * Account for the block pointers in each of the indirect blocks
1156	 * in the levels below us.
1157	 */
1158	subblksperindir = blksperindir / NINDIR(fs);
1159	for (lbn++, level--, i = 0; i < last; i++) {
1160		error = indiracct_ufs1(snapvp, cancelvp, level, bap[i], lbn,
1161		    rlbn, remblks, subblksperindir, fs, acctfunc, expungetype);
1162		if (error)
1163			goto out;
1164		rlbn += blksperindir;
1165		lbn -= blksperindir;
1166		remblks -= blksperindir;
1167	}
1168out:
1169	free(bap, M_DEVBUF);
1170	return (error);
1171}
1172
1173/*
1174 * Do both snap accounting and map accounting.
1175 */
1176static int
1177fullacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype)
1178	struct vnode *vp;
1179	ufs1_daddr_t *oldblkp, *lastblkp;
1180	struct fs *fs;
1181	ufs_lbn_t lblkno;
1182	int exptype;	/* BLK_SNAP or BLK_NOCOPY */
1183{
1184	int error;
1185
1186	if ((error = snapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype)))
1187		return (error);
1188	return (mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype));
1189}
1190
1191/*
1192 * Identify a set of blocks allocated in a snapshot inode.
1193 */
1194static int
1195snapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype)
1196	struct vnode *vp;
1197	ufs1_daddr_t *oldblkp, *lastblkp;
1198	struct fs *fs;
1199	ufs_lbn_t lblkno;
1200	int expungetype;	/* BLK_SNAP or BLK_NOCOPY */
1201{
1202	struct inode *ip = VTOI(vp);
1203	ufs1_daddr_t blkno, *blkp;
1204	ufs_lbn_t lbn;
1205	struct buf *ibp;
1206	int error;
1207
1208	for ( ; oldblkp < lastblkp; oldblkp++) {
1209		blkno = *oldblkp;
1210		if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP)
1211			continue;
1212		lbn = fragstoblks(fs, blkno);
1213		if (lbn < UFS_NDADDR) {
1214			blkp = &ip->i_din1->di_db[lbn];
1215			ip->i_flag |= IN_CHANGE | IN_UPDATE;
1216		} else {
1217			error = ffs_balloc_ufs1(vp, lblktosize(fs, (off_t)lbn),
1218			    fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
1219			if (error)
1220				return (error);
1221			blkp = &((ufs1_daddr_t *)(ibp->b_data))
1222			    [(lbn - UFS_NDADDR) % NINDIR(fs)];
1223		}
1224		/*
1225		 * If we are expunging a snapshot vnode and we
1226		 * find a block marked BLK_NOCOPY, then it is
1227		 * one that has been allocated to this snapshot after
1228		 * we took our current snapshot and can be ignored.
1229		 */
1230		if (expungetype == BLK_SNAP && *blkp == BLK_NOCOPY) {
1231			if (lbn >= UFS_NDADDR)
1232				brelse(ibp);
1233		} else {
1234			if (*blkp != 0)
1235				panic("snapacct_ufs1: bad block");
1236			*blkp = expungetype;
1237			if (lbn >= UFS_NDADDR)
1238				bdwrite(ibp);
1239		}
1240	}
1241	return (0);
1242}
1243
1244/*
1245 * Account for a set of blocks allocated in a snapshot inode.
1246 */
1247static int
1248mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype)
1249	struct vnode *vp;
1250	ufs1_daddr_t *oldblkp, *lastblkp;
1251	struct fs *fs;
1252	ufs_lbn_t lblkno;
1253	int expungetype;
1254{
1255	ufs1_daddr_t blkno;
1256	struct inode *ip;
1257	ino_t inum;
1258	int acctit;
1259
1260	ip = VTOI(vp);
1261	inum = ip->i_number;
1262	if (lblkno == -1)
1263		acctit = 0;
1264	else
1265		acctit = 1;
1266	for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) {
1267		blkno = *oldblkp;
1268		if (blkno == 0 || blkno == BLK_NOCOPY)
1269			continue;
1270		if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP)
1271			*ip->i_snapblklist++ = lblkno;
1272		if (blkno == BLK_SNAP)
1273			blkno = blkstofrags(fs, lblkno);
1274		ffs_blkfree(ITOUMP(ip), fs, vp, blkno, fs->fs_bsize, inum,
1275		    vp->v_type, NULL, SINGLETON_KEY);
1276	}
1277	return (0);
1278}
1279
1280/*
1281 * Before expunging a snapshot inode, note all the
1282 * blocks that it claims with BLK_SNAP so that fsck will
1283 * be able to account for those blocks properly and so
1284 * that this snapshot knows that it need not copy them
1285 * if the other snapshot holding them is freed. This code
1286 * is reproduced once each for UFS1 and UFS2.
1287 */
1288static int
1289expunge_ufs2(snapvp, cancelip, fs, acctfunc, expungetype, clearmode)
1290	struct vnode *snapvp;
1291	struct inode *cancelip;
1292	struct fs *fs;
1293	int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
1294	    struct fs *, ufs_lbn_t, int);
1295	int expungetype;
1296	int clearmode;
1297{
1298	int i, error, indiroff;
1299	ufs_lbn_t lbn, rlbn;
1300	ufs2_daddr_t len, blkno, numblks, blksperindir;
1301	struct ufs2_dinode *dip;
1302	struct thread *td = curthread;
1303	struct buf *bp;
1304
1305	/*
1306	 * Prepare to expunge the inode. If its inode block has not
1307	 * yet been copied, then allocate and fill the copy.
1308	 */
1309	lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number));
1310	blkno = 0;
1311	if (lbn < UFS_NDADDR) {
1312		blkno = VTOI(snapvp)->i_din2->di_db[lbn];
1313	} else {
1314		if (DOINGSOFTDEP(snapvp))
1315			softdep_prealloc(snapvp, MNT_WAIT);
1316		td->td_pflags |= TDP_COWINPROGRESS;
1317		error = ffs_balloc_ufs2(snapvp, lblktosize(fs, (off_t)lbn),
1318		   fs->fs_bsize, KERNCRED, BA_METAONLY, &bp);
1319		td->td_pflags &= ~TDP_COWINPROGRESS;
1320		if (error)
1321			return (error);
1322		indiroff = (lbn - UFS_NDADDR) % NINDIR(fs);
1323		blkno = ((ufs2_daddr_t *)(bp->b_data))[indiroff];
1324		bqrelse(bp);
1325	}
1326	if (blkno != 0) {
1327		if ((error = bread(snapvp, lbn, fs->fs_bsize, KERNCRED, &bp)))
1328			return (error);
1329	} else {
1330		error = ffs_balloc_ufs2(snapvp, lblktosize(fs, (off_t)lbn),
1331		    fs->fs_bsize, KERNCRED, 0, &bp);
1332		if (error)
1333			return (error);
1334		if ((error = readblock(snapvp, bp, lbn)) != 0)
1335			return (error);
1336	}
1337	/*
1338	 * Set a snapshot inode to be a zero length file, regular files
1339	 * to be completely unallocated.
1340	 */
1341	dip = (struct ufs2_dinode *)bp->b_data +
1342	    ino_to_fsbo(fs, cancelip->i_number);
1343	if (clearmode || cancelip->i_effnlink == 0)
1344		dip->di_mode = 0;
1345	dip->di_size = 0;
1346	dip->di_blocks = 0;
1347	dip->di_flags &= ~SF_SNAPSHOT;
1348	bzero(&dip->di_db[0], (UFS_NDADDR + UFS_NIADDR) * sizeof(ufs2_daddr_t));
1349	bdwrite(bp);
1350	/*
1351	 * Now go through and expunge all the blocks in the file
1352	 * using the function requested.
1353	 */
1354	numblks = howmany(cancelip->i_size, fs->fs_bsize);
1355	if ((error = (*acctfunc)(snapvp, &cancelip->i_din2->di_db[0],
1356	    &cancelip->i_din2->di_db[UFS_NDADDR], fs, 0, expungetype)))
1357		return (error);
1358	if ((error = (*acctfunc)(snapvp, &cancelip->i_din2->di_ib[0],
1359	    &cancelip->i_din2->di_ib[UFS_NIADDR], fs, -1, expungetype)))
1360		return (error);
1361	blksperindir = 1;
1362	lbn = -UFS_NDADDR;
1363	len = numblks - UFS_NDADDR;
1364	rlbn = UFS_NDADDR;
1365	for (i = 0; len > 0 && i < UFS_NIADDR; i++) {
1366		error = indiracct_ufs2(snapvp, ITOV(cancelip), i,
1367		    cancelip->i_din2->di_ib[i], lbn, rlbn, len,
1368		    blksperindir, fs, acctfunc, expungetype);
1369		if (error)
1370			return (error);
1371		blksperindir *= NINDIR(fs);
1372		lbn -= blksperindir + 1;
1373		len -= blksperindir;
1374		rlbn += blksperindir;
1375	}
1376	return (0);
1377}
1378
1379/*
1380 * Descend an indirect block chain for vnode cancelvp accounting for all
1381 * its indirect blocks in snapvp.
1382 */
1383static int
1384indiracct_ufs2(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks,
1385	    blksperindir, fs, acctfunc, expungetype)
1386	struct vnode *snapvp;
1387	struct vnode *cancelvp;
1388	int level;
1389	ufs2_daddr_t blkno;
1390	ufs_lbn_t lbn;
1391	ufs_lbn_t rlbn;
1392	ufs_lbn_t remblks;
1393	ufs_lbn_t blksperindir;
1394	struct fs *fs;
1395	int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
1396	    struct fs *, ufs_lbn_t, int);
1397	int expungetype;
1398{
1399	int error, num, i;
1400	ufs_lbn_t subblksperindir;
1401	struct indir indirs[UFS_NIADDR + 2];
1402	ufs2_daddr_t last, *bap;
1403	struct buf *bp;
1404
1405	if (blkno == 0) {
1406		if (expungetype == BLK_NOCOPY)
1407			return (0);
1408		panic("indiracct_ufs2: missing indir");
1409	}
1410	if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0)
1411		return (error);
1412	if (lbn != indirs[num - 1 - level].in_lbn || num < 2)
1413		panic("indiracct_ufs2: botched params");
1414	/*
1415	 * We have to expand bread here since it will deadlock looking
1416	 * up the block number for any blocks that are not in the cache.
1417	 */
1418	bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0, 0);
1419	bp->b_blkno = fsbtodb(fs, blkno);
1420	if ((bp->b_flags & B_CACHE) == 0 &&
1421	    (error = readblock(cancelvp, bp, fragstoblks(fs, blkno)))) {
1422		brelse(bp);
1423		return (error);
1424	}
1425	/*
1426	 * Account for the block pointers in this indirect block.
1427	 */
1428	last = howmany(remblks, blksperindir);
1429	if (last > NINDIR(fs))
1430		last = NINDIR(fs);
1431	bap = malloc(fs->fs_bsize, M_DEVBUF, M_WAITOK);
1432	bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize);
1433	bqrelse(bp);
1434	error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs,
1435	    level == 0 ? rlbn : -1, expungetype);
1436	if (error || level == 0)
1437		goto out;
1438	/*
1439	 * Account for the block pointers in each of the indirect blocks
1440	 * in the levels below us.
1441	 */
1442	subblksperindir = blksperindir / NINDIR(fs);
1443	for (lbn++, level--, i = 0; i < last; i++) {
1444		error = indiracct_ufs2(snapvp, cancelvp, level, bap[i], lbn,
1445		    rlbn, remblks, subblksperindir, fs, acctfunc, expungetype);
1446		if (error)
1447			goto out;
1448		rlbn += blksperindir;
1449		lbn -= blksperindir;
1450		remblks -= blksperindir;
1451	}
1452out:
1453	free(bap, M_DEVBUF);
1454	return (error);
1455}
1456
1457/*
1458 * Do both snap accounting and map accounting.
1459 */
1460static int
1461fullacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype)
1462	struct vnode *vp;
1463	ufs2_daddr_t *oldblkp, *lastblkp;
1464	struct fs *fs;
1465	ufs_lbn_t lblkno;
1466	int exptype;	/* BLK_SNAP or BLK_NOCOPY */
1467{
1468	int error;
1469
1470	if ((error = snapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype)))
1471		return (error);
1472	return (mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype));
1473}
1474
1475/*
1476 * Identify a set of blocks allocated in a snapshot inode.
1477 */
1478static int
1479snapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expungetype)
1480	struct vnode *vp;
1481	ufs2_daddr_t *oldblkp, *lastblkp;
1482	struct fs *fs;
1483	ufs_lbn_t lblkno;
1484	int expungetype;	/* BLK_SNAP or BLK_NOCOPY */
1485{
1486	struct inode *ip = VTOI(vp);
1487	ufs2_daddr_t blkno, *blkp;
1488	ufs_lbn_t lbn;
1489	struct buf *ibp;
1490	int error;
1491
1492	for ( ; oldblkp < lastblkp; oldblkp++) {
1493		blkno = *oldblkp;
1494		if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP)
1495			continue;
1496		lbn = fragstoblks(fs, blkno);
1497		if (lbn < UFS_NDADDR) {
1498			blkp = &ip->i_din2->di_db[lbn];
1499			ip->i_flag |= IN_CHANGE | IN_UPDATE;
1500		} else {
1501			error = ffs_balloc_ufs2(vp, lblktosize(fs, (off_t)lbn),
1502			    fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
1503			if (error)
1504				return (error);
1505			blkp = &((ufs2_daddr_t *)(ibp->b_data))
1506			    [(lbn - UFS_NDADDR) % NINDIR(fs)];
1507		}
1508		/*
1509		 * If we are expunging a snapshot vnode and we
1510		 * find a block marked BLK_NOCOPY, then it is
1511		 * one that has been allocated to this snapshot after
1512		 * we took our current snapshot and can be ignored.
1513		 */
1514		if (expungetype == BLK_SNAP && *blkp == BLK_NOCOPY) {
1515			if (lbn >= UFS_NDADDR)
1516				brelse(ibp);
1517		} else {
1518			if (*blkp != 0)
1519				panic("snapacct_ufs2: bad block");
1520			*blkp = expungetype;
1521			if (lbn >= UFS_NDADDR)
1522				bdwrite(ibp);
1523		}
1524	}
1525	return (0);
1526}
1527
1528/*
1529 * Account for a set of blocks allocated in a snapshot inode.
1530 */
1531static int
1532mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expungetype)
1533	struct vnode *vp;
1534	ufs2_daddr_t *oldblkp, *lastblkp;
1535	struct fs *fs;
1536	ufs_lbn_t lblkno;
1537	int expungetype;
1538{
1539	ufs2_daddr_t blkno;
1540	struct inode *ip;
1541	ino_t inum;
1542	int acctit;
1543
1544	ip = VTOI(vp);
1545	inum = ip->i_number;
1546	if (lblkno == -1)
1547		acctit = 0;
1548	else
1549		acctit = 1;
1550	for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) {
1551		blkno = *oldblkp;
1552		if (blkno == 0 || blkno == BLK_NOCOPY)
1553			continue;
1554		if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP &&
1555		    lblkno >= UFS_NDADDR)
1556			*ip->i_snapblklist++ = lblkno;
1557		if (blkno == BLK_SNAP)
1558			blkno = blkstofrags(fs, lblkno);
1559		ffs_blkfree(ITOUMP(ip), fs, vp, blkno, fs->fs_bsize, inum,
1560		    vp->v_type, NULL, SINGLETON_KEY);
1561	}
1562	return (0);
1563}
1564
1565/*
1566 * Decrement extra reference on snapshot when last name is removed.
1567 * It will not be freed until the last open reference goes away.
1568 */
1569void
1570ffs_snapgone(ip)
1571	struct inode *ip;
1572{
1573	struct inode *xp;
1574	struct fs *fs;
1575	int snaploc;
1576	struct snapdata *sn;
1577	struct ufsmount *ump;
1578
1579	/*
1580	 * Find snapshot in incore list.
1581	 */
1582	xp = NULL;
1583	sn = ITODEVVP(ip)->v_rdev->si_snapdata;
1584	if (sn != NULL)
1585		TAILQ_FOREACH(xp, &sn->sn_head, i_nextsnap)
1586			if (xp == ip)
1587				break;
1588	if (xp != NULL)
1589		vrele(ITOV(ip));
1590	else if (snapdebug)
1591		printf("ffs_snapgone: lost snapshot vnode %ju\n",
1592		    (uintmax_t)ip->i_number);
1593	/*
1594	 * Delete snapshot inode from superblock. Keep list dense.
1595	 */
1596	ump = ITOUMP(ip);
1597	fs = ump->um_fs;
1598	UFS_LOCK(ump);
1599	for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++)
1600		if (fs->fs_snapinum[snaploc] == ip->i_number)
1601			break;
1602	if (snaploc < FSMAXSNAP) {
1603		for (snaploc++; snaploc < FSMAXSNAP; snaploc++) {
1604			if (fs->fs_snapinum[snaploc] == 0)
1605				break;
1606			fs->fs_snapinum[snaploc - 1] = fs->fs_snapinum[snaploc];
1607		}
1608		fs->fs_snapinum[snaploc - 1] = 0;
1609	}
1610	UFS_UNLOCK(ump);
1611}
1612
1613/*
1614 * Prepare a snapshot file for being removed.
1615 */
1616void
1617ffs_snapremove(vp)
1618	struct vnode *vp;
1619{
1620	struct inode *ip;
1621	struct vnode *devvp;
1622	struct buf *ibp;
1623	struct fs *fs;
1624	ufs2_daddr_t numblks, blkno, dblk;
1625	int error, i, last, loc;
1626	struct snapdata *sn;
1627
1628	ip = VTOI(vp);
1629	fs = ITOFS(ip);
1630	devvp = ITODEVVP(ip);
1631	/*
1632	 * If active, delete from incore list (this snapshot may
1633	 * already have been in the process of being deleted, so
1634	 * would not have been active).
1635	 *
1636	 * Clear copy-on-write flag if last snapshot.
1637	 */
1638	VI_LOCK(devvp);
1639	if (ip->i_nextsnap.tqe_prev != 0) {
1640		sn = devvp->v_rdev->si_snapdata;
1641		TAILQ_REMOVE(&sn->sn_head, ip, i_nextsnap);
1642		ip->i_nextsnap.tqe_prev = 0;
1643		VI_UNLOCK(devvp);
1644		lockmgr(&vp->v_lock, LK_EXCLUSIVE, NULL);
1645		for (i = 0; i < sn->sn_lock.lk_recurse; i++)
1646			lockmgr(&vp->v_lock, LK_EXCLUSIVE, NULL);
1647		KASSERT(vp->v_vnlock == &sn->sn_lock,
1648			("ffs_snapremove: lost lock mutation"));
1649		vp->v_vnlock = &vp->v_lock;
1650		VI_LOCK(devvp);
1651		while (sn->sn_lock.lk_recurse > 0)
1652			lockmgr(&sn->sn_lock, LK_RELEASE, NULL);
1653		lockmgr(&sn->sn_lock, LK_RELEASE, NULL);
1654		try_free_snapdata(devvp);
1655	} else
1656		VI_UNLOCK(devvp);
1657	/*
1658	 * Clear all BLK_NOCOPY fields. Pass any block claims to other
1659	 * snapshots that want them (see ffs_snapblkfree below).
1660	 */
1661	for (blkno = 1; blkno < UFS_NDADDR; blkno++) {
1662		dblk = DIP(ip, i_db[blkno]);
1663		if (dblk == 0)
1664			continue;
1665		if (dblk == BLK_NOCOPY || dblk == BLK_SNAP)
1666			DIP_SET(ip, i_db[blkno], 0);
1667		else if ((dblk == blkstofrags(fs, blkno) &&
1668		     ffs_snapblkfree(fs, ITODEVVP(ip), dblk, fs->fs_bsize,
1669		     ip->i_number, vp->v_type, NULL))) {
1670			DIP_SET(ip, i_blocks, DIP(ip, i_blocks) -
1671			    btodb(fs->fs_bsize));
1672			DIP_SET(ip, i_db[blkno], 0);
1673		}
1674	}
1675	numblks = howmany(ip->i_size, fs->fs_bsize);
1676	for (blkno = UFS_NDADDR; blkno < numblks; blkno += NINDIR(fs)) {
1677		error = UFS_BALLOC(vp, lblktosize(fs, (off_t)blkno),
1678		    fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
1679		if (error)
1680			continue;
1681		if (fs->fs_size - blkno > NINDIR(fs))
1682			last = NINDIR(fs);
1683		else
1684			last = fs->fs_size - blkno;
1685		for (loc = 0; loc < last; loc++) {
1686			if (I_IS_UFS1(ip)) {
1687				dblk = ((ufs1_daddr_t *)(ibp->b_data))[loc];
1688				if (dblk == 0)
1689					continue;
1690				if (dblk == BLK_NOCOPY || dblk == BLK_SNAP)
1691					((ufs1_daddr_t *)(ibp->b_data))[loc]= 0;
1692				else if ((dblk == blkstofrags(fs, blkno) &&
1693				     ffs_snapblkfree(fs, ITODEVVP(ip), dblk,
1694				     fs->fs_bsize, ip->i_number, vp->v_type,
1695				     NULL))) {
1696					ip->i_din1->di_blocks -=
1697					    btodb(fs->fs_bsize);
1698					((ufs1_daddr_t *)(ibp->b_data))[loc]= 0;
1699				}
1700				continue;
1701			}
1702			dblk = ((ufs2_daddr_t *)(ibp->b_data))[loc];
1703			if (dblk == 0)
1704				continue;
1705			if (dblk == BLK_NOCOPY || dblk == BLK_SNAP)
1706				((ufs2_daddr_t *)(ibp->b_data))[loc] = 0;
1707			else if ((dblk == blkstofrags(fs, blkno) &&
1708			     ffs_snapblkfree(fs, ITODEVVP(ip), dblk,
1709			     fs->fs_bsize, ip->i_number, vp->v_type, NULL))) {
1710				ip->i_din2->di_blocks -= btodb(fs->fs_bsize);
1711				((ufs2_daddr_t *)(ibp->b_data))[loc] = 0;
1712			}
1713		}
1714		bawrite(ibp);
1715	}
1716	/*
1717	 * Clear snapshot flag and drop reference.
1718	 */
1719	ip->i_flags &= ~SF_SNAPSHOT;
1720	DIP_SET(ip, i_flags, ip->i_flags);
1721	ip->i_flag |= IN_CHANGE | IN_UPDATE;
1722	/*
1723	 * The dirtied indirects must be written out before
1724	 * softdep_setup_freeblocks() is called.  Otherwise indir_trunc()
1725	 * may find indirect pointers using the magic BLK_* values.
1726	 */
1727	if (DOINGSOFTDEP(vp))
1728		ffs_syncvnode(vp, MNT_WAIT, 0);
1729#ifdef QUOTA
1730	/*
1731	 * Reenable disk quotas for ex-snapshot file.
1732	 */
1733	if (!getinoquota(ip))
1734		(void) chkdq(ip, DIP(ip, i_blocks), KERNCRED, FORCE);
1735#endif
1736}
1737
1738/*
1739 * Notification that a block is being freed. Return zero if the free
1740 * should be allowed to proceed. Return non-zero if the snapshot file
1741 * wants to claim the block. The block will be claimed if it is an
1742 * uncopied part of one of the snapshots. It will be freed if it is
1743 * either a BLK_NOCOPY or has already been copied in all of the snapshots.
1744 * If a fragment is being freed, then all snapshots that care about
1745 * it must make a copy since a snapshot file can only claim full sized
1746 * blocks. Note that if more than one snapshot file maps the block,
1747 * we can pick one at random to claim it. Since none of the snapshots
1748 * can change, we are assurred that they will all see the same unmodified
1749 * image. When deleting a snapshot file (see ffs_snapremove above), we
1750 * must push any of these claimed blocks to one of the other snapshots
1751 * that maps it. These claimed blocks are easily identified as they will
1752 * have a block number equal to their logical block number within the
1753 * snapshot. A copied block can never have this property because they
1754 * must always have been allocated from a BLK_NOCOPY location.
1755 */
1756int
1757ffs_snapblkfree(fs, devvp, bno, size, inum, vtype, wkhd)
1758	struct fs *fs;
1759	struct vnode *devvp;
1760	ufs2_daddr_t bno;
1761	long size;
1762	ino_t inum;
1763	enum vtype vtype;
1764	struct workhead *wkhd;
1765{
1766	struct buf *ibp, *cbp, *savedcbp = NULL;
1767	struct thread *td = curthread;
1768	struct inode *ip;
1769	struct vnode *vp = NULL;
1770	ufs_lbn_t lbn;
1771	ufs2_daddr_t blkno;
1772	int indiroff = 0, error = 0, claimedblk = 0;
1773	struct snapdata *sn;
1774
1775	lbn = fragstoblks(fs, bno);
1776retry:
1777	VI_LOCK(devvp);
1778	sn = devvp->v_rdev->si_snapdata;
1779	if (sn == NULL) {
1780		VI_UNLOCK(devvp);
1781		return (0);
1782	}
1783	if (lockmgr(&sn->sn_lock, LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL,
1784	    VI_MTX(devvp)) != 0)
1785		goto retry;
1786	TAILQ_FOREACH(ip, &sn->sn_head, i_nextsnap) {
1787		vp = ITOV(ip);
1788		if (DOINGSOFTDEP(vp))
1789			softdep_prealloc(vp, MNT_WAIT);
1790		/*
1791		 * Lookup block being written.
1792		 */
1793		if (lbn < UFS_NDADDR) {
1794			blkno = DIP(ip, i_db[lbn]);
1795		} else {
1796			td->td_pflags |= TDP_COWINPROGRESS;
1797			error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn),
1798			    fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
1799			td->td_pflags &= ~TDP_COWINPROGRESS;
1800			if (error)
1801				break;
1802			indiroff = (lbn - UFS_NDADDR) % NINDIR(fs);
1803			if (I_IS_UFS1(ip))
1804				blkno=((ufs1_daddr_t *)(ibp->b_data))[indiroff];
1805			else
1806				blkno=((ufs2_daddr_t *)(ibp->b_data))[indiroff];
1807		}
1808		/*
1809		 * Check to see if block needs to be copied.
1810		 */
1811		if (blkno == 0) {
1812			/*
1813			 * A block that we map is being freed. If it has not
1814			 * been claimed yet, we will claim or copy it (below).
1815			 */
1816			claimedblk = 1;
1817		} else if (blkno == BLK_SNAP) {
1818			/*
1819			 * No previous snapshot claimed the block,
1820			 * so it will be freed and become a BLK_NOCOPY
1821			 * (don't care) for us.
1822			 */
1823			if (claimedblk)
1824				panic("snapblkfree: inconsistent block type");
1825			if (lbn < UFS_NDADDR) {
1826				DIP_SET(ip, i_db[lbn], BLK_NOCOPY);
1827				ip->i_flag |= IN_CHANGE | IN_UPDATE;
1828			} else if (I_IS_UFS1(ip)) {
1829				((ufs1_daddr_t *)(ibp->b_data))[indiroff] =
1830				    BLK_NOCOPY;
1831				bdwrite(ibp);
1832			} else {
1833				((ufs2_daddr_t *)(ibp->b_data))[indiroff] =
1834				    BLK_NOCOPY;
1835				bdwrite(ibp);
1836			}
1837			continue;
1838		} else /* BLK_NOCOPY or default */ {
1839			/*
1840			 * If the snapshot has already copied the block
1841			 * (default), or does not care about the block,
1842			 * it is not needed.
1843			 */
1844			if (lbn >= UFS_NDADDR)
1845				bqrelse(ibp);
1846			continue;
1847		}
1848		/*
1849		 * If this is a full size block, we will just grab it
1850		 * and assign it to the snapshot inode. Otherwise we
1851		 * will proceed to copy it. See explanation for this
1852		 * routine as to why only a single snapshot needs to
1853		 * claim this block.
1854		 */
1855		if (size == fs->fs_bsize) {
1856#ifdef DEBUG
1857			if (snapdebug)
1858				printf("%s %ju lbn %jd from inum %ju\n",
1859				    "Grabonremove: snapino",
1860				    (uintmax_t)ip->i_number,
1861				    (intmax_t)lbn, (uintmax_t)inum);
1862#endif
1863			/*
1864			 * If journaling is tracking this write we must add
1865			 * the work to the inode or indirect being written.
1866			 */
1867			if (wkhd != NULL) {
1868				if (lbn < UFS_NDADDR)
1869					softdep_inode_append(ip,
1870					    curthread->td_ucred, wkhd);
1871				else
1872					softdep_buf_append(ibp, wkhd);
1873			}
1874			if (lbn < UFS_NDADDR) {
1875				DIP_SET(ip, i_db[lbn], bno);
1876			} else if (I_IS_UFS1(ip)) {
1877				((ufs1_daddr_t *)(ibp->b_data))[indiroff] = bno;
1878				bdwrite(ibp);
1879			} else {
1880				((ufs2_daddr_t *)(ibp->b_data))[indiroff] = bno;
1881				bdwrite(ibp);
1882			}
1883			DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + btodb(size));
1884			ip->i_flag |= IN_CHANGE | IN_UPDATE;
1885			lockmgr(vp->v_vnlock, LK_RELEASE, NULL);
1886			return (1);
1887		}
1888		if (lbn >= UFS_NDADDR)
1889			bqrelse(ibp);
1890		/*
1891		 * Allocate the block into which to do the copy. Note that this
1892		 * allocation will never require any additional allocations for
1893		 * the snapshot inode.
1894		 */
1895		td->td_pflags |= TDP_COWINPROGRESS;
1896		error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn),
1897		    fs->fs_bsize, KERNCRED, 0, &cbp);
1898		td->td_pflags &= ~TDP_COWINPROGRESS;
1899		if (error)
1900			break;
1901#ifdef DEBUG
1902		if (snapdebug)
1903			printf("%s%ju lbn %jd %s %ju size %ld to blkno %jd\n",
1904			    "Copyonremove: snapino ", (uintmax_t)ip->i_number,
1905			    (intmax_t)lbn, "for inum", (uintmax_t)inum, size,
1906			    (intmax_t)cbp->b_blkno);
1907#endif
1908		/*
1909		 * If we have already read the old block contents, then
1910		 * simply copy them to the new block. Note that we need
1911		 * to synchronously write snapshots that have not been
1912		 * unlinked, and hence will be visible after a crash,
1913		 * to ensure their integrity. At a minimum we ensure the
1914		 * integrity of the filesystem metadata, but use the
1915		 * dopersistence sysctl-setable flag to decide on the
1916		 * persistence needed for file content data.
1917		 */
1918		if (savedcbp != NULL) {
1919			bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize);
1920			bawrite(cbp);
1921			if ((vtype == VDIR || dopersistence) &&
1922			    ip->i_effnlink > 0)
1923				(void) ffs_syncvnode(vp, MNT_WAIT, NO_INO_UPDT);
1924			continue;
1925		}
1926		/*
1927		 * Otherwise, read the old block contents into the buffer.
1928		 */
1929		if ((error = readblock(vp, cbp, lbn)) != 0) {
1930			bzero(cbp->b_data, fs->fs_bsize);
1931			bawrite(cbp);
1932			if ((vtype == VDIR || dopersistence) &&
1933			    ip->i_effnlink > 0)
1934				(void) ffs_syncvnode(vp, MNT_WAIT, NO_INO_UPDT);
1935			break;
1936		}
1937		savedcbp = cbp;
1938	}
1939	/*
1940	 * Note that we need to synchronously write snapshots that
1941	 * have not been unlinked, and hence will be visible after
1942	 * a crash, to ensure their integrity. At a minimum we
1943	 * ensure the integrity of the filesystem metadata, but
1944	 * use the dopersistence sysctl-setable flag to decide on
1945	 * the persistence needed for file content data.
1946	 */
1947	if (savedcbp) {
1948		vp = savedcbp->b_vp;
1949		bawrite(savedcbp);
1950		if ((vtype == VDIR || dopersistence) &&
1951		    VTOI(vp)->i_effnlink > 0)
1952			(void) ffs_syncvnode(vp, MNT_WAIT, NO_INO_UPDT);
1953	}
1954	/*
1955	 * If we have been unable to allocate a block in which to do
1956	 * the copy, then return non-zero so that the fragment will
1957	 * not be freed. Although space will be lost, the snapshot
1958	 * will stay consistent.
1959	 */
1960	if (error != 0 && wkhd != NULL)
1961		softdep_freework(wkhd);
1962	lockmgr(&sn->sn_lock, LK_RELEASE, NULL);
1963	return (error);
1964}
1965
1966/*
1967 * Associate snapshot files when mounting.
1968 */
1969void
1970ffs_snapshot_mount(mp)
1971	struct mount *mp;
1972{
1973	struct ufsmount *ump = VFSTOUFS(mp);
1974	struct vnode *devvp = ump->um_devvp;
1975	struct fs *fs = ump->um_fs;
1976	struct thread *td = curthread;
1977	struct snapdata *sn;
1978	struct vnode *vp;
1979	struct vnode *lastvp;
1980	struct inode *ip;
1981	struct uio auio;
1982	struct iovec aiov;
1983	void *snapblklist;
1984	char *reason;
1985	daddr_t snaplistsize;
1986	int error, snaploc, loc;
1987
1988	/*
1989	 * XXX The following needs to be set before ffs_truncate or
1990	 * VOP_READ can be called.
1991	 */
1992	mp->mnt_stat.f_iosize = fs->fs_bsize;
1993	/*
1994	 * Process each snapshot listed in the superblock.
1995	 */
1996	vp = NULL;
1997	lastvp = NULL;
1998	sn = NULL;
1999	for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) {
2000		if (fs->fs_snapinum[snaploc] == 0)
2001			break;
2002		if ((error = ffs_vget(mp, fs->fs_snapinum[snaploc],
2003		    LK_EXCLUSIVE, &vp)) != 0){
2004			printf("ffs_snapshot_mount: vget failed %d\n", error);
2005			continue;
2006		}
2007		ip = VTOI(vp);
2008		if (vp->v_type != VREG) {
2009			reason = "non-file snapshot";
2010		} else if (!IS_SNAPSHOT(ip)) {
2011			reason = "non-snapshot";
2012		} else if (ip->i_size ==
2013		    lblktosize(fs, howmany(fs->fs_size, fs->fs_frag))) {
2014			reason = "old format snapshot";
2015			(void)ffs_truncate(vp, (off_t)0, 0, NOCRED);
2016			(void)ffs_syncvnode(vp, MNT_WAIT, 0);
2017		} else {
2018			reason = NULL;
2019		}
2020		if (reason != NULL) {
2021			printf("ffs_snapshot_mount: %s inode %d\n",
2022			    reason, fs->fs_snapinum[snaploc]);
2023			vput(vp);
2024			vp = NULL;
2025			for (loc = snaploc + 1; loc < FSMAXSNAP; loc++) {
2026				if (fs->fs_snapinum[loc] == 0)
2027					break;
2028				fs->fs_snapinum[loc - 1] = fs->fs_snapinum[loc];
2029			}
2030			fs->fs_snapinum[loc - 1] = 0;
2031			snaploc--;
2032			continue;
2033		}
2034		/*
2035		 * Acquire a lock on the snapdata structure, creating it if
2036		 * necessary.
2037		 */
2038		sn = ffs_snapdata_acquire(devvp);
2039		/*
2040		 * Change vnode to use shared snapshot lock instead of the
2041		 * original private lock.
2042		 */
2043		vp->v_vnlock = &sn->sn_lock;
2044		lockmgr(&vp->v_lock, LK_RELEASE, NULL);
2045		/*
2046		 * Link it onto the active snapshot list.
2047		 */
2048		VI_LOCK(devvp);
2049		if (ip->i_nextsnap.tqe_prev != 0)
2050			panic("ffs_snapshot_mount: %ju already on list",
2051			    (uintmax_t)ip->i_number);
2052		else
2053			TAILQ_INSERT_TAIL(&sn->sn_head, ip, i_nextsnap);
2054		vp->v_vflag |= VV_SYSTEM;
2055		VI_UNLOCK(devvp);
2056		VOP_UNLOCK(vp, 0);
2057		lastvp = vp;
2058	}
2059	vp = lastvp;
2060	/*
2061	 * No usable snapshots found.
2062	 */
2063	if (sn == NULL || vp == NULL)
2064		return;
2065	/*
2066	 * Allocate the space for the block hints list. We always want to
2067	 * use the list from the newest snapshot.
2068	 */
2069	auio.uio_iov = &aiov;
2070	auio.uio_iovcnt = 1;
2071	aiov.iov_base = (void *)&snaplistsize;
2072	aiov.iov_len = sizeof(snaplistsize);
2073	auio.uio_resid = aiov.iov_len;
2074	auio.uio_offset =
2075	    lblktosize(fs, howmany(fs->fs_size, fs->fs_frag));
2076	auio.uio_segflg = UIO_SYSSPACE;
2077	auio.uio_rw = UIO_READ;
2078	auio.uio_td = td;
2079	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2080	if ((error = VOP_READ(vp, &auio, IO_UNIT, td->td_ucred)) != 0) {
2081		printf("ffs_snapshot_mount: read_1 failed %d\n", error);
2082		VOP_UNLOCK(vp, 0);
2083		return;
2084	}
2085	snapblklist = malloc(snaplistsize * sizeof(daddr_t),
2086	    M_UFSMNT, M_WAITOK);
2087	auio.uio_iovcnt = 1;
2088	aiov.iov_base = snapblklist;
2089	aiov.iov_len = snaplistsize * sizeof (daddr_t);
2090	auio.uio_resid = aiov.iov_len;
2091	auio.uio_offset -= sizeof(snaplistsize);
2092	if ((error = VOP_READ(vp, &auio, IO_UNIT, td->td_ucred)) != 0) {
2093		printf("ffs_snapshot_mount: read_2 failed %d\n", error);
2094		VOP_UNLOCK(vp, 0);
2095		free(snapblklist, M_UFSMNT);
2096		return;
2097	}
2098	VOP_UNLOCK(vp, 0);
2099	VI_LOCK(devvp);
2100	ASSERT_VOP_LOCKED(devvp, "ffs_snapshot_mount");
2101	sn->sn_listsize = snaplistsize;
2102	sn->sn_blklist = (daddr_t *)snapblklist;
2103	devvp->v_vflag |= VV_COPYONWRITE;
2104	VI_UNLOCK(devvp);
2105}
2106
2107/*
2108 * Disassociate snapshot files when unmounting.
2109 */
2110void
2111ffs_snapshot_unmount(mp)
2112	struct mount *mp;
2113{
2114	struct vnode *devvp = VFSTOUFS(mp)->um_devvp;
2115	struct snapdata *sn;
2116	struct inode *xp;
2117	struct vnode *vp;
2118
2119	VI_LOCK(devvp);
2120	sn = devvp->v_rdev->si_snapdata;
2121	while (sn != NULL && (xp = TAILQ_FIRST(&sn->sn_head)) != NULL) {
2122		vp = ITOV(xp);
2123		TAILQ_REMOVE(&sn->sn_head, xp, i_nextsnap);
2124		xp->i_nextsnap.tqe_prev = 0;
2125		lockmgr(&sn->sn_lock, LK_INTERLOCK | LK_EXCLUSIVE,
2126		    VI_MTX(devvp));
2127		lockmgr(&vp->v_lock, LK_EXCLUSIVE, NULL);
2128		KASSERT(vp->v_vnlock == &sn->sn_lock,
2129		("ffs_snapshot_unmount: lost lock mutation"));
2130		vp->v_vnlock = &vp->v_lock;
2131		lockmgr(&vp->v_lock, LK_RELEASE, NULL);
2132		lockmgr(&sn->sn_lock, LK_RELEASE, NULL);
2133		if (xp->i_effnlink > 0)
2134			vrele(vp);
2135		VI_LOCK(devvp);
2136		sn = devvp->v_rdev->si_snapdata;
2137	}
2138	try_free_snapdata(devvp);
2139	ASSERT_VOP_LOCKED(devvp, "ffs_snapshot_unmount");
2140}
2141
2142/*
2143 * Check the buffer block to be belong to device buffer that shall be
2144 * locked after snaplk. devvp shall be locked on entry, and will be
2145 * leaved locked upon exit.
2146 */
2147static int
2148ffs_bp_snapblk(devvp, bp)
2149	struct vnode *devvp;
2150	struct buf *bp;
2151{
2152	struct snapdata *sn;
2153	struct fs *fs;
2154	ufs2_daddr_t lbn, *snapblklist;
2155	int lower, upper, mid;
2156
2157	ASSERT_VI_LOCKED(devvp, "ffs_bp_snapblk");
2158	KASSERT(devvp->v_type == VCHR, ("Not a device %p", devvp));
2159	sn = devvp->v_rdev->si_snapdata;
2160	if (sn == NULL || TAILQ_FIRST(&sn->sn_head) == NULL)
2161		return (0);
2162	fs = ITOFS(TAILQ_FIRST(&sn->sn_head));
2163	lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno));
2164	snapblklist = sn->sn_blklist;
2165	upper = sn->sn_listsize - 1;
2166	lower = 1;
2167	while (lower <= upper) {
2168		mid = (lower + upper) / 2;
2169		if (snapblklist[mid] == lbn)
2170			break;
2171		if (snapblklist[mid] < lbn)
2172			lower = mid + 1;
2173		else
2174			upper = mid - 1;
2175	}
2176	if (lower <= upper)
2177		return (1);
2178	return (0);
2179}
2180
2181void
2182ffs_bdflush(bo, bp)
2183	struct bufobj *bo;
2184	struct buf *bp;
2185{
2186	struct thread *td;
2187	struct vnode *vp, *devvp;
2188	struct buf *nbp;
2189	int bp_bdskip;
2190
2191	if (bo->bo_dirty.bv_cnt <= dirtybufthresh)
2192		return;
2193
2194	td = curthread;
2195	vp = bp->b_vp;
2196	devvp = bo2vnode(bo);
2197	KASSERT(vp == devvp, ("devvp != vp %p %p", bo, bp));
2198
2199	VI_LOCK(devvp);
2200	bp_bdskip = ffs_bp_snapblk(devvp, bp);
2201	if (bp_bdskip)
2202		bdwriteskip++;
2203	VI_UNLOCK(devvp);
2204	if (bo->bo_dirty.bv_cnt > dirtybufthresh + 10 && !bp_bdskip) {
2205		(void) VOP_FSYNC(vp, MNT_NOWAIT, td);
2206		altbufferflushes++;
2207	} else {
2208		BO_LOCK(bo);
2209		/*
2210		 * Try to find a buffer to flush.
2211		 */
2212		TAILQ_FOREACH(nbp, &bo->bo_dirty.bv_hd, b_bobufs) {
2213			if ((nbp->b_vflags & BV_BKGRDINPROG) ||
2214			    BUF_LOCK(nbp,
2215				     LK_EXCLUSIVE | LK_NOWAIT, NULL))
2216				continue;
2217			if (bp == nbp)
2218				panic("bdwrite: found ourselves");
2219			BO_UNLOCK(bo);
2220			/*
2221			 * Don't countdeps with the bo lock
2222			 * held.
2223			 */
2224			if (buf_countdeps(nbp, 0)) {
2225				BO_LOCK(bo);
2226				BUF_UNLOCK(nbp);
2227				continue;
2228			}
2229			if (bp_bdskip) {
2230				VI_LOCK(devvp);
2231				if (!ffs_bp_snapblk(vp, nbp)) {
2232					VI_UNLOCK(devvp);
2233					BO_LOCK(bo);
2234					BUF_UNLOCK(nbp);
2235					continue;
2236				}
2237				VI_UNLOCK(devvp);
2238			}
2239			if (nbp->b_flags & B_CLUSTEROK) {
2240				vfs_bio_awrite(nbp);
2241			} else {
2242				bremfree(nbp);
2243				bawrite(nbp);
2244			}
2245			dirtybufferflushes++;
2246			break;
2247		}
2248		if (nbp == NULL)
2249			BO_UNLOCK(bo);
2250	}
2251}
2252
2253/*
2254 * Check for need to copy block that is about to be written,
2255 * copying the block if necessary.
2256 */
2257int
2258ffs_copyonwrite(devvp, bp)
2259	struct vnode *devvp;
2260	struct buf *bp;
2261{
2262	struct snapdata *sn;
2263	struct buf *ibp, *cbp, *savedcbp = NULL;
2264	struct thread *td = curthread;
2265	struct fs *fs;
2266	struct inode *ip;
2267	struct vnode *vp = NULL;
2268	ufs2_daddr_t lbn, blkno, *snapblklist;
2269	int lower, upper, mid, indiroff, error = 0;
2270	int launched_async_io, prev_norunningbuf;
2271	long saved_runningbufspace;
2272
2273	if (devvp != bp->b_vp && IS_SNAPSHOT(VTOI(bp->b_vp)))
2274		return (0);		/* Update on a snapshot file */
2275	if (td->td_pflags & TDP_COWINPROGRESS)
2276		panic("ffs_copyonwrite: recursive call");
2277	/*
2278	 * First check to see if it is in the preallocated list.
2279	 * By doing this check we avoid several potential deadlocks.
2280	 */
2281	VI_LOCK(devvp);
2282	sn = devvp->v_rdev->si_snapdata;
2283	if (sn == NULL ||
2284	    TAILQ_EMPTY(&sn->sn_head)) {
2285		VI_UNLOCK(devvp);
2286		return (0);		/* No snapshot */
2287	}
2288	ip = TAILQ_FIRST(&sn->sn_head);
2289	fs = ITOFS(ip);
2290	lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno));
2291	if (lbn < UFS_NDADDR) {
2292		VI_UNLOCK(devvp);
2293		return (0);		/* Direct blocks are always copied */
2294	}
2295	snapblklist = sn->sn_blklist;
2296	upper = sn->sn_listsize - 1;
2297	lower = 1;
2298	while (lower <= upper) {
2299		mid = (lower + upper) / 2;
2300		if (snapblklist[mid] == lbn)
2301			break;
2302		if (snapblklist[mid] < lbn)
2303			lower = mid + 1;
2304		else
2305			upper = mid - 1;
2306	}
2307	if (lower <= upper) {
2308		VI_UNLOCK(devvp);
2309		return (0);
2310	}
2311	launched_async_io = 0;
2312	prev_norunningbuf = td->td_pflags & TDP_NORUNNINGBUF;
2313	/*
2314	 * Since I/O on bp isn't yet in progress and it may be blocked
2315	 * for a long time waiting on snaplk, back it out of
2316	 * runningbufspace, possibly waking other threads waiting for space.
2317	 */
2318	saved_runningbufspace = bp->b_runningbufspace;
2319	if (saved_runningbufspace != 0)
2320		runningbufwakeup(bp);
2321	/*
2322	 * Not in the precomputed list, so check the snapshots.
2323	 */
2324	while (lockmgr(&sn->sn_lock, LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL,
2325	    VI_MTX(devvp)) != 0) {
2326		VI_LOCK(devvp);
2327		sn = devvp->v_rdev->si_snapdata;
2328		if (sn == NULL ||
2329		    TAILQ_EMPTY(&sn->sn_head)) {
2330			VI_UNLOCK(devvp);
2331			if (saved_runningbufspace != 0) {
2332				bp->b_runningbufspace = saved_runningbufspace;
2333				atomic_add_long(&runningbufspace,
2334					       bp->b_runningbufspace);
2335			}
2336			return (0);		/* Snapshot gone */
2337		}
2338	}
2339	TAILQ_FOREACH(ip, &sn->sn_head, i_nextsnap) {
2340		vp = ITOV(ip);
2341		if (DOINGSOFTDEP(vp))
2342			softdep_prealloc(vp, MNT_WAIT);
2343		/*
2344		 * We ensure that everything of our own that needs to be
2345		 * copied will be done at the time that ffs_snapshot is
2346		 * called. Thus we can skip the check here which can
2347		 * deadlock in doing the lookup in UFS_BALLOC.
2348		 */
2349		if (bp->b_vp == vp)
2350			continue;
2351		/*
2352		 * Check to see if block needs to be copied. We do not have
2353		 * to hold the snapshot lock while doing this lookup as it
2354		 * will never require any additional allocations for the
2355		 * snapshot inode.
2356		 */
2357		if (lbn < UFS_NDADDR) {
2358			blkno = DIP(ip, i_db[lbn]);
2359		} else {
2360			td->td_pflags |= TDP_COWINPROGRESS | TDP_NORUNNINGBUF;
2361			error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn),
2362			   fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
2363			td->td_pflags &= ~TDP_COWINPROGRESS;
2364			if (error)
2365				break;
2366			indiroff = (lbn - UFS_NDADDR) % NINDIR(fs);
2367			if (I_IS_UFS1(ip))
2368				blkno=((ufs1_daddr_t *)(ibp->b_data))[indiroff];
2369			else
2370				blkno=((ufs2_daddr_t *)(ibp->b_data))[indiroff];
2371			bqrelse(ibp);
2372		}
2373#ifdef INVARIANTS
2374		if (blkno == BLK_SNAP && bp->b_lblkno >= 0)
2375			panic("ffs_copyonwrite: bad copy block");
2376#endif
2377		if (blkno != 0)
2378			continue;
2379		/*
2380		 * Allocate the block into which to do the copy. Since
2381		 * multiple processes may all try to copy the same block,
2382		 * we have to recheck our need to do a copy if we sleep
2383		 * waiting for the lock.
2384		 *
2385		 * Because all snapshots on a filesystem share a single
2386		 * lock, we ensure that we will never be in competition
2387		 * with another process to allocate a block.
2388		 */
2389		td->td_pflags |= TDP_COWINPROGRESS | TDP_NORUNNINGBUF;
2390		error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn),
2391		    fs->fs_bsize, KERNCRED, 0, &cbp);
2392		td->td_pflags &= ~TDP_COWINPROGRESS;
2393		if (error)
2394			break;
2395#ifdef DEBUG
2396		if (snapdebug) {
2397			printf("Copyonwrite: snapino %ju lbn %jd for ",
2398			    (uintmax_t)ip->i_number, (intmax_t)lbn);
2399			if (bp->b_vp == devvp)
2400				printf("fs metadata");
2401			else
2402				printf("inum %ju",
2403				    (uintmax_t)VTOI(bp->b_vp)->i_number);
2404			printf(" lblkno %jd to blkno %jd\n",
2405			    (intmax_t)bp->b_lblkno, (intmax_t)cbp->b_blkno);
2406		}
2407#endif
2408		/*
2409		 * If we have already read the old block contents, then
2410		 * simply copy them to the new block. Note that we need
2411		 * to synchronously write snapshots that have not been
2412		 * unlinked, and hence will be visible after a crash,
2413		 * to ensure their integrity. At a minimum we ensure the
2414		 * integrity of the filesystem metadata, but use the
2415		 * dopersistence sysctl-setable flag to decide on the
2416		 * persistence needed for file content data.
2417		 */
2418		if (savedcbp != NULL) {
2419			bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize);
2420			bawrite(cbp);
2421			if ((devvp == bp->b_vp || bp->b_vp->v_type == VDIR ||
2422			    dopersistence) && ip->i_effnlink > 0)
2423				(void) ffs_syncvnode(vp, MNT_WAIT, NO_INO_UPDT);
2424			else
2425				launched_async_io = 1;
2426			continue;
2427		}
2428		/*
2429		 * Otherwise, read the old block contents into the buffer.
2430		 */
2431		if ((error = readblock(vp, cbp, lbn)) != 0) {
2432			bzero(cbp->b_data, fs->fs_bsize);
2433			bawrite(cbp);
2434			if ((devvp == bp->b_vp || bp->b_vp->v_type == VDIR ||
2435			    dopersistence) && ip->i_effnlink > 0)
2436				(void) ffs_syncvnode(vp, MNT_WAIT, NO_INO_UPDT);
2437			else
2438				launched_async_io = 1;
2439			break;
2440		}
2441		savedcbp = cbp;
2442	}
2443	/*
2444	 * Note that we need to synchronously write snapshots that
2445	 * have not been unlinked, and hence will be visible after
2446	 * a crash, to ensure their integrity. At a minimum we
2447	 * ensure the integrity of the filesystem metadata, but
2448	 * use the dopersistence sysctl-setable flag to decide on
2449	 * the persistence needed for file content data.
2450	 */
2451	if (savedcbp) {
2452		vp = savedcbp->b_vp;
2453		bawrite(savedcbp);
2454		if ((devvp == bp->b_vp || bp->b_vp->v_type == VDIR ||
2455		    dopersistence) && VTOI(vp)->i_effnlink > 0)
2456			(void) ffs_syncvnode(vp, MNT_WAIT, NO_INO_UPDT);
2457		else
2458			launched_async_io = 1;
2459	}
2460	lockmgr(vp->v_vnlock, LK_RELEASE, NULL);
2461	td->td_pflags = (td->td_pflags & ~TDP_NORUNNINGBUF) |
2462		prev_norunningbuf;
2463	if (launched_async_io && (td->td_pflags & TDP_NORUNNINGBUF) == 0)
2464		waitrunningbufspace();
2465	/*
2466	 * I/O on bp will now be started, so count it in runningbufspace.
2467	 */
2468	if (saved_runningbufspace != 0) {
2469		bp->b_runningbufspace = saved_runningbufspace;
2470		atomic_add_long(&runningbufspace, bp->b_runningbufspace);
2471	}
2472	return (error);
2473}
2474
2475/*
2476 * sync snapshots to force freework records waiting on snapshots to claim
2477 * blocks to free.
2478 */
2479void
2480ffs_sync_snap(mp, waitfor)
2481	struct mount *mp;
2482	int waitfor;
2483{
2484	struct snapdata *sn;
2485	struct vnode *devvp;
2486	struct vnode *vp;
2487	struct inode *ip;
2488
2489	devvp = VFSTOUFS(mp)->um_devvp;
2490	if ((devvp->v_vflag & VV_COPYONWRITE) == 0)
2491		return;
2492	for (;;) {
2493		VI_LOCK(devvp);
2494		sn = devvp->v_rdev->si_snapdata;
2495		if (sn == NULL) {
2496			VI_UNLOCK(devvp);
2497			return;
2498		}
2499		if (lockmgr(&sn->sn_lock,
2500		    LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL,
2501		    VI_MTX(devvp)) == 0)
2502			break;
2503	}
2504	TAILQ_FOREACH(ip, &sn->sn_head, i_nextsnap) {
2505		vp = ITOV(ip);
2506		ffs_syncvnode(vp, waitfor, NO_INO_UPDT);
2507	}
2508	lockmgr(&sn->sn_lock, LK_RELEASE, NULL);
2509}
2510
2511/*
2512 * Read the specified block into the given buffer.
2513 * Much of this boiler-plate comes from bwrite().
2514 */
2515static int
2516readblock(vp, bp, lbn)
2517	struct vnode *vp;
2518	struct buf *bp;
2519	ufs2_daddr_t lbn;
2520{
2521	struct inode *ip;
2522	struct bio *bip;
2523	struct fs *fs;
2524
2525	ip = VTOI(vp);
2526	fs = ITOFS(ip);
2527
2528	bip = g_alloc_bio();
2529	bip->bio_cmd = BIO_READ;
2530	bip->bio_offset = dbtob(fsbtodb(fs, blkstofrags(fs, lbn)));
2531	bip->bio_data = bp->b_data;
2532	bip->bio_length = bp->b_bcount;
2533	bip->bio_done = NULL;
2534
2535	g_io_request(bip, ITODEVVP(ip)->v_bufobj.bo_private);
2536	bp->b_error = biowait(bip, "snaprdb");
2537	g_destroy_bio(bip);
2538	return (bp->b_error);
2539}
2540
2541#endif
2542
2543/*
2544 * Process file deletes that were deferred by ufs_inactive() due to
2545 * the file system being suspended. Transfer IN_LAZYACCESS into
2546 * IN_MODIFIED for vnodes that were accessed during suspension.
2547 */
2548void
2549process_deferred_inactive(struct mount *mp)
2550{
2551	struct vnode *vp, *mvp;
2552	struct inode *ip;
2553	struct thread *td;
2554	int error;
2555
2556	td = curthread;
2557	(void) vn_start_secondary_write(NULL, &mp, V_WAIT);
2558 loop:
2559	MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
2560		/*
2561		 * IN_LAZYACCESS is checked here without holding any
2562		 * vnode lock, but this flag is set only while holding
2563		 * vnode interlock.
2564		 */
2565		if (vp->v_type == VNON ||
2566		    ((VTOI(vp)->i_flag & IN_LAZYACCESS) == 0 &&
2567		    ((vp->v_iflag & VI_OWEINACT) == 0 || vp->v_usecount > 0))) {
2568			VI_UNLOCK(vp);
2569			continue;
2570		}
2571		vholdl(vp);
2572		error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK);
2573		if (error != 0) {
2574			vdrop(vp);
2575			if (error == ENOENT)
2576				continue;	/* vnode recycled */
2577			MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
2578			goto loop;
2579		}
2580		ip = VTOI(vp);
2581		if ((ip->i_flag & IN_LAZYACCESS) != 0) {
2582			ip->i_flag &= ~IN_LAZYACCESS;
2583			ip->i_flag |= IN_MODIFIED;
2584		}
2585		VI_LOCK(vp);
2586		if ((vp->v_iflag & VI_OWEINACT) == 0 || vp->v_usecount > 0) {
2587			VI_UNLOCK(vp);
2588			VOP_UNLOCK(vp, 0);
2589			vdrop(vp);
2590			continue;
2591		}
2592		vinactive(vp, td);
2593		VNASSERT((vp->v_iflag & VI_OWEINACT) == 0, vp,
2594			 ("process_deferred_inactive: got VI_OWEINACT"));
2595		VI_UNLOCK(vp);
2596		VOP_UNLOCK(vp, 0);
2597		vdrop(vp);
2598	}
2599	vn_finished_secondary_write(mp);
2600}
2601
2602#ifndef NO_FFS_SNAPSHOT
2603
2604static struct snapdata *
2605ffs_snapdata_alloc(void)
2606{
2607	struct snapdata *sn;
2608
2609	/*
2610	 * Fetch a snapdata from the free list if there is one available.
2611	 */
2612	mtx_lock(&snapfree_lock);
2613	sn = LIST_FIRST(&snapfree);
2614	if (sn != NULL)
2615		LIST_REMOVE(sn, sn_link);
2616	mtx_unlock(&snapfree_lock);
2617	if (sn != NULL)
2618		return (sn);
2619	/*
2620 	 * If there were no free snapdatas allocate one.
2621	 */
2622	sn = malloc(sizeof *sn, M_UFSMNT, M_WAITOK | M_ZERO);
2623	TAILQ_INIT(&sn->sn_head);
2624	lockinit(&sn->sn_lock, PVFS, "snaplk", VLKTIMEOUT,
2625	    LK_CANRECURSE | LK_NOSHARE);
2626	return (sn);
2627}
2628
2629/*
2630 * The snapdata is never freed because we can not be certain that
2631 * there are no threads sleeping on the snap lock.  Persisting
2632 * them permanently avoids costly synchronization in ffs_lock().
2633 */
2634static void
2635ffs_snapdata_free(struct snapdata *sn)
2636{
2637	mtx_lock(&snapfree_lock);
2638	LIST_INSERT_HEAD(&snapfree, sn, sn_link);
2639	mtx_unlock(&snapfree_lock);
2640}
2641
2642/* Try to free snapdata associated with devvp */
2643static void
2644try_free_snapdata(struct vnode *devvp)
2645{
2646	struct snapdata *sn;
2647	ufs2_daddr_t *snapblklist;
2648
2649	ASSERT_VI_LOCKED(devvp, "try_free_snapdata");
2650	sn = devvp->v_rdev->si_snapdata;
2651
2652	if (sn == NULL || TAILQ_FIRST(&sn->sn_head) != NULL ||
2653	    (devvp->v_vflag & VV_COPYONWRITE) == 0) {
2654		VI_UNLOCK(devvp);
2655		return;
2656	}
2657
2658	devvp->v_rdev->si_snapdata = NULL;
2659	devvp->v_vflag &= ~VV_COPYONWRITE;
2660	lockmgr(&sn->sn_lock, LK_DRAIN|LK_INTERLOCK, VI_MTX(devvp));
2661	snapblklist = sn->sn_blklist;
2662	sn->sn_blklist = NULL;
2663	sn->sn_listsize = 0;
2664	lockmgr(&sn->sn_lock, LK_RELEASE, NULL);
2665	if (snapblklist != NULL)
2666		free(snapblklist, M_UFSMNT);
2667	ffs_snapdata_free(sn);
2668}
2669
2670static struct snapdata *
2671ffs_snapdata_acquire(struct vnode *devvp)
2672{
2673	struct snapdata *nsn, *sn;
2674	int error;
2675
2676	/*
2677	 * Allocate a free snapdata.  This is done before acquiring the
2678	 * devvp lock to avoid allocation while the devvp interlock is
2679	 * held.
2680	 */
2681	nsn = ffs_snapdata_alloc();
2682
2683	for (;;) {
2684		VI_LOCK(devvp);
2685		sn = devvp->v_rdev->si_snapdata;
2686		if (sn == NULL) {
2687			/*
2688			 * This is the first snapshot on this
2689			 * filesystem and we use our pre-allocated
2690			 * snapdata.  Publish sn with the sn_lock
2691			 * owned by us, to avoid the race.
2692			 */
2693			error = lockmgr(&nsn->sn_lock, LK_EXCLUSIVE |
2694			    LK_NOWAIT, NULL);
2695			if (error != 0)
2696				panic("leaked sn, lockmgr error %d", error);
2697			sn = devvp->v_rdev->si_snapdata = nsn;
2698			VI_UNLOCK(devvp);
2699			nsn = NULL;
2700			break;
2701		}
2702
2703		/*
2704		 * There is a snapshots which already exists on this
2705		 * filesystem, grab a reference to the common lock.
2706		 */
2707		error = lockmgr(&sn->sn_lock, LK_INTERLOCK |
2708		    LK_EXCLUSIVE | LK_SLEEPFAIL, VI_MTX(devvp));
2709		if (error == 0)
2710			break;
2711	}
2712
2713	/*
2714	 * Free any unused snapdata.
2715	 */
2716	if (nsn != NULL)
2717		ffs_snapdata_free(nsn);
2718
2719	return (sn);
2720}
2721
2722#endif
2723