ffs_snapshot.c revision 107915
162976Smckusick/* 262976Smckusick * Copyright 2000 Marshall Kirk McKusick. All Rights Reserved. 362976Smckusick * 462976Smckusick * Further information about snapshots can be obtained from: 562976Smckusick * 662976Smckusick * Marshall Kirk McKusick http://www.mckusick.com/softdep/ 762976Smckusick * 1614 Oxford Street mckusick@mckusick.com 862976Smckusick * Berkeley, CA 94709-1608 +1-510-843-9542 962976Smckusick * USA 1062976Smckusick * 1162976Smckusick * Redistribution and use in source and binary forms, with or without 1262976Smckusick * modification, are permitted provided that the following conditions 1362976Smckusick * are met: 1462976Smckusick * 1562976Smckusick * 1. Redistributions of source code must retain the above copyright 1662976Smckusick * notice, this list of conditions and the following disclaimer. 1762976Smckusick * 2. Redistributions in binary form must reproduce the above copyright 1862976Smckusick * notice, this list of conditions and the following disclaimer in the 1962976Smckusick * documentation and/or other materials provided with the distribution. 2062976Smckusick * 2162976Smckusick * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY 2262976Smckusick * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 2362976Smckusick * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 2462976Smckusick * DISCLAIMED. IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR 2562976Smckusick * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 2662976Smckusick * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 2762976Smckusick * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 2862976Smckusick * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 2962976Smckusick * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 3062976Smckusick * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 3162976Smckusick * SUCH DAMAGE. 3262976Smckusick * 3363788Smckusick * @(#)ffs_snapshot.c 8.11 (McKusick) 7/23/00 3462976Smckusick * $FreeBSD: head/sys/ufs/ffs/ffs_snapshot.c 107915 2002-12-15 19:25:59Z mckusick $ 3562976Smckusick */ 3662976Smckusick 3762976Smckusick#include <sys/param.h> 3898542Smckusick#include <sys/stdint.h> 39105191Smckusick#include <sys/kernel.h> 4062976Smckusick#include <sys/systm.h> 4173942Smckusick#include <sys/conf.h> 4262976Smckusick#include <sys/bio.h> 4362976Smckusick#include <sys/buf.h> 4462976Smckusick#include <sys/proc.h> 4562976Smckusick#include <sys/namei.h> 4662976Smckusick#include <sys/stat.h> 4762976Smckusick#include <sys/malloc.h> 4862976Smckusick#include <sys/mount.h> 4962976Smckusick#include <sys/resource.h> 5062976Smckusick#include <sys/resourcevar.h> 5162976Smckusick#include <sys/vnode.h> 5262976Smckusick 5362976Smckusick#include <ufs/ufs/extattr.h> 5462976Smckusick#include <ufs/ufs/quota.h> 5562976Smckusick#include <ufs/ufs/ufsmount.h> 5662976Smckusick#include <ufs/ufs/inode.h> 5762976Smckusick#include <ufs/ufs/ufs_extern.h> 5862976Smckusick 5962976Smckusick#include <ufs/ffs/fs.h> 6062976Smckusick#include <ufs/ffs/ffs_extern.h> 6162976Smckusick 6291420Sjhb#define KERNCRED thread0.td_ucred 6365998Sdes#define DEBUG 1 6462976Smckusick 6592728Salfredstatic int cgaccount(int, struct vnode *, struct buf *, int); 6698542Smckusickstatic int expunge_ufs1(struct vnode *, struct inode *, struct fs *, 6798542Smckusick int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, 6898542Smckusick ufs_lbn_t, int), int); 6998542Smckusickstatic int indiracct_ufs1(struct vnode *, struct vnode *, int, 7098542Smckusick ufs1_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *, 7198542Smckusick int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, 7298542Smckusick ufs_lbn_t, int), int); 7398542Smckusickstatic int fullacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 7498542Smckusick struct fs *, ufs_lbn_t, int); 7598542Smckusickstatic int snapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 7698542Smckusick struct fs *, ufs_lbn_t, int); 7798542Smckusickstatic int mapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 7898542Smckusick struct fs *, ufs_lbn_t, int); 7998542Smckusickstatic int expunge_ufs2(struct vnode *, struct inode *, struct fs *, 8098542Smckusick int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, 8198542Smckusick ufs_lbn_t, int), int); 8298542Smckusickstatic int indiracct_ufs2(struct vnode *, struct vnode *, int, 8398542Smckusick ufs2_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *, 8498542Smckusick int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, 8598542Smckusick ufs_lbn_t, int), int); 8698542Smckusickstatic int fullacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 8798542Smckusick struct fs *, ufs_lbn_t, int); 8898542Smckusickstatic int snapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 8998542Smckusick struct fs *, ufs_lbn_t, int); 9098542Smckusickstatic int mapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 9198542Smckusick struct fs *, ufs_lbn_t, int); 9292728Salfredstatic int ffs_copyonwrite(struct vnode *, struct buf *); 9398542Smckusickstatic int readblock(struct buf *, ufs2_daddr_t); 9462976Smckusick 9576580Smckusick/* 9676580Smckusick * To ensure the consistency of snapshots across crashes, we must 9776580Smckusick * synchronously write out copied blocks before allowing the 9876580Smckusick * originals to be modified. Because of the rather severe speed 9976580Smckusick * penalty that this imposes, the following flag allows this 10076580Smckusick * crash persistence to be disabled. 10176580Smckusick */ 10276580Smckusickint dopersistence = 0; 10376580Smckusick 10462976Smckusick#ifdef DEBUG 10562976Smckusick#include <sys/sysctl.h> 10676580SmckusickSYSCTL_INT(_debug, OID_AUTO, dopersistence, CTLFLAG_RW, &dopersistence, 0, ""); 10762976Smckusickint snapdebug = 0; 10862976SmckusickSYSCTL_INT(_debug, OID_AUTO, snapdebug, CTLFLAG_RW, &snapdebug, 0, ""); 10987827Smckusickint collectsnapstats = 0; 11087827SmckusickSYSCTL_INT(_debug, OID_AUTO, collectsnapstats, CTLFLAG_RW, &collectsnapstats, 11187827Smckusick 0, ""); 11262976Smckusick#endif /* DEBUG */ 11362976Smckusick 11462976Smckusick/* 11562976Smckusick * Create a snapshot file and initialize it for the filesystem. 11662976Smckusick */ 11762976Smckusickint 11862976Smckusickffs_snapshot(mp, snapfile) 11962976Smckusick struct mount *mp; 12062976Smckusick char *snapfile; 12162976Smckusick{ 12298542Smckusick ufs2_daddr_t numblks, blkno; 12398542Smckusick int error, cg, snaploc; 12490098Smckusick int i, size, len, loc; 12576269Smckusick int flag = mp->mnt_flag; 12687827Smckusick struct timespec starttime = {0, 0}, endtime; 12787827Smckusick char saved_nice = 0; 128107848Smckusick long redo = 0, snaplistsize; 12976269Smckusick int32_t *lp; 13071073Siedowse void *space; 131107848Smckusick daddr_t *snapblklist; 13276269Smckusick struct fs *copy_fs = NULL, *fs = VFSTOUFS(mp)->um_fs; 13373942Smckusick struct snaphead *snaphead; 13483366Sjulian struct thread *td = curthread; 13573942Smckusick struct inode *ip, *xp; 13676269Smckusick struct buf *bp, *nbp, *ibp, *sbp = NULL; 13762976Smckusick struct nameidata nd; 13862976Smckusick struct mount *wrtmp; 13962976Smckusick struct vattr vat; 140107414Smckusick struct vnode *vp, *xvp, *nvp, *devvp; 141104698Smckusick struct uio auio; 142104698Smckusick struct iovec aiov; 14362976Smckusick 14462976Smckusick /* 14562976Smckusick * Need to serialize access to snapshot code per filesystem. 14662976Smckusick */ 14762976Smckusick /* 14862976Smckusick * Assign a snapshot slot in the superblock. 14962976Smckusick */ 15062976Smckusick for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) 15162976Smckusick if (fs->fs_snapinum[snaploc] == 0) 15262976Smckusick break; 15362976Smckusick if (snaploc == FSMAXSNAP) 15462976Smckusick return (ENOSPC); 15562976Smckusick /* 15662976Smckusick * Create the snapshot file. 15762976Smckusick */ 15862976Smckusickrestart: 15983366Sjulian NDINIT(&nd, CREATE, LOCKPARENT | LOCKLEAF, UIO_USERSPACE, snapfile, td); 16062976Smckusick if ((error = namei(&nd)) != 0) 16162976Smckusick return (error); 16262976Smckusick if (nd.ni_vp != NULL) { 16362976Smckusick vput(nd.ni_vp); 16462976Smckusick error = EEXIST; 16562976Smckusick } 16662976Smckusick if (nd.ni_dvp->v_mount != mp) 16762976Smckusick error = EXDEV; 16862976Smckusick if (error) { 16962976Smckusick NDFREE(&nd, NDF_ONLY_PNBUF); 17062976Smckusick if (nd.ni_dvp == nd.ni_vp) 17162976Smckusick vrele(nd.ni_dvp); 17262976Smckusick else 17362976Smckusick vput(nd.ni_dvp); 17462976Smckusick return (error); 17562976Smckusick } 17662976Smckusick VATTR_NULL(&vat); 17762976Smckusick vat.va_type = VREG; 17862976Smckusick vat.va_mode = S_IRUSR; 17962976Smckusick vat.va_vaflags |= VA_EXCLUSIVE; 18062976Smckusick if (VOP_GETWRITEMOUNT(nd.ni_dvp, &wrtmp)) 18162976Smckusick wrtmp = NULL; 18262976Smckusick if (wrtmp != mp) 18362976Smckusick panic("ffs_snapshot: mount mismatch"); 18462985Smckusick if (vn_start_write(NULL, &wrtmp, V_NOWAIT) != 0) { 18562976Smckusick NDFREE(&nd, NDF_ONLY_PNBUF); 18662976Smckusick vput(nd.ni_dvp); 18762985Smckusick if ((error = vn_start_write(NULL, &wrtmp, 18862985Smckusick V_XSLEEP | PCATCH)) != 0) 18962976Smckusick return (error); 19062976Smckusick goto restart; 19162976Smckusick } 19283366Sjulian VOP_LEASE(nd.ni_dvp, td, KERNCRED, LEASE_WRITE); 19362976Smckusick error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vat); 19462976Smckusick vput(nd.ni_dvp); 19562976Smckusick if (error) { 19662976Smckusick NDFREE(&nd, NDF_ONLY_PNBUF); 19762976Smckusick vn_finished_write(wrtmp); 19862976Smckusick return (error); 19962976Smckusick } 20062976Smckusick vp = nd.ni_vp; 20162976Smckusick ip = VTOI(vp); 202107414Smckusick devvp = ip->i_devvp; 20362976Smckusick /* 20462976Smckusick * Allocate and copy the last block contents so as to be able 20562976Smckusick * to set size to that of the filesystem. 20662976Smckusick */ 20762976Smckusick numblks = howmany(fs->fs_size, fs->fs_frag); 20876132Sphk error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(numblks - 1)), 20998658Sdillon fs->fs_bsize, KERNCRED, BA_CLRBUF, &bp); 21062976Smckusick if (error) 21162976Smckusick goto out; 21262976Smckusick ip->i_size = lblktosize(fs, (off_t)numblks); 21398542Smckusick DIP(ip, i_size) = ip->i_size; 21462976Smckusick ip->i_flag |= IN_CHANGE | IN_UPDATE; 21562976Smckusick if ((error = readblock(bp, numblks - 1)) != 0) 21662976Smckusick goto out; 21762976Smckusick bawrite(bp); 21862976Smckusick /* 21962976Smckusick * Preallocate critical data structures so that we can copy 22062976Smckusick * them in without further allocation after we suspend all 22162976Smckusick * operations on the filesystem. We would like to just release 22262976Smckusick * the allocated buffers without writing them since they will 22362976Smckusick * be filled in below once we are ready to go, but this upsets 22462976Smckusick * the soft update code, so we go ahead and write the new buffers. 22562976Smckusick * 22675993Smckusick * Allocate all indirect blocks and mark all of them as not 22775993Smckusick * needing to be copied. 22862976Smckusick */ 22962976Smckusick for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) { 23076132Sphk error = UFS_BALLOC(vp, lblktosize(fs, (off_t)blkno), 23198658Sdillon fs->fs_bsize, td->td_ucred, BA_METAONLY, &ibp); 23262976Smckusick if (error) 23362976Smckusick goto out; 234107406Smckusick bawrite(ibp); 23562976Smckusick } 23662976Smckusick /* 23762976Smckusick * Allocate copies for the superblock and its summary information. 23862976Smckusick */ 239107294Smckusick error = UFS_BALLOC(vp, fs->fs_sblockloc, fs->fs_sbsize, KERNCRED, 240107294Smckusick 0, &nbp); 24176269Smckusick if (error) 24262976Smckusick goto out; 24362976Smckusick bawrite(nbp); 24462976Smckusick blkno = fragstoblks(fs, fs->fs_csaddr); 24562976Smckusick len = howmany(fs->fs_cssize, fs->fs_bsize); 24662976Smckusick for (loc = 0; loc < len; loc++) { 24776132Sphk error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(blkno + loc)), 24862976Smckusick fs->fs_bsize, KERNCRED, 0, &nbp); 24962976Smckusick if (error) 25062976Smckusick goto out; 25162976Smckusick bawrite(nbp); 25262976Smckusick } 25362976Smckusick /* 25487827Smckusick * Allocate all cylinder group blocks. 25587827Smckusick */ 25687827Smckusick for (cg = 0; cg < fs->fs_ncg; cg++) { 25787827Smckusick error = UFS_BALLOC(vp, (off_t)(cgtod(fs, cg)) << fs->fs_fshift, 25887827Smckusick fs->fs_bsize, KERNCRED, 0, &nbp); 25987827Smckusick if (error) 26087827Smckusick goto out; 261107406Smckusick bawrite(nbp); 26287827Smckusick } 26387827Smckusick /* 26487827Smckusick * Copy all the cylinder group maps. Although the 26587827Smckusick * filesystem is still active, we hope that only a few 26687827Smckusick * cylinder groups will change between now and when we 26787827Smckusick * suspend operations. Thus, we will be able to quickly 26887827Smckusick * touch up the few cylinder groups that changed during 26987827Smckusick * the suspension period. 27087827Smckusick */ 27189450Smckusick len = howmany(fs->fs_ncg, NBBY); 27288138Smckusick MALLOC(fs->fs_active, int *, len, M_DEVBUF, M_WAITOK); 27387827Smckusick bzero(fs->fs_active, len); 27487827Smckusick for (cg = 0; cg < fs->fs_ncg; cg++) { 275107558Smckusick error = UFS_BALLOC(vp, (off_t)(cgtod(fs, cg)) << fs->fs_fshift, 276107558Smckusick fs->fs_bsize, KERNCRED, 0, &nbp); 277107558Smckusick if (error) 27887827Smckusick goto out; 27987827Smckusick error = cgaccount(cg, vp, nbp, 1); 28087827Smckusick bawrite(nbp); 28187827Smckusick if (error) 28287827Smckusick goto out; 28387827Smckusick } 28487827Smckusick /* 28562976Smckusick * Change inode to snapshot type file. 28662976Smckusick */ 28763897Smckusick ip->i_flags |= SF_SNAPSHOT; 28898542Smckusick DIP(ip, i_flags) = ip->i_flags; 28962976Smckusick ip->i_flag |= IN_CHANGE | IN_UPDATE; 29062976Smckusick /* 29162976Smckusick * Ensure that the snapshot is completely on disk. 292107406Smckusick * Since we have marked it as a snapshot it is safe to 293107406Smckusick * unlock it as no process will be allowed to write to it. 29462976Smckusick */ 29583366Sjulian if ((error = VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td)) != 0) 29662976Smckusick goto out; 297107406Smckusick VOP_UNLOCK(vp, 0, td); 29862976Smckusick /* 29962976Smckusick * All allocations are done, so we can now snapshot the system. 30062976Smckusick * 30187827Smckusick * Recind nice scheduling while running with the filesystem suspended. 30287827Smckusick */ 30387827Smckusick if (td->td_ksegrp->kg_nice > 0) { 30487827Smckusick saved_nice = td->td_ksegrp->kg_nice; 30587827Smckusick td->td_ksegrp->kg_nice = 0; 30687827Smckusick } 30787827Smckusick /* 30862976Smckusick * Suspend operation on filesystem. 30962976Smckusick */ 31062976Smckusick for (;;) { 31162976Smckusick vn_finished_write(wrtmp); 312105902Smckusick if ((error = vfs_write_suspend(vp->v_mount)) != 0) { 313105902Smckusick vn_start_write(NULL, &wrtmp, V_WAIT); 314107406Smckusick vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); 315105902Smckusick goto out; 316105902Smckusick } 31762976Smckusick if (mp->mnt_kern_flag & MNTK_SUSPENDED) 31862976Smckusick break; 31962985Smckusick vn_start_write(NULL, &wrtmp, V_WAIT); 32062976Smckusick } 321107406Smckusick vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); 32290098Smckusick if (collectsnapstats) 32390098Smckusick nanotime(&starttime); 32462976Smckusick /* 32587827Smckusick * First, copy all the cylinder group maps that have changed. 32662976Smckusick */ 32762976Smckusick for (cg = 0; cg < fs->fs_ncg; cg++) { 32888138Smckusick if ((ACTIVECGNUM(fs, cg) & ACTIVECGOFF(cg)) != 0) 32987827Smckusick continue; 33087827Smckusick redo++; 331107558Smckusick error = UFS_BALLOC(vp, (off_t)(cgtod(fs, cg)) << fs->fs_fshift, 332107558Smckusick fs->fs_bsize, KERNCRED, 0, &nbp); 333107558Smckusick if (error) 33462976Smckusick goto out1; 33587827Smckusick error = cgaccount(cg, vp, nbp, 2); 33689450Smckusick bawrite(nbp); 33787827Smckusick if (error) 33862976Smckusick goto out1; 33962976Smckusick } 34062976Smckusick /* 34176269Smckusick * Grab a copy of the superblock and its summary information. 34276269Smckusick * We delay writing it until the suspension is released below. 34376269Smckusick */ 344107294Smckusick error = bread(vp, lblkno(fs, fs->fs_sblockloc), fs->fs_bsize, 34598542Smckusick KERNCRED, &sbp); 34690098Smckusick if (error) { 34790098Smckusick brelse(sbp); 34890098Smckusick sbp = NULL; 34976269Smckusick goto out1; 35090098Smckusick } 351107294Smckusick loc = blkoff(fs, fs->fs_sblockloc); 35298542Smckusick copy_fs = (struct fs *)(sbp->b_data + loc); 35376269Smckusick bcopy(fs, copy_fs, fs->fs_sbsize); 35476269Smckusick if ((fs->fs_flags & (FS_UNCLEAN | FS_NEEDSFSCK)) == 0) 35576269Smckusick copy_fs->fs_clean = 1; 35698542Smckusick if (fs->fs_sbsize < SBLOCKSIZE) 35798542Smckusick bzero(&sbp->b_data[loc + fs->fs_sbsize], 35898542Smckusick SBLOCKSIZE - fs->fs_sbsize); 35976269Smckusick size = blkroundup(fs, fs->fs_cssize); 36076269Smckusick if (fs->fs_contigsumsize > 0) 36176269Smckusick size += fs->fs_ncg * sizeof(int32_t); 36276269Smckusick space = malloc((u_long)size, M_UFSMNT, M_WAITOK); 36376269Smckusick copy_fs->fs_csp = space; 36476269Smckusick bcopy(fs->fs_csp, copy_fs->fs_csp, fs->fs_cssize); 36576269Smckusick (char *)space += fs->fs_cssize; 36676269Smckusick loc = howmany(fs->fs_cssize, fs->fs_fsize); 36776356Smckusick i = fs->fs_frag - loc % fs->fs_frag; 36876356Smckusick len = (i == fs->fs_frag) ? 0 : i * fs->fs_fsize; 36976356Smckusick if (len > 0) { 370107414Smckusick if ((error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + loc), 37176269Smckusick len, KERNCRED, &bp)) != 0) { 37290098Smckusick brelse(bp); 37376269Smckusick free(copy_fs->fs_csp, M_UFSMNT); 37490098Smckusick bawrite(sbp); 37590098Smckusick sbp = NULL; 37676269Smckusick goto out1; 37762976Smckusick } 37876269Smckusick bcopy(bp->b_data, space, (u_int)len); 37976269Smckusick (char *)space += len; 38076269Smckusick bp->b_flags |= B_INVAL | B_NOCACHE; 38176269Smckusick brelse(bp); 38262976Smckusick } 38376269Smckusick if (fs->fs_contigsumsize > 0) { 38476269Smckusick copy_fs->fs_maxcluster = lp = space; 38576269Smckusick for (i = 0; i < fs->fs_ncg; i++) 38676269Smckusick *lp++ = fs->fs_contigsumsize; 38776269Smckusick } 38862976Smckusick /* 38990098Smckusick * We must check for active files that have been unlinked 39090098Smckusick * (e.g., with a zero link count). We have to expunge all 39190098Smckusick * trace of these files from the snapshot so that they are 39290098Smckusick * not reclaimed prematurely by fsck or unnecessarily dumped. 39390098Smckusick * We turn off the MNTK_SUSPENDED flag to avoid a panic from 39490098Smckusick * spec_strategy about writing on a suspended filesystem. 395104698Smckusick * Note that we skip unlinked snapshot files as they will 396104698Smckusick * be handled separately below. 39790098Smckusick */ 39890098Smckusick mp->mnt_kern_flag &= ~MNTK_SUSPENDED; 39990098Smckusick mtx_lock(&mntvnode_mtx); 40090098Smckusickloop: 40190098Smckusick for (xvp = TAILQ_FIRST(&mp->mnt_nvnodelist); xvp; xvp = nvp) { 40290098Smckusick /* 40390098Smckusick * Make sure this vnode wasn't reclaimed in getnewvnode(). 40490098Smckusick * Start over if it has (it won't be on the list anymore). 40590098Smckusick */ 40690098Smckusick if (xvp->v_mount != mp) 40790098Smckusick goto loop; 40890098Smckusick nvp = TAILQ_NEXT(xvp, v_nmntvnodes); 40990098Smckusick mtx_unlock(&mntvnode_mtx); 410103945Sjeff mp_fixme("Unlocked GETATTR."); 411103945Sjeff if (vrefcnt(xvp) == 0 || xvp->v_type == VNON || 412104698Smckusick (VTOI(xvp)->i_flags & SF_SNAPSHOT) || 41390098Smckusick (VOP_GETATTR(xvp, &vat, td->td_proc->p_ucred, td) == 0 && 41490098Smckusick vat.va_nlink > 0)) { 41590098Smckusick mtx_lock(&mntvnode_mtx); 41690098Smckusick continue; 41790098Smckusick } 41890098Smckusick if (snapdebug) 41990098Smckusick vprint("ffs_snapshot: busy vnode", xvp); 420104688Sjeff if (vn_lock(xvp, LK_EXCLUSIVE, td) != 0) 42190098Smckusick goto loop; 42290098Smckusick xp = VTOI(xvp); 42390098Smckusick /* 42490098Smckusick * If there is a fragment, clear it here. 42590098Smckusick */ 42690098Smckusick blkno = 0; 42790098Smckusick loc = howmany(xp->i_size, fs->fs_bsize) - 1; 42890098Smckusick if (loc < NDADDR) { 42990098Smckusick len = fragroundup(fs, blkoff(fs, xp->i_size)); 43090098Smckusick if (len < fs->fs_bsize) { 43198542Smckusick ffs_blkfree(copy_fs, vp, DIP(xp, i_db[loc]), 43298542Smckusick len, xp->i_number); 43398542Smckusick blkno = DIP(xp, i_db[loc]); 43498542Smckusick DIP(xp, i_db[loc]) = 0; 43590098Smckusick } 43690098Smckusick } 43798542Smckusick if (xp->i_ump->um_fstype == UFS1) 43898542Smckusick error = expunge_ufs1(vp, xp, copy_fs, fullacct_ufs1, 43998542Smckusick BLK_NOCOPY); 44098542Smckusick else 44198542Smckusick error = expunge_ufs2(vp, xp, copy_fs, fullacct_ufs2, 44298542Smckusick BLK_NOCOPY); 44390098Smckusick if (blkno) 44498542Smckusick DIP(xp, i_db[loc]) = blkno; 44590098Smckusick if (!error) 44690098Smckusick error = ffs_freefile(copy_fs, vp, xp->i_number, 44790098Smckusick xp->i_mode); 44890098Smckusick VOP_UNLOCK(xvp, 0, td); 44990098Smckusick if (error) { 45090098Smckusick free(copy_fs->fs_csp, M_UFSMNT); 45190098Smckusick bawrite(sbp); 45290098Smckusick sbp = NULL; 45390098Smckusick goto out1; 45490098Smckusick } 45590098Smckusick mtx_lock(&mntvnode_mtx); 45690098Smckusick } 45790098Smckusick mtx_unlock(&mntvnode_mtx); 45890098Smckusick /* 459105191Smckusick * If there already exist snapshots on this filesystem, grab a 460105191Smckusick * reference to their shared lock. If this is the first snapshot 461105191Smckusick * on this filesystem, we need to allocate a lock for the snapshots 462105191Smckusick * to share. In either case, acquire the snapshot lock and give 463105191Smckusick * up our original private lock. 464105191Smckusick */ 465107414Smckusick VI_LOCK(devvp); 466107414Smckusick snaphead = &devvp->v_rdev->si_snapshots; 467105191Smckusick if ((xp = TAILQ_FIRST(snaphead)) != NULL) { 468105191Smckusick VI_LOCK(vp); 469105191Smckusick vp->v_vnlock = ITOV(xp)->v_vnlock; 470107414Smckusick VI_UNLOCK(devvp); 471105191Smckusick } else { 472105191Smckusick struct lock *lkp; 473105191Smckusick 474107414Smckusick VI_UNLOCK(devvp); 475105191Smckusick MALLOC(lkp, struct lock *, sizeof(struct lock), M_UFSMNT, 476105191Smckusick M_WAITOK); 477105191Smckusick lockinit(lkp, PVFS, "snaplk", VLKTIMEOUT, 478105191Smckusick LK_CANRECURSE | LK_NOPAUSE); 479105191Smckusick VI_LOCK(vp); 480105191Smckusick vp->v_vnlock = lkp; 481105191Smckusick } 482105191Smckusick vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY, td); 483107414Smckusick transferlockers(&vp->v_lock, vp->v_vnlock); 484107414Smckusick lockmgr(&vp->v_lock, LK_RELEASE, NULL, td); 485105191Smckusick /* 48662976Smckusick * Record snapshot inode. Since this is the newest snapshot, 48762976Smckusick * it must be placed at the end of the list. 48862976Smckusick */ 489107414Smckusick VI_LOCK(devvp); 49062976Smckusick fs->fs_snapinum[snaploc] = ip->i_number; 49173942Smckusick if (ip->i_nextsnap.tqe_prev != 0) 49262976Smckusick panic("ffs_snapshot: %d already on list", ip->i_number); 49373942Smckusick TAILQ_INSERT_TAIL(snaphead, ip, i_nextsnap); 494107414Smckusick devvp->v_rdev->si_copyonwrite = ffs_copyonwrite; 495107414Smckusick devvp->v_vflag |= VV_COPYONWRITE; 496107414Smckusick VI_UNLOCK(devvp); 497101308Sjeff ASSERT_VOP_LOCKED(vp, "ffs_snapshot vp"); 498101308Sjeff vp->v_vflag |= VV_SYSTEM; 49987827Smckusickout1: 50062976Smckusick /* 50162976Smckusick * Resume operation on filesystem. 50262976Smckusick */ 50362976Smckusick vfs_write_resume(vp->v_mount); 50462985Smckusick vn_start_write(NULL, &wrtmp, V_WAIT); 50587827Smckusick if (collectsnapstats && starttime.tv_sec > 0) { 50687827Smckusick nanotime(&endtime); 50787827Smckusick timespecsub(&endtime, &starttime); 508106965Speter printf("%s: suspended %ld.%03ld sec, redo %ld of %d\n", 509106965Speter vp->v_mount->mnt_stat.f_mntonname, (long)endtime.tv_sec, 51087827Smckusick endtime.tv_nsec / 1000000, redo, fs->fs_ncg); 51187827Smckusick } 51290098Smckusick if (sbp == NULL) 51390098Smckusick goto out; 51490098Smckusick /* 51590098Smckusick * Copy allocation information from all the snapshots in 51690098Smckusick * this snapshot and then expunge them from its view. 51790098Smckusick */ 518107414Smckusick snaphead = &devvp->v_rdev->si_snapshots; 51990098Smckusick TAILQ_FOREACH(xp, snaphead, i_nextsnap) { 52090098Smckusick if (xp == ip) 52190098Smckusick break; 52298542Smckusick if (xp->i_ump->um_fstype == UFS1) 52398542Smckusick error = expunge_ufs1(vp, xp, fs, snapacct_ufs1, 52498542Smckusick BLK_SNAP); 52598542Smckusick else 52698542Smckusick error = expunge_ufs2(vp, xp, fs, snapacct_ufs2, 52798542Smckusick BLK_SNAP); 52898542Smckusick if (error) { 52990098Smckusick fs->fs_snapinum[snaploc] = 0; 53090098Smckusick goto done; 53187827Smckusick } 53290098Smckusick } 53390098Smckusick /* 534104698Smckusick * Allocate the space for the list of preallocated snapshot blocks. 535104698Smckusick */ 536107848Smckusick snaplistsize = fs->fs_ncg + howmany(fs->fs_cssize, fs->fs_bsize) + 537107848Smckusick FSMAXSNAP + 1 /* superblock */ + 1 /* last block */ + 1 /* size */; 538107848Smckusick MALLOC(snapblklist, daddr_t *, snaplistsize * sizeof(daddr_t), 539104698Smckusick M_UFSMNT, M_WAITOK); 540107915Smckusick ip->i_snapblklist = &snapblklist[1]; 541104698Smckusick /* 54290098Smckusick * Expunge the blocks used by the snapshots from the set of 543104698Smckusick * blocks marked as used in the snapshot bitmaps. Also, collect 544107915Smckusick * the list of allocated blocks in i_snapblklist. 54590098Smckusick */ 54698542Smckusick if (ip->i_ump->um_fstype == UFS1) 54798542Smckusick error = expunge_ufs1(vp, ip, copy_fs, mapacct_ufs1, BLK_SNAP); 54898542Smckusick else 54998542Smckusick error = expunge_ufs2(vp, ip, copy_fs, mapacct_ufs2, BLK_SNAP); 55098542Smckusick if (error) { 55190098Smckusick fs->fs_snapinum[snaploc] = 0; 552107848Smckusick FREE(snapblklist, M_UFSMNT); 55390098Smckusick goto done; 55490098Smckusick } 555107915Smckusick snaplistsize = ip->i_snapblklist - snapblklist; 556107848Smckusick snapblklist[0] = snaplistsize; 557107915Smckusick ip->i_snapblklist = 0; 55890098Smckusick /* 559104698Smckusick * Write out the list of allocated blocks to the end of the snapshot. 560104698Smckusick */ 561104698Smckusick auio.uio_iov = &aiov; 562104698Smckusick auio.uio_iovcnt = 1; 563107848Smckusick aiov.iov_base = (void *)snapblklist; 564107848Smckusick aiov.iov_len = snaplistsize * sizeof(daddr_t); 565104698Smckusick auio.uio_resid = aiov.iov_len;; 566104698Smckusick auio.uio_offset = ip->i_size; 567104698Smckusick auio.uio_segflg = UIO_SYSSPACE; 568104698Smckusick auio.uio_rw = UIO_WRITE; 569104698Smckusick auio.uio_td = td; 570104698Smckusick if ((error = VOP_WRITE(vp, &auio, IO_UNIT, td->td_ucred)) != 0) { 571104698Smckusick fs->fs_snapinum[snaploc] = 0; 572107848Smckusick FREE(snapblklist, M_UFSMNT); 573104698Smckusick goto done; 574104698Smckusick } 575104698Smckusick /* 57690098Smckusick * Write the superblock and its summary information 57790098Smckusick * to the snapshot. 57890098Smckusick */ 57990098Smckusick blkno = fragstoblks(fs, fs->fs_csaddr); 58090098Smckusick len = howmany(fs->fs_cssize, fs->fs_bsize); 58190098Smckusick space = copy_fs->fs_csp; 58290098Smckusick for (loc = 0; loc < len; loc++) { 58390098Smckusick error = bread(vp, blkno + loc, fs->fs_bsize, KERNCRED, &nbp); 58490098Smckusick if (error) { 58590098Smckusick brelse(nbp); 58690098Smckusick fs->fs_snapinum[snaploc] = 0; 587107848Smckusick FREE(snapblklist, M_UFSMNT); 58890098Smckusick goto done; 58976269Smckusick } 59090098Smckusick bcopy(space, nbp->b_data, fs->fs_bsize); 59190098Smckusick space = (char *)space + fs->fs_bsize; 59290098Smckusick bawrite(nbp); 59376269Smckusick } 594107848Smckusick /* 595107848Smckusick * As this is the newest list, it is the most inclusive, so 596107848Smckusick * should replace the previous list. 597107848Smckusick */ 598107848Smckusick VI_LOCK(devvp); 599107848Smckusick FREE(devvp->v_rdev->si_snapblklist, M_UFSMNT); 600107848Smckusick devvp->v_rdev->si_snapblklist = snapblklist; 601107848Smckusick devvp->v_rdev->si_snaplistsize = snaplistsize; 602107848Smckusick VI_UNLOCK(devvp); 60390098Smckusickdone: 60490098Smckusick free(copy_fs->fs_csp, M_UFSMNT); 60590098Smckusick bawrite(sbp); 60662976Smckusickout: 607105667Smckusick if (saved_nice > 0) 608105667Smckusick td->td_ksegrp->kg_nice = saved_nice; 60987827Smckusick if (fs->fs_active != 0) { 61087827Smckusick FREE(fs->fs_active, M_DEVBUF); 61187827Smckusick fs->fs_active = 0; 61287827Smckusick } 61362976Smckusick mp->mnt_flag = flag; 61476269Smckusick if (error) 61583366Sjulian (void) UFS_TRUNCATE(vp, (off_t)0, 0, NOCRED, td); 61683366Sjulian (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); 61762976Smckusick if (error) 61862976Smckusick vput(vp); 61962976Smckusick else 62083366Sjulian VOP_UNLOCK(vp, 0, td); 62162976Smckusick vn_finished_write(wrtmp); 62262976Smckusick return (error); 62362976Smckusick} 62462976Smckusick 62562976Smckusick/* 62687827Smckusick * Copy a cylinder group map. All the unallocated blocks are marked 62787827Smckusick * BLK_NOCOPY so that the snapshot knows that it need not copy them 62892363Smckusick * if they are later written. If passno is one, then this is a first 62992363Smckusick * pass, so only setting needs to be done. If passno is 2, then this 63087827Smckusick * is a revision to a previous pass which must be undone as the 63187827Smckusick * replacement pass is done. 63287827Smckusick */ 63387827Smckusickstatic int 63487827Smckusickcgaccount(cg, vp, nbp, passno) 63587827Smckusick int cg; 63687827Smckusick struct vnode *vp; 63787827Smckusick struct buf *nbp; 63887827Smckusick int passno; 63987827Smckusick{ 64087827Smckusick struct buf *bp, *ibp; 64187827Smckusick struct inode *ip; 64287827Smckusick struct cg *cgp; 64387827Smckusick struct fs *fs; 64498542Smckusick ufs2_daddr_t base, numblks; 64598542Smckusick int error, len, loc, indiroff; 64687827Smckusick 64787827Smckusick ip = VTOI(vp); 64887827Smckusick fs = ip->i_fs; 64987827Smckusick error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)), 65087827Smckusick (int)fs->fs_cgsize, KERNCRED, &bp); 65187827Smckusick if (error) { 65287827Smckusick brelse(bp); 65387827Smckusick return (error); 65487827Smckusick } 65587827Smckusick cgp = (struct cg *)bp->b_data; 65687827Smckusick if (!cg_chkmagic(cgp)) { 65787827Smckusick brelse(bp); 65887827Smckusick return (EIO); 65987827Smckusick } 66088138Smckusick atomic_set_int(&ACTIVECGNUM(fs, cg), ACTIVECGOFF(cg)); 66187827Smckusick bcopy(bp->b_data, nbp->b_data, fs->fs_cgsize); 66287827Smckusick if (fs->fs_cgsize < fs->fs_bsize) 66387827Smckusick bzero(&nbp->b_data[fs->fs_cgsize], 66487827Smckusick fs->fs_bsize - fs->fs_cgsize); 66587827Smckusick if (passno == 2) 66687827Smckusick nbp->b_flags |= B_VALIDSUSPWRT; 66787827Smckusick numblks = howmany(fs->fs_size, fs->fs_frag); 66887827Smckusick len = howmany(fs->fs_fpg, fs->fs_frag); 66987827Smckusick base = cg * fs->fs_fpg / fs->fs_frag; 67087827Smckusick if (base + len >= numblks) 67187827Smckusick len = numblks - base - 1; 67287827Smckusick loc = 0; 67387827Smckusick if (base < NDADDR) { 67487827Smckusick for ( ; loc < NDADDR; loc++) { 67587827Smckusick if (ffs_isblock(fs, cg_blksfree(cgp), loc)) 67698542Smckusick DIP(ip, i_db[loc]) = BLK_NOCOPY; 67798542Smckusick else if (passno == 2 && DIP(ip, i_db[loc])== BLK_NOCOPY) 67898542Smckusick DIP(ip, i_db[loc]) = 0; 67998542Smckusick else if (passno == 1 && DIP(ip, i_db[loc])== BLK_NOCOPY) 68087827Smckusick panic("ffs_snapshot: lost direct block"); 68187827Smckusick } 68287827Smckusick } 68387827Smckusick error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(base + loc)), 68498658Sdillon fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 68587827Smckusick if (error) { 68687827Smckusick brelse(bp); 68787827Smckusick return (error); 68887827Smckusick } 68987827Smckusick indiroff = (base + loc - NDADDR) % NINDIR(fs); 69087827Smckusick for ( ; loc < len; loc++, indiroff++) { 69187827Smckusick if (indiroff >= NINDIR(fs)) { 69287827Smckusick if (passno == 2) 69387827Smckusick ibp->b_flags |= B_VALIDSUSPWRT; 69487827Smckusick bawrite(ibp); 69587827Smckusick error = UFS_BALLOC(vp, 69687827Smckusick lblktosize(fs, (off_t)(base + loc)), 69798658Sdillon fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 69887827Smckusick if (error) { 69987827Smckusick brelse(bp); 70087827Smckusick return (error); 70187827Smckusick } 70287827Smckusick indiroff = 0; 70387827Smckusick } 70498542Smckusick if (ip->i_ump->um_fstype == UFS1) { 70598542Smckusick if (ffs_isblock(fs, cg_blksfree(cgp), loc)) 70698542Smckusick ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = 70798542Smckusick BLK_NOCOPY; 70898542Smckusick else if (passno == 2 && ((ufs1_daddr_t *)(ibp->b_data)) 70998542Smckusick [indiroff] == BLK_NOCOPY) 71098542Smckusick ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = 0; 71198542Smckusick else if (passno == 1 && ((ufs1_daddr_t *)(ibp->b_data)) 71298542Smckusick [indiroff] == BLK_NOCOPY) 71398542Smckusick panic("ffs_snapshot: lost indirect block"); 71498542Smckusick continue; 71598542Smckusick } 71687827Smckusick if (ffs_isblock(fs, cg_blksfree(cgp), loc)) 71798542Smckusick ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = BLK_NOCOPY; 71887827Smckusick else if (passno == 2 && 71998542Smckusick ((ufs2_daddr_t *)(ibp->b_data)) [indiroff] == BLK_NOCOPY) 72098542Smckusick ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = 0; 72187827Smckusick else if (passno == 1 && 72298542Smckusick ((ufs2_daddr_t *)(ibp->b_data)) [indiroff] == BLK_NOCOPY) 72387827Smckusick panic("ffs_snapshot: lost indirect block"); 72487827Smckusick } 72587827Smckusick bqrelse(bp); 72687827Smckusick if (passno == 2) 72787827Smckusick ibp->b_flags |= B_VALIDSUSPWRT; 72887827Smckusick bdwrite(ibp); 72987827Smckusick return (0); 73087827Smckusick} 73187827Smckusick 73287827Smckusick/* 73376269Smckusick * Before expunging a snapshot inode, note all the 73476269Smckusick * blocks that it claims with BLK_SNAP so that fsck will 73576269Smckusick * be able to account for those blocks properly and so 73676269Smckusick * that this snapshot knows that it need not copy them 73798542Smckusick * if the other snapshot holding them is freed. This code 73898542Smckusick * is reproduced once each for UFS1 and UFS2. 73976269Smckusick */ 74076269Smckusickstatic int 74198542Smckusickexpunge_ufs1(snapvp, cancelip, fs, acctfunc, expungetype) 74290098Smckusick struct vnode *snapvp; 74390098Smckusick struct inode *cancelip; 74476269Smckusick struct fs *fs; 74598542Smckusick int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 74698542Smckusick struct fs *, ufs_lbn_t, int); 74790098Smckusick int expungetype; 74876269Smckusick{ 74998542Smckusick int i, error, indiroff; 75098542Smckusick ufs_lbn_t lbn, rlbn; 75198542Smckusick ufs2_daddr_t len, blkno, numblks, blksperindir; 75298542Smckusick struct ufs1_dinode *dip; 75390098Smckusick struct thread *td = curthread; 75476269Smckusick struct buf *bp; 75576269Smckusick 75676269Smckusick /* 75790098Smckusick * Prepare to expunge the inode. If its inode block has not 75890098Smckusick * yet been copied, then allocate and fill the copy. 75976269Smckusick */ 76090098Smckusick lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number)); 76190098Smckusick blkno = 0; 76290098Smckusick if (lbn < NDADDR) { 763107558Smckusick blkno = VTOI(snapvp)->i_din1->di_db[lbn]; 76490098Smckusick } else { 76590098Smckusick td->td_proc->p_flag |= P_COWINPROGRESS; 76690098Smckusick error = UFS_BALLOC(snapvp, lblktosize(fs, (off_t)lbn), 76798658Sdillon fs->fs_bsize, KERNCRED, BA_METAONLY, &bp); 76890098Smckusick td->td_proc->p_flag &= ~P_COWINPROGRESS; 76990098Smckusick if (error) 77090098Smckusick return (error); 77190098Smckusick indiroff = (lbn - NDADDR) % NINDIR(fs); 77298542Smckusick blkno = ((ufs1_daddr_t *)(bp->b_data))[indiroff]; 77390098Smckusick bqrelse(bp); 77490098Smckusick } 775107558Smckusick if (blkno != 0) { 776107558Smckusick if ((error = bread(snapvp, lbn, fs->fs_bsize, KERNCRED, &bp))) 777107558Smckusick return (error); 778107558Smckusick } else { 779107558Smckusick error = UFS_BALLOC(snapvp, lblktosize(fs, (off_t)lbn), 780107558Smckusick fs->fs_bsize, KERNCRED, 0, &bp); 781107558Smckusick if (error) 782107558Smckusick return (error); 783107558Smckusick if ((error = readblock(bp, lbn)) != 0) 784107558Smckusick return (error); 785107558Smckusick } 78690098Smckusick /* 78790098Smckusick * Set a snapshot inode to be a zero length file, regular files 78890098Smckusick * to be completely unallocated. 78990098Smckusick */ 79098542Smckusick dip = (struct ufs1_dinode *)bp->b_data + 79198542Smckusick ino_to_fsbo(fs, cancelip->i_number); 79290098Smckusick if (expungetype == BLK_NOCOPY) 79390098Smckusick dip->di_mode = 0; 79476269Smckusick dip->di_size = 0; 79576269Smckusick dip->di_blocks = 0; 79676269Smckusick dip->di_flags &= ~SF_SNAPSHOT; 79798542Smckusick bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs1_daddr_t)); 79876269Smckusick bdwrite(bp); 799107848Smckusick /* 800107848Smckusick * Now go through and expunge all the blocks in the file 801107848Smckusick * using the function requested. 802107848Smckusick */ 803107848Smckusick numblks = howmany(cancelip->i_size, fs->fs_bsize); 804107848Smckusick if ((error = (*acctfunc)(snapvp, &cancelip->i_din1->di_db[0], 805107848Smckusick &cancelip->i_din1->di_db[NDADDR], fs, 0, expungetype))) 806107848Smckusick return (error); 807107848Smckusick if ((error = (*acctfunc)(snapvp, &cancelip->i_din1->di_ib[0], 808107848Smckusick &cancelip->i_din1->di_ib[NIADDR], fs, -1, expungetype))) 809107848Smckusick return (error); 810107848Smckusick blksperindir = 1; 811107848Smckusick lbn = -NDADDR; 812107848Smckusick len = numblks - NDADDR; 813107848Smckusick rlbn = NDADDR; 814107848Smckusick for (i = 0; len > 0 && i < NIADDR; i++) { 815107848Smckusick error = indiracct_ufs1(snapvp, ITOV(cancelip), i, 816107848Smckusick cancelip->i_din1->di_ib[i], lbn, rlbn, len, 817107848Smckusick blksperindir, fs, acctfunc, expungetype); 818107848Smckusick if (error) 819107848Smckusick return (error); 820107848Smckusick blksperindir *= NINDIR(fs); 821107848Smckusick lbn -= blksperindir + 1; 822107848Smckusick len -= blksperindir; 823107848Smckusick rlbn += blksperindir; 824107848Smckusick } 82576269Smckusick return (0); 82676269Smckusick} 82776269Smckusick 82876269Smckusick/* 82962976Smckusick * Descend an indirect block chain for vnode cancelvp accounting for all 83062976Smckusick * its indirect blocks in snapvp. 83162976Smckusick */ 83262976Smckusickstatic int 83398542Smckusickindiracct_ufs1(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks, 83498542Smckusick blksperindir, fs, acctfunc, expungetype) 83562976Smckusick struct vnode *snapvp; 83662976Smckusick struct vnode *cancelvp; 83762976Smckusick int level; 83898542Smckusick ufs1_daddr_t blkno; 83998542Smckusick ufs_lbn_t lbn; 84098542Smckusick ufs_lbn_t rlbn; 84198542Smckusick ufs_lbn_t remblks; 84298542Smckusick ufs_lbn_t blksperindir; 84376269Smckusick struct fs *fs; 84498542Smckusick int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 84598542Smckusick struct fs *, ufs_lbn_t, int); 84690098Smckusick int expungetype; 84762976Smckusick{ 84898542Smckusick int error, num, i; 84998542Smckusick ufs_lbn_t subblksperindir; 85062976Smckusick struct indir indirs[NIADDR + 2]; 85198542Smckusick ufs1_daddr_t last, *bap; 85262976Smckusick struct buf *bp; 85362976Smckusick 85462976Smckusick if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0) 85562976Smckusick return (error); 85662976Smckusick if (lbn != indirs[num - 1 - level].in_lbn || blkno == 0 || num < 2) 85762976Smckusick panic("indiracct: botched params"); 85862976Smckusick /* 85962976Smckusick * We have to expand bread here since it will deadlock looking 86062976Smckusick * up the block number for any blocks that are not in the cache. 86162976Smckusick */ 86262976Smckusick bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0); 86362976Smckusick bp->b_blkno = fsbtodb(fs, blkno); 86462976Smckusick if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 && 86562976Smckusick (error = readblock(bp, fragstoblks(fs, blkno)))) { 86662976Smckusick brelse(bp); 86762976Smckusick return (error); 86862976Smckusick } 86962976Smckusick /* 87062976Smckusick * Account for the block pointers in this indirect block. 87162976Smckusick */ 87262976Smckusick last = howmany(remblks, blksperindir); 87362976Smckusick if (last > NINDIR(fs)) 87462976Smckusick last = NINDIR(fs); 87598542Smckusick MALLOC(bap, ufs1_daddr_t *, fs->fs_bsize, M_DEVBUF, M_WAITOK); 87676269Smckusick bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize); 87776269Smckusick bqrelse(bp); 878107848Smckusick error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs, 879107848Smckusick level == 0 ? rlbn : -1, expungetype); 88062976Smckusick if (error || level == 0) 88162976Smckusick goto out; 88262976Smckusick /* 88362976Smckusick * Account for the block pointers in each of the indirect blocks 88462976Smckusick * in the levels below us. 88562976Smckusick */ 88662976Smckusick subblksperindir = blksperindir / NINDIR(fs); 88762976Smckusick for (lbn++, level--, i = 0; i < last; i++) { 88898542Smckusick error = indiracct_ufs1(snapvp, cancelvp, level, bap[i], lbn, 88990098Smckusick rlbn, remblks, subblksperindir, fs, acctfunc, expungetype); 89062976Smckusick if (error) 89162976Smckusick goto out; 89262976Smckusick rlbn += blksperindir; 89362976Smckusick lbn -= blksperindir; 89462976Smckusick remblks -= blksperindir; 89562976Smckusick } 89662976Smckusickout: 89776269Smckusick FREE(bap, M_DEVBUF); 89862976Smckusick return (error); 89962976Smckusick} 90062976Smckusick 90162976Smckusick/* 90290098Smckusick * Do both snap accounting and map accounting. 90390098Smckusick */ 90490098Smckusickstatic int 90598542Smckusickfullacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype) 90690098Smckusick struct vnode *vp; 90798542Smckusick ufs1_daddr_t *oldblkp, *lastblkp; 90890098Smckusick struct fs *fs; 90998542Smckusick ufs_lbn_t lblkno; 91098542Smckusick int exptype; /* BLK_SNAP or BLK_NOCOPY */ 91198542Smckusick{ 91298542Smckusick int error; 91398542Smckusick 91498542Smckusick if ((error = snapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype))) 91598542Smckusick return (error); 91698542Smckusick return (mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype)); 91798542Smckusick} 91898542Smckusick 91998542Smckusick/* 92098542Smckusick * Identify a set of blocks allocated in a snapshot inode. 92198542Smckusick */ 92298542Smckusickstatic int 92398542Smckusicksnapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype) 92498542Smckusick struct vnode *vp; 92598542Smckusick ufs1_daddr_t *oldblkp, *lastblkp; 92698542Smckusick struct fs *fs; 92798542Smckusick ufs_lbn_t lblkno; 92890098Smckusick int expungetype; /* BLK_SNAP or BLK_NOCOPY */ 92990098Smckusick{ 93098542Smckusick struct inode *ip = VTOI(vp); 93198542Smckusick ufs1_daddr_t blkno, *blkp; 93298542Smckusick ufs_lbn_t lbn; 93398542Smckusick struct buf *ibp; 93490098Smckusick int error; 93590098Smckusick 93698542Smckusick for ( ; oldblkp < lastblkp; oldblkp++) { 93798542Smckusick blkno = *oldblkp; 93898542Smckusick if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP) 93998542Smckusick continue; 94098542Smckusick lbn = fragstoblks(fs, blkno); 94198542Smckusick if (lbn < NDADDR) { 94298542Smckusick blkp = &ip->i_din1->di_db[lbn]; 94398542Smckusick ip->i_flag |= IN_CHANGE | IN_UPDATE; 94498542Smckusick } else { 94598542Smckusick error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 94698658Sdillon fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 94798542Smckusick if (error) 94898542Smckusick return (error); 94998542Smckusick blkp = &((ufs1_daddr_t *)(ibp->b_data)) 95098542Smckusick [(lbn - NDADDR) % NINDIR(fs)]; 95198542Smckusick } 95298542Smckusick /* 95398542Smckusick * If we are expunging a snapshot vnode and we 95498542Smckusick * find a block marked BLK_NOCOPY, then it is 95598542Smckusick * one that has been allocated to this snapshot after 95698542Smckusick * we took our current snapshot and can be ignored. 95798542Smckusick */ 95898542Smckusick if (expungetype == BLK_SNAP && *blkp == BLK_NOCOPY) { 95998542Smckusick if (lbn >= NDADDR) 96098542Smckusick brelse(ibp); 96198542Smckusick } else { 96298542Smckusick if (*blkp != 0) 96398542Smckusick panic("snapacct: bad block"); 96498542Smckusick *blkp = expungetype; 96598542Smckusick if (lbn >= NDADDR) 96698542Smckusick bdwrite(ibp); 96798542Smckusick } 96898542Smckusick } 96998542Smckusick return (0); 97098542Smckusick} 97198542Smckusick 97298542Smckusick/* 97398542Smckusick * Account for a set of blocks allocated in a snapshot inode. 97498542Smckusick */ 97598542Smckusickstatic int 97698542Smckusickmapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype) 97798542Smckusick struct vnode *vp; 97898542Smckusick ufs1_daddr_t *oldblkp, *lastblkp; 97998542Smckusick struct fs *fs; 98098542Smckusick ufs_lbn_t lblkno; 98198542Smckusick int expungetype; 98298542Smckusick{ 98398542Smckusick ufs1_daddr_t blkno; 984104698Smckusick struct inode *ip; 98598542Smckusick ino_t inum; 98698542Smckusick 987107848Smckusick /* 988107848Smckusick * We only care about the leaf block numbers, not the 989107848Smckusick * meta-block numbers. 990107848Smckusick */ 991107848Smckusick if (lblkno == -1) 992107848Smckusick return (0); 993104698Smckusick ip = VTOI(vp); 994104698Smckusick inum = ip->i_number; 99598542Smckusick for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) { 99698542Smckusick blkno = *oldblkp; 99798542Smckusick if (blkno == 0 || blkno == BLK_NOCOPY) 99898542Smckusick continue; 999104698Smckusick if (expungetype == BLK_SNAP && blkno != BLK_SNAP) 1000107915Smckusick *ip->i_snapblklist++ = lblkno; 100198542Smckusick if (blkno == BLK_SNAP) 100298542Smckusick blkno = blkstofrags(fs, lblkno); 100398542Smckusick ffs_blkfree(fs, vp, blkno, fs->fs_bsize, inum); 100498542Smckusick } 100598542Smckusick return (0); 100698542Smckusick} 100798542Smckusick 100898542Smckusick/* 100998542Smckusick * Before expunging a snapshot inode, note all the 101098542Smckusick * blocks that it claims with BLK_SNAP so that fsck will 101198542Smckusick * be able to account for those blocks properly and so 101298542Smckusick * that this snapshot knows that it need not copy them 101398542Smckusick * if the other snapshot holding them is freed. This code 101498542Smckusick * is reproduced once each for UFS1 and UFS2. 101598542Smckusick */ 101698542Smckusickstatic int 101798542Smckusickexpunge_ufs2(snapvp, cancelip, fs, acctfunc, expungetype) 101898542Smckusick struct vnode *snapvp; 101998542Smckusick struct inode *cancelip; 102098542Smckusick struct fs *fs; 102198542Smckusick int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 102298542Smckusick struct fs *, ufs_lbn_t, int); 102398542Smckusick int expungetype; 102498542Smckusick{ 102598542Smckusick int i, error, indiroff; 102698542Smckusick ufs_lbn_t lbn, rlbn; 102798542Smckusick ufs2_daddr_t len, blkno, numblks, blksperindir; 102898542Smckusick struct ufs2_dinode *dip; 102998542Smckusick struct thread *td = curthread; 103098542Smckusick struct buf *bp; 103198542Smckusick 103298542Smckusick /* 103398542Smckusick * Prepare to expunge the inode. If its inode block has not 103498542Smckusick * yet been copied, then allocate and fill the copy. 103598542Smckusick */ 103698542Smckusick lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number)); 103798542Smckusick blkno = 0; 103898542Smckusick if (lbn < NDADDR) { 1039107558Smckusick blkno = VTOI(snapvp)->i_din2->di_db[lbn]; 104098542Smckusick } else { 104198542Smckusick td->td_proc->p_flag |= P_COWINPROGRESS; 104298542Smckusick error = UFS_BALLOC(snapvp, lblktosize(fs, (off_t)lbn), 104398658Sdillon fs->fs_bsize, KERNCRED, BA_METAONLY, &bp); 104498542Smckusick td->td_proc->p_flag &= ~P_COWINPROGRESS; 104598542Smckusick if (error) 104698542Smckusick return (error); 104798542Smckusick indiroff = (lbn - NDADDR) % NINDIR(fs); 104898542Smckusick blkno = ((ufs2_daddr_t *)(bp->b_data))[indiroff]; 104998542Smckusick bqrelse(bp); 105098542Smckusick } 1051107558Smckusick if (blkno != 0) { 1052107558Smckusick if ((error = bread(snapvp, lbn, fs->fs_bsize, KERNCRED, &bp))) 1053107558Smckusick return (error); 1054107558Smckusick } else { 1055107558Smckusick error = UFS_BALLOC(snapvp, lblktosize(fs, (off_t)lbn), 1056107558Smckusick fs->fs_bsize, KERNCRED, 0, &bp); 1057107558Smckusick if (error) 1058107558Smckusick return (error); 1059107558Smckusick if ((error = readblock(bp, lbn)) != 0) 1060107558Smckusick return (error); 1061107558Smckusick } 106298542Smckusick /* 106398542Smckusick * Set a snapshot inode to be a zero length file, regular files 106498542Smckusick * to be completely unallocated. 106598542Smckusick */ 106698542Smckusick dip = (struct ufs2_dinode *)bp->b_data + 106798542Smckusick ino_to_fsbo(fs, cancelip->i_number); 106898542Smckusick if (expungetype == BLK_NOCOPY) 106998542Smckusick dip->di_mode = 0; 107098542Smckusick dip->di_size = 0; 107198542Smckusick dip->di_blocks = 0; 107298542Smckusick dip->di_flags &= ~SF_SNAPSHOT; 107398542Smckusick bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs2_daddr_t)); 107498542Smckusick bdwrite(bp); 1075107848Smckusick /* 1076107848Smckusick * Now go through and expunge all the blocks in the file 1077107848Smckusick * using the function requested. 1078107848Smckusick */ 1079107848Smckusick numblks = howmany(cancelip->i_size, fs->fs_bsize); 1080107848Smckusick if ((error = (*acctfunc)(snapvp, &cancelip->i_din2->di_db[0], 1081107848Smckusick &cancelip->i_din2->di_db[NDADDR], fs, 0, expungetype))) 1082107848Smckusick return (error); 1083107848Smckusick if ((error = (*acctfunc)(snapvp, &cancelip->i_din2->di_ib[0], 1084107848Smckusick &cancelip->i_din2->di_ib[NIADDR], fs, -1, expungetype))) 1085107848Smckusick return (error); 1086107848Smckusick blksperindir = 1; 1087107848Smckusick lbn = -NDADDR; 1088107848Smckusick len = numblks - NDADDR; 1089107848Smckusick rlbn = NDADDR; 1090107848Smckusick for (i = 0; len > 0 && i < NIADDR; i++) { 1091107848Smckusick error = indiracct_ufs2(snapvp, ITOV(cancelip), i, 1092107848Smckusick cancelip->i_din2->di_ib[i], lbn, rlbn, len, 1093107848Smckusick blksperindir, fs, acctfunc, expungetype); 1094107848Smckusick if (error) 1095107848Smckusick return (error); 1096107848Smckusick blksperindir *= NINDIR(fs); 1097107848Smckusick lbn -= blksperindir + 1; 1098107848Smckusick len -= blksperindir; 1099107848Smckusick rlbn += blksperindir; 1100107848Smckusick } 110198542Smckusick return (0); 110290098Smckusick} 110390098Smckusick 110490098Smckusick/* 110598542Smckusick * Descend an indirect block chain for vnode cancelvp accounting for all 110698542Smckusick * its indirect blocks in snapvp. 110798542Smckusick */ 110898542Smckusickstatic int 110998542Smckusickindiracct_ufs2(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks, 111098542Smckusick blksperindir, fs, acctfunc, expungetype) 111198542Smckusick struct vnode *snapvp; 111298542Smckusick struct vnode *cancelvp; 111398542Smckusick int level; 111498542Smckusick ufs2_daddr_t blkno; 111598542Smckusick ufs_lbn_t lbn; 111698542Smckusick ufs_lbn_t rlbn; 111798542Smckusick ufs_lbn_t remblks; 111898542Smckusick ufs_lbn_t blksperindir; 111998542Smckusick struct fs *fs; 112098542Smckusick int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 112198542Smckusick struct fs *, ufs_lbn_t, int); 112298542Smckusick int expungetype; 112398542Smckusick{ 112498542Smckusick int error, num, i; 112598542Smckusick ufs_lbn_t subblksperindir; 112698542Smckusick struct indir indirs[NIADDR + 2]; 112798542Smckusick ufs2_daddr_t last, *bap; 112898542Smckusick struct buf *bp; 112998542Smckusick 113098542Smckusick if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0) 113198542Smckusick return (error); 113298542Smckusick if (lbn != indirs[num - 1 - level].in_lbn || blkno == 0 || num < 2) 113398542Smckusick panic("indiracct: botched params"); 113498542Smckusick /* 113598542Smckusick * We have to expand bread here since it will deadlock looking 113698542Smckusick * up the block number for any blocks that are not in the cache. 113798542Smckusick */ 113898542Smckusick bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0); 113998542Smckusick bp->b_blkno = fsbtodb(fs, blkno); 114098542Smckusick if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 && 114198542Smckusick (error = readblock(bp, fragstoblks(fs, blkno)))) { 114298542Smckusick brelse(bp); 114398542Smckusick return (error); 114498542Smckusick } 114598542Smckusick /* 114698542Smckusick * Account for the block pointers in this indirect block. 114798542Smckusick */ 114898542Smckusick last = howmany(remblks, blksperindir); 114998542Smckusick if (last > NINDIR(fs)) 115098542Smckusick last = NINDIR(fs); 115198542Smckusick MALLOC(bap, ufs2_daddr_t *, fs->fs_bsize, M_DEVBUF, M_WAITOK); 115298542Smckusick bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize); 115398542Smckusick bqrelse(bp); 1154107848Smckusick error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs, 1155107848Smckusick level == 0 ? rlbn : -1, expungetype); 115698542Smckusick if (error || level == 0) 115798542Smckusick goto out; 115898542Smckusick /* 115998542Smckusick * Account for the block pointers in each of the indirect blocks 116098542Smckusick * in the levels below us. 116198542Smckusick */ 116298542Smckusick subblksperindir = blksperindir / NINDIR(fs); 116398542Smckusick for (lbn++, level--, i = 0; i < last; i++) { 116498542Smckusick error = indiracct_ufs2(snapvp, cancelvp, level, bap[i], lbn, 116598542Smckusick rlbn, remblks, subblksperindir, fs, acctfunc, expungetype); 116698542Smckusick if (error) 116798542Smckusick goto out; 116898542Smckusick rlbn += blksperindir; 116998542Smckusick lbn -= blksperindir; 117098542Smckusick remblks -= blksperindir; 117198542Smckusick } 117298542Smckusickout: 117398542Smckusick FREE(bap, M_DEVBUF); 117498542Smckusick return (error); 117598542Smckusick} 117698542Smckusick 117798542Smckusick/* 117898542Smckusick * Do both snap accounting and map accounting. 117998542Smckusick */ 118098542Smckusickstatic int 118198542Smckusickfullacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype) 118298542Smckusick struct vnode *vp; 118398542Smckusick ufs2_daddr_t *oldblkp, *lastblkp; 118498542Smckusick struct fs *fs; 118598542Smckusick ufs_lbn_t lblkno; 118698542Smckusick int exptype; /* BLK_SNAP or BLK_NOCOPY */ 118798542Smckusick{ 118898542Smckusick int error; 118998542Smckusick 119098542Smckusick if ((error = snapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype))) 119198542Smckusick return (error); 119298542Smckusick return (mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype)); 119398542Smckusick} 119498542Smckusick 119598542Smckusick/* 119687827Smckusick * Identify a set of blocks allocated in a snapshot inode. 119762976Smckusick */ 119862976Smckusickstatic int 119998542Smckusicksnapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expungetype) 120062976Smckusick struct vnode *vp; 120198542Smckusick ufs2_daddr_t *oldblkp, *lastblkp; 120276269Smckusick struct fs *fs; 120398542Smckusick ufs_lbn_t lblkno; 120490098Smckusick int expungetype; /* BLK_SNAP or BLK_NOCOPY */ 120562976Smckusick{ 120662976Smckusick struct inode *ip = VTOI(vp); 120798542Smckusick ufs2_daddr_t blkno, *blkp; 120898542Smckusick ufs_lbn_t lbn; 120962976Smckusick struct buf *ibp; 121062976Smckusick int error; 121162976Smckusick 121262976Smckusick for ( ; oldblkp < lastblkp; oldblkp++) { 121362976Smckusick blkno = *oldblkp; 121462976Smckusick if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP) 121562976Smckusick continue; 121662976Smckusick lbn = fragstoblks(fs, blkno); 121762976Smckusick if (lbn < NDADDR) { 121898542Smckusick blkp = &ip->i_din2->di_db[lbn]; 121962976Smckusick ip->i_flag |= IN_CHANGE | IN_UPDATE; 122062976Smckusick } else { 122176132Sphk error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 122298658Sdillon fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 122362976Smckusick if (error) 122462976Smckusick return (error); 122598542Smckusick blkp = &((ufs2_daddr_t *)(ibp->b_data)) 122662976Smckusick [(lbn - NDADDR) % NINDIR(fs)]; 122762976Smckusick } 122887827Smckusick /* 122990098Smckusick * If we are expunging a snapshot vnode and we 123090098Smckusick * find a block marked BLK_NOCOPY, then it is 123187827Smckusick * one that has been allocated to this snapshot after 123287827Smckusick * we took our current snapshot and can be ignored. 123387827Smckusick */ 123490098Smckusick if (expungetype == BLK_SNAP && *blkp == BLK_NOCOPY) { 123587827Smckusick if (lbn >= NDADDR) 123687827Smckusick brelse(ibp); 123787827Smckusick } else { 123887827Smckusick if (*blkp != 0) 123987827Smckusick panic("snapacct: bad block"); 124090098Smckusick *blkp = expungetype; 124187827Smckusick if (lbn >= NDADDR) 124287827Smckusick bdwrite(ibp); 124363788Smckusick } 124462976Smckusick } 124562976Smckusick return (0); 124662976Smckusick} 124762976Smckusick 124862976Smckusick/* 124976269Smckusick * Account for a set of blocks allocated in a snapshot inode. 125076269Smckusick */ 125176269Smckusickstatic int 125298542Smckusickmapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expungetype) 125376269Smckusick struct vnode *vp; 125498542Smckusick ufs2_daddr_t *oldblkp, *lastblkp; 125576269Smckusick struct fs *fs; 125698542Smckusick ufs_lbn_t lblkno; 125790098Smckusick int expungetype; 125876269Smckusick{ 125998542Smckusick ufs2_daddr_t blkno; 1260104698Smckusick struct inode *ip; 126190098Smckusick ino_t inum; 126276269Smckusick 1263107848Smckusick /* 1264107848Smckusick * We only care about the leaf block numbers, not the 1265107848Smckusick * meta-block numbers. 1266107848Smckusick */ 1267107848Smckusick if (lblkno == -1) 1268107848Smckusick return (0); 1269104698Smckusick ip = VTOI(vp); 1270104698Smckusick inum = ip->i_number; 127176269Smckusick for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) { 127276269Smckusick blkno = *oldblkp; 127376269Smckusick if (blkno == 0 || blkno == BLK_NOCOPY) 127476269Smckusick continue; 1275104698Smckusick if (expungetype == BLK_SNAP && blkno != BLK_SNAP) 1276107915Smckusick *ip->i_snapblklist++ = lblkno; 127776269Smckusick if (blkno == BLK_SNAP) 127876269Smckusick blkno = blkstofrags(fs, lblkno); 127990098Smckusick ffs_blkfree(fs, vp, blkno, fs->fs_bsize, inum); 128076269Smckusick } 128176269Smckusick return (0); 128276269Smckusick} 128376269Smckusick 128476269Smckusick/* 128570183Smckusick * Decrement extra reference on snapshot when last name is removed. 128670183Smckusick * It will not be freed until the last open reference goes away. 128770183Smckusick */ 128870183Smckusickvoid 128970183Smckusickffs_snapgone(ip) 129070183Smckusick struct inode *ip; 129170183Smckusick{ 129270183Smckusick struct inode *xp; 129374547Smckusick struct fs *fs; 129474547Smckusick int snaploc; 129570183Smckusick 129670183Smckusick /* 129770183Smckusick * Find snapshot in incore list. 129870183Smckusick */ 129973942Smckusick TAILQ_FOREACH(xp, &ip->i_devvp->v_rdev->si_snapshots, i_nextsnap) 130073942Smckusick if (xp == ip) 130170183Smckusick break; 1302107848Smckusick if (xp != NULL) 1303107848Smckusick vrele(ITOV(ip)); 1304107848Smckusick else if (snapdebug) 130570183Smckusick printf("ffs_snapgone: lost snapshot vnode %d\n", 130670183Smckusick ip->i_number); 130774547Smckusick /* 130874547Smckusick * Delete snapshot inode from superblock. Keep list dense. 130974547Smckusick */ 131074547Smckusick fs = ip->i_fs; 131174547Smckusick for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) 131274547Smckusick if (fs->fs_snapinum[snaploc] == ip->i_number) 131374547Smckusick break; 131474547Smckusick if (snaploc < FSMAXSNAP) { 131574547Smckusick for (snaploc++; snaploc < FSMAXSNAP; snaploc++) { 131674547Smckusick if (fs->fs_snapinum[snaploc] == 0) 131774547Smckusick break; 131874547Smckusick fs->fs_snapinum[snaploc - 1] = fs->fs_snapinum[snaploc]; 131974547Smckusick } 132074547Smckusick fs->fs_snapinum[snaploc - 1] = 0; 132174547Smckusick } 132270183Smckusick} 132370183Smckusick 132470183Smckusick/* 132562976Smckusick * Prepare a snapshot file for being removed. 132662976Smckusick */ 132762976Smckusickvoid 132862976Smckusickffs_snapremove(vp) 132962976Smckusick struct vnode *vp; 133062976Smckusick{ 133173942Smckusick struct inode *ip; 133262976Smckusick struct vnode *devvp; 1333105191Smckusick struct lock *lkp; 133462976Smckusick struct buf *ibp; 133562976Smckusick struct fs *fs; 1336105191Smckusick struct thread *td = curthread; 1337107848Smckusick ufs2_daddr_t numblks, blkno, dblk, *snapblklist; 133898542Smckusick int error, loc, last; 133962976Smckusick 134062976Smckusick ip = VTOI(vp); 134162976Smckusick fs = ip->i_fs; 1342107414Smckusick devvp = ip->i_devvp; 134362976Smckusick /* 134475943Smckusick * If active, delete from incore list (this snapshot may 134575943Smckusick * already have been in the process of being deleted, so 134675943Smckusick * would not have been active). 134775943Smckusick * 134862976Smckusick * Clear copy-on-write flag if last snapshot. 134962976Smckusick */ 135075943Smckusick if (ip->i_nextsnap.tqe_prev != 0) { 1351107414Smckusick VI_LOCK(devvp); 1352107414Smckusick lockmgr(&vp->v_lock, LK_INTERLOCK | LK_EXCLUSIVE, 1353107414Smckusick VI_MTX(devvp), td); 1354107414Smckusick VI_LOCK(devvp); 1355107414Smckusick TAILQ_REMOVE(&devvp->v_rdev->si_snapshots, ip, i_nextsnap); 1356107414Smckusick ip->i_nextsnap.tqe_prev = 0; 1357105191Smckusick lkp = vp->v_vnlock; 1358105191Smckusick vp->v_vnlock = &vp->v_lock; 1359107414Smckusick lockmgr(lkp, LK_RELEASE, NULL, td); 1360107414Smckusick if (TAILQ_FIRST(&devvp->v_rdev->si_snapshots) != 0) { 1361107414Smckusick VI_UNLOCK(devvp); 1362107414Smckusick } else { 1363107848Smckusick snapblklist = devvp->v_rdev->si_snapblklist; 1364107848Smckusick devvp->v_rdev->si_snapblklist = 0; 1365107848Smckusick devvp->v_rdev->si_snaplistsize = 0; 1366107414Smckusick devvp->v_rdev->si_copyonwrite = 0; 1367107414Smckusick devvp->v_vflag &= ~VV_COPYONWRITE; 1368107414Smckusick lockmgr(lkp, LK_DRAIN|LK_INTERLOCK, VI_MTX(devvp), td); 1369107414Smckusick lockmgr(lkp, LK_RELEASE, NULL, td); 1370105191Smckusick lockdestroy(lkp); 1371105191Smckusick FREE(lkp, M_UFSMNT); 1372107848Smckusick FREE(snapblklist, M_UFSMNT); 137373942Smckusick } 137473942Smckusick } 137562976Smckusick /* 137662976Smckusick * Clear all BLK_NOCOPY fields. Pass any block claims to other 137762976Smckusick * snapshots that want them (see ffs_snapblkfree below). 137862976Smckusick */ 137962976Smckusick for (blkno = 1; blkno < NDADDR; blkno++) { 138098542Smckusick dblk = DIP(ip, i_db[blkno]); 138176356Smckusick if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) 138298542Smckusick DIP(ip, i_db[blkno]) = 0; 138376356Smckusick else if ((dblk == blkstofrags(fs, blkno) && 138490098Smckusick ffs_snapblkfree(fs, ip->i_devvp, dblk, fs->fs_bsize, 138590098Smckusick ip->i_number))) { 138698542Smckusick DIP(ip, i_blocks) -= btodb(fs->fs_bsize); 138798542Smckusick DIP(ip, i_db[blkno]) = 0; 138876356Smckusick } 138962976Smckusick } 139076356Smckusick numblks = howmany(ip->i_size, fs->fs_bsize); 139176356Smckusick for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) { 139276132Sphk error = UFS_BALLOC(vp, lblktosize(fs, (off_t)blkno), 139398658Sdillon fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 139462976Smckusick if (error) 139562976Smckusick continue; 139698542Smckusick if (fs->fs_size - blkno > NINDIR(fs)) 139762976Smckusick last = NINDIR(fs); 139898542Smckusick else 139998542Smckusick last = fs->fs_size - blkno; 140062976Smckusick for (loc = 0; loc < last; loc++) { 140198542Smckusick if (ip->i_ump->um_fstype == UFS1) { 140298542Smckusick dblk = ((ufs1_daddr_t *)(ibp->b_data))[loc]; 140398542Smckusick if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) 140498542Smckusick ((ufs1_daddr_t *)(ibp->b_data))[loc]= 0; 140598542Smckusick else if ((dblk == blkstofrags(fs, blkno) && 140698542Smckusick ffs_snapblkfree(fs, ip->i_devvp, dblk, 140798542Smckusick fs->fs_bsize, ip->i_number))) { 140898542Smckusick ip->i_din1->di_blocks -= 140998542Smckusick btodb(fs->fs_bsize); 141098542Smckusick ((ufs1_daddr_t *)(ibp->b_data))[loc]= 0; 141198542Smckusick } 141298542Smckusick continue; 141398542Smckusick } 141498542Smckusick dblk = ((ufs2_daddr_t *)(ibp->b_data))[loc]; 141576356Smckusick if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) 141698542Smckusick ((ufs2_daddr_t *)(ibp->b_data))[loc] = 0; 141776356Smckusick else if ((dblk == blkstofrags(fs, blkno) && 141890098Smckusick ffs_snapblkfree(fs, ip->i_devvp, dblk, 141990098Smckusick fs->fs_bsize, ip->i_number))) { 142098542Smckusick ip->i_din2->di_blocks -= btodb(fs->fs_bsize); 142198542Smckusick ((ufs2_daddr_t *)(ibp->b_data))[loc] = 0; 142276356Smckusick } 142362976Smckusick } 142462976Smckusick bawrite(ibp); 142562976Smckusick } 142662976Smckusick /* 142762976Smckusick * Clear snapshot flag and drop reference. 142862976Smckusick */ 142963897Smckusick ip->i_flags &= ~SF_SNAPSHOT; 143098542Smckusick DIP(ip, i_flags) = ip->i_flags; 143162976Smckusick ip->i_flag |= IN_CHANGE | IN_UPDATE; 143262976Smckusick} 143362976Smckusick 143462976Smckusick/* 143562976Smckusick * Notification that a block is being freed. Return zero if the free 143662976Smckusick * should be allowed to proceed. Return non-zero if the snapshot file 143762976Smckusick * wants to claim the block. The block will be claimed if it is an 143862976Smckusick * uncopied part of one of the snapshots. It will be freed if it is 143962976Smckusick * either a BLK_NOCOPY or has already been copied in all of the snapshots. 144062976Smckusick * If a fragment is being freed, then all snapshots that care about 144162976Smckusick * it must make a copy since a snapshot file can only claim full sized 144262976Smckusick * blocks. Note that if more than one snapshot file maps the block, 144362976Smckusick * we can pick one at random to claim it. Since none of the snapshots 144462976Smckusick * can change, we are assurred that they will all see the same unmodified 144562976Smckusick * image. When deleting a snapshot file (see ffs_snapremove above), we 144662976Smckusick * must push any of these claimed blocks to one of the other snapshots 144762976Smckusick * that maps it. These claimed blocks are easily identified as they will 144862976Smckusick * have a block number equal to their logical block number within the 144962976Smckusick * snapshot. A copied block can never have this property because they 145062976Smckusick * must always have been allocated from a BLK_NOCOPY location. 145162976Smckusick */ 145262976Smckusickint 145390098Smckusickffs_snapblkfree(fs, devvp, bno, size, inum) 145490098Smckusick struct fs *fs; 145590098Smckusick struct vnode *devvp; 145698542Smckusick ufs2_daddr_t bno; 145762976Smckusick long size; 145890098Smckusick ino_t inum; 145962976Smckusick{ 146062976Smckusick struct buf *ibp, *cbp, *savedcbp = 0; 146183366Sjulian struct thread *td = curthread; 146262976Smckusick struct inode *ip; 1463107414Smckusick struct vnode *vp = NULL; 146498542Smckusick ufs_lbn_t lbn; 146598542Smckusick ufs2_daddr_t blkno; 1466107414Smckusick int indiroff = 0, snapshot_locked = 0, error = 0, claimedblk = 0; 146773942Smckusick struct snaphead *snaphead; 146862976Smckusick 146962976Smckusick lbn = fragstoblks(fs, bno); 1470107414Smckusickretry: 1471107414Smckusick VI_LOCK(devvp); 147290098Smckusick snaphead = &devvp->v_rdev->si_snapshots; 147373942Smckusick TAILQ_FOREACH(ip, snaphead, i_nextsnap) { 147462976Smckusick vp = ITOV(ip); 147562976Smckusick /* 147662976Smckusick * Lookup block being written. 147762976Smckusick */ 147862976Smckusick if (lbn < NDADDR) { 147998542Smckusick blkno = DIP(ip, i_db[lbn]); 148062976Smckusick } else { 1481107414Smckusick if (snapshot_locked == 0 && 1482107848Smckusick lockmgr(vp->v_vnlock, 1483107414Smckusick LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL, 1484107414Smckusick VI_MTX(devvp), td) != 0) 1485107414Smckusick goto retry; 1486107848Smckusick snapshot_locked = 1; 148783366Sjulian td->td_proc->p_flag |= P_COWINPROGRESS; 148876132Sphk error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 148998658Sdillon fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 149083366Sjulian td->td_proc->p_flag &= ~P_COWINPROGRESS; 149162976Smckusick if (error) 149262976Smckusick break; 149362976Smckusick indiroff = (lbn - NDADDR) % NINDIR(fs); 149498542Smckusick if (ip->i_ump->um_fstype == UFS1) 149598542Smckusick blkno=((ufs1_daddr_t *)(ibp->b_data))[indiroff]; 149698542Smckusick else 149798542Smckusick blkno=((ufs2_daddr_t *)(ibp->b_data))[indiroff]; 149862976Smckusick } 149962976Smckusick /* 150062976Smckusick * Check to see if block needs to be copied. 150162976Smckusick */ 150298542Smckusick if (blkno == 0) { 150398542Smckusick /* 150498542Smckusick * A block that we map is being freed. If it has not 150598542Smckusick * been claimed yet, we will claim or copy it (below). 150698542Smckusick */ 150798542Smckusick claimedblk = 1; 150898542Smckusick } else if (blkno == BLK_SNAP) { 150998542Smckusick /* 151098542Smckusick * No previous snapshot claimed the block, 1511107414Smckusick * so it will be freed and become a BLK_NOCOPY 151298542Smckusick * (don't care) for us. 151398542Smckusick */ 151462976Smckusick if (claimedblk) 151562976Smckusick panic("snapblkfree: inconsistent block type"); 1516107414Smckusick if (snapshot_locked == 0 && 1517107414Smckusick lockmgr(vp->v_vnlock, 1518107414Smckusick LK_INTERLOCK | LK_EXCLUSIVE | LK_NOWAIT, 1519107414Smckusick VI_MTX(devvp), td) != 0) { 1520107414Smckusick if (lbn >= NDADDR) 1521107414Smckusick bqrelse(ibp); 1522107414Smckusick vn_lock(vp, LK_EXCLUSIVE | LK_SLEEPFAIL, td); 1523107414Smckusick goto retry; 1524107414Smckusick } 1525107414Smckusick snapshot_locked = 1; 152662976Smckusick if (lbn < NDADDR) { 152798542Smckusick DIP(ip, i_db[lbn]) = BLK_NOCOPY; 152862976Smckusick ip->i_flag |= IN_CHANGE | IN_UPDATE; 152998542Smckusick } else if (ip->i_ump->um_fstype == UFS1) { 153098542Smckusick ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = 153198542Smckusick BLK_NOCOPY; 153298542Smckusick bdwrite(ibp); 153362976Smckusick } else { 153498542Smckusick ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = 153562976Smckusick BLK_NOCOPY; 153662976Smckusick bdwrite(ibp); 153762976Smckusick } 153862976Smckusick continue; 153998542Smckusick } else /* BLK_NOCOPY or default */ { 154098542Smckusick /* 154198542Smckusick * If the snapshot has already copied the block 154298542Smckusick * (default), or does not care about the block, 154398542Smckusick * it is not needed. 154498542Smckusick */ 154598542Smckusick if (lbn >= NDADDR) 154698542Smckusick bqrelse(ibp); 154798542Smckusick continue; 154862976Smckusick } 154962976Smckusick /* 155062976Smckusick * If this is a full size block, we will just grab it 155162976Smckusick * and assign it to the snapshot inode. Otherwise we 155262976Smckusick * will proceed to copy it. See explanation for this 155362976Smckusick * routine as to why only a single snapshot needs to 155462976Smckusick * claim this block. 155562976Smckusick */ 1556107414Smckusick if (snapshot_locked == 0 && 1557107414Smckusick lockmgr(vp->v_vnlock, 1558107414Smckusick LK_INTERLOCK | LK_EXCLUSIVE | LK_NOWAIT, 1559107414Smckusick VI_MTX(devvp), td) != 0) { 1560107414Smckusick if (lbn >= NDADDR) 1561107414Smckusick bqrelse(ibp); 1562107414Smckusick vn_lock(vp, LK_EXCLUSIVE | LK_SLEEPFAIL, td); 1563107414Smckusick goto retry; 1564107414Smckusick } 1565107414Smckusick snapshot_locked = 1; 156662976Smckusick if (size == fs->fs_bsize) { 156762976Smckusick#ifdef DEBUG 156862976Smckusick if (snapdebug) 156998687Smux printf("%s %d lbn %jd from inum %d\n", 157098542Smckusick "Grabonremove: snapino", ip->i_number, 157198542Smckusick (intmax_t)lbn, inum); 157262976Smckusick#endif 157362976Smckusick if (lbn < NDADDR) { 157498542Smckusick DIP(ip, i_db[lbn]) = bno; 157598542Smckusick } else if (ip->i_ump->um_fstype == UFS1) { 157698542Smckusick ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = bno; 157798542Smckusick bdwrite(ibp); 157862976Smckusick } else { 157998542Smckusick ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = bno; 158062976Smckusick bdwrite(ibp); 158162976Smckusick } 158298542Smckusick DIP(ip, i_blocks) += btodb(size); 158362976Smckusick ip->i_flag |= IN_CHANGE | IN_UPDATE; 158483366Sjulian VOP_UNLOCK(vp, 0, td); 158562976Smckusick return (1); 158662976Smckusick } 158762976Smckusick if (lbn >= NDADDR) 158863788Smckusick bqrelse(ibp); 158962976Smckusick /* 159062976Smckusick * Allocate the block into which to do the copy. Note that this 159162976Smckusick * allocation will never require any additional allocations for 159262976Smckusick * the snapshot inode. 159362976Smckusick */ 159483366Sjulian td->td_proc->p_flag |= P_COWINPROGRESS; 159576132Sphk error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 159662976Smckusick fs->fs_bsize, KERNCRED, 0, &cbp); 159783366Sjulian td->td_proc->p_flag &= ~P_COWINPROGRESS; 1598107414Smckusick if (error) 159962976Smckusick break; 160062976Smckusick#ifdef DEBUG 160162976Smckusick if (snapdebug) 160298687Smux printf("%s%d lbn %jd %s %d size %ld to blkno %jd\n", 160398542Smckusick "Copyonremove: snapino ", ip->i_number, 160498542Smckusick (intmax_t)lbn, "for inum", inum, size, 160598542Smckusick (intmax_t)cbp->b_blkno); 160662976Smckusick#endif 160762976Smckusick /* 160862976Smckusick * If we have already read the old block contents, then 160975943Smckusick * simply copy them to the new block. Note that we need 161075943Smckusick * to synchronously write snapshots that have not been 161175943Smckusick * unlinked, and hence will be visible after a crash, 161275943Smckusick * to ensure their integrity. 161362976Smckusick */ 161462976Smckusick if (savedcbp != 0) { 161562976Smckusick bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize); 161662976Smckusick bawrite(cbp); 161776580Smckusick if (dopersistence && ip->i_effnlink > 0) 161883366Sjulian (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); 161962976Smckusick continue; 162062976Smckusick } 162162976Smckusick /* 162262976Smckusick * Otherwise, read the old block contents into the buffer. 162362976Smckusick */ 162475943Smckusick if ((error = readblock(cbp, lbn)) != 0) { 162575943Smckusick bzero(cbp->b_data, fs->fs_bsize); 162675943Smckusick bawrite(cbp); 162776580Smckusick if (dopersistence && ip->i_effnlink > 0) 162883366Sjulian (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); 162962976Smckusick break; 163075943Smckusick } 163162976Smckusick savedcbp = cbp; 163262976Smckusick } 163375943Smckusick /* 163475943Smckusick * Note that we need to synchronously write snapshots that 163575943Smckusick * have not been unlinked, and hence will be visible after 163675943Smckusick * a crash, to ensure their integrity. 163775943Smckusick */ 163875943Smckusick if (savedcbp) { 163975943Smckusick vp = savedcbp->b_vp; 164062976Smckusick bawrite(savedcbp); 1641107414Smckusick if (dopersistence && VTOI(vp)->i_effnlink > 0) 164283366Sjulian (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); 164375943Smckusick } 164462976Smckusick /* 164562976Smckusick * If we have been unable to allocate a block in which to do 164662976Smckusick * the copy, then return non-zero so that the fragment will 164762976Smckusick * not be freed. Although space will be lost, the snapshot 164862976Smckusick * will stay consistent. 164962976Smckusick */ 1650107414Smckusick if (snapshot_locked) 1651107414Smckusick VOP_UNLOCK(vp, 0, td); 1652107414Smckusick else 1653107414Smckusick VI_UNLOCK(devvp); 165462976Smckusick return (error); 165562976Smckusick} 165662976Smckusick 165762976Smckusick/* 165862976Smckusick * Associate snapshot files when mounting. 165962976Smckusick */ 166062976Smckusickvoid 166162976Smckusickffs_snapshot_mount(mp) 166262976Smckusick struct mount *mp; 166362976Smckusick{ 166462976Smckusick struct ufsmount *ump = VFSTOUFS(mp); 1665107414Smckusick struct vnode *devvp = ump->um_devvp; 166662976Smckusick struct fs *fs = ump->um_fs; 166783366Sjulian struct thread *td = curthread; 166873942Smckusick struct snaphead *snaphead; 166962976Smckusick struct vnode *vp; 1670105191Smckusick struct inode *ip, *xp; 1671104698Smckusick struct uio auio; 1672104698Smckusick struct iovec aiov; 1673107848Smckusick void *snapblklist; 1674104698Smckusick char *reason; 1675107848Smckusick daddr_t snaplistsize; 167662976Smckusick int error, snaploc, loc; 167762976Smckusick 1678104698Smckusick /* 1679104698Smckusick * XXX The following needs to be set before UFS_TRUNCATE or 1680104698Smckusick * VOP_READ can be called. 1681104698Smckusick */ 1682104698Smckusick mp->mnt_stat.f_iosize = fs->fs_bsize; 1683104698Smckusick /* 1684104698Smckusick * Process each snapshot listed in the superblock. 1685104698Smckusick */ 1686107848Smckusick vp = NULL; 1687107414Smckusick snaphead = &devvp->v_rdev->si_snapshots; 168862976Smckusick for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) { 168962976Smckusick if (fs->fs_snapinum[snaploc] == 0) 1690107848Smckusick break; 169192462Smckusick if ((error = VFS_VGET(mp, fs->fs_snapinum[snaploc], 169292462Smckusick LK_EXCLUSIVE, &vp)) != 0){ 169362976Smckusick printf("ffs_snapshot_mount: vget failed %d\n", error); 169462976Smckusick continue; 169562976Smckusick } 169662976Smckusick ip = VTOI(vp); 1697104698Smckusick if ((ip->i_flags & SF_SNAPSHOT) == 0 || ip->i_size == 1698104698Smckusick lblktosize(fs, howmany(fs->fs_size, fs->fs_frag))) { 1699104698Smckusick if ((ip->i_flags & SF_SNAPSHOT) == 0) { 1700104698Smckusick reason = "non-snapshot"; 1701104698Smckusick } else { 1702104698Smckusick reason = "old format snapshot"; 1703104698Smckusick (void)UFS_TRUNCATE(vp, (off_t)0, 0, NOCRED, td); 1704104698Smckusick (void)VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); 1705104698Smckusick } 1706104698Smckusick printf("ffs_snapshot_mount: %s inode %d\n", 1707104698Smckusick reason, fs->fs_snapinum[snaploc]); 170862976Smckusick vput(vp); 1709107848Smckusick vp = NULL; 171062976Smckusick for (loc = snaploc + 1; loc < FSMAXSNAP; loc++) { 171162976Smckusick if (fs->fs_snapinum[loc] == 0) 171262976Smckusick break; 171362976Smckusick fs->fs_snapinum[loc - 1] = fs->fs_snapinum[loc]; 171462976Smckusick } 171562976Smckusick fs->fs_snapinum[loc - 1] = 0; 171662976Smckusick snaploc--; 171762976Smckusick continue; 171862976Smckusick } 1719104698Smckusick /* 1720105191Smckusick * If there already exist snapshots on this filesystem, grab a 1721105191Smckusick * reference to their shared lock. If this is the first snapshot 1722105191Smckusick * on this filesystem, we need to allocate a lock for the 1723105191Smckusick * snapshots to share. In either case, acquire the snapshot 1724105191Smckusick * lock and give up our original private lock. 1725105191Smckusick */ 1726107414Smckusick VI_LOCK(devvp); 1727105191Smckusick if ((xp = TAILQ_FIRST(snaphead)) != NULL) { 1728105191Smckusick VI_LOCK(vp); 1729105191Smckusick vp->v_vnlock = ITOV(xp)->v_vnlock; 1730107414Smckusick VI_UNLOCK(devvp); 1731105191Smckusick } else { 1732105191Smckusick struct lock *lkp; 1733105191Smckusick 1734107414Smckusick VI_UNLOCK(devvp); 1735105191Smckusick MALLOC(lkp, struct lock *, sizeof(struct lock), 1736105191Smckusick M_UFSMNT, M_WAITOK); 1737105191Smckusick lockinit(lkp, PVFS, "snaplk", VLKTIMEOUT, 1738105191Smckusick LK_CANRECURSE | LK_NOPAUSE); 1739105191Smckusick VI_LOCK(vp); 1740105191Smckusick vp->v_vnlock = lkp; 1741105191Smckusick } 1742105191Smckusick vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY, td); 1743107414Smckusick transferlockers(&vp->v_lock, vp->v_vnlock); 1744107414Smckusick lockmgr(&vp->v_lock, LK_RELEASE, NULL, td); 1745105191Smckusick /* 1746104698Smckusick * Link it onto the active snapshot list. 1747104698Smckusick */ 1748107414Smckusick VI_LOCK(devvp); 174973942Smckusick if (ip->i_nextsnap.tqe_prev != 0) 175062976Smckusick panic("ffs_snapshot_mount: %d already on list", 175162976Smckusick ip->i_number); 175273942Smckusick else 175373942Smckusick TAILQ_INSERT_TAIL(snaphead, ip, i_nextsnap); 1754101308Sjeff vp->v_vflag |= VV_SYSTEM; 1755107414Smckusick VI_UNLOCK(devvp); 175683366Sjulian VOP_UNLOCK(vp, 0, td); 175762976Smckusick } 1758107848Smckusick /* 1759107848Smckusick * No usable snapshots found. 1760107848Smckusick */ 1761107848Smckusick if (vp == NULL) 1762107848Smckusick return; 1763107848Smckusick /* 1764107848Smckusick * Allocate the space for the block hints list. We always want to 1765107848Smckusick * use the list from the newest snapshot. 1766107848Smckusick */ 1767107848Smckusick auio.uio_iov = &aiov; 1768107848Smckusick auio.uio_iovcnt = 1; 1769107848Smckusick aiov.iov_base = (void *)&snaplistsize; 1770107848Smckusick aiov.iov_len = sizeof(snaplistsize); 1771107848Smckusick auio.uio_resid = aiov.iov_len; 1772107848Smckusick auio.uio_offset = 1773107848Smckusick lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)); 1774107848Smckusick auio.uio_segflg = UIO_SYSSPACE; 1775107848Smckusick auio.uio_rw = UIO_READ; 1776107848Smckusick auio.uio_td = td; 1777107848Smckusick vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); 1778107848Smckusick if ((error = VOP_READ(vp, &auio, IO_UNIT, td->td_ucred)) != 0) { 1779107848Smckusick printf("ffs_snapshot_mount: read_1 failed %d\n", error); 1780107848Smckusick VOP_UNLOCK(vp, 0, td); 1781107848Smckusick return; 1782107848Smckusick } 1783107848Smckusick MALLOC(snapblklist, void *, snaplistsize * sizeof(daddr_t), 1784107848Smckusick M_UFSMNT, M_WAITOK); 1785107848Smckusick auio.uio_iovcnt = 1; 1786107848Smckusick aiov.iov_base = snapblklist; 1787107848Smckusick aiov.iov_len = snaplistsize * sizeof (daddr_t); 1788107848Smckusick auio.uio_resid = aiov.iov_len; 1789107848Smckusick auio.uio_offset -= sizeof(snaplistsize); 1790107848Smckusick if ((error = VOP_READ(vp, &auio, IO_UNIT, td->td_ucred)) != 0) { 1791107848Smckusick printf("ffs_snapshot_mount: read_2 failed %d\n", error); 1792107848Smckusick VOP_UNLOCK(vp, 0, td); 1793107848Smckusick FREE(snapblklist, M_UFSMNT); 1794107848Smckusick return; 1795107848Smckusick } 1796107848Smckusick VOP_UNLOCK(vp, 0, td); 1797107848Smckusick VI_LOCK(devvp); 1798107848Smckusick ASSERT_VOP_LOCKED(devvp, "ffs_snapshot_mount"); 1799107848Smckusick devvp->v_rdev->si_snaplistsize = snaplistsize; 1800107848Smckusick devvp->v_rdev->si_snapblklist = (daddr_t *)snapblklist; 1801107848Smckusick devvp->v_rdev->si_copyonwrite = ffs_copyonwrite; 1802107848Smckusick devvp->v_vflag |= VV_COPYONWRITE; 1803107848Smckusick VI_UNLOCK(devvp); 180462976Smckusick} 180562976Smckusick 180662976Smckusick/* 180762976Smckusick * Disassociate snapshot files when unmounting. 180862976Smckusick */ 180962976Smckusickvoid 181062976Smckusickffs_snapshot_unmount(mp) 181162976Smckusick struct mount *mp; 181262976Smckusick{ 1813107414Smckusick struct vnode *devvp = VFSTOUFS(mp)->um_devvp; 1814107414Smckusick struct snaphead *snaphead = &devvp->v_rdev->si_snapshots; 1815105191Smckusick struct lock *lkp = NULL; 181662976Smckusick struct inode *xp; 1817105191Smckusick struct vnode *vp; 181862976Smckusick 1819107414Smckusick VI_LOCK(devvp); 182073942Smckusick while ((xp = TAILQ_FIRST(snaphead)) != 0) { 1821105191Smckusick vp = ITOV(xp); 1822105191Smckusick lkp = vp->v_vnlock; 1823105191Smckusick vp->v_vnlock = &vp->v_lock; 182473942Smckusick TAILQ_REMOVE(snaphead, xp, i_nextsnap); 182573942Smckusick xp->i_nextsnap.tqe_prev = 0; 1826107414Smckusick if (xp->i_effnlink > 0) { 1827107414Smckusick VI_UNLOCK(devvp); 1828105191Smckusick vrele(vp); 1829107414Smckusick VI_LOCK(devvp); 1830107414Smckusick } 183162976Smckusick } 1832107848Smckusick if (devvp->v_rdev->si_snapblklist != NULL) { 1833107848Smckusick FREE(devvp->v_rdev->si_snapblklist, M_UFSMNT); 1834107848Smckusick devvp->v_rdev->si_snapblklist = NULL; 1835107848Smckusick devvp->v_rdev->si_snaplistsize = 0; 1836107848Smckusick } 1837105191Smckusick if (lkp != NULL) { 1838105191Smckusick lockdestroy(lkp); 1839105191Smckusick FREE(lkp, M_UFSMNT); 1840105191Smckusick } 1841107414Smckusick ASSERT_VOP_LOCKED(devvp, "ffs_snapshot_unmount"); 1842107414Smckusick devvp->v_rdev->si_copyonwrite = 0; 1843107414Smckusick devvp->v_vflag &= ~VV_COPYONWRITE; 1844107414Smckusick VI_UNLOCK(devvp); 184562976Smckusick} 184662976Smckusick 184762976Smckusick/* 184862976Smckusick * Check for need to copy block that is about to be written, 184962976Smckusick * copying the block if necessary. 185062976Smckusick */ 185173942Smckusickstatic int 185273942Smckusickffs_copyonwrite(devvp, bp) 185373942Smckusick struct vnode *devvp; 185473942Smckusick struct buf *bp; 185562976Smckusick{ 1856105191Smckusick struct snaphead *snaphead; 185773942Smckusick struct buf *ibp, *cbp, *savedcbp = 0; 185883366Sjulian struct thread *td = curthread; 185973942Smckusick struct fs *fs; 186062976Smckusick struct inode *ip; 1861105670Smckusick struct vnode *vp = 0; 1862107848Smckusick ufs2_daddr_t lbn, blkno, *snapblklist; 1863105670Smckusick int lower, upper, mid, indiroff, snapshot_locked = 0, error = 0; 186462976Smckusick 186583366Sjulian if (td->td_proc->p_flag & P_COWINPROGRESS) 186662976Smckusick panic("ffs_copyonwrite: recursive call"); 1867107848Smckusick /* 1868107848Smckusick * First check to see if it is in the preallocated list. 1869107848Smckusick * By doing this check we avoid several potential deadlocks. 1870107848Smckusick */ 1871107414Smckusick VI_LOCK(devvp); 1872105191Smckusick snaphead = &devvp->v_rdev->si_snapshots; 1873105191Smckusick ip = TAILQ_FIRST(snaphead); 1874105191Smckusick fs = ip->i_fs; 1875105191Smckusick lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno)); 1876107848Smckusick snapblklist = devvp->v_rdev->si_snapblklist; 1877107848Smckusick upper = devvp->v_rdev->si_snaplistsize - 1; 1878107848Smckusick lower = 1; 1879107848Smckusick while (lower <= upper) { 1880107848Smckusick mid = (lower + upper) / 2; 1881107848Smckusick if (snapblklist[mid] == lbn) 1882107848Smckusick break; 1883107848Smckusick if (snapblklist[mid] < lbn) 1884107848Smckusick lower = mid + 1; 1885107848Smckusick else 1886107848Smckusick upper = mid - 1; 1887107848Smckusick } 1888107848Smckusick if (lower <= upper) { 1889107848Smckusick VI_UNLOCK(devvp); 1890107848Smckusick return (0); 1891107848Smckusick } 1892107848Smckusick /* 1893107848Smckusick * Not in the precomputed list, so check the snapshots. 1894107848Smckusick */ 1895107414Smckusickretry: 1896105191Smckusick TAILQ_FOREACH(ip, snaphead, i_nextsnap) { 189762976Smckusick vp = ITOV(ip); 189862976Smckusick /* 189962976Smckusick * We ensure that everything of our own that needs to be 190062976Smckusick * copied will be done at the time that ffs_snapshot is 190162976Smckusick * called. Thus we can skip the check here which can 190276132Sphk * deadlock in doing the lookup in UFS_BALLOC. 190362976Smckusick */ 190462976Smckusick if (bp->b_vp == vp) 190562976Smckusick continue; 190662976Smckusick /* 1907105670Smckusick * Check to see if block needs to be copied. We do not have 1908105670Smckusick * to hold the snapshot lock while doing this lookup as it 1909105670Smckusick * will never require any additional allocations for the 1910105670Smckusick * snapshot inode. 191162976Smckusick */ 191262976Smckusick if (lbn < NDADDR) { 191398542Smckusick blkno = DIP(ip, i_db[lbn]); 191462976Smckusick } else { 1915107414Smckusick if (snapshot_locked == 0 && 1916107848Smckusick lockmgr(vp->v_vnlock, 1917107414Smckusick LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL, 1918107414Smckusick VI_MTX(devvp), td) != 0) { 1919107414Smckusick VI_LOCK(devvp); 1920107414Smckusick goto retry; 1921107414Smckusick } 1922107848Smckusick snapshot_locked = 1; 192383366Sjulian td->td_proc->p_flag |= P_COWINPROGRESS; 192476132Sphk error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 1925105191Smckusick fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 192683366Sjulian td->td_proc->p_flag &= ~P_COWINPROGRESS; 1927105191Smckusick if (error) 1928105191Smckusick break; 192962976Smckusick indiroff = (lbn - NDADDR) % NINDIR(fs); 193098542Smckusick if (ip->i_ump->um_fstype == UFS1) 193198542Smckusick blkno=((ufs1_daddr_t *)(ibp->b_data))[indiroff]; 193298542Smckusick else 193398542Smckusick blkno=((ufs2_daddr_t *)(ibp->b_data))[indiroff]; 193463788Smckusick bqrelse(ibp); 193562976Smckusick } 193662976Smckusick#ifdef DIAGNOSTIC 193762976Smckusick if (blkno == BLK_SNAP && bp->b_lblkno >= 0) 193862976Smckusick panic("ffs_copyonwrite: bad copy block"); 193962976Smckusick#endif 1940105191Smckusick if (blkno != 0) 194162976Smckusick continue; 194262976Smckusick /* 1943105670Smckusick * Allocate the block into which to do the copy. Since 1944105670Smckusick * multiple processes may all try to copy the same block, 1945105670Smckusick * we have to recheck our need to do a copy if we sleep 1946105670Smckusick * waiting for the lock. 1947105670Smckusick * 1948105670Smckusick * Because all snapshots on a filesystem share a single 1949105670Smckusick * lock, we ensure that we will never be in competition 1950105670Smckusick * with another process to allocate a block. 195162976Smckusick */ 1952105670Smckusick if (snapshot_locked == 0 && 1953107414Smckusick lockmgr(vp->v_vnlock, 1954107414Smckusick LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL, 1955107414Smckusick VI_MTX(devvp), td) != 0) { 1956107414Smckusick VI_LOCK(devvp); 1957105670Smckusick goto retry; 1958107414Smckusick } 1959105670Smckusick snapshot_locked = 1; 196083366Sjulian td->td_proc->p_flag |= P_COWINPROGRESS; 196176132Sphk error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 1962105191Smckusick fs->fs_bsize, KERNCRED, 0, &cbp); 196383366Sjulian td->td_proc->p_flag &= ~P_COWINPROGRESS; 1964105191Smckusick if (error) 1965105191Smckusick break; 196662976Smckusick#ifdef DEBUG 196762976Smckusick if (snapdebug) { 196898687Smux printf("Copyonwrite: snapino %d lbn %jd for ", 196998542Smckusick ip->i_number, (intmax_t)lbn); 197073942Smckusick if (bp->b_vp == devvp) 197162976Smckusick printf("fs metadata"); 197262976Smckusick else 197362976Smckusick printf("inum %d", VTOI(bp->b_vp)->i_number); 197498687Smux printf(" lblkno %jd to blkno %jd\n", 197598542Smckusick (intmax_t)bp->b_lblkno, (intmax_t)cbp->b_blkno); 197662976Smckusick } 197762976Smckusick#endif 197862976Smckusick /* 197962976Smckusick * If we have already read the old block contents, then 198075943Smckusick * simply copy them to the new block. Note that we need 198175943Smckusick * to synchronously write snapshots that have not been 198275943Smckusick * unlinked, and hence will be visible after a crash, 198375943Smckusick * to ensure their integrity. 198462976Smckusick */ 198562976Smckusick if (savedcbp != 0) { 198662976Smckusick bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize); 198762976Smckusick bawrite(cbp); 198876580Smckusick if (dopersistence && ip->i_effnlink > 0) 198983366Sjulian (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); 199062976Smckusick continue; 199162976Smckusick } 199262976Smckusick /* 199362976Smckusick * Otherwise, read the old block contents into the buffer. 199462976Smckusick */ 199575943Smckusick if ((error = readblock(cbp, lbn)) != 0) { 199675943Smckusick bzero(cbp->b_data, fs->fs_bsize); 199775943Smckusick bawrite(cbp); 199876580Smckusick if (dopersistence && ip->i_effnlink > 0) 199983366Sjulian (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); 200062976Smckusick break; 200175943Smckusick } 200262976Smckusick savedcbp = cbp; 200362976Smckusick } 200475943Smckusick /* 200575943Smckusick * Note that we need to synchronously write snapshots that 200675943Smckusick * have not been unlinked, and hence will be visible after 200775943Smckusick * a crash, to ensure their integrity. 200875943Smckusick */ 200975943Smckusick if (savedcbp) { 201075943Smckusick vp = savedcbp->b_vp; 201162976Smckusick bawrite(savedcbp); 2012105191Smckusick if (dopersistence && VTOI(vp)->i_effnlink > 0) 201383366Sjulian (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); 201475943Smckusick } 2015105670Smckusick if (snapshot_locked) 2016105670Smckusick VOP_UNLOCK(vp, 0, td); 2017107414Smckusick else 2018107414Smckusick VI_UNLOCK(devvp); 201962976Smckusick return (error); 202062976Smckusick} 202162976Smckusick 202262976Smckusick/* 202362976Smckusick * Read the specified block into the given buffer. 202462976Smckusick * Much of this boiler-plate comes from bwrite(). 202562976Smckusick */ 202662976Smckusickstatic int 202762976Smckusickreadblock(bp, lbn) 202862976Smckusick struct buf *bp; 202998542Smckusick ufs2_daddr_t lbn; 203062976Smckusick{ 203162976Smckusick struct uio auio; 203262976Smckusick struct iovec aiov; 203383366Sjulian struct thread *td = curthread; 203462976Smckusick struct inode *ip = VTOI(bp->b_vp); 203562976Smckusick 203662976Smckusick aiov.iov_base = bp->b_data; 203762976Smckusick aiov.iov_len = bp->b_bcount; 203862976Smckusick auio.uio_iov = &aiov; 203962976Smckusick auio.uio_iovcnt = 1; 204062976Smckusick auio.uio_offset = dbtob(fsbtodb(ip->i_fs, blkstofrags(ip->i_fs, lbn))); 204162976Smckusick auio.uio_resid = bp->b_bcount; 204262976Smckusick auio.uio_rw = UIO_READ; 204362976Smckusick auio.uio_segflg = UIO_SYSSPACE; 204483366Sjulian auio.uio_td = td; 204562976Smckusick return (physio(ip->i_devvp->v_rdev, &auio, 0)); 204662976Smckusick} 2047