1139825Simp/*- 262976Smckusick * Copyright 2000 Marshall Kirk McKusick. All Rights Reserved. 362976Smckusick * 462976Smckusick * Further information about snapshots can be obtained from: 562976Smckusick * 662976Smckusick * Marshall Kirk McKusick http://www.mckusick.com/softdep/ 762976Smckusick * 1614 Oxford Street mckusick@mckusick.com 862976Smckusick * Berkeley, CA 94709-1608 +1-510-843-9542 962976Smckusick * USA 1062976Smckusick * 1162976Smckusick * Redistribution and use in source and binary forms, with or without 1262976Smckusick * modification, are permitted provided that the following conditions 1362976Smckusick * are met: 1462976Smckusick * 1562976Smckusick * 1. Redistributions of source code must retain the above copyright 1662976Smckusick * notice, this list of conditions and the following disclaimer. 1762976Smckusick * 2. Redistributions in binary form must reproduce the above copyright 1862976Smckusick * notice, this list of conditions and the following disclaimer in the 1962976Smckusick * documentation and/or other materials provided with the distribution. 2062976Smckusick * 2162976Smckusick * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY 2262976Smckusick * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 2362976Smckusick * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 2462976Smckusick * DISCLAIMED. IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR 2562976Smckusick * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 2662976Smckusick * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 2762976Smckusick * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 2862976Smckusick * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 2962976Smckusick * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 3062976Smckusick * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 3162976Smckusick * SUCH DAMAGE. 3262976Smckusick * 3363788Smckusick * @(#)ffs_snapshot.c 8.11 (McKusick) 7/23/00 3462976Smckusick */ 3562976Smckusick 36116192Sobrien#include <sys/cdefs.h> 37116192Sobrien__FBSDID("$FreeBSD$"); 38116192Sobrien 39158322Stegge#include "opt_quota.h" 40158322Stegge 4162976Smckusick#include <sys/param.h> 42105191Smckusick#include <sys/kernel.h> 4362976Smckusick#include <sys/systm.h> 4473942Smckusick#include <sys/conf.h> 4562976Smckusick#include <sys/bio.h> 4662976Smckusick#include <sys/buf.h> 47177785Skib#include <sys/fcntl.h> 4862976Smckusick#include <sys/proc.h> 4962976Smckusick#include <sys/namei.h> 50113376Sjeff#include <sys/sched.h> 5162976Smckusick#include <sys/stat.h> 5262976Smckusick#include <sys/malloc.h> 5362976Smckusick#include <sys/mount.h> 5462976Smckusick#include <sys/resource.h> 5562976Smckusick#include <sys/resourcevar.h> 56251171Sjeff#include <sys/rwlock.h> 5762976Smckusick#include <sys/vnode.h> 5862976Smckusick 59137035Sphk#include <geom/geom.h> 60137035Sphk 6162976Smckusick#include <ufs/ufs/extattr.h> 6262976Smckusick#include <ufs/ufs/quota.h> 6362976Smckusick#include <ufs/ufs/ufsmount.h> 6462976Smckusick#include <ufs/ufs/inode.h> 6562976Smckusick#include <ufs/ufs/ufs_extern.h> 6662976Smckusick 6762976Smckusick#include <ufs/ffs/fs.h> 6862976Smckusick#include <ufs/ffs/ffs_extern.h> 6962976Smckusick 7091420Sjhb#define KERNCRED thread0.td_ucred 7165998Sdes#define DEBUG 1 7262976Smckusick 73154065Simp#include "opt_ffs.h" 74154065Simp 75154065Simp#ifdef NO_FFS_SNAPSHOT 76154065Simpint 77154065Simpffs_snapshot(mp, snapfile) 78154065Simp struct mount *mp; 79154065Simp char *snapfile; 80154065Simp{ 81154065Simp return (EINVAL); 82154065Simp} 83154065Simp 84154065Simpint 85223127Smckusickffs_snapblkfree(fs, devvp, bno, size, inum, vtype, wkhd) 86154065Simp struct fs *fs; 87154065Simp struct vnode *devvp; 88154065Simp ufs2_daddr_t bno; 89154065Simp long size; 90154065Simp ino_t inum; 91223127Smckusick enum vtype vtype; 92223020Smckusick struct workhead *wkhd; 93154065Simp{ 94154065Simp return (EINVAL); 95154065Simp} 96154065Simp 97154065Simpvoid 98154065Simpffs_snapremove(vp) 99154065Simp struct vnode *vp; 100154065Simp{ 101154065Simp} 102154065Simp 103154065Simpvoid 104154065Simpffs_snapshot_mount(mp) 105154065Simp struct mount *mp; 106154065Simp{ 107154065Simp} 108154065Simp 109154065Simpvoid 110154065Simpffs_snapshot_unmount(mp) 111154065Simp struct mount *mp; 112154065Simp{ 113154065Simp} 114154065Simp 115154065Simpvoid 116154065Simpffs_snapgone(ip) 117154065Simp struct inode *ip; 118154065Simp{ 119154065Simp} 120154065Simp 121154065Simpint 122154065Simpffs_copyonwrite(devvp, bp) 123154065Simp struct vnode *devvp; 124154065Simp struct buf *bp; 125154065Simp{ 126154065Simp return (EINVAL); 127154065Simp} 128154065Simp 129223020Smckusickvoid 130223020Smckusickffs_sync_snap(mp, waitfor) 131223020Smckusick struct mount *mp; 132223020Smckusick int waitfor; 133223020Smckusick{ 134223020Smckusick} 135223020Smckusick 136154065Simp#else 137218485SnetchildFEATURE(ffs_snapshot, "FFS snapshot support"); 138154065Simp 139177778SjeffLIST_HEAD(, snapdata) snapfree; 140177778Sjeffstatic struct mtx snapfree_lock; 141177778SjeffMTX_SYSINIT(ffs_snapfree, &snapfree_lock, "snapdata free list", MTX_DEF); 142177778Sjeff 14392728Salfredstatic int cgaccount(int, struct vnode *, struct buf *, int); 14498542Smckusickstatic int expunge_ufs1(struct vnode *, struct inode *, struct fs *, 14598542Smckusick int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, 146207141Sjeff ufs_lbn_t, int), int, int); 14798542Smckusickstatic int indiracct_ufs1(struct vnode *, struct vnode *, int, 14898542Smckusick ufs1_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *, 14998542Smckusick int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, 15098542Smckusick ufs_lbn_t, int), int); 15198542Smckusickstatic int fullacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 15298542Smckusick struct fs *, ufs_lbn_t, int); 15398542Smckusickstatic int snapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 15498542Smckusick struct fs *, ufs_lbn_t, int); 15598542Smckusickstatic int mapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 15698542Smckusick struct fs *, ufs_lbn_t, int); 15798542Smckusickstatic int expunge_ufs2(struct vnode *, struct inode *, struct fs *, 15898542Smckusick int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, 159207141Sjeff ufs_lbn_t, int), int, int); 16098542Smckusickstatic int indiracct_ufs2(struct vnode *, struct vnode *, int, 16198542Smckusick ufs2_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *, 16298542Smckusick int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, 16398542Smckusick ufs_lbn_t, int), int); 16498542Smckusickstatic int fullacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 16598542Smckusick struct fs *, ufs_lbn_t, int); 16698542Smckusickstatic int snapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 16798542Smckusick struct fs *, ufs_lbn_t, int); 16898542Smckusickstatic int mapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 16998542Smckusick struct fs *, ufs_lbn_t, int); 170135138Sphkstatic int readblock(struct vnode *vp, struct buf *, ufs2_daddr_t); 171177778Sjeffstatic void try_free_snapdata(struct vnode *devvp); 172177778Sjeffstatic struct snapdata *ffs_snapdata_acquire(struct vnode *devvp); 173166193Skibstatic int ffs_bp_snapblk(struct vnode *, struct buf *); 17462976Smckusick 17576580Smckusick/* 17676580Smckusick * To ensure the consistency of snapshots across crashes, we must 17776580Smckusick * synchronously write out copied blocks before allowing the 17876580Smckusick * originals to be modified. Because of the rather severe speed 179223127Smckusick * penalty that this imposes, the code normally only ensures 180223127Smckusick * persistence for the filesystem metadata contained within a 181223127Smckusick * snapshot. Setting the following flag allows this crash 182223127Smckusick * persistence to be enabled for file contents. 18376580Smckusick */ 18476580Smckusickint dopersistence = 0; 18576580Smckusick 18662976Smckusick#ifdef DEBUG 18762976Smckusick#include <sys/sysctl.h> 18876580SmckusickSYSCTL_INT(_debug, OID_AUTO, dopersistence, CTLFLAG_RW, &dopersistence, 0, ""); 189114293Smarkmstatic int snapdebug = 0; 19062976SmckusickSYSCTL_INT(_debug, OID_AUTO, snapdebug, CTLFLAG_RW, &snapdebug, 0, ""); 19187827Smckusickint collectsnapstats = 0; 19287827SmckusickSYSCTL_INT(_debug, OID_AUTO, collectsnapstats, CTLFLAG_RW, &collectsnapstats, 19387827Smckusick 0, ""); 19462976Smckusick#endif /* DEBUG */ 19562976Smckusick 19662976Smckusick/* 19762976Smckusick * Create a snapshot file and initialize it for the filesystem. 19862976Smckusick */ 19962976Smckusickint 20062976Smckusickffs_snapshot(mp, snapfile) 20162976Smckusick struct mount *mp; 20262976Smckusick char *snapfile; 20362976Smckusick{ 204111240Smckusick ufs2_daddr_t numblks, blkno, *blkp, *snapblklist; 20598542Smckusick int error, cg, snaploc; 20690098Smckusick int i, size, len, loc; 207232351Smckusick ufs2_daddr_t blockno; 208225806Smckusick uint64_t flag; 20987827Smckusick struct timespec starttime = {0, 0}, endtime; 21087827Smckusick char saved_nice = 0; 211111240Smckusick long redo = 0, snaplistsize = 0; 21276269Smckusick int32_t *lp; 21371073Siedowse void *space; 214140706Sjeff struct fs *copy_fs = NULL, *fs; 21583366Sjulian struct thread *td = curthread; 21673942Smckusick struct inode *ip, *xp; 217225807Smckusick struct buf *bp, *nbp, *ibp; 21862976Smckusick struct nameidata nd; 21962976Smckusick struct mount *wrtmp; 22062976Smckusick struct vattr vat; 221154152Stegge struct vnode *vp, *xvp, *mvp, *devvp; 222104698Smckusick struct uio auio; 223104698Smckusick struct iovec aiov; 224135138Sphk struct snapdata *sn; 225140706Sjeff struct ufsmount *ump; 22662976Smckusick 227140706Sjeff ump = VFSTOUFS(mp); 228140706Sjeff fs = ump->um_fs; 229158632Stegge sn = NULL; 230230250Smckusick /* 231230250Smckusick * At the moment, journaled soft updates cannot support 232230250Smckusick * taking snapshots. 233230250Smckusick */ 234230250Smckusick if (MOUNTEDSUJ(mp)) { 235230250Smckusick vfs_mount_error(mp, "%s: Snapshots are not yet supported when " 236230250Smckusick "running with journaled soft updates", fs->fs_fsmnt); 237230250Smckusick return (EOPNOTSUPP); 238230250Smckusick } 239162647Stegge MNT_ILOCK(mp); 240162647Stegge flag = mp->mnt_flag; 241162647Stegge MNT_IUNLOCK(mp); 242135138Sphk /* 24362976Smckusick * Need to serialize access to snapshot code per filesystem. 24462976Smckusick */ 24562976Smckusick /* 24662976Smckusick * Assign a snapshot slot in the superblock. 24762976Smckusick */ 248140706Sjeff UFS_LOCK(ump); 24962976Smckusick for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) 25062976Smckusick if (fs->fs_snapinum[snaploc] == 0) 25162976Smckusick break; 252140706Sjeff UFS_UNLOCK(ump); 25362976Smckusick if (snaploc == FSMAXSNAP) 25462976Smckusick return (ENOSPC); 25562976Smckusick /* 25662976Smckusick * Create the snapshot file. 25762976Smckusick */ 25862976Smckusickrestart: 259138557Sphk NDINIT(&nd, CREATE, LOCKPARENT | LOCKLEAF, UIO_SYSSPACE, snapfile, td); 26062976Smckusick if ((error = namei(&nd)) != 0) 26162976Smckusick return (error); 26262976Smckusick if (nd.ni_vp != NULL) { 26362976Smckusick vput(nd.ni_vp); 26462976Smckusick error = EEXIST; 26562976Smckusick } 26662976Smckusick if (nd.ni_dvp->v_mount != mp) 26762976Smckusick error = EXDEV; 26862976Smckusick if (error) { 26962976Smckusick NDFREE(&nd, NDF_ONLY_PNBUF); 27062976Smckusick if (nd.ni_dvp == nd.ni_vp) 27162976Smckusick vrele(nd.ni_dvp); 27262976Smckusick else 27362976Smckusick vput(nd.ni_dvp); 27462976Smckusick return (error); 27562976Smckusick } 27662976Smckusick VATTR_NULL(&vat); 27762976Smckusick vat.va_type = VREG; 27862976Smckusick vat.va_mode = S_IRUSR; 27962976Smckusick vat.va_vaflags |= VA_EXCLUSIVE; 28062976Smckusick if (VOP_GETWRITEMOUNT(nd.ni_dvp, &wrtmp)) 28162976Smckusick wrtmp = NULL; 28262976Smckusick if (wrtmp != mp) 28362976Smckusick panic("ffs_snapshot: mount mismatch"); 284157325Sjeff vfs_rel(wrtmp); 28562985Smckusick if (vn_start_write(NULL, &wrtmp, V_NOWAIT) != 0) { 28662976Smckusick NDFREE(&nd, NDF_ONLY_PNBUF); 28762976Smckusick vput(nd.ni_dvp); 28862985Smckusick if ((error = vn_start_write(NULL, &wrtmp, 28962985Smckusick V_XSLEEP | PCATCH)) != 0) 29062976Smckusick return (error); 29162976Smckusick goto restart; 29262976Smckusick } 29362976Smckusick error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vat); 294175294Sattilio VOP_UNLOCK(nd.ni_dvp, 0); 29562976Smckusick if (error) { 29662976Smckusick NDFREE(&nd, NDF_ONLY_PNBUF); 29762976Smckusick vn_finished_write(wrtmp); 298156895Stegge vrele(nd.ni_dvp); 29962976Smckusick return (error); 30062976Smckusick } 30162976Smckusick vp = nd.ni_vp; 302166142Smpp vp->v_vflag |= VV_SYSTEM; 30362976Smckusick ip = VTOI(vp); 304107414Smckusick devvp = ip->i_devvp; 30562976Smckusick /* 30662976Smckusick * Allocate and copy the last block contents so as to be able 30762976Smckusick * to set size to that of the filesystem. 30862976Smckusick */ 30962976Smckusick numblks = howmany(fs->fs_size, fs->fs_frag); 31076132Sphk error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(numblks - 1)), 31198658Sdillon fs->fs_bsize, KERNCRED, BA_CLRBUF, &bp); 31262976Smckusick if (error) 31362976Smckusick goto out; 31462976Smckusick ip->i_size = lblktosize(fs, (off_t)numblks); 315132775Skan DIP_SET(ip, i_size, ip->i_size); 31662976Smckusick ip->i_flag |= IN_CHANGE | IN_UPDATE; 317158633Stegge error = readblock(vp, bp, numblks - 1); 318158633Stegge bawrite(bp); 319158633Stegge if (error != 0) 32062976Smckusick goto out; 32162976Smckusick /* 32262976Smckusick * Preallocate critical data structures so that we can copy 32362976Smckusick * them in without further allocation after we suspend all 32462976Smckusick * operations on the filesystem. We would like to just release 32562976Smckusick * the allocated buffers without writing them since they will 32662976Smckusick * be filled in below once we are ready to go, but this upsets 32762976Smckusick * the soft update code, so we go ahead and write the new buffers. 32862976Smckusick * 32975993Smckusick * Allocate all indirect blocks and mark all of them as not 33075993Smckusick * needing to be copied. 33162976Smckusick */ 33262976Smckusick for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) { 33376132Sphk error = UFS_BALLOC(vp, lblktosize(fs, (off_t)blkno), 33498658Sdillon fs->fs_bsize, td->td_ucred, BA_METAONLY, &ibp); 33562976Smckusick if (error) 33662976Smckusick goto out; 337107406Smckusick bawrite(ibp); 33862976Smckusick } 33962976Smckusick /* 34062976Smckusick * Allocate copies for the superblock and its summary information. 34162976Smckusick */ 342107294Smckusick error = UFS_BALLOC(vp, fs->fs_sblockloc, fs->fs_sbsize, KERNCRED, 343107294Smckusick 0, &nbp); 34476269Smckusick if (error) 34562976Smckusick goto out; 34662976Smckusick bawrite(nbp); 34762976Smckusick blkno = fragstoblks(fs, fs->fs_csaddr); 34862976Smckusick len = howmany(fs->fs_cssize, fs->fs_bsize); 34962976Smckusick for (loc = 0; loc < len; loc++) { 35076132Sphk error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(blkno + loc)), 35162976Smckusick fs->fs_bsize, KERNCRED, 0, &nbp); 35262976Smckusick if (error) 35362976Smckusick goto out; 35462976Smckusick bawrite(nbp); 35562976Smckusick } 35662976Smckusick /* 35787827Smckusick * Allocate all cylinder group blocks. 35887827Smckusick */ 35987827Smckusick for (cg = 0; cg < fs->fs_ncg; cg++) { 360111238Smckusick error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)), 36187827Smckusick fs->fs_bsize, KERNCRED, 0, &nbp); 36287827Smckusick if (error) 36387827Smckusick goto out; 364107406Smckusick bawrite(nbp); 365184934Sambrisko if (cg % 10 == 0) 366233438Smckusick ffs_syncvnode(vp, MNT_WAIT, 0); 36787827Smckusick } 36887827Smckusick /* 36987827Smckusick * Copy all the cylinder group maps. Although the 37087827Smckusick * filesystem is still active, we hope that only a few 37187827Smckusick * cylinder groups will change between now and when we 37287827Smckusick * suspend operations. Thus, we will be able to quickly 37387827Smckusick * touch up the few cylinder groups that changed during 37487827Smckusick * the suspension period. 37587827Smckusick */ 37689450Smckusick len = howmany(fs->fs_ncg, NBBY); 377184205Sdes space = malloc(len, M_DEVBUF, M_WAITOK|M_ZERO); 378140706Sjeff UFS_LOCK(ump); 379140706Sjeff fs->fs_active = space; 380140706Sjeff UFS_UNLOCK(ump); 38187827Smckusick for (cg = 0; cg < fs->fs_ncg; cg++) { 382111238Smckusick error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)), 383107558Smckusick fs->fs_bsize, KERNCRED, 0, &nbp); 384107558Smckusick if (error) 38587827Smckusick goto out; 38687827Smckusick error = cgaccount(cg, vp, nbp, 1); 38787827Smckusick bawrite(nbp); 388184934Sambrisko if (cg % 10 == 0) 389233438Smckusick ffs_syncvnode(vp, MNT_WAIT, 0); 39087827Smckusick if (error) 39187827Smckusick goto out; 39287827Smckusick } 39387827Smckusick /* 39462976Smckusick * Change inode to snapshot type file. 39562976Smckusick */ 39663897Smckusick ip->i_flags |= SF_SNAPSHOT; 397132775Skan DIP_SET(ip, i_flags, ip->i_flags); 39862976Smckusick ip->i_flag |= IN_CHANGE | IN_UPDATE; 39962976Smckusick /* 40062976Smckusick * Ensure that the snapshot is completely on disk. 401107406Smckusick * Since we have marked it as a snapshot it is safe to 402107406Smckusick * unlock it as no process will be allowed to write to it. 40362976Smckusick */ 404233438Smckusick if ((error = ffs_syncvnode(vp, MNT_WAIT, 0)) != 0) 40562976Smckusick goto out; 406175294Sattilio VOP_UNLOCK(vp, 0); 40762976Smckusick /* 40862976Smckusick * All allocations are done, so we can now snapshot the system. 40962976Smckusick * 41087827Smckusick * Recind nice scheduling while running with the filesystem suspended. 41187827Smckusick */ 412130551Sjulian if (td->td_proc->p_nice > 0) { 413170307Sjeff struct proc *p; 414170307Sjeff 415170307Sjeff p = td->td_proc; 416170307Sjeff PROC_LOCK(p); 417170307Sjeff saved_nice = p->p_nice; 418170307Sjeff sched_nice(p, 0); 419170307Sjeff PROC_UNLOCK(p); 42087827Smckusick } 42187827Smckusick /* 42262976Smckusick * Suspend operation on filesystem. 42362976Smckusick */ 42462976Smckusick for (;;) { 42562976Smckusick vn_finished_write(wrtmp); 426253106Skib if ((error = vfs_write_suspend(vp->v_mount, 0)) != 0) { 427105902Smckusick vn_start_write(NULL, &wrtmp, V_WAIT); 428175202Sattilio vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 429105902Smckusick goto out; 430105902Smckusick } 43162976Smckusick if (mp->mnt_kern_flag & MNTK_SUSPENDED) 43262976Smckusick break; 43362985Smckusick vn_start_write(NULL, &wrtmp, V_WAIT); 43462976Smckusick } 435175202Sattilio vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 436158262Stegge if (ip->i_effnlink == 0) { 437158262Stegge error = ENOENT; /* Snapshot file unlinked */ 438158262Stegge goto out1; 439158262Stegge } 44090098Smckusick if (collectsnapstats) 44190098Smckusick nanotime(&starttime); 442158634Stegge 443158634Stegge /* The last block might have changed. Copy it again to be sure. */ 444158634Stegge error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(numblks - 1)), 445158634Stegge fs->fs_bsize, KERNCRED, BA_CLRBUF, &bp); 446158634Stegge if (error != 0) 447158634Stegge goto out1; 448158634Stegge error = readblock(vp, bp, numblks - 1); 449158634Stegge bp->b_flags |= B_VALIDSUSPWRT; 450158634Stegge bawrite(bp); 451158634Stegge if (error != 0) 452158634Stegge goto out1; 45362976Smckusick /* 45487827Smckusick * First, copy all the cylinder group maps that have changed. 45562976Smckusick */ 45662976Smckusick for (cg = 0; cg < fs->fs_ncg; cg++) { 45788138Smckusick if ((ACTIVECGNUM(fs, cg) & ACTIVECGOFF(cg)) != 0) 45887827Smckusick continue; 45987827Smckusick redo++; 460111238Smckusick error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)), 461107558Smckusick fs->fs_bsize, KERNCRED, 0, &nbp); 462107558Smckusick if (error) 46362976Smckusick goto out1; 46487827Smckusick error = cgaccount(cg, vp, nbp, 2); 46589450Smckusick bawrite(nbp); 46687827Smckusick if (error) 46762976Smckusick goto out1; 46862976Smckusick } 46962976Smckusick /* 47076269Smckusick * Grab a copy of the superblock and its summary information. 47176269Smckusick * We delay writing it until the suspension is released below. 47276269Smckusick */ 473225807Smckusick copy_fs = malloc((u_long)fs->fs_bsize, M_UFSMNT, M_WAITOK); 47476269Smckusick bcopy(fs, copy_fs, fs->fs_sbsize); 47576269Smckusick if ((fs->fs_flags & (FS_UNCLEAN | FS_NEEDSFSCK)) == 0) 47676269Smckusick copy_fs->fs_clean = 1; 477111972Smckusick size = fs->fs_bsize < SBLOCKSIZE ? fs->fs_bsize : SBLOCKSIZE; 478111972Smckusick if (fs->fs_sbsize < size) 479225807Smckusick bzero(&((char *)copy_fs)[fs->fs_sbsize], 480225807Smckusick size - fs->fs_sbsize); 48176269Smckusick size = blkroundup(fs, fs->fs_cssize); 48276269Smckusick if (fs->fs_contigsumsize > 0) 48376269Smckusick size += fs->fs_ncg * sizeof(int32_t); 484111119Simp space = malloc((u_long)size, M_UFSMNT, M_WAITOK); 48576269Smckusick copy_fs->fs_csp = space; 48676269Smckusick bcopy(fs->fs_csp, copy_fs->fs_csp, fs->fs_cssize); 487130246Sstefanf space = (char *)space + fs->fs_cssize; 48876269Smckusick loc = howmany(fs->fs_cssize, fs->fs_fsize); 48976356Smckusick i = fs->fs_frag - loc % fs->fs_frag; 49076356Smckusick len = (i == fs->fs_frag) ? 0 : i * fs->fs_fsize; 49176356Smckusick if (len > 0) { 492107414Smckusick if ((error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + loc), 49376269Smckusick len, KERNCRED, &bp)) != 0) { 49490098Smckusick brelse(bp); 49576269Smckusick free(copy_fs->fs_csp, M_UFSMNT); 496225807Smckusick free(copy_fs, M_UFSMNT); 497225807Smckusick copy_fs = NULL; 49876269Smckusick goto out1; 49962976Smckusick } 50076269Smckusick bcopy(bp->b_data, space, (u_int)len); 501130246Sstefanf space = (char *)space + len; 50276269Smckusick bp->b_flags |= B_INVAL | B_NOCACHE; 50376269Smckusick brelse(bp); 50462976Smckusick } 50576269Smckusick if (fs->fs_contigsumsize > 0) { 50676269Smckusick copy_fs->fs_maxcluster = lp = space; 50776269Smckusick for (i = 0; i < fs->fs_ncg; i++) 50876269Smckusick *lp++ = fs->fs_contigsumsize; 50976269Smckusick } 51062976Smckusick /* 51190098Smckusick * We must check for active files that have been unlinked 51290098Smckusick * (e.g., with a zero link count). We have to expunge all 51390098Smckusick * trace of these files from the snapshot so that they are 51490098Smckusick * not reclaimed prematurely by fsck or unnecessarily dumped. 51590098Smckusick * We turn off the MNTK_SUSPENDED flag to avoid a panic from 51690098Smckusick * spec_strategy about writing on a suspended filesystem. 517104698Smckusick * Note that we skip unlinked snapshot files as they will 518104698Smckusick * be handled separately below. 519111240Smckusick * 520111240Smckusick * We also calculate the needed size for the snapshot list. 52190098Smckusick */ 522111240Smckusick snaplistsize = fs->fs_ncg + howmany(fs->fs_cssize, fs->fs_bsize) + 523111240Smckusick FSMAXSNAP + 1 /* superblock */ + 1 /* last block */ + 1 /* size */; 524140706Sjeff MNT_ILOCK(mp); 52590098Smckusick mp->mnt_kern_flag &= ~MNTK_SUSPENDED; 526234386Smckusick MNT_IUNLOCK(mp); 52790098Smckusickloop: 528234386Smckusick MNT_VNODE_FOREACH_ALL(xvp, mp, mvp) { 529234386Smckusick if ((xvp->v_usecount == 0 && 530156560Stegge (xvp->v_iflag & (VI_OWEINACT | VI_DOINGINACT)) == 0) || 531156560Stegge xvp->v_type == VNON || 532232351Smckusick IS_SNAPSHOT(VTOI(xvp))) { 533120740Sjeff VI_UNLOCK(xvp); 53490098Smckusick continue; 53590098Smckusick } 536130690Skuriyama /* 537130690Skuriyama * We can skip parent directory vnode because it must have 538130690Skuriyama * this snapshot file in it. 539130690Skuriyama */ 540130690Skuriyama if (xvp == nd.ni_dvp) { 541130690Skuriyama VI_UNLOCK(xvp); 542130690Skuriyama continue; 543130690Skuriyama } 544156560Stegge vholdl(xvp); 545175202Sattilio if (vn_lock(xvp, LK_EXCLUSIVE | LK_INTERLOCK) != 0) { 546234386Smckusick MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); 547156560Stegge vdrop(xvp); 54890098Smckusick goto loop; 549120740Sjeff } 550156560Stegge VI_LOCK(xvp); 551156560Stegge if (xvp->v_usecount == 0 && 552156560Stegge (xvp->v_iflag & (VI_OWEINACT | VI_DOINGINACT)) == 0) { 553156560Stegge VI_UNLOCK(xvp); 554175294Sattilio VOP_UNLOCK(xvp, 0); 555156560Stegge vdrop(xvp); 556156560Stegge continue; 557156560Stegge } 558156560Stegge VI_UNLOCK(xvp); 559124119Skan if (snapdebug) 560124119Skan vprint("ffs_snapshot: busy vnode", xvp); 561182371Sattilio if (VOP_GETATTR(xvp, &vat, td->td_ucred) == 0 && 562120740Sjeff vat.va_nlink > 0) { 563175294Sattilio VOP_UNLOCK(xvp, 0); 564156560Stegge vdrop(xvp); 565120740Sjeff continue; 566120740Sjeff } 56790098Smckusick xp = VTOI(xvp); 568111239Smckusick if (ffs_checkfreefile(copy_fs, vp, xp->i_number)) { 569175294Sattilio VOP_UNLOCK(xvp, 0); 570156560Stegge vdrop(xvp); 571111239Smckusick continue; 572111239Smckusick } 57390098Smckusick /* 57490098Smckusick * If there is a fragment, clear it here. 57590098Smckusick */ 57690098Smckusick blkno = 0; 57790098Smckusick loc = howmany(xp->i_size, fs->fs_bsize) - 1; 57890098Smckusick if (loc < NDADDR) { 57990098Smckusick len = fragroundup(fs, blkoff(fs, xp->i_size)); 580142074Sdelphij if (len != 0 && len < fs->fs_bsize) { 581140706Sjeff ffs_blkfree(ump, copy_fs, vp, 582207141Sjeff DIP(xp, i_db[loc]), len, xp->i_number, 583223127Smckusick xvp->v_type, NULL); 58498542Smckusick blkno = DIP(xp, i_db[loc]); 585132775Skan DIP_SET(xp, i_db[loc], 0); 58690098Smckusick } 58790098Smckusick } 588111240Smckusick snaplistsize += 1; 58998542Smckusick if (xp->i_ump->um_fstype == UFS1) 59098542Smckusick error = expunge_ufs1(vp, xp, copy_fs, fullacct_ufs1, 591207141Sjeff BLK_NOCOPY, 1); 59298542Smckusick else 59398542Smckusick error = expunge_ufs2(vp, xp, copy_fs, fullacct_ufs2, 594207141Sjeff BLK_NOCOPY, 1); 59590098Smckusick if (blkno) 596132775Skan DIP_SET(xp, i_db[loc], blkno); 59790098Smckusick if (!error) 598140706Sjeff error = ffs_freefile(ump, copy_fs, vp, xp->i_number, 599207141Sjeff xp->i_mode, NULL); 600175294Sattilio VOP_UNLOCK(xvp, 0); 601156560Stegge vdrop(xvp); 60290098Smckusick if (error) { 60390098Smckusick free(copy_fs->fs_csp, M_UFSMNT); 604225807Smckusick free(copy_fs, M_UFSMNT); 605225807Smckusick copy_fs = NULL; 606234386Smckusick MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); 60790098Smckusick goto out1; 60890098Smckusick } 60990098Smckusick } 61090098Smckusick /* 611207141Sjeff * Erase the journal file from the snapshot. 612207141Sjeff */ 613207141Sjeff if (fs->fs_flags & FS_SUJ) { 614207141Sjeff error = softdep_journal_lookup(mp, &xvp); 615207141Sjeff if (error) { 616207141Sjeff free(copy_fs->fs_csp, M_UFSMNT); 617225807Smckusick free(copy_fs, M_UFSMNT); 618225807Smckusick copy_fs = NULL; 619207141Sjeff goto out1; 620207141Sjeff } 621207141Sjeff xp = VTOI(xvp); 622207141Sjeff if (xp->i_ump->um_fstype == UFS1) 623207141Sjeff error = expunge_ufs1(vp, xp, copy_fs, fullacct_ufs1, 624207141Sjeff BLK_NOCOPY, 0); 625207141Sjeff else 626207141Sjeff error = expunge_ufs2(vp, xp, copy_fs, fullacct_ufs2, 627207141Sjeff BLK_NOCOPY, 0); 628207141Sjeff vput(xvp); 629207141Sjeff } 630207141Sjeff /* 631177778Sjeff * Acquire a lock on the snapdata structure, creating it if necessary. 632105191Smckusick */ 633177778Sjeff sn = ffs_snapdata_acquire(devvp); 634177778Sjeff /* 635177778Sjeff * Change vnode to use shared snapshot lock instead of the original 636177778Sjeff * private lock. 637177778Sjeff */ 638177778Sjeff vp->v_vnlock = &sn->sn_lock; 639175635Sattilio lockmgr(&vp->v_lock, LK_RELEASE, NULL); 640177778Sjeff xp = TAILQ_FIRST(&sn->sn_head); 641105191Smckusick /* 642111240Smckusick * If this is the first snapshot on this filesystem, then we need 643111240Smckusick * to allocate the space for the list of preallocated snapshot blocks. 644111240Smckusick * This list will be refined below, but this preliminary one will 645111240Smckusick * keep us out of deadlock until the full one is ready. 646111240Smckusick */ 647111240Smckusick if (xp == NULL) { 648184205Sdes snapblklist = malloc(snaplistsize * sizeof(daddr_t), 649111240Smckusick M_UFSMNT, M_WAITOK); 650111240Smckusick blkp = &snapblklist[1]; 651111240Smckusick *blkp++ = lblkno(fs, fs->fs_sblockloc); 652111240Smckusick blkno = fragstoblks(fs, fs->fs_csaddr); 653111240Smckusick for (cg = 0; cg < fs->fs_ncg; cg++) { 654111240Smckusick if (fragstoblks(fs, cgtod(fs, cg) > blkno)) 655111240Smckusick break; 656111240Smckusick *blkp++ = fragstoblks(fs, cgtod(fs, cg)); 657111240Smckusick } 658111240Smckusick len = howmany(fs->fs_cssize, fs->fs_bsize); 659111240Smckusick for (loc = 0; loc < len; loc++) 660111240Smckusick *blkp++ = blkno + loc; 661111240Smckusick for (; cg < fs->fs_ncg; cg++) 662111240Smckusick *blkp++ = fragstoblks(fs, cgtod(fs, cg)); 663111240Smckusick snapblklist[0] = blkp - snapblklist; 664111240Smckusick VI_LOCK(devvp); 665135138Sphk if (sn->sn_blklist != NULL) 666111240Smckusick panic("ffs_snapshot: non-empty list"); 667135138Sphk sn->sn_blklist = snapblklist; 668135138Sphk sn->sn_listsize = blkp - snapblklist; 669111240Smckusick VI_UNLOCK(devvp); 670111240Smckusick } 671111240Smckusick /* 67262976Smckusick * Record snapshot inode. Since this is the newest snapshot, 67362976Smckusick * it must be placed at the end of the list. 67462976Smckusick */ 675107414Smckusick VI_LOCK(devvp); 67662976Smckusick fs->fs_snapinum[snaploc] = ip->i_number; 67773942Smckusick if (ip->i_nextsnap.tqe_prev != 0) 678241011Smdf panic("ffs_snapshot: %ju already on list", 679241011Smdf (uintmax_t)ip->i_number); 680135138Sphk TAILQ_INSERT_TAIL(&sn->sn_head, ip, i_nextsnap); 681107414Smckusick devvp->v_vflag |= VV_COPYONWRITE; 682107414Smckusick VI_UNLOCK(devvp); 683101308Sjeff ASSERT_VOP_LOCKED(vp, "ffs_snapshot vp"); 68487827Smckusickout1: 685225807Smckusick KASSERT((sn != NULL && copy_fs != NULL && error == 0) || 686225807Smckusick (sn == NULL && copy_fs == NULL && error != 0), 687158632Stegge ("email phk@ and mckusick@")); 68862976Smckusick /* 68962976Smckusick * Resume operation on filesystem. 69062976Smckusick */ 691245286Skib vfs_write_resume(vp->v_mount, VR_START_WRITE | VR_NO_SUSPCLR); 69287827Smckusick if (collectsnapstats && starttime.tv_sec > 0) { 69387827Smckusick nanotime(&endtime); 69487827Smckusick timespecsub(&endtime, &starttime); 695106965Speter printf("%s: suspended %ld.%03ld sec, redo %ld of %d\n", 696106965Speter vp->v_mount->mnt_stat.f_mntonname, (long)endtime.tv_sec, 69787827Smckusick endtime.tv_nsec / 1000000, redo, fs->fs_ncg); 69887827Smckusick } 699225807Smckusick if (copy_fs == NULL) 70090098Smckusick goto out; 70190098Smckusick /* 70290098Smckusick * Copy allocation information from all the snapshots in 70390098Smckusick * this snapshot and then expunge them from its view. 70490098Smckusick */ 705135138Sphk TAILQ_FOREACH(xp, &sn->sn_head, i_nextsnap) { 70690098Smckusick if (xp == ip) 70790098Smckusick break; 70898542Smckusick if (xp->i_ump->um_fstype == UFS1) 70998542Smckusick error = expunge_ufs1(vp, xp, fs, snapacct_ufs1, 710207141Sjeff BLK_SNAP, 0); 71198542Smckusick else 71298542Smckusick error = expunge_ufs2(vp, xp, fs, snapacct_ufs2, 713207141Sjeff BLK_SNAP, 0); 714158527Stegge if (error == 0 && xp->i_effnlink == 0) { 715158527Stegge error = ffs_freefile(ump, 716158527Stegge copy_fs, 717158527Stegge vp, 718158527Stegge xp->i_number, 719207141Sjeff xp->i_mode, NULL); 720158527Stegge } 72198542Smckusick if (error) { 72290098Smckusick fs->fs_snapinum[snaploc] = 0; 72390098Smckusick goto done; 72487827Smckusick } 72590098Smckusick } 72690098Smckusick /* 727111240Smckusick * Allocate space for the full list of preallocated snapshot blocks. 728104698Smckusick */ 729184205Sdes snapblklist = malloc(snaplistsize * sizeof(daddr_t), 730111119Simp M_UFSMNT, M_WAITOK); 731107915Smckusick ip->i_snapblklist = &snapblklist[1]; 732104698Smckusick /* 73390098Smckusick * Expunge the blocks used by the snapshots from the set of 734104698Smckusick * blocks marked as used in the snapshot bitmaps. Also, collect 735107915Smckusick * the list of allocated blocks in i_snapblklist. 73690098Smckusick */ 73798542Smckusick if (ip->i_ump->um_fstype == UFS1) 738207141Sjeff error = expunge_ufs1(vp, ip, copy_fs, mapacct_ufs1, 739207141Sjeff BLK_SNAP, 0); 74098542Smckusick else 741207141Sjeff error = expunge_ufs2(vp, ip, copy_fs, mapacct_ufs2, 742207141Sjeff BLK_SNAP, 0); 74398542Smckusick if (error) { 74490098Smckusick fs->fs_snapinum[snaploc] = 0; 745184205Sdes free(snapblklist, M_UFSMNT); 74690098Smckusick goto done; 74790098Smckusick } 748111240Smckusick if (snaplistsize < ip->i_snapblklist - snapblklist) 749111240Smckusick panic("ffs_snapshot: list too small"); 750107915Smckusick snaplistsize = ip->i_snapblklist - snapblklist; 751107848Smckusick snapblklist[0] = snaplistsize; 752107915Smckusick ip->i_snapblklist = 0; 75390098Smckusick /* 754104698Smckusick * Write out the list of allocated blocks to the end of the snapshot. 755104698Smckusick */ 756104698Smckusick auio.uio_iov = &aiov; 757104698Smckusick auio.uio_iovcnt = 1; 758107848Smckusick aiov.iov_base = (void *)snapblklist; 759107848Smckusick aiov.iov_len = snaplistsize * sizeof(daddr_t); 760201758Smbr auio.uio_resid = aiov.iov_len; 761104698Smckusick auio.uio_offset = ip->i_size; 762104698Smckusick auio.uio_segflg = UIO_SYSSPACE; 763104698Smckusick auio.uio_rw = UIO_WRITE; 764104698Smckusick auio.uio_td = td; 765104698Smckusick if ((error = VOP_WRITE(vp, &auio, IO_UNIT, td->td_ucred)) != 0) { 766104698Smckusick fs->fs_snapinum[snaploc] = 0; 767184205Sdes free(snapblklist, M_UFSMNT); 768104698Smckusick goto done; 769104698Smckusick } 770104698Smckusick /* 77190098Smckusick * Write the superblock and its summary information 77290098Smckusick * to the snapshot. 77390098Smckusick */ 77490098Smckusick blkno = fragstoblks(fs, fs->fs_csaddr); 77590098Smckusick len = howmany(fs->fs_cssize, fs->fs_bsize); 77690098Smckusick space = copy_fs->fs_csp; 77790098Smckusick for (loc = 0; loc < len; loc++) { 77890098Smckusick error = bread(vp, blkno + loc, fs->fs_bsize, KERNCRED, &nbp); 77990098Smckusick if (error) { 78090098Smckusick brelse(nbp); 78190098Smckusick fs->fs_snapinum[snaploc] = 0; 782184205Sdes free(snapblklist, M_UFSMNT); 78390098Smckusick goto done; 78476269Smckusick } 78590098Smckusick bcopy(space, nbp->b_data, fs->fs_bsize); 78690098Smckusick space = (char *)space + fs->fs_bsize; 78790098Smckusick bawrite(nbp); 78876269Smckusick } 789225807Smckusick error = bread(vp, lblkno(fs, fs->fs_sblockloc), fs->fs_bsize, 790225807Smckusick KERNCRED, &nbp); 791225807Smckusick if (error) { 792225807Smckusick brelse(nbp); 793225807Smckusick } else { 794225807Smckusick loc = blkoff(fs, fs->fs_sblockloc); 795253280Skib bcopy((char *)copy_fs, &nbp->b_data[loc], (u_int)fs->fs_sbsize); 796225807Smckusick bawrite(nbp); 797225807Smckusick } 798107848Smckusick /* 799107848Smckusick * As this is the newest list, it is the most inclusive, so 800107848Smckusick * should replace the previous list. 801107848Smckusick */ 802107848Smckusick VI_LOCK(devvp); 803135138Sphk space = sn->sn_blklist; 804135138Sphk sn->sn_blklist = snapblklist; 805135138Sphk sn->sn_listsize = snaplistsize; 806122596Salc VI_UNLOCK(devvp); 807111240Smckusick if (space != NULL) 808184205Sdes free(space, M_UFSMNT); 809151180Stegge /* 810232351Smckusick * Preallocate all the direct blocks in the snapshot inode so 811232351Smckusick * that we never have to write the inode itself to commit an 812232351Smckusick * update to the contents of the snapshot. Note that once 813232351Smckusick * created, the size of the snapshot will never change, so 814232351Smckusick * there will never be a need to write the inode except to 815232351Smckusick * update the non-integrity-critical time fields and 816232351Smckusick * allocated-block count. 817151180Stegge */ 818232351Smckusick for (blockno = 0; blockno < NDADDR; blockno++) { 819232351Smckusick if (DIP(ip, i_db[blockno]) != 0) 820232351Smckusick continue; 821232351Smckusick error = UFS_BALLOC(vp, lblktosize(fs, blockno), 822232351Smckusick fs->fs_bsize, KERNCRED, BA_CLRBUF, &bp); 823232351Smckusick if (error) 824232351Smckusick break; 825232351Smckusick error = readblock(vp, bp, blockno); 826232351Smckusick bawrite(bp); 827232351Smckusick if (error != 0) 828232351Smckusick break; 829232351Smckusick } 83090098Smckusickdone: 831184205Sdes free(copy_fs->fs_csp, M_UFSMNT); 832225807Smckusick free(copy_fs, M_UFSMNT); 833225807Smckusick copy_fs = NULL; 83462976Smckusickout: 835168576Skib NDFREE(&nd, NDF_ONLY_PNBUF); 836113872Sjhb if (saved_nice > 0) { 837170307Sjeff struct proc *p; 838170307Sjeff 839170307Sjeff p = td->td_proc; 840170307Sjeff PROC_LOCK(p); 841130551Sjulian sched_nice(td->td_proc, saved_nice); 842113872Sjhb PROC_UNLOCK(td->td_proc); 843113872Sjhb } 844140706Sjeff UFS_LOCK(ump); 84587827Smckusick if (fs->fs_active != 0) { 846184205Sdes free(fs->fs_active, M_DEVBUF); 84787827Smckusick fs->fs_active = 0; 84887827Smckusick } 849140706Sjeff UFS_UNLOCK(ump); 850162647Stegge MNT_ILOCK(mp); 851162652Stegge mp->mnt_flag = (mp->mnt_flag & MNT_QUOTA) | (flag & ~MNT_QUOTA); 852162647Stegge MNT_IUNLOCK(mp); 85376269Smckusick if (error) 854234605Strasz (void) ffs_truncate(vp, (off_t)0, 0, NOCRED); 855233438Smckusick (void) ffs_syncvnode(vp, MNT_WAIT, 0); 85662976Smckusick if (error) 85762976Smckusick vput(vp); 85862976Smckusick else 859175294Sattilio VOP_UNLOCK(vp, 0); 860156895Stegge vrele(nd.ni_dvp); 86162976Smckusick vn_finished_write(wrtmp); 862156560Stegge process_deferred_inactive(mp); 86362976Smckusick return (error); 86462976Smckusick} 86562976Smckusick 86662976Smckusick/* 86787827Smckusick * Copy a cylinder group map. All the unallocated blocks are marked 86887827Smckusick * BLK_NOCOPY so that the snapshot knows that it need not copy them 86992363Smckusick * if they are later written. If passno is one, then this is a first 87092363Smckusick * pass, so only setting needs to be done. If passno is 2, then this 87187827Smckusick * is a revision to a previous pass which must be undone as the 87287827Smckusick * replacement pass is done. 87387827Smckusick */ 87487827Smckusickstatic int 87587827Smckusickcgaccount(cg, vp, nbp, passno) 87687827Smckusick int cg; 87787827Smckusick struct vnode *vp; 87887827Smckusick struct buf *nbp; 87987827Smckusick int passno; 88087827Smckusick{ 88187827Smckusick struct buf *bp, *ibp; 88287827Smckusick struct inode *ip; 88387827Smckusick struct cg *cgp; 88487827Smckusick struct fs *fs; 88598542Smckusick ufs2_daddr_t base, numblks; 88698542Smckusick int error, len, loc, indiroff; 88787827Smckusick 88887827Smckusick ip = VTOI(vp); 88987827Smckusick fs = ip->i_fs; 89087827Smckusick error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)), 89187827Smckusick (int)fs->fs_cgsize, KERNCRED, &bp); 89287827Smckusick if (error) { 89387827Smckusick brelse(bp); 89487827Smckusick return (error); 89587827Smckusick } 89687827Smckusick cgp = (struct cg *)bp->b_data; 89787827Smckusick if (!cg_chkmagic(cgp)) { 89887827Smckusick brelse(bp); 89987827Smckusick return (EIO); 90087827Smckusick } 901140706Sjeff UFS_LOCK(ip->i_ump); 902142879Sjeff ACTIVESET(fs, cg); 903183822Skib /* 904183822Skib * Recomputation of summary information might not have been performed 905183822Skib * at mount time. Sync up summary information for current cylinder 906183822Skib * group while data is in memory to ensure that result of background 907183822Skib * fsck is slightly more consistent. 908183822Skib */ 909183822Skib fs->fs_cs(fs, cg) = cgp->cg_cs; 910140706Sjeff UFS_UNLOCK(ip->i_ump); 91187827Smckusick bcopy(bp->b_data, nbp->b_data, fs->fs_cgsize); 91287827Smckusick if (fs->fs_cgsize < fs->fs_bsize) 91387827Smckusick bzero(&nbp->b_data[fs->fs_cgsize], 91487827Smckusick fs->fs_bsize - fs->fs_cgsize); 915151178Stegge cgp = (struct cg *)nbp->b_data; 916151178Stegge bqrelse(bp); 91787827Smckusick if (passno == 2) 91887827Smckusick nbp->b_flags |= B_VALIDSUSPWRT; 91987827Smckusick numblks = howmany(fs->fs_size, fs->fs_frag); 92087827Smckusick len = howmany(fs->fs_fpg, fs->fs_frag); 921138634Smckusick base = cgbase(fs, cg) / fs->fs_frag; 92287827Smckusick if (base + len >= numblks) 92387827Smckusick len = numblks - base - 1; 92487827Smckusick loc = 0; 92587827Smckusick if (base < NDADDR) { 92687827Smckusick for ( ; loc < NDADDR; loc++) { 92787827Smckusick if (ffs_isblock(fs, cg_blksfree(cgp), loc)) 928132775Skan DIP_SET(ip, i_db[loc], BLK_NOCOPY); 92998542Smckusick else if (passno == 2 && DIP(ip, i_db[loc])== BLK_NOCOPY) 930132775Skan DIP_SET(ip, i_db[loc], 0); 93198542Smckusick else if (passno == 1 && DIP(ip, i_db[loc])== BLK_NOCOPY) 93287827Smckusick panic("ffs_snapshot: lost direct block"); 93387827Smckusick } 93487827Smckusick } 93587827Smckusick error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(base + loc)), 93698658Sdillon fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 93787827Smckusick if (error) { 93887827Smckusick return (error); 93987827Smckusick } 94087827Smckusick indiroff = (base + loc - NDADDR) % NINDIR(fs); 94187827Smckusick for ( ; loc < len; loc++, indiroff++) { 94287827Smckusick if (indiroff >= NINDIR(fs)) { 94387827Smckusick if (passno == 2) 94487827Smckusick ibp->b_flags |= B_VALIDSUSPWRT; 94587827Smckusick bawrite(ibp); 94687827Smckusick error = UFS_BALLOC(vp, 94787827Smckusick lblktosize(fs, (off_t)(base + loc)), 94898658Sdillon fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 94987827Smckusick if (error) { 95087827Smckusick return (error); 95187827Smckusick } 95287827Smckusick indiroff = 0; 95387827Smckusick } 95498542Smckusick if (ip->i_ump->um_fstype == UFS1) { 95598542Smckusick if (ffs_isblock(fs, cg_blksfree(cgp), loc)) 95698542Smckusick ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = 95798542Smckusick BLK_NOCOPY; 95898542Smckusick else if (passno == 2 && ((ufs1_daddr_t *)(ibp->b_data)) 95998542Smckusick [indiroff] == BLK_NOCOPY) 96098542Smckusick ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = 0; 96198542Smckusick else if (passno == 1 && ((ufs1_daddr_t *)(ibp->b_data)) 96298542Smckusick [indiroff] == BLK_NOCOPY) 96398542Smckusick panic("ffs_snapshot: lost indirect block"); 96498542Smckusick continue; 96598542Smckusick } 96687827Smckusick if (ffs_isblock(fs, cg_blksfree(cgp), loc)) 96798542Smckusick ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = BLK_NOCOPY; 96887827Smckusick else if (passno == 2 && 96998542Smckusick ((ufs2_daddr_t *)(ibp->b_data)) [indiroff] == BLK_NOCOPY) 97098542Smckusick ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = 0; 97187827Smckusick else if (passno == 1 && 97298542Smckusick ((ufs2_daddr_t *)(ibp->b_data)) [indiroff] == BLK_NOCOPY) 97387827Smckusick panic("ffs_snapshot: lost indirect block"); 97487827Smckusick } 97587827Smckusick if (passno == 2) 97687827Smckusick ibp->b_flags |= B_VALIDSUSPWRT; 97787827Smckusick bdwrite(ibp); 97887827Smckusick return (0); 97987827Smckusick} 98087827Smckusick 98187827Smckusick/* 98276269Smckusick * Before expunging a snapshot inode, note all the 98376269Smckusick * blocks that it claims with BLK_SNAP so that fsck will 98476269Smckusick * be able to account for those blocks properly and so 98576269Smckusick * that this snapshot knows that it need not copy them 98698542Smckusick * if the other snapshot holding them is freed. This code 98798542Smckusick * is reproduced once each for UFS1 and UFS2. 98876269Smckusick */ 98976269Smckusickstatic int 990207141Sjeffexpunge_ufs1(snapvp, cancelip, fs, acctfunc, expungetype, clearmode) 99190098Smckusick struct vnode *snapvp; 99290098Smckusick struct inode *cancelip; 99376269Smckusick struct fs *fs; 99498542Smckusick int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 99598542Smckusick struct fs *, ufs_lbn_t, int); 99690098Smckusick int expungetype; 997207141Sjeff int clearmode; 99876269Smckusick{ 99998542Smckusick int i, error, indiroff; 100098542Smckusick ufs_lbn_t lbn, rlbn; 100198542Smckusick ufs2_daddr_t len, blkno, numblks, blksperindir; 100298542Smckusick struct ufs1_dinode *dip; 100390098Smckusick struct thread *td = curthread; 100476269Smckusick struct buf *bp; 100576269Smckusick 100676269Smckusick /* 100790098Smckusick * Prepare to expunge the inode. If its inode block has not 100890098Smckusick * yet been copied, then allocate and fill the copy. 100976269Smckusick */ 101090098Smckusick lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number)); 101190098Smckusick blkno = 0; 101290098Smckusick if (lbn < NDADDR) { 1013107558Smckusick blkno = VTOI(snapvp)->i_din1->di_db[lbn]; 101490098Smckusick } else { 1015207742Sjeff if (DOINGSOFTDEP(snapvp)) 1016207742Sjeff softdep_prealloc(snapvp, MNT_WAIT); 1017121443Sjhb td->td_pflags |= TDP_COWINPROGRESS; 1018141526Sphk error = ffs_balloc_ufs1(snapvp, lblktosize(fs, (off_t)lbn), 101998658Sdillon fs->fs_bsize, KERNCRED, BA_METAONLY, &bp); 1020121443Sjhb td->td_pflags &= ~TDP_COWINPROGRESS; 102190098Smckusick if (error) 102290098Smckusick return (error); 102390098Smckusick indiroff = (lbn - NDADDR) % NINDIR(fs); 102498542Smckusick blkno = ((ufs1_daddr_t *)(bp->b_data))[indiroff]; 102590098Smckusick bqrelse(bp); 102690098Smckusick } 1027107558Smckusick if (blkno != 0) { 1028107558Smckusick if ((error = bread(snapvp, lbn, fs->fs_bsize, KERNCRED, &bp))) 1029107558Smckusick return (error); 1030107558Smckusick } else { 1031141526Sphk error = ffs_balloc_ufs1(snapvp, lblktosize(fs, (off_t)lbn), 1032107558Smckusick fs->fs_bsize, KERNCRED, 0, &bp); 1033107558Smckusick if (error) 1034107558Smckusick return (error); 1035135138Sphk if ((error = readblock(snapvp, bp, lbn)) != 0) 1036107558Smckusick return (error); 1037107558Smckusick } 103890098Smckusick /* 103990098Smckusick * Set a snapshot inode to be a zero length file, regular files 1040158527Stegge * or unlinked snapshots to be completely unallocated. 104190098Smckusick */ 104298542Smckusick dip = (struct ufs1_dinode *)bp->b_data + 104398542Smckusick ino_to_fsbo(fs, cancelip->i_number); 1044207141Sjeff if (clearmode || cancelip->i_effnlink == 0) 104590098Smckusick dip->di_mode = 0; 104676269Smckusick dip->di_size = 0; 104776269Smckusick dip->di_blocks = 0; 104876269Smckusick dip->di_flags &= ~SF_SNAPSHOT; 104998542Smckusick bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs1_daddr_t)); 105076269Smckusick bdwrite(bp); 1051107848Smckusick /* 1052107848Smckusick * Now go through and expunge all the blocks in the file 1053107848Smckusick * using the function requested. 1054107848Smckusick */ 1055107848Smckusick numblks = howmany(cancelip->i_size, fs->fs_bsize); 1056107848Smckusick if ((error = (*acctfunc)(snapvp, &cancelip->i_din1->di_db[0], 1057107848Smckusick &cancelip->i_din1->di_db[NDADDR], fs, 0, expungetype))) 1058107848Smckusick return (error); 1059107848Smckusick if ((error = (*acctfunc)(snapvp, &cancelip->i_din1->di_ib[0], 1060107848Smckusick &cancelip->i_din1->di_ib[NIADDR], fs, -1, expungetype))) 1061107848Smckusick return (error); 1062107848Smckusick blksperindir = 1; 1063107848Smckusick lbn = -NDADDR; 1064107848Smckusick len = numblks - NDADDR; 1065107848Smckusick rlbn = NDADDR; 1066107848Smckusick for (i = 0; len > 0 && i < NIADDR; i++) { 1067107848Smckusick error = indiracct_ufs1(snapvp, ITOV(cancelip), i, 1068107848Smckusick cancelip->i_din1->di_ib[i], lbn, rlbn, len, 1069107848Smckusick blksperindir, fs, acctfunc, expungetype); 1070107848Smckusick if (error) 1071107848Smckusick return (error); 1072107848Smckusick blksperindir *= NINDIR(fs); 1073107848Smckusick lbn -= blksperindir + 1; 1074107848Smckusick len -= blksperindir; 1075107848Smckusick rlbn += blksperindir; 1076107848Smckusick } 107776269Smckusick return (0); 107876269Smckusick} 107976269Smckusick 108076269Smckusick/* 108162976Smckusick * Descend an indirect block chain for vnode cancelvp accounting for all 108262976Smckusick * its indirect blocks in snapvp. 108362976Smckusick */ 108462976Smckusickstatic int 108598542Smckusickindiracct_ufs1(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks, 108698542Smckusick blksperindir, fs, acctfunc, expungetype) 108762976Smckusick struct vnode *snapvp; 108862976Smckusick struct vnode *cancelvp; 108962976Smckusick int level; 109098542Smckusick ufs1_daddr_t blkno; 109198542Smckusick ufs_lbn_t lbn; 109298542Smckusick ufs_lbn_t rlbn; 109398542Smckusick ufs_lbn_t remblks; 109498542Smckusick ufs_lbn_t blksperindir; 109576269Smckusick struct fs *fs; 109698542Smckusick int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 109798542Smckusick struct fs *, ufs_lbn_t, int); 109890098Smckusick int expungetype; 109962976Smckusick{ 110098542Smckusick int error, num, i; 110198542Smckusick ufs_lbn_t subblksperindir; 110262976Smckusick struct indir indirs[NIADDR + 2]; 110398542Smckusick ufs1_daddr_t last, *bap; 110462976Smckusick struct buf *bp; 110562976Smckusick 1106121158Smckusick if (blkno == 0) { 1107121158Smckusick if (expungetype == BLK_NOCOPY) 1108121158Smckusick return (0); 1109121158Smckusick panic("indiracct_ufs1: missing indir"); 1110121158Smckusick } 111162976Smckusick if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0) 111262976Smckusick return (error); 1113121158Smckusick if (lbn != indirs[num - 1 - level].in_lbn || num < 2) 1114121158Smckusick panic("indiracct_ufs1: botched params"); 111562976Smckusick /* 111662976Smckusick * We have to expand bread here since it will deadlock looking 111762976Smckusick * up the block number for any blocks that are not in the cache. 111862976Smckusick */ 1119111856Sjeff bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0, 0); 112062976Smckusick bp->b_blkno = fsbtodb(fs, blkno); 112162976Smckusick if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 && 1122135138Sphk (error = readblock(cancelvp, bp, fragstoblks(fs, blkno)))) { 112362976Smckusick brelse(bp); 112462976Smckusick return (error); 112562976Smckusick } 112662976Smckusick /* 112762976Smckusick * Account for the block pointers in this indirect block. 112862976Smckusick */ 112962976Smckusick last = howmany(remblks, blksperindir); 113062976Smckusick if (last > NINDIR(fs)) 113162976Smckusick last = NINDIR(fs); 1132184205Sdes bap = malloc(fs->fs_bsize, M_DEVBUF, M_WAITOK); 113376269Smckusick bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize); 113476269Smckusick bqrelse(bp); 1135107848Smckusick error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs, 1136107848Smckusick level == 0 ? rlbn : -1, expungetype); 113762976Smckusick if (error || level == 0) 113862976Smckusick goto out; 113962976Smckusick /* 114062976Smckusick * Account for the block pointers in each of the indirect blocks 114162976Smckusick * in the levels below us. 114262976Smckusick */ 114362976Smckusick subblksperindir = blksperindir / NINDIR(fs); 114462976Smckusick for (lbn++, level--, i = 0; i < last; i++) { 114598542Smckusick error = indiracct_ufs1(snapvp, cancelvp, level, bap[i], lbn, 114690098Smckusick rlbn, remblks, subblksperindir, fs, acctfunc, expungetype); 114762976Smckusick if (error) 114862976Smckusick goto out; 114962976Smckusick rlbn += blksperindir; 115062976Smckusick lbn -= blksperindir; 115162976Smckusick remblks -= blksperindir; 115262976Smckusick } 115362976Smckusickout: 1154184205Sdes free(bap, M_DEVBUF); 115562976Smckusick return (error); 115662976Smckusick} 115762976Smckusick 115862976Smckusick/* 115990098Smckusick * Do both snap accounting and map accounting. 116090098Smckusick */ 116190098Smckusickstatic int 116298542Smckusickfullacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype) 116390098Smckusick struct vnode *vp; 116498542Smckusick ufs1_daddr_t *oldblkp, *lastblkp; 116590098Smckusick struct fs *fs; 116698542Smckusick ufs_lbn_t lblkno; 116798542Smckusick int exptype; /* BLK_SNAP or BLK_NOCOPY */ 116898542Smckusick{ 116998542Smckusick int error; 117098542Smckusick 117198542Smckusick if ((error = snapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype))) 117298542Smckusick return (error); 117398542Smckusick return (mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype)); 117498542Smckusick} 117598542Smckusick 117698542Smckusick/* 117798542Smckusick * Identify a set of blocks allocated in a snapshot inode. 117898542Smckusick */ 117998542Smckusickstatic int 118098542Smckusicksnapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype) 118198542Smckusick struct vnode *vp; 118298542Smckusick ufs1_daddr_t *oldblkp, *lastblkp; 118398542Smckusick struct fs *fs; 118498542Smckusick ufs_lbn_t lblkno; 118590098Smckusick int expungetype; /* BLK_SNAP or BLK_NOCOPY */ 118690098Smckusick{ 118798542Smckusick struct inode *ip = VTOI(vp); 118898542Smckusick ufs1_daddr_t blkno, *blkp; 118998542Smckusick ufs_lbn_t lbn; 119098542Smckusick struct buf *ibp; 119190098Smckusick int error; 119290098Smckusick 119398542Smckusick for ( ; oldblkp < lastblkp; oldblkp++) { 119498542Smckusick blkno = *oldblkp; 119598542Smckusick if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP) 119698542Smckusick continue; 119798542Smckusick lbn = fragstoblks(fs, blkno); 119898542Smckusick if (lbn < NDADDR) { 119998542Smckusick blkp = &ip->i_din1->di_db[lbn]; 120098542Smckusick ip->i_flag |= IN_CHANGE | IN_UPDATE; 120198542Smckusick } else { 1202141526Sphk error = ffs_balloc_ufs1(vp, lblktosize(fs, (off_t)lbn), 120398658Sdillon fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 120498542Smckusick if (error) 120598542Smckusick return (error); 120698542Smckusick blkp = &((ufs1_daddr_t *)(ibp->b_data)) 120798542Smckusick [(lbn - NDADDR) % NINDIR(fs)]; 120898542Smckusick } 120998542Smckusick /* 121098542Smckusick * If we are expunging a snapshot vnode and we 121198542Smckusick * find a block marked BLK_NOCOPY, then it is 121298542Smckusick * one that has been allocated to this snapshot after 121398542Smckusick * we took our current snapshot and can be ignored. 121498542Smckusick */ 121598542Smckusick if (expungetype == BLK_SNAP && *blkp == BLK_NOCOPY) { 121698542Smckusick if (lbn >= NDADDR) 121798542Smckusick brelse(ibp); 121898542Smckusick } else { 121998542Smckusick if (*blkp != 0) 1220121158Smckusick panic("snapacct_ufs1: bad block"); 122198542Smckusick *blkp = expungetype; 122298542Smckusick if (lbn >= NDADDR) 122398542Smckusick bdwrite(ibp); 122498542Smckusick } 122598542Smckusick } 122698542Smckusick return (0); 122798542Smckusick} 122898542Smckusick 122998542Smckusick/* 123098542Smckusick * Account for a set of blocks allocated in a snapshot inode. 123198542Smckusick */ 123298542Smckusickstatic int 123398542Smckusickmapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype) 123498542Smckusick struct vnode *vp; 123598542Smckusick ufs1_daddr_t *oldblkp, *lastblkp; 123698542Smckusick struct fs *fs; 123798542Smckusick ufs_lbn_t lblkno; 123898542Smckusick int expungetype; 123998542Smckusick{ 124098542Smckusick ufs1_daddr_t blkno; 1241104698Smckusick struct inode *ip; 124298542Smckusick ino_t inum; 1243108050Smckusick int acctit; 124498542Smckusick 1245104698Smckusick ip = VTOI(vp); 1246104698Smckusick inum = ip->i_number; 1247108050Smckusick if (lblkno == -1) 1248108050Smckusick acctit = 0; 1249108050Smckusick else 1250108050Smckusick acctit = 1; 125198542Smckusick for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) { 125298542Smckusick blkno = *oldblkp; 125398542Smckusick if (blkno == 0 || blkno == BLK_NOCOPY) 125498542Smckusick continue; 1255108050Smckusick if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP) 1256107915Smckusick *ip->i_snapblklist++ = lblkno; 125798542Smckusick if (blkno == BLK_SNAP) 125898542Smckusick blkno = blkstofrags(fs, lblkno); 1259223127Smckusick ffs_blkfree(ip->i_ump, fs, vp, blkno, fs->fs_bsize, inum, 1260223127Smckusick vp->v_type, NULL); 126198542Smckusick } 126298542Smckusick return (0); 126398542Smckusick} 126498542Smckusick 126598542Smckusick/* 126698542Smckusick * Before expunging a snapshot inode, note all the 126798542Smckusick * blocks that it claims with BLK_SNAP so that fsck will 126898542Smckusick * be able to account for those blocks properly and so 126998542Smckusick * that this snapshot knows that it need not copy them 127098542Smckusick * if the other snapshot holding them is freed. This code 127198542Smckusick * is reproduced once each for UFS1 and UFS2. 127298542Smckusick */ 127398542Smckusickstatic int 1274207141Sjeffexpunge_ufs2(snapvp, cancelip, fs, acctfunc, expungetype, clearmode) 127598542Smckusick struct vnode *snapvp; 127698542Smckusick struct inode *cancelip; 127798542Smckusick struct fs *fs; 127898542Smckusick int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 127998542Smckusick struct fs *, ufs_lbn_t, int); 128098542Smckusick int expungetype; 1281207141Sjeff int clearmode; 128298542Smckusick{ 128398542Smckusick int i, error, indiroff; 128498542Smckusick ufs_lbn_t lbn, rlbn; 128598542Smckusick ufs2_daddr_t len, blkno, numblks, blksperindir; 128698542Smckusick struct ufs2_dinode *dip; 128798542Smckusick struct thread *td = curthread; 128898542Smckusick struct buf *bp; 128998542Smckusick 129098542Smckusick /* 129198542Smckusick * Prepare to expunge the inode. If its inode block has not 129298542Smckusick * yet been copied, then allocate and fill the copy. 129398542Smckusick */ 129498542Smckusick lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number)); 129598542Smckusick blkno = 0; 129698542Smckusick if (lbn < NDADDR) { 1297107558Smckusick blkno = VTOI(snapvp)->i_din2->di_db[lbn]; 129898542Smckusick } else { 1299207742Sjeff if (DOINGSOFTDEP(snapvp)) 1300207742Sjeff softdep_prealloc(snapvp, MNT_WAIT); 1301121443Sjhb td->td_pflags |= TDP_COWINPROGRESS; 1302141526Sphk error = ffs_balloc_ufs2(snapvp, lblktosize(fs, (off_t)lbn), 130398658Sdillon fs->fs_bsize, KERNCRED, BA_METAONLY, &bp); 1304121443Sjhb td->td_pflags &= ~TDP_COWINPROGRESS; 130598542Smckusick if (error) 130698542Smckusick return (error); 130798542Smckusick indiroff = (lbn - NDADDR) % NINDIR(fs); 130898542Smckusick blkno = ((ufs2_daddr_t *)(bp->b_data))[indiroff]; 130998542Smckusick bqrelse(bp); 131098542Smckusick } 1311107558Smckusick if (blkno != 0) { 1312107558Smckusick if ((error = bread(snapvp, lbn, fs->fs_bsize, KERNCRED, &bp))) 1313107558Smckusick return (error); 1314107558Smckusick } else { 1315141526Sphk error = ffs_balloc_ufs2(snapvp, lblktosize(fs, (off_t)lbn), 1316107558Smckusick fs->fs_bsize, KERNCRED, 0, &bp); 1317107558Smckusick if (error) 1318107558Smckusick return (error); 1319135138Sphk if ((error = readblock(snapvp, bp, lbn)) != 0) 1320107558Smckusick return (error); 1321107558Smckusick } 132298542Smckusick /* 132398542Smckusick * Set a snapshot inode to be a zero length file, regular files 132498542Smckusick * to be completely unallocated. 132598542Smckusick */ 132698542Smckusick dip = (struct ufs2_dinode *)bp->b_data + 132798542Smckusick ino_to_fsbo(fs, cancelip->i_number); 1328207141Sjeff if (clearmode || cancelip->i_effnlink == 0) 132998542Smckusick dip->di_mode = 0; 133098542Smckusick dip->di_size = 0; 133198542Smckusick dip->di_blocks = 0; 133298542Smckusick dip->di_flags &= ~SF_SNAPSHOT; 133398542Smckusick bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs2_daddr_t)); 133498542Smckusick bdwrite(bp); 1335107848Smckusick /* 1336107848Smckusick * Now go through and expunge all the blocks in the file 1337107848Smckusick * using the function requested. 1338107848Smckusick */ 1339107848Smckusick numblks = howmany(cancelip->i_size, fs->fs_bsize); 1340107848Smckusick if ((error = (*acctfunc)(snapvp, &cancelip->i_din2->di_db[0], 1341107848Smckusick &cancelip->i_din2->di_db[NDADDR], fs, 0, expungetype))) 1342107848Smckusick return (error); 1343107848Smckusick if ((error = (*acctfunc)(snapvp, &cancelip->i_din2->di_ib[0], 1344107848Smckusick &cancelip->i_din2->di_ib[NIADDR], fs, -1, expungetype))) 1345107848Smckusick return (error); 1346107848Smckusick blksperindir = 1; 1347107848Smckusick lbn = -NDADDR; 1348107848Smckusick len = numblks - NDADDR; 1349107848Smckusick rlbn = NDADDR; 1350107848Smckusick for (i = 0; len > 0 && i < NIADDR; i++) { 1351107848Smckusick error = indiracct_ufs2(snapvp, ITOV(cancelip), i, 1352107848Smckusick cancelip->i_din2->di_ib[i], lbn, rlbn, len, 1353107848Smckusick blksperindir, fs, acctfunc, expungetype); 1354107848Smckusick if (error) 1355107848Smckusick return (error); 1356107848Smckusick blksperindir *= NINDIR(fs); 1357107848Smckusick lbn -= blksperindir + 1; 1358107848Smckusick len -= blksperindir; 1359107848Smckusick rlbn += blksperindir; 1360107848Smckusick } 136198542Smckusick return (0); 136290098Smckusick} 136390098Smckusick 136490098Smckusick/* 136598542Smckusick * Descend an indirect block chain for vnode cancelvp accounting for all 136698542Smckusick * its indirect blocks in snapvp. 136798542Smckusick */ 136898542Smckusickstatic int 136998542Smckusickindiracct_ufs2(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks, 137098542Smckusick blksperindir, fs, acctfunc, expungetype) 137198542Smckusick struct vnode *snapvp; 137298542Smckusick struct vnode *cancelvp; 137398542Smckusick int level; 137498542Smckusick ufs2_daddr_t blkno; 137598542Smckusick ufs_lbn_t lbn; 137698542Smckusick ufs_lbn_t rlbn; 137798542Smckusick ufs_lbn_t remblks; 137898542Smckusick ufs_lbn_t blksperindir; 137998542Smckusick struct fs *fs; 138098542Smckusick int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 138198542Smckusick struct fs *, ufs_lbn_t, int); 138298542Smckusick int expungetype; 138398542Smckusick{ 138498542Smckusick int error, num, i; 138598542Smckusick ufs_lbn_t subblksperindir; 138698542Smckusick struct indir indirs[NIADDR + 2]; 138798542Smckusick ufs2_daddr_t last, *bap; 138898542Smckusick struct buf *bp; 138998542Smckusick 1390121158Smckusick if (blkno == 0) { 1391121158Smckusick if (expungetype == BLK_NOCOPY) 1392121158Smckusick return (0); 1393121158Smckusick panic("indiracct_ufs2: missing indir"); 1394121158Smckusick } 139598542Smckusick if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0) 139698542Smckusick return (error); 1397121158Smckusick if (lbn != indirs[num - 1 - level].in_lbn || num < 2) 1398121158Smckusick panic("indiracct_ufs2: botched params"); 139998542Smckusick /* 140098542Smckusick * We have to expand bread here since it will deadlock looking 140198542Smckusick * up the block number for any blocks that are not in the cache. 140298542Smckusick */ 1403111856Sjeff bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0, 0); 140498542Smckusick bp->b_blkno = fsbtodb(fs, blkno); 140598542Smckusick if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 && 1406135138Sphk (error = readblock(cancelvp, bp, fragstoblks(fs, blkno)))) { 140798542Smckusick brelse(bp); 140898542Smckusick return (error); 140998542Smckusick } 141098542Smckusick /* 141198542Smckusick * Account for the block pointers in this indirect block. 141298542Smckusick */ 141398542Smckusick last = howmany(remblks, blksperindir); 141498542Smckusick if (last > NINDIR(fs)) 141598542Smckusick last = NINDIR(fs); 1416184205Sdes bap = malloc(fs->fs_bsize, M_DEVBUF, M_WAITOK); 141798542Smckusick bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize); 141898542Smckusick bqrelse(bp); 1419107848Smckusick error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs, 1420107848Smckusick level == 0 ? rlbn : -1, expungetype); 142198542Smckusick if (error || level == 0) 142298542Smckusick goto out; 142398542Smckusick /* 142498542Smckusick * Account for the block pointers in each of the indirect blocks 142598542Smckusick * in the levels below us. 142698542Smckusick */ 142798542Smckusick subblksperindir = blksperindir / NINDIR(fs); 142898542Smckusick for (lbn++, level--, i = 0; i < last; i++) { 142998542Smckusick error = indiracct_ufs2(snapvp, cancelvp, level, bap[i], lbn, 143098542Smckusick rlbn, remblks, subblksperindir, fs, acctfunc, expungetype); 143198542Smckusick if (error) 143298542Smckusick goto out; 143398542Smckusick rlbn += blksperindir; 143498542Smckusick lbn -= blksperindir; 143598542Smckusick remblks -= blksperindir; 143698542Smckusick } 143798542Smckusickout: 1438184205Sdes free(bap, M_DEVBUF); 143998542Smckusick return (error); 144098542Smckusick} 144198542Smckusick 144298542Smckusick/* 144398542Smckusick * Do both snap accounting and map accounting. 144498542Smckusick */ 144598542Smckusickstatic int 144698542Smckusickfullacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype) 144798542Smckusick struct vnode *vp; 144898542Smckusick ufs2_daddr_t *oldblkp, *lastblkp; 144998542Smckusick struct fs *fs; 145098542Smckusick ufs_lbn_t lblkno; 145198542Smckusick int exptype; /* BLK_SNAP or BLK_NOCOPY */ 145298542Smckusick{ 145398542Smckusick int error; 145498542Smckusick 145598542Smckusick if ((error = snapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype))) 145698542Smckusick return (error); 145798542Smckusick return (mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype)); 145898542Smckusick} 145998542Smckusick 146098542Smckusick/* 146187827Smckusick * Identify a set of blocks allocated in a snapshot inode. 146262976Smckusick */ 146362976Smckusickstatic int 146498542Smckusicksnapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expungetype) 146562976Smckusick struct vnode *vp; 146698542Smckusick ufs2_daddr_t *oldblkp, *lastblkp; 146776269Smckusick struct fs *fs; 146898542Smckusick ufs_lbn_t lblkno; 146990098Smckusick int expungetype; /* BLK_SNAP or BLK_NOCOPY */ 147062976Smckusick{ 147162976Smckusick struct inode *ip = VTOI(vp); 147298542Smckusick ufs2_daddr_t blkno, *blkp; 147398542Smckusick ufs_lbn_t lbn; 147462976Smckusick struct buf *ibp; 147562976Smckusick int error; 147662976Smckusick 147762976Smckusick for ( ; oldblkp < lastblkp; oldblkp++) { 147862976Smckusick blkno = *oldblkp; 147962976Smckusick if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP) 148062976Smckusick continue; 148162976Smckusick lbn = fragstoblks(fs, blkno); 148262976Smckusick if (lbn < NDADDR) { 148398542Smckusick blkp = &ip->i_din2->di_db[lbn]; 148462976Smckusick ip->i_flag |= IN_CHANGE | IN_UPDATE; 148562976Smckusick } else { 1486141526Sphk error = ffs_balloc_ufs2(vp, lblktosize(fs, (off_t)lbn), 148798658Sdillon fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 148862976Smckusick if (error) 148962976Smckusick return (error); 149098542Smckusick blkp = &((ufs2_daddr_t *)(ibp->b_data)) 149162976Smckusick [(lbn - NDADDR) % NINDIR(fs)]; 149262976Smckusick } 149387827Smckusick /* 149490098Smckusick * If we are expunging a snapshot vnode and we 149590098Smckusick * find a block marked BLK_NOCOPY, then it is 149687827Smckusick * one that has been allocated to this snapshot after 149787827Smckusick * we took our current snapshot and can be ignored. 149887827Smckusick */ 149990098Smckusick if (expungetype == BLK_SNAP && *blkp == BLK_NOCOPY) { 150087827Smckusick if (lbn >= NDADDR) 150187827Smckusick brelse(ibp); 150287827Smckusick } else { 150387827Smckusick if (*blkp != 0) 1504121158Smckusick panic("snapacct_ufs2: bad block"); 150590098Smckusick *blkp = expungetype; 150687827Smckusick if (lbn >= NDADDR) 150787827Smckusick bdwrite(ibp); 150863788Smckusick } 150962976Smckusick } 151062976Smckusick return (0); 151162976Smckusick} 151262976Smckusick 151362976Smckusick/* 151476269Smckusick * Account for a set of blocks allocated in a snapshot inode. 151576269Smckusick */ 151676269Smckusickstatic int 151798542Smckusickmapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expungetype) 151876269Smckusick struct vnode *vp; 151998542Smckusick ufs2_daddr_t *oldblkp, *lastblkp; 152076269Smckusick struct fs *fs; 152198542Smckusick ufs_lbn_t lblkno; 152290098Smckusick int expungetype; 152376269Smckusick{ 152498542Smckusick ufs2_daddr_t blkno; 1525104698Smckusick struct inode *ip; 152690098Smckusick ino_t inum; 1527108050Smckusick int acctit; 152876269Smckusick 1529104698Smckusick ip = VTOI(vp); 1530104698Smckusick inum = ip->i_number; 1531108050Smckusick if (lblkno == -1) 1532108050Smckusick acctit = 0; 1533108050Smckusick else 1534108050Smckusick acctit = 1; 153576269Smckusick for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) { 153676269Smckusick blkno = *oldblkp; 153776269Smckusick if (blkno == 0 || blkno == BLK_NOCOPY) 153876269Smckusick continue; 1539108050Smckusick if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP) 1540107915Smckusick *ip->i_snapblklist++ = lblkno; 154176269Smckusick if (blkno == BLK_SNAP) 154276269Smckusick blkno = blkstofrags(fs, lblkno); 1543223127Smckusick ffs_blkfree(ip->i_ump, fs, vp, blkno, fs->fs_bsize, inum, 1544223127Smckusick vp->v_type, NULL); 154576269Smckusick } 154676269Smckusick return (0); 154776269Smckusick} 154876269Smckusick 154976269Smckusick/* 155070183Smckusick * Decrement extra reference on snapshot when last name is removed. 155170183Smckusick * It will not be freed until the last open reference goes away. 155270183Smckusick */ 155370183Smckusickvoid 155470183Smckusickffs_snapgone(ip) 155570183Smckusick struct inode *ip; 155670183Smckusick{ 155770183Smckusick struct inode *xp; 155874547Smckusick struct fs *fs; 155974547Smckusick int snaploc; 1560135138Sphk struct snapdata *sn; 1561140706Sjeff struct ufsmount *ump; 156270183Smckusick 156370183Smckusick /* 156470183Smckusick * Find snapshot in incore list. 156570183Smckusick */ 1566135138Sphk xp = NULL; 1567135138Sphk sn = ip->i_devvp->v_rdev->si_snapdata; 1568135138Sphk if (sn != NULL) 1569135138Sphk TAILQ_FOREACH(xp, &sn->sn_head, i_nextsnap) 1570135138Sphk if (xp == ip) 1571135138Sphk break; 1572107848Smckusick if (xp != NULL) 1573107848Smckusick vrele(ITOV(ip)); 1574107848Smckusick else if (snapdebug) 1575241011Smdf printf("ffs_snapgone: lost snapshot vnode %ju\n", 1576241011Smdf (uintmax_t)ip->i_number); 157774547Smckusick /* 157874547Smckusick * Delete snapshot inode from superblock. Keep list dense. 157974547Smckusick */ 158074547Smckusick fs = ip->i_fs; 1581140706Sjeff ump = ip->i_ump; 1582140706Sjeff UFS_LOCK(ump); 158374547Smckusick for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) 158474547Smckusick if (fs->fs_snapinum[snaploc] == ip->i_number) 158574547Smckusick break; 158674547Smckusick if (snaploc < FSMAXSNAP) { 158774547Smckusick for (snaploc++; snaploc < FSMAXSNAP; snaploc++) { 158874547Smckusick if (fs->fs_snapinum[snaploc] == 0) 158974547Smckusick break; 159074547Smckusick fs->fs_snapinum[snaploc - 1] = fs->fs_snapinum[snaploc]; 159174547Smckusick } 159274547Smckusick fs->fs_snapinum[snaploc - 1] = 0; 159374547Smckusick } 1594140706Sjeff UFS_UNLOCK(ump); 159570183Smckusick} 159670183Smckusick 159770183Smckusick/* 159862976Smckusick * Prepare a snapshot file for being removed. 159962976Smckusick */ 160062976Smckusickvoid 160162976Smckusickffs_snapremove(vp) 160262976Smckusick struct vnode *vp; 160362976Smckusick{ 160473942Smckusick struct inode *ip; 160562976Smckusick struct vnode *devvp; 160662976Smckusick struct buf *ibp; 160762976Smckusick struct fs *fs; 1608158259Stegge ufs2_daddr_t numblks, blkno, dblk; 160998542Smckusick int error, loc, last; 1610135138Sphk struct snapdata *sn; 161162976Smckusick 161262976Smckusick ip = VTOI(vp); 161362976Smckusick fs = ip->i_fs; 1614107414Smckusick devvp = ip->i_devvp; 161562976Smckusick /* 161675943Smckusick * If active, delete from incore list (this snapshot may 161775943Smckusick * already have been in the process of being deleted, so 161875943Smckusick * would not have been active). 161975943Smckusick * 162062976Smckusick * Clear copy-on-write flag if last snapshot. 162162976Smckusick */ 1622158259Stegge VI_LOCK(devvp); 162375943Smckusick if (ip->i_nextsnap.tqe_prev != 0) { 1624158259Stegge sn = devvp->v_rdev->si_snapdata; 1625135138Sphk TAILQ_REMOVE(&sn->sn_head, ip, i_nextsnap); 1626107414Smckusick ip->i_nextsnap.tqe_prev = 0; 1627158259Stegge VI_UNLOCK(devvp); 1628175635Sattilio lockmgr(&vp->v_lock, LK_EXCLUSIVE, NULL); 1629158259Stegge KASSERT(vp->v_vnlock == &sn->sn_lock, 1630158259Stegge ("ffs_snapremove: lost lock mutation")); 1631105191Smckusick vp->v_vnlock = &vp->v_lock; 1632158259Stegge VI_LOCK(devvp); 1633175635Sattilio lockmgr(&sn->sn_lock, LK_RELEASE, NULL); 1634177778Sjeff try_free_snapdata(devvp); 1635158259Stegge } else 1636158259Stegge VI_UNLOCK(devvp); 163762976Smckusick /* 163862976Smckusick * Clear all BLK_NOCOPY fields. Pass any block claims to other 163962976Smckusick * snapshots that want them (see ffs_snapblkfree below). 164062976Smckusick */ 164162976Smckusick for (blkno = 1; blkno < NDADDR; blkno++) { 164298542Smckusick dblk = DIP(ip, i_db[blkno]); 1643151177Stegge if (dblk == 0) 1644151177Stegge continue; 164576356Smckusick if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) 1646132775Skan DIP_SET(ip, i_db[blkno], 0); 164776356Smckusick else if ((dblk == blkstofrags(fs, blkno) && 164890098Smckusick ffs_snapblkfree(fs, ip->i_devvp, dblk, fs->fs_bsize, 1649223127Smckusick ip->i_number, vp->v_type, NULL))) { 1650132775Skan DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - 1651132775Skan btodb(fs->fs_bsize)); 1652132775Skan DIP_SET(ip, i_db[blkno], 0); 165376356Smckusick } 165462976Smckusick } 165576356Smckusick numblks = howmany(ip->i_size, fs->fs_bsize); 165676356Smckusick for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) { 165776132Sphk error = UFS_BALLOC(vp, lblktosize(fs, (off_t)blkno), 165898658Sdillon fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 165962976Smckusick if (error) 166062976Smckusick continue; 166198542Smckusick if (fs->fs_size - blkno > NINDIR(fs)) 166262976Smckusick last = NINDIR(fs); 166398542Smckusick else 166498542Smckusick last = fs->fs_size - blkno; 166562976Smckusick for (loc = 0; loc < last; loc++) { 166698542Smckusick if (ip->i_ump->um_fstype == UFS1) { 166798542Smckusick dblk = ((ufs1_daddr_t *)(ibp->b_data))[loc]; 1668151177Stegge if (dblk == 0) 1669151177Stegge continue; 167098542Smckusick if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) 167198542Smckusick ((ufs1_daddr_t *)(ibp->b_data))[loc]= 0; 167298542Smckusick else if ((dblk == blkstofrags(fs, blkno) && 167398542Smckusick ffs_snapblkfree(fs, ip->i_devvp, dblk, 1674223127Smckusick fs->fs_bsize, ip->i_number, vp->v_type, 1675223127Smckusick NULL))) { 167698542Smckusick ip->i_din1->di_blocks -= 167798542Smckusick btodb(fs->fs_bsize); 167898542Smckusick ((ufs1_daddr_t *)(ibp->b_data))[loc]= 0; 167998542Smckusick } 168098542Smckusick continue; 168198542Smckusick } 168298542Smckusick dblk = ((ufs2_daddr_t *)(ibp->b_data))[loc]; 1683151177Stegge if (dblk == 0) 1684151177Stegge continue; 168576356Smckusick if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) 168698542Smckusick ((ufs2_daddr_t *)(ibp->b_data))[loc] = 0; 168776356Smckusick else if ((dblk == blkstofrags(fs, blkno) && 168890098Smckusick ffs_snapblkfree(fs, ip->i_devvp, dblk, 1689223127Smckusick fs->fs_bsize, ip->i_number, vp->v_type, NULL))) { 169098542Smckusick ip->i_din2->di_blocks -= btodb(fs->fs_bsize); 169198542Smckusick ((ufs2_daddr_t *)(ibp->b_data))[loc] = 0; 169276356Smckusick } 169362976Smckusick } 169462976Smckusick bawrite(ibp); 169562976Smckusick } 169662976Smckusick /* 169762976Smckusick * Clear snapshot flag and drop reference. 169862976Smckusick */ 169963897Smckusick ip->i_flags &= ~SF_SNAPSHOT; 1700132775Skan DIP_SET(ip, i_flags, ip->i_flags); 170162976Smckusick ip->i_flag |= IN_CHANGE | IN_UPDATE; 1702207141Sjeff /* 1703207141Sjeff * The dirtied indirects must be written out before 1704207141Sjeff * softdep_setup_freeblocks() is called. Otherwise indir_trunc() 1705207141Sjeff * may find indirect pointers using the magic BLK_* values. 1706207141Sjeff */ 1707207141Sjeff if (DOINGSOFTDEP(vp)) 1708233438Smckusick ffs_syncvnode(vp, MNT_WAIT, 0); 1709158322Stegge#ifdef QUOTA 1710158322Stegge /* 1711158322Stegge * Reenable disk quotas for ex-snapshot file. 1712158322Stegge */ 1713158322Stegge if (!getinoquota(ip)) 1714158322Stegge (void) chkdq(ip, DIP(ip, i_blocks), KERNCRED, FORCE); 1715158322Stegge#endif 171662976Smckusick} 171762976Smckusick 171862976Smckusick/* 171962976Smckusick * Notification that a block is being freed. Return zero if the free 172062976Smckusick * should be allowed to proceed. Return non-zero if the snapshot file 172162976Smckusick * wants to claim the block. The block will be claimed if it is an 172262976Smckusick * uncopied part of one of the snapshots. It will be freed if it is 172362976Smckusick * either a BLK_NOCOPY or has already been copied in all of the snapshots. 172462976Smckusick * If a fragment is being freed, then all snapshots that care about 172562976Smckusick * it must make a copy since a snapshot file can only claim full sized 172662976Smckusick * blocks. Note that if more than one snapshot file maps the block, 172762976Smckusick * we can pick one at random to claim it. Since none of the snapshots 172862976Smckusick * can change, we are assurred that they will all see the same unmodified 172962976Smckusick * image. When deleting a snapshot file (see ffs_snapremove above), we 173062976Smckusick * must push any of these claimed blocks to one of the other snapshots 173162976Smckusick * that maps it. These claimed blocks are easily identified as they will 173262976Smckusick * have a block number equal to their logical block number within the 173362976Smckusick * snapshot. A copied block can never have this property because they 173462976Smckusick * must always have been allocated from a BLK_NOCOPY location. 173562976Smckusick */ 173662976Smckusickint 1737223127Smckusickffs_snapblkfree(fs, devvp, bno, size, inum, vtype, wkhd) 173890098Smckusick struct fs *fs; 173990098Smckusick struct vnode *devvp; 174098542Smckusick ufs2_daddr_t bno; 174162976Smckusick long size; 174290098Smckusick ino_t inum; 1743223127Smckusick enum vtype vtype; 1744223020Smckusick struct workhead *wkhd; 174562976Smckusick{ 1746238697Skevlo struct buf *ibp, *cbp, *savedcbp = NULL; 174783366Sjulian struct thread *td = curthread; 174862976Smckusick struct inode *ip; 1749107414Smckusick struct vnode *vp = NULL; 175098542Smckusick ufs_lbn_t lbn; 175198542Smckusick ufs2_daddr_t blkno; 1752151177Stegge int indiroff = 0, error = 0, claimedblk = 0; 1753135138Sphk struct snapdata *sn; 175462976Smckusick 175562976Smckusick lbn = fragstoblks(fs, bno); 1756107414Smckusickretry: 1757107414Smckusick VI_LOCK(devvp); 1758135138Sphk sn = devvp->v_rdev->si_snapdata; 1759135312Sphk if (sn == NULL) { 1760135312Sphk VI_UNLOCK(devvp); 1761135312Sphk return (0); 1762135312Sphk } 1763175635Sattilio if (lockmgr(&sn->sn_lock, LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL, 1764175635Sattilio VI_MTX(devvp)) != 0) 1765151177Stegge goto retry; 1766135138Sphk TAILQ_FOREACH(ip, &sn->sn_head, i_nextsnap) { 176762976Smckusick vp = ITOV(ip); 1768207742Sjeff if (DOINGSOFTDEP(vp)) 1769207742Sjeff softdep_prealloc(vp, MNT_WAIT); 177062976Smckusick /* 177162976Smckusick * Lookup block being written. 177262976Smckusick */ 177362976Smckusick if (lbn < NDADDR) { 177498542Smckusick blkno = DIP(ip, i_db[lbn]); 177562976Smckusick } else { 1776121443Sjhb td->td_pflags |= TDP_COWINPROGRESS; 177776132Sphk error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 177898658Sdillon fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 1779121443Sjhb td->td_pflags &= ~TDP_COWINPROGRESS; 178062976Smckusick if (error) 178162976Smckusick break; 178262976Smckusick indiroff = (lbn - NDADDR) % NINDIR(fs); 178398542Smckusick if (ip->i_ump->um_fstype == UFS1) 178498542Smckusick blkno=((ufs1_daddr_t *)(ibp->b_data))[indiroff]; 178598542Smckusick else 178698542Smckusick blkno=((ufs2_daddr_t *)(ibp->b_data))[indiroff]; 178762976Smckusick } 178862976Smckusick /* 178962976Smckusick * Check to see if block needs to be copied. 179062976Smckusick */ 179198542Smckusick if (blkno == 0) { 179298542Smckusick /* 179398542Smckusick * A block that we map is being freed. If it has not 179498542Smckusick * been claimed yet, we will claim or copy it (below). 179598542Smckusick */ 179698542Smckusick claimedblk = 1; 179798542Smckusick } else if (blkno == BLK_SNAP) { 179898542Smckusick /* 179998542Smckusick * No previous snapshot claimed the block, 1800107414Smckusick * so it will be freed and become a BLK_NOCOPY 180198542Smckusick * (don't care) for us. 180298542Smckusick */ 180362976Smckusick if (claimedblk) 180462976Smckusick panic("snapblkfree: inconsistent block type"); 180562976Smckusick if (lbn < NDADDR) { 1806132775Skan DIP_SET(ip, i_db[lbn], BLK_NOCOPY); 180762976Smckusick ip->i_flag |= IN_CHANGE | IN_UPDATE; 180898542Smckusick } else if (ip->i_ump->um_fstype == UFS1) { 180998542Smckusick ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = 181098542Smckusick BLK_NOCOPY; 181198542Smckusick bdwrite(ibp); 181262976Smckusick } else { 181398542Smckusick ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = 181462976Smckusick BLK_NOCOPY; 181562976Smckusick bdwrite(ibp); 181662976Smckusick } 181762976Smckusick continue; 181898542Smckusick } else /* BLK_NOCOPY or default */ { 181998542Smckusick /* 182098542Smckusick * If the snapshot has already copied the block 182198542Smckusick * (default), or does not care about the block, 182298542Smckusick * it is not needed. 182398542Smckusick */ 182498542Smckusick if (lbn >= NDADDR) 182598542Smckusick bqrelse(ibp); 182698542Smckusick continue; 182762976Smckusick } 182862976Smckusick /* 182962976Smckusick * If this is a full size block, we will just grab it 183062976Smckusick * and assign it to the snapshot inode. Otherwise we 183162976Smckusick * will proceed to copy it. See explanation for this 183262976Smckusick * routine as to why only a single snapshot needs to 183362976Smckusick * claim this block. 183462976Smckusick */ 183562976Smckusick if (size == fs->fs_bsize) { 183662976Smckusick#ifdef DEBUG 183762976Smckusick if (snapdebug) 1838241011Smdf printf("%s %ju lbn %jd from inum %ju\n", 1839241011Smdf "Grabonremove: snapino", 1840241011Smdf (uintmax_t)ip->i_number, 1841241011Smdf (intmax_t)lbn, (uintmax_t)inum); 184262976Smckusick#endif 1843223020Smckusick /* 1844223020Smckusick * If journaling is tracking this write we must add 1845223020Smckusick * the work to the inode or indirect being written. 1846223020Smckusick */ 1847223020Smckusick if (wkhd != NULL) { 1848223020Smckusick if (lbn < NDADDR) 1849223020Smckusick softdep_inode_append(ip, 1850223020Smckusick curthread->td_ucred, wkhd); 1851223020Smckusick else 1852223020Smckusick softdep_buf_append(ibp, wkhd); 1853223020Smckusick } 185462976Smckusick if (lbn < NDADDR) { 1855132775Skan DIP_SET(ip, i_db[lbn], bno); 185698542Smckusick } else if (ip->i_ump->um_fstype == UFS1) { 185798542Smckusick ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = bno; 185898542Smckusick bdwrite(ibp); 185962976Smckusick } else { 186098542Smckusick ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = bno; 186162976Smckusick bdwrite(ibp); 186262976Smckusick } 1863132775Skan DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + btodb(size)); 186462976Smckusick ip->i_flag |= IN_CHANGE | IN_UPDATE; 1865175635Sattilio lockmgr(vp->v_vnlock, LK_RELEASE, NULL); 186662976Smckusick return (1); 186762976Smckusick } 186862976Smckusick if (lbn >= NDADDR) 186963788Smckusick bqrelse(ibp); 187062976Smckusick /* 187162976Smckusick * Allocate the block into which to do the copy. Note that this 187262976Smckusick * allocation will never require any additional allocations for 187362976Smckusick * the snapshot inode. 187462976Smckusick */ 1875121443Sjhb td->td_pflags |= TDP_COWINPROGRESS; 187676132Sphk error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 187762976Smckusick fs->fs_bsize, KERNCRED, 0, &cbp); 1878121443Sjhb td->td_pflags &= ~TDP_COWINPROGRESS; 1879107414Smckusick if (error) 188062976Smckusick break; 188162976Smckusick#ifdef DEBUG 188262976Smckusick if (snapdebug) 1883241011Smdf printf("%s%ju lbn %jd %s %ju size %ld to blkno %jd\n", 1884241011Smdf "Copyonremove: snapino ", (uintmax_t)ip->i_number, 1885241011Smdf (intmax_t)lbn, "for inum", (uintmax_t)inum, size, 188698542Smckusick (intmax_t)cbp->b_blkno); 188762976Smckusick#endif 188862976Smckusick /* 188962976Smckusick * If we have already read the old block contents, then 189075943Smckusick * simply copy them to the new block. Note that we need 189175943Smckusick * to synchronously write snapshots that have not been 189275943Smckusick * unlinked, and hence will be visible after a crash, 1893223127Smckusick * to ensure their integrity. At a minimum we ensure the 1894223127Smckusick * integrity of the filesystem metadata, but use the 1895223127Smckusick * dopersistence sysctl-setable flag to decide on the 1896223127Smckusick * persistence needed for file content data. 189762976Smckusick */ 189862976Smckusick if (savedcbp != 0) { 189962976Smckusick bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize); 190062976Smckusick bawrite(cbp); 1901223127Smckusick if ((vtype == VDIR || dopersistence) && 1902223127Smckusick ip->i_effnlink > 0) 1903233438Smckusick (void) ffs_syncvnode(vp, MNT_WAIT, NO_INO_UPDT); 190462976Smckusick continue; 190562976Smckusick } 190662976Smckusick /* 190762976Smckusick * Otherwise, read the old block contents into the buffer. 190862976Smckusick */ 1909135138Sphk if ((error = readblock(vp, cbp, lbn)) != 0) { 191075943Smckusick bzero(cbp->b_data, fs->fs_bsize); 191175943Smckusick bawrite(cbp); 1912223127Smckusick if ((vtype == VDIR || dopersistence) && 1913223127Smckusick ip->i_effnlink > 0) 1914233438Smckusick (void) ffs_syncvnode(vp, MNT_WAIT, NO_INO_UPDT); 191562976Smckusick break; 191675943Smckusick } 191762976Smckusick savedcbp = cbp; 191862976Smckusick } 191975943Smckusick /* 192075943Smckusick * Note that we need to synchronously write snapshots that 192175943Smckusick * have not been unlinked, and hence will be visible after 1922223127Smckusick * a crash, to ensure their integrity. At a minimum we 1923223127Smckusick * ensure the integrity of the filesystem metadata, but 1924223127Smckusick * use the dopersistence sysctl-setable flag to decide on 1925223127Smckusick * the persistence needed for file content data. 192675943Smckusick */ 192775943Smckusick if (savedcbp) { 192875943Smckusick vp = savedcbp->b_vp; 192962976Smckusick bawrite(savedcbp); 1930223268Smckusick if ((vtype == VDIR || dopersistence) && 1931223268Smckusick VTOI(vp)->i_effnlink > 0) 1932233438Smckusick (void) ffs_syncvnode(vp, MNT_WAIT, NO_INO_UPDT); 193375943Smckusick } 193462976Smckusick /* 193562976Smckusick * If we have been unable to allocate a block in which to do 193662976Smckusick * the copy, then return non-zero so that the fragment will 193762976Smckusick * not be freed. Although space will be lost, the snapshot 193862976Smckusick * will stay consistent. 193962976Smckusick */ 1940223020Smckusick if (error != 0 && wkhd != NULL) 1941223020Smckusick softdep_freework(wkhd); 1942175635Sattilio lockmgr(vp->v_vnlock, LK_RELEASE, NULL); 194362976Smckusick return (error); 194462976Smckusick} 194562976Smckusick 194662976Smckusick/* 194762976Smckusick * Associate snapshot files when mounting. 194862976Smckusick */ 194962976Smckusickvoid 195062976Smckusickffs_snapshot_mount(mp) 195162976Smckusick struct mount *mp; 195262976Smckusick{ 195362976Smckusick struct ufsmount *ump = VFSTOUFS(mp); 1954107414Smckusick struct vnode *devvp = ump->um_devvp; 195562976Smckusick struct fs *fs = ump->um_fs; 195683366Sjulian struct thread *td = curthread; 1957135138Sphk struct snapdata *sn; 195862976Smckusick struct vnode *vp; 1959158636Stegge struct vnode *lastvp; 1960135303Sphk struct inode *ip; 1961104698Smckusick struct uio auio; 1962104698Smckusick struct iovec aiov; 1963107848Smckusick void *snapblklist; 1964104698Smckusick char *reason; 1965107848Smckusick daddr_t snaplistsize; 196662976Smckusick int error, snaploc, loc; 196762976Smckusick 1968104698Smckusick /* 1969141526Sphk * XXX The following needs to be set before ffs_truncate or 1970104698Smckusick * VOP_READ can be called. 1971104698Smckusick */ 1972104698Smckusick mp->mnt_stat.f_iosize = fs->fs_bsize; 1973104698Smckusick /* 1974104698Smckusick * Process each snapshot listed in the superblock. 1975104698Smckusick */ 1976107848Smckusick vp = NULL; 1977158636Stegge lastvp = NULL; 1978177778Sjeff sn = NULL; 197962976Smckusick for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) { 198062976Smckusick if (fs->fs_snapinum[snaploc] == 0) 1981107848Smckusick break; 1982141526Sphk if ((error = ffs_vget(mp, fs->fs_snapinum[snaploc], 198392462Smckusick LK_EXCLUSIVE, &vp)) != 0){ 198462976Smckusick printf("ffs_snapshot_mount: vget failed %d\n", error); 198562976Smckusick continue; 198662976Smckusick } 198762976Smckusick ip = VTOI(vp); 1988232351Smckusick if (!IS_SNAPSHOT(ip) || ip->i_size == 1989104698Smckusick lblktosize(fs, howmany(fs->fs_size, fs->fs_frag))) { 1990232351Smckusick if (!IS_SNAPSHOT(ip)) { 1991104698Smckusick reason = "non-snapshot"; 1992104698Smckusick } else { 1993104698Smckusick reason = "old format snapshot"; 1994234605Strasz (void)ffs_truncate(vp, (off_t)0, 0, NOCRED); 1995233438Smckusick (void)ffs_syncvnode(vp, MNT_WAIT, 0); 1996104698Smckusick } 1997104698Smckusick printf("ffs_snapshot_mount: %s inode %d\n", 1998104698Smckusick reason, fs->fs_snapinum[snaploc]); 199962976Smckusick vput(vp); 2000107848Smckusick vp = NULL; 200162976Smckusick for (loc = snaploc + 1; loc < FSMAXSNAP; loc++) { 200262976Smckusick if (fs->fs_snapinum[loc] == 0) 200362976Smckusick break; 200462976Smckusick fs->fs_snapinum[loc - 1] = fs->fs_snapinum[loc]; 200562976Smckusick } 200662976Smckusick fs->fs_snapinum[loc - 1] = 0; 200762976Smckusick snaploc--; 200862976Smckusick continue; 200962976Smckusick } 2010104698Smckusick /* 2011177778Sjeff * Acquire a lock on the snapdata structure, creating it if 2012177778Sjeff * necessary. 2013105191Smckusick */ 2014177778Sjeff sn = ffs_snapdata_acquire(devvp); 2015177778Sjeff /* 2016177778Sjeff * Change vnode to use shared snapshot lock instead of the 2017177778Sjeff * original private lock. 2018177778Sjeff */ 2019177778Sjeff vp->v_vnlock = &sn->sn_lock; 2020175635Sattilio lockmgr(&vp->v_lock, LK_RELEASE, NULL); 2021105191Smckusick /* 2022104698Smckusick * Link it onto the active snapshot list. 2023104698Smckusick */ 2024107414Smckusick VI_LOCK(devvp); 202573942Smckusick if (ip->i_nextsnap.tqe_prev != 0) 2026241011Smdf panic("ffs_snapshot_mount: %ju already on list", 2027241011Smdf (uintmax_t)ip->i_number); 202873942Smckusick else 2029135138Sphk TAILQ_INSERT_TAIL(&sn->sn_head, ip, i_nextsnap); 2030101308Sjeff vp->v_vflag |= VV_SYSTEM; 2031107414Smckusick VI_UNLOCK(devvp); 2032175294Sattilio VOP_UNLOCK(vp, 0); 2033158636Stegge lastvp = vp; 203462976Smckusick } 2035158636Stegge vp = lastvp; 2036107848Smckusick /* 2037107848Smckusick * No usable snapshots found. 2038107848Smckusick */ 2039177778Sjeff if (sn == NULL || vp == NULL) 2040107848Smckusick return; 2041107848Smckusick /* 2042107848Smckusick * Allocate the space for the block hints list. We always want to 2043107848Smckusick * use the list from the newest snapshot. 2044107848Smckusick */ 2045107848Smckusick auio.uio_iov = &aiov; 2046107848Smckusick auio.uio_iovcnt = 1; 2047107848Smckusick aiov.iov_base = (void *)&snaplistsize; 2048107848Smckusick aiov.iov_len = sizeof(snaplistsize); 2049107848Smckusick auio.uio_resid = aiov.iov_len; 2050107848Smckusick auio.uio_offset = 2051107848Smckusick lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)); 2052107848Smckusick auio.uio_segflg = UIO_SYSSPACE; 2053107848Smckusick auio.uio_rw = UIO_READ; 2054107848Smckusick auio.uio_td = td; 2055175202Sattilio vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 2056107848Smckusick if ((error = VOP_READ(vp, &auio, IO_UNIT, td->td_ucred)) != 0) { 2057107848Smckusick printf("ffs_snapshot_mount: read_1 failed %d\n", error); 2058175294Sattilio VOP_UNLOCK(vp, 0); 2059107848Smckusick return; 2060107848Smckusick } 2061184205Sdes snapblklist = malloc(snaplistsize * sizeof(daddr_t), 2062111119Simp M_UFSMNT, M_WAITOK); 2063107848Smckusick auio.uio_iovcnt = 1; 2064107848Smckusick aiov.iov_base = snapblklist; 2065107848Smckusick aiov.iov_len = snaplistsize * sizeof (daddr_t); 2066107848Smckusick auio.uio_resid = aiov.iov_len; 2067107848Smckusick auio.uio_offset -= sizeof(snaplistsize); 2068107848Smckusick if ((error = VOP_READ(vp, &auio, IO_UNIT, td->td_ucred)) != 0) { 2069107848Smckusick printf("ffs_snapshot_mount: read_2 failed %d\n", error); 2070175294Sattilio VOP_UNLOCK(vp, 0); 2071184205Sdes free(snapblklist, M_UFSMNT); 2072107848Smckusick return; 2073107848Smckusick } 2074175294Sattilio VOP_UNLOCK(vp, 0); 2075107848Smckusick VI_LOCK(devvp); 2076107848Smckusick ASSERT_VOP_LOCKED(devvp, "ffs_snapshot_mount"); 2077135138Sphk sn->sn_listsize = snaplistsize; 2078135138Sphk sn->sn_blklist = (daddr_t *)snapblklist; 2079107848Smckusick devvp->v_vflag |= VV_COPYONWRITE; 2080107848Smckusick VI_UNLOCK(devvp); 208162976Smckusick} 208262976Smckusick 208362976Smckusick/* 208462976Smckusick * Disassociate snapshot files when unmounting. 208562976Smckusick */ 208662976Smckusickvoid 208762976Smckusickffs_snapshot_unmount(mp) 208862976Smckusick struct mount *mp; 208962976Smckusick{ 2090107414Smckusick struct vnode *devvp = VFSTOUFS(mp)->um_devvp; 2091135138Sphk struct snapdata *sn; 209262976Smckusick struct inode *xp; 2093105191Smckusick struct vnode *vp; 209462976Smckusick 2095158259Stegge VI_LOCK(devvp); 2096135138Sphk sn = devvp->v_rdev->si_snapdata; 2097158259Stegge while (sn != NULL && (xp = TAILQ_FIRST(&sn->sn_head)) != NULL) { 2098105191Smckusick vp = ITOV(xp); 2099135138Sphk TAILQ_REMOVE(&sn->sn_head, xp, i_nextsnap); 210073942Smckusick xp->i_nextsnap.tqe_prev = 0; 2101175635Sattilio lockmgr(&sn->sn_lock, LK_INTERLOCK | LK_EXCLUSIVE, 2102175635Sattilio VI_MTX(devvp)); 2103177778Sjeff lockmgr(&vp->v_lock, LK_EXCLUSIVE, NULL); 2104158259Stegge KASSERT(vp->v_vnlock == &sn->sn_lock, 2105158259Stegge ("ffs_snapshot_unmount: lost lock mutation")); 2106158259Stegge vp->v_vnlock = &vp->v_lock; 2107175635Sattilio lockmgr(&vp->v_lock, LK_RELEASE, NULL); 2108175635Sattilio lockmgr(&sn->sn_lock, LK_RELEASE, NULL); 2109158259Stegge if (xp->i_effnlink > 0) 2110105191Smckusick vrele(vp); 2111158259Stegge VI_LOCK(devvp); 2112158259Stegge sn = devvp->v_rdev->si_snapdata; 211362976Smckusick } 2114177778Sjeff try_free_snapdata(devvp); 2115107414Smckusick ASSERT_VOP_LOCKED(devvp, "ffs_snapshot_unmount"); 211662976Smckusick} 211762976Smckusick 211862976Smckusick/* 2119166193Skib * Check the buffer block to be belong to device buffer that shall be 2120166193Skib * locked after snaplk. devvp shall be locked on entry, and will be 2121166193Skib * leaved locked upon exit. 2122166193Skib */ 2123166193Skibstatic int 2124166193Skibffs_bp_snapblk(devvp, bp) 2125166193Skib struct vnode *devvp; 2126166193Skib struct buf *bp; 2127166193Skib{ 2128166193Skib struct snapdata *sn; 2129166193Skib struct fs *fs; 2130166193Skib ufs2_daddr_t lbn, *snapblklist; 2131166193Skib int lower, upper, mid; 2132166193Skib 2133166193Skib ASSERT_VI_LOCKED(devvp, "ffs_bp_snapblk"); 2134166193Skib KASSERT(devvp->v_type == VCHR, ("Not a device %p", devvp)); 2135166193Skib sn = devvp->v_rdev->si_snapdata; 2136166193Skib if (sn == NULL || TAILQ_FIRST(&sn->sn_head) == NULL) 2137166193Skib return (0); 2138166193Skib fs = TAILQ_FIRST(&sn->sn_head)->i_fs; 2139166193Skib lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno)); 2140166193Skib snapblklist = sn->sn_blklist; 2141166193Skib upper = sn->sn_listsize - 1; 2142166193Skib lower = 1; 2143166193Skib while (lower <= upper) { 2144166193Skib mid = (lower + upper) / 2; 2145166193Skib if (snapblklist[mid] == lbn) 2146166193Skib break; 2147166193Skib if (snapblklist[mid] < lbn) 2148166193Skib lower = mid + 1; 2149166193Skib else 2150166193Skib upper = mid - 1; 2151166193Skib } 2152166193Skib if (lower <= upper) 2153166193Skib return (1); 2154166193Skib return (0); 2155166193Skib} 2156166193Skib 2157166193Skibvoid 2158166193Skibffs_bdflush(bo, bp) 2159166193Skib struct bufobj *bo; 2160166193Skib struct buf *bp; 2161166193Skib{ 2162166193Skib struct thread *td; 2163166193Skib struct vnode *vp, *devvp; 2164166193Skib struct buf *nbp; 2165166193Skib int bp_bdskip; 2166166193Skib 2167166193Skib if (bo->bo_dirty.bv_cnt <= dirtybufthresh) 2168166193Skib return; 2169166193Skib 2170166193Skib td = curthread; 2171166193Skib vp = bp->b_vp; 2172166193Skib devvp = bo->__bo_vnode; 2173166193Skib KASSERT(vp == devvp, ("devvp != vp %p %p", bo, bp)); 2174166193Skib 2175166193Skib VI_LOCK(devvp); 2176166193Skib bp_bdskip = ffs_bp_snapblk(devvp, bp); 2177166193Skib if (bp_bdskip) 2178166193Skib bdwriteskip++; 2179166193Skib VI_UNLOCK(devvp); 2180166193Skib if (bo->bo_dirty.bv_cnt > dirtybufthresh + 10 && !bp_bdskip) { 2181166193Skib (void) VOP_FSYNC(vp, MNT_NOWAIT, td); 2182166193Skib altbufferflushes++; 2183166193Skib } else { 2184166193Skib BO_LOCK(bo); 2185166193Skib /* 2186166193Skib * Try to find a buffer to flush. 2187166193Skib */ 2188166193Skib TAILQ_FOREACH(nbp, &bo->bo_dirty.bv_hd, b_bobufs) { 2189166193Skib if ((nbp->b_vflags & BV_BKGRDINPROG) || 2190166193Skib BUF_LOCK(nbp, 2191166193Skib LK_EXCLUSIVE | LK_NOWAIT, NULL)) 2192166193Skib continue; 2193166193Skib if (bp == nbp) 2194166193Skib panic("bdwrite: found ourselves"); 2195166193Skib BO_UNLOCK(bo); 2196166193Skib /* 2197166193Skib * Don't countdeps with the bo lock 2198166193Skib * held. 2199166193Skib */ 2200166193Skib if (buf_countdeps(nbp, 0)) { 2201166193Skib BO_LOCK(bo); 2202166193Skib BUF_UNLOCK(nbp); 2203166193Skib continue; 2204166193Skib } 2205166193Skib if (bp_bdskip) { 2206166193Skib VI_LOCK(devvp); 2207166193Skib if (!ffs_bp_snapblk(vp, nbp)) { 2208251171Sjeff VI_UNLOCK(devvp); 2209251171Sjeff BO_LOCK(bo); 2210166193Skib BUF_UNLOCK(nbp); 2211166193Skib continue; 2212166193Skib } 2213166193Skib VI_UNLOCK(devvp); 2214166193Skib } 2215166193Skib if (nbp->b_flags & B_CLUSTEROK) { 2216166193Skib vfs_bio_awrite(nbp); 2217166193Skib } else { 2218166193Skib bremfree(nbp); 2219166193Skib bawrite(nbp); 2220166193Skib } 2221166193Skib dirtybufferflushes++; 2222166193Skib break; 2223166193Skib } 2224166193Skib if (nbp == NULL) 2225166193Skib BO_UNLOCK(bo); 2226166193Skib } 2227166193Skib} 2228166193Skib 2229166193Skib/* 223062976Smckusick * Check for need to copy block that is about to be written, 223162976Smckusick * copying the block if necessary. 223262976Smckusick */ 2233136963Sphkint 223473942Smckusickffs_copyonwrite(devvp, bp) 223573942Smckusick struct vnode *devvp; 223673942Smckusick struct buf *bp; 223762976Smckusick{ 2238135138Sphk struct snapdata *sn; 2239238697Skevlo struct buf *ibp, *cbp, *savedcbp = NULL; 224083366Sjulian struct thread *td = curthread; 224173942Smckusick struct fs *fs; 224262976Smckusick struct inode *ip; 2243238697Skevlo struct vnode *vp = NULL; 2244107848Smckusick ufs2_daddr_t lbn, blkno, *snapblklist; 2245151177Stegge int lower, upper, mid, indiroff, error = 0; 2246150760Struckman int launched_async_io, prev_norunningbuf; 2247158260Stegge long saved_runningbufspace; 224862976Smckusick 2249232351Smckusick if (devvp != bp->b_vp && IS_SNAPSHOT(VTOI(bp->b_vp))) 2250151179Stegge return (0); /* Update on a snapshot file */ 2251121443Sjhb if (td->td_pflags & TDP_COWINPROGRESS) 225262976Smckusick panic("ffs_copyonwrite: recursive call"); 2253107848Smckusick /* 2254107848Smckusick * First check to see if it is in the preallocated list. 2255107848Smckusick * By doing this check we avoid several potential deadlocks. 2256107848Smckusick */ 2257107414Smckusick VI_LOCK(devvp); 2258135138Sphk sn = devvp->v_rdev->si_snapdata; 2259151177Stegge if (sn == NULL || 2260168353Sdelphij TAILQ_EMPTY(&sn->sn_head)) { 2261151177Stegge VI_UNLOCK(devvp); 2262151177Stegge return (0); /* No snapshot */ 2263151177Stegge } 2264135138Sphk ip = TAILQ_FIRST(&sn->sn_head); 2265105191Smckusick fs = ip->i_fs; 2266105191Smckusick lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno)); 2267135138Sphk snapblklist = sn->sn_blklist; 2268135138Sphk upper = sn->sn_listsize - 1; 2269107848Smckusick lower = 1; 2270107848Smckusick while (lower <= upper) { 2271107848Smckusick mid = (lower + upper) / 2; 2272107848Smckusick if (snapblklist[mid] == lbn) 2273107848Smckusick break; 2274107848Smckusick if (snapblklist[mid] < lbn) 2275107848Smckusick lower = mid + 1; 2276107848Smckusick else 2277107848Smckusick upper = mid - 1; 2278107848Smckusick } 2279107848Smckusick if (lower <= upper) { 2280107848Smckusick VI_UNLOCK(devvp); 2281107848Smckusick return (0); 2282107848Smckusick } 2283150760Struckman launched_async_io = 0; 2284150760Struckman prev_norunningbuf = td->td_pflags & TDP_NORUNNINGBUF; 2285107848Smckusick /* 2286150741Struckman * Since I/O on bp isn't yet in progress and it may be blocked 2287150741Struckman * for a long time waiting on snaplk, back it out of 2288150741Struckman * runningbufspace, possibly waking other threads waiting for space. 2289150741Struckman */ 2290158260Stegge saved_runningbufspace = bp->b_runningbufspace; 2291158260Stegge if (saved_runningbufspace != 0) 2292158260Stegge runningbufwakeup(bp); 2293150741Struckman /* 2294107848Smckusick * Not in the precomputed list, so check the snapshots. 2295107848Smckusick */ 2296175635Sattilio while (lockmgr(&sn->sn_lock, LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL, 2297175635Sattilio VI_MTX(devvp)) != 0) { 2298151177Stegge VI_LOCK(devvp); 2299151177Stegge sn = devvp->v_rdev->si_snapdata; 2300151177Stegge if (sn == NULL || 2301168353Sdelphij TAILQ_EMPTY(&sn->sn_head)) { 2302151177Stegge VI_UNLOCK(devvp); 2303158260Stegge if (saved_runningbufspace != 0) { 2304158260Stegge bp->b_runningbufspace = saved_runningbufspace; 2305189595Sjhb atomic_add_long(&runningbufspace, 2306151177Stegge bp->b_runningbufspace); 2307158260Stegge } 2308151177Stegge return (0); /* Snapshot gone */ 2309151177Stegge } 2310151177Stegge } 2311135138Sphk TAILQ_FOREACH(ip, &sn->sn_head, i_nextsnap) { 231262976Smckusick vp = ITOV(ip); 2313207742Sjeff if (DOINGSOFTDEP(vp)) 2314207742Sjeff softdep_prealloc(vp, MNT_WAIT); 231562976Smckusick /* 231662976Smckusick * We ensure that everything of our own that needs to be 231762976Smckusick * copied will be done at the time that ffs_snapshot is 231862976Smckusick * called. Thus we can skip the check here which can 231976132Sphk * deadlock in doing the lookup in UFS_BALLOC. 232062976Smckusick */ 232162976Smckusick if (bp->b_vp == vp) 232262976Smckusick continue; 232362976Smckusick /* 2324105670Smckusick * Check to see if block needs to be copied. We do not have 2325105670Smckusick * to hold the snapshot lock while doing this lookup as it 2326105670Smckusick * will never require any additional allocations for the 2327105670Smckusick * snapshot inode. 232862976Smckusick */ 232962976Smckusick if (lbn < NDADDR) { 233098542Smckusick blkno = DIP(ip, i_db[lbn]); 233162976Smckusick } else { 2332150741Struckman td->td_pflags |= TDP_COWINPROGRESS | TDP_NORUNNINGBUF; 233376132Sphk error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 2334105191Smckusick fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 2335121443Sjhb td->td_pflags &= ~TDP_COWINPROGRESS; 2336105191Smckusick if (error) 2337105191Smckusick break; 233862976Smckusick indiroff = (lbn - NDADDR) % NINDIR(fs); 233998542Smckusick if (ip->i_ump->um_fstype == UFS1) 234098542Smckusick blkno=((ufs1_daddr_t *)(ibp->b_data))[indiroff]; 234198542Smckusick else 234298542Smckusick blkno=((ufs2_daddr_t *)(ibp->b_data))[indiroff]; 234363788Smckusick bqrelse(ibp); 234462976Smckusick } 2345173464Sobrien#ifdef INVARIANTS 234662976Smckusick if (blkno == BLK_SNAP && bp->b_lblkno >= 0) 234762976Smckusick panic("ffs_copyonwrite: bad copy block"); 234862976Smckusick#endif 2349105191Smckusick if (blkno != 0) 235062976Smckusick continue; 235162976Smckusick /* 2352105670Smckusick * Allocate the block into which to do the copy. Since 2353105670Smckusick * multiple processes may all try to copy the same block, 2354105670Smckusick * we have to recheck our need to do a copy if we sleep 2355105670Smckusick * waiting for the lock. 2356105670Smckusick * 2357105670Smckusick * Because all snapshots on a filesystem share a single 2358105670Smckusick * lock, we ensure that we will never be in competition 2359105670Smckusick * with another process to allocate a block. 236062976Smckusick */ 2361150741Struckman td->td_pflags |= TDP_COWINPROGRESS | TDP_NORUNNINGBUF; 236276132Sphk error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 2363105191Smckusick fs->fs_bsize, KERNCRED, 0, &cbp); 2364121443Sjhb td->td_pflags &= ~TDP_COWINPROGRESS; 2365105191Smckusick if (error) 2366105191Smckusick break; 236762976Smckusick#ifdef DEBUG 236862976Smckusick if (snapdebug) { 2369241011Smdf printf("Copyonwrite: snapino %ju lbn %jd for ", 2370241011Smdf (uintmax_t)ip->i_number, (intmax_t)lbn); 237173942Smckusick if (bp->b_vp == devvp) 237262976Smckusick printf("fs metadata"); 237362976Smckusick else 2374241011Smdf printf("inum %ju", 2375241011Smdf (uintmax_t)VTOI(bp->b_vp)->i_number); 237698687Smux printf(" lblkno %jd to blkno %jd\n", 237798542Smckusick (intmax_t)bp->b_lblkno, (intmax_t)cbp->b_blkno); 237862976Smckusick } 237962976Smckusick#endif 238062976Smckusick /* 238162976Smckusick * If we have already read the old block contents, then 238275943Smckusick * simply copy them to the new block. Note that we need 238375943Smckusick * to synchronously write snapshots that have not been 238475943Smckusick * unlinked, and hence will be visible after a crash, 2385223127Smckusick * to ensure their integrity. At a minimum we ensure the 2386223127Smckusick * integrity of the filesystem metadata, but use the 2387223127Smckusick * dopersistence sysctl-setable flag to decide on the 2388223127Smckusick * persistence needed for file content data. 238962976Smckusick */ 239062976Smckusick if (savedcbp != 0) { 239162976Smckusick bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize); 239262976Smckusick bawrite(cbp); 2393223127Smckusick if ((devvp == bp->b_vp || bp->b_vp->v_type == VDIR || 2394223127Smckusick dopersistence) && ip->i_effnlink > 0) 2395233438Smckusick (void) ffs_syncvnode(vp, MNT_WAIT, NO_INO_UPDT); 2396150760Struckman else 2397150760Struckman launched_async_io = 1; 239862976Smckusick continue; 239962976Smckusick } 240062976Smckusick /* 240162976Smckusick * Otherwise, read the old block contents into the buffer. 240262976Smckusick */ 2403135138Sphk if ((error = readblock(vp, cbp, lbn)) != 0) { 240475943Smckusick bzero(cbp->b_data, fs->fs_bsize); 240575943Smckusick bawrite(cbp); 2406223127Smckusick if ((devvp == bp->b_vp || bp->b_vp->v_type == VDIR || 2407223127Smckusick dopersistence) && ip->i_effnlink > 0) 2408233438Smckusick (void) ffs_syncvnode(vp, MNT_WAIT, NO_INO_UPDT); 2409150760Struckman else 2410150760Struckman launched_async_io = 1; 241162976Smckusick break; 241275943Smckusick } 241362976Smckusick savedcbp = cbp; 241462976Smckusick } 241575943Smckusick /* 241675943Smckusick * Note that we need to synchronously write snapshots that 241775943Smckusick * have not been unlinked, and hence will be visible after 2418223127Smckusick * a crash, to ensure their integrity. At a minimum we 2419223127Smckusick * ensure the integrity of the filesystem metadata, but 2420223127Smckusick * use the dopersistence sysctl-setable flag to decide on 2421223127Smckusick * the persistence needed for file content data. 242275943Smckusick */ 242375943Smckusick if (savedcbp) { 242475943Smckusick vp = savedcbp->b_vp; 242562976Smckusick bawrite(savedcbp); 2426223127Smckusick if ((devvp == bp->b_vp || bp->b_vp->v_type == VDIR || 2427223127Smckusick dopersistence) && VTOI(vp)->i_effnlink > 0) 2428233438Smckusick (void) ffs_syncvnode(vp, MNT_WAIT, NO_INO_UPDT); 2429150760Struckman else 2430150760Struckman launched_async_io = 1; 243175943Smckusick } 2432175635Sattilio lockmgr(vp->v_vnlock, LK_RELEASE, NULL); 2433151177Stegge td->td_pflags = (td->td_pflags & ~TDP_NORUNNINGBUF) | 2434151177Stegge prev_norunningbuf; 2435150791Struckman if (launched_async_io && (td->td_pflags & TDP_NORUNNINGBUF) == 0) 2436150760Struckman waitrunningbufspace(); 2437150741Struckman /* 2438150741Struckman * I/O on bp will now be started, so count it in runningbufspace. 2439150741Struckman */ 2440158260Stegge if (saved_runningbufspace != 0) { 2441158260Stegge bp->b_runningbufspace = saved_runningbufspace; 2442189595Sjhb atomic_add_long(&runningbufspace, bp->b_runningbufspace); 2443158260Stegge } 244462976Smckusick return (error); 244562976Smckusick} 244662976Smckusick 244762976Smckusick/* 2448223020Smckusick * sync snapshots to force freework records waiting on snapshots to claim 2449223020Smckusick * blocks to free. 2450223020Smckusick */ 2451223020Smckusickvoid 2452223020Smckusickffs_sync_snap(mp, waitfor) 2453223020Smckusick struct mount *mp; 2454223020Smckusick int waitfor; 2455223020Smckusick{ 2456223020Smckusick struct snapdata *sn; 2457223020Smckusick struct vnode *devvp; 2458223020Smckusick struct vnode *vp; 2459223020Smckusick struct inode *ip; 2460223020Smckusick 2461223020Smckusick devvp = VFSTOUFS(mp)->um_devvp; 2462223020Smckusick if ((devvp->v_vflag & VV_COPYONWRITE) == 0) 2463223020Smckusick return; 2464223020Smckusick for (;;) { 2465223020Smckusick VI_LOCK(devvp); 2466223020Smckusick sn = devvp->v_rdev->si_snapdata; 2467223020Smckusick if (sn == NULL) { 2468223020Smckusick VI_UNLOCK(devvp); 2469223020Smckusick return; 2470223020Smckusick } 2471223020Smckusick if (lockmgr(&sn->sn_lock, 2472223020Smckusick LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL, 2473223020Smckusick VI_MTX(devvp)) == 0) 2474223020Smckusick break; 2475223020Smckusick } 2476223020Smckusick TAILQ_FOREACH(ip, &sn->sn_head, i_nextsnap) { 2477223020Smckusick vp = ITOV(ip); 2478233438Smckusick ffs_syncvnode(vp, waitfor, NO_INO_UPDT); 2479223020Smckusick } 2480223020Smckusick lockmgr(&sn->sn_lock, LK_RELEASE, NULL); 2481223020Smckusick} 2482223020Smckusick 2483223020Smckusick/* 248462976Smckusick * Read the specified block into the given buffer. 248562976Smckusick * Much of this boiler-plate comes from bwrite(). 248662976Smckusick */ 248762976Smckusickstatic int 2488135138Sphkreadblock(vp, bp, lbn) 2489135138Sphk struct vnode *vp; 249062976Smckusick struct buf *bp; 249198542Smckusick ufs2_daddr_t lbn; 249262976Smckusick{ 2493135138Sphk struct inode *ip = VTOI(vp); 2494137035Sphk struct bio *bip; 249562976Smckusick 2496137035Sphk bip = g_alloc_bio(); 2497137035Sphk bip->bio_cmd = BIO_READ; 2498137035Sphk bip->bio_offset = dbtob(fsbtodb(ip->i_fs, blkstofrags(ip->i_fs, lbn))); 2499137035Sphk bip->bio_data = bp->b_data; 2500137035Sphk bip->bio_length = bp->b_bcount; 2501158308Spjd bip->bio_done = NULL; 2502137035Sphk 2503137035Sphk g_io_request(bip, ip->i_devvp->v_bufobj.bo_private); 2504158308Spjd bp->b_error = biowait(bip, "snaprdb"); 2505137035Sphk g_destroy_bio(bip); 2506137035Sphk return (bp->b_error); 250762976Smckusick} 2508154065Simp 2509183073Skib#endif 2510183073Skib 2511156560Stegge/* 2512156560Stegge * Process file deletes that were deferred by ufs_inactive() due to 2513163194Skib * the file system being suspended. Transfer IN_LAZYACCESS into 2514163194Skib * IN_MODIFIED for vnodes that were accessed during suspension. 2515156560Stegge */ 2516183073Skibvoid 2517156560Steggeprocess_deferred_inactive(struct mount *mp) 2518156560Stegge{ 2519156560Stegge struct vnode *vp, *mvp; 2520163194Skib struct inode *ip; 2521156560Stegge struct thread *td; 2522156560Stegge int error; 2523156560Stegge 2524156560Stegge td = curthread; 2525156560Stegge (void) vn_start_secondary_write(NULL, &mp, V_WAIT); 2526156560Stegge loop: 2527234386Smckusick MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { 2528163194Skib /* 2529163194Skib * IN_LAZYACCESS is checked here without holding any 2530163194Skib * vnode lock, but this flag is set only while holding 2531163194Skib * vnode interlock. 2532163194Skib */ 2533234386Smckusick if (vp->v_type == VNON || 2534163194Skib ((VTOI(vp)->i_flag & IN_LAZYACCESS) == 0 && 2535234386Smckusick ((vp->v_iflag & VI_OWEINACT) == 0 || vp->v_usecount > 0))) { 2536156560Stegge VI_UNLOCK(vp); 2537156560Stegge continue; 2538156560Stegge } 2539156560Stegge vholdl(vp); 2540175202Sattilio error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK); 2541156560Stegge if (error != 0) { 2542156560Stegge vdrop(vp); 2543156560Stegge if (error == ENOENT) 2544156560Stegge continue; /* vnode recycled */ 2545234386Smckusick MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); 2546156560Stegge goto loop; 2547156560Stegge } 2548163194Skib ip = VTOI(vp); 2549163194Skib if ((ip->i_flag & IN_LAZYACCESS) != 0) { 2550163194Skib ip->i_flag &= ~IN_LAZYACCESS; 2551163194Skib ip->i_flag |= IN_MODIFIED; 2552163194Skib } 2553156560Stegge VI_LOCK(vp); 2554163194Skib if ((vp->v_iflag & VI_OWEINACT) == 0 || vp->v_usecount > 0) { 2555156560Stegge VI_UNLOCK(vp); 2556175294Sattilio VOP_UNLOCK(vp, 0); 2557156560Stegge vdrop(vp); 2558156560Stegge continue; 2559156560Stegge } 2560234158Smckusick vinactive(vp, td); 2561156560Stegge VNASSERT((vp->v_iflag & VI_OWEINACT) == 0, vp, 2562156560Stegge ("process_deferred_inactive: got VI_OWEINACT")); 2563156560Stegge VI_UNLOCK(vp); 2564175294Sattilio VOP_UNLOCK(vp, 0); 2565156560Stegge vdrop(vp); 2566156560Stegge } 2567156560Stegge vn_finished_secondary_write(mp); 2568156560Stegge} 2569158259Stegge 2570183073Skib#ifndef NO_FFS_SNAPSHOT 2571183073Skib 2572177778Sjeffstatic struct snapdata * 2573177778Sjeffffs_snapdata_alloc(void) 2574177778Sjeff{ 2575177778Sjeff struct snapdata *sn; 2576177778Sjeff 2577177778Sjeff /* 2578177778Sjeff * Fetch a snapdata from the free list if there is one available. 2579177778Sjeff */ 2580177778Sjeff mtx_lock(&snapfree_lock); 2581177778Sjeff sn = LIST_FIRST(&snapfree); 2582177778Sjeff if (sn != NULL) 2583177778Sjeff LIST_REMOVE(sn, sn_link); 2584177778Sjeff mtx_unlock(&snapfree_lock); 2585177778Sjeff if (sn != NULL) 2586177778Sjeff return (sn); 2587177778Sjeff /* 2588177778Sjeff * If there were no free snapdatas allocate one. 2589177778Sjeff */ 2590177778Sjeff sn = malloc(sizeof *sn, M_UFSMNT, M_WAITOK | M_ZERO); 2591177778Sjeff TAILQ_INIT(&sn->sn_head); 2592177778Sjeff lockinit(&sn->sn_lock, PVFS, "snaplk", VLKTIMEOUT, 2593177778Sjeff LK_CANRECURSE | LK_NOSHARE); 2594177778Sjeff return (sn); 2595177778Sjeff} 2596177778Sjeff 2597177778Sjeff/* 2598177778Sjeff * The snapdata is never freed because we can not be certain that 2599177778Sjeff * there are no threads sleeping on the snap lock. Persisting 2600177778Sjeff * them permanently avoids costly synchronization in ffs_lock(). 2601177778Sjeff */ 2602177778Sjeffstatic void 2603177778Sjeffffs_snapdata_free(struct snapdata *sn) 2604177778Sjeff{ 2605177778Sjeff mtx_lock(&snapfree_lock); 2606177778Sjeff LIST_INSERT_HEAD(&snapfree, sn, sn_link); 2607177778Sjeff mtx_unlock(&snapfree_lock); 2608177778Sjeff} 2609177778Sjeff 2610158259Stegge/* Try to free snapdata associated with devvp */ 2611158259Steggestatic void 2612177778Sjefftry_free_snapdata(struct vnode *devvp) 2613158259Stegge{ 2614158259Stegge struct snapdata *sn; 2615158259Stegge ufs2_daddr_t *snapblklist; 2616158259Stegge 2617177778Sjeff ASSERT_VI_LOCKED(devvp, "try_free_snapdata"); 2618158259Stegge sn = devvp->v_rdev->si_snapdata; 2619158259Stegge 2620158259Stegge if (sn == NULL || TAILQ_FIRST(&sn->sn_head) != NULL || 2621158259Stegge (devvp->v_vflag & VV_COPYONWRITE) == 0) { 2622158259Stegge VI_UNLOCK(devvp); 2623158259Stegge return; 2624158259Stegge } 2625158259Stegge 2626158259Stegge devvp->v_rdev->si_snapdata = NULL; 2627158259Stegge devvp->v_vflag &= ~VV_COPYONWRITE; 2628177778Sjeff lockmgr(&sn->sn_lock, LK_DRAIN|LK_INTERLOCK, VI_MTX(devvp)); 2629158259Stegge snapblklist = sn->sn_blklist; 2630158259Stegge sn->sn_blklist = NULL; 2631158259Stegge sn->sn_listsize = 0; 2632175635Sattilio lockmgr(&sn->sn_lock, LK_RELEASE, NULL); 2633158259Stegge if (snapblklist != NULL) 2634184205Sdes free(snapblklist, M_UFSMNT); 2635177778Sjeff ffs_snapdata_free(sn); 2636158259Stegge} 2637177778Sjeff 2638177778Sjeffstatic struct snapdata * 2639177778Sjeffffs_snapdata_acquire(struct vnode *devvp) 2640177778Sjeff{ 2641177778Sjeff struct snapdata *nsn; 2642177778Sjeff struct snapdata *sn; 2643177778Sjeff 2644177778Sjeff /* 2645177778Sjeff * Allocate a free snapdata. This is done before acquiring the 2646177778Sjeff * devvp lock to avoid allocation while the devvp interlock is 2647177778Sjeff * held. 2648177778Sjeff */ 2649177778Sjeff nsn = ffs_snapdata_alloc(); 2650177778Sjeff /* 2651177778Sjeff * If there snapshots already exist on this filesystem grab a 2652177778Sjeff * reference to the shared lock. Otherwise this is the first 2653177778Sjeff * snapshot on this filesystem and we need to use our 2654177778Sjeff * pre-allocated snapdata. 2655177778Sjeff */ 2656177778Sjeff VI_LOCK(devvp); 2657177778Sjeff if (devvp->v_rdev->si_snapdata == NULL) { 2658177778Sjeff devvp->v_rdev->si_snapdata = nsn; 2659177778Sjeff nsn = NULL; 2660177778Sjeff } 2661177778Sjeff sn = devvp->v_rdev->si_snapdata; 2662177778Sjeff /* 2663177778Sjeff * Acquire the snapshot lock. 2664177778Sjeff */ 2665177778Sjeff lockmgr(&sn->sn_lock, 2666177778Sjeff LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY, VI_MTX(devvp)); 2667177778Sjeff /* 2668177778Sjeff * Free any unused snapdata. 2669177778Sjeff */ 2670177778Sjeff if (nsn != NULL) 2671177778Sjeff ffs_snapdata_free(nsn); 2672177778Sjeff 2673177778Sjeff return (sn); 2674177778Sjeff} 2675177778Sjeff 2676154065Simp#endif 2677