ffs_snapshot.c revision 63788
162976Smckusick/* 262976Smckusick * Copyright 2000 Marshall Kirk McKusick. All Rights Reserved. 362976Smckusick * 462976Smckusick * Further information about snapshots can be obtained from: 562976Smckusick * 662976Smckusick * Marshall Kirk McKusick http://www.mckusick.com/softdep/ 762976Smckusick * 1614 Oxford Street mckusick@mckusick.com 862976Smckusick * Berkeley, CA 94709-1608 +1-510-843-9542 962976Smckusick * USA 1062976Smckusick * 1162976Smckusick * Redistribution and use in source and binary forms, with or without 1262976Smckusick * modification, are permitted provided that the following conditions 1362976Smckusick * are met: 1462976Smckusick * 1562976Smckusick * 1. Redistributions of source code must retain the above copyright 1662976Smckusick * notice, this list of conditions and the following disclaimer. 1762976Smckusick * 2. Redistributions in binary form must reproduce the above copyright 1862976Smckusick * notice, this list of conditions and the following disclaimer in the 1962976Smckusick * documentation and/or other materials provided with the distribution. 2062976Smckusick * 2162976Smckusick * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY 2262976Smckusick * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 2362976Smckusick * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 2462976Smckusick * DISCLAIMED. IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR 2562976Smckusick * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 2662976Smckusick * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 2762976Smckusick * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 2862976Smckusick * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 2962976Smckusick * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 3062976Smckusick * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 3162976Smckusick * SUCH DAMAGE. 3262976Smckusick * 3363788Smckusick * @(#)ffs_snapshot.c 8.11 (McKusick) 7/23/00 3462976Smckusick * $FreeBSD: head/sys/ufs/ffs/ffs_snapshot.c 63788 2000-07-24 05:28:33Z mckusick $ 3562976Smckusick */ 3662976Smckusick 3762976Smckusick#include <sys/param.h> 3862976Smckusick#include <sys/systm.h> 3962976Smckusick#include <sys/bio.h> 4062976Smckusick#include <sys/buf.h> 4162976Smckusick#include <sys/proc.h> 4262976Smckusick#include <sys/namei.h> 4362976Smckusick#include <sys/stat.h> 4462976Smckusick#include <sys/malloc.h> 4562976Smckusick#include <sys/mount.h> 4662976Smckusick#include <sys/resource.h> 4762976Smckusick#include <sys/resourcevar.h> 4862976Smckusick#include <sys/vnode.h> 4962976Smckusick 5062976Smckusick#include <ufs/ufs/extattr.h> 5162976Smckusick#include <ufs/ufs/quota.h> 5262976Smckusick#include <ufs/ufs/ufsmount.h> 5362976Smckusick#include <ufs/ufs/inode.h> 5462976Smckusick#include <ufs/ufs/ufs_extern.h> 5562976Smckusick 5662976Smckusick#include <ufs/ffs/fs.h> 5762976Smckusick#include <ufs/ffs/ffs_extern.h> 5862976Smckusick 5962976Smckusick#define KERNCRED proc0.p_ucred 6062976Smckusick#define CURPROC curproc 6162976Smckusick#define DEBUG 6262976Smckusick 6362976Smckusickstatic int indiracct __P((struct vnode *, struct vnode *, int, ufs_daddr_t, 6462976Smckusick int, int, int, int)); 6562976Smckusickstatic int snapacct __P((struct vnode *, ufs_daddr_t *, ufs_daddr_t *)); 6662976Smckusickstatic int readblock __P((struct buf *, daddr_t)); 6762976Smckusick 6862976Smckusick#ifdef DEBUG 6962976Smckusick#include <sys/sysctl.h> 7062976Smckusickint snapdebug = 0; 7162976SmckusickSYSCTL_INT(_debug, OID_AUTO, snapdebug, CTLFLAG_RW, &snapdebug, 0, ""); 7262976Smckusick#endif /* DEBUG */ 7362976Smckusick 7462976Smckusick/* 7562976Smckusick * Create a snapshot file and initialize it for the filesystem. 7662976Smckusick */ 7762976Smckusickint 7862976Smckusickffs_snapshot(mp, snapfile) 7962976Smckusick struct mount *mp; 8062976Smckusick char *snapfile; 8162976Smckusick{ 8262976Smckusick ufs_daddr_t rlbn; 8362976Smckusick ufs_daddr_t lbn, blkno, copyblkno, inoblks[FSMAXSNAP]; 8462976Smckusick int error, cg, snaploc, indiroff, numblks; 8562976Smckusick int i, size, base, len, loc, inoblkcnt; 8662976Smckusick int blksperindir, flag = mp->mnt_flag; 8762976Smckusick struct fs *fs = VFSTOUFS(mp)->um_fs; 8862976Smckusick struct proc *p = CURPROC; 8962976Smckusick struct inode *devip, *ip, *xp; 9062976Smckusick struct buf *bp, *nbp, *ibp; 9162976Smckusick struct vnode *vp, *devvp; 9262976Smckusick struct nameidata nd; 9362976Smckusick struct mount *wrtmp; 9462976Smckusick struct dinode *dip; 9562976Smckusick struct vattr vat; 9662976Smckusick struct cg *cgp; 9762976Smckusick 9862976Smckusick /* 9962976Smckusick * Need to serialize access to snapshot code per filesystem. 10062976Smckusick */ 10162976Smckusick /* 10262976Smckusick * Assign a snapshot slot in the superblock. 10362976Smckusick */ 10462976Smckusick for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) 10562976Smckusick if (fs->fs_snapinum[snaploc] == 0) 10662976Smckusick break; 10762976Smckusick if (snaploc == FSMAXSNAP) 10862976Smckusick return (ENOSPC); 10962976Smckusick /* 11062976Smckusick * Create the snapshot file. 11162976Smckusick */ 11262976Smckusickrestart: 11362976Smckusick NDINIT(&nd, CREATE, LOCKPARENT | LOCKLEAF, UIO_USERSPACE, snapfile, p); 11462976Smckusick if ((error = namei(&nd)) != 0) 11562976Smckusick return (error); 11662976Smckusick if (nd.ni_vp != NULL) { 11762976Smckusick vput(nd.ni_vp); 11862976Smckusick error = EEXIST; 11962976Smckusick } 12062976Smckusick if (nd.ni_dvp->v_mount != mp) 12162976Smckusick error = EXDEV; 12262976Smckusick if (error) { 12362976Smckusick NDFREE(&nd, NDF_ONLY_PNBUF); 12462976Smckusick if (nd.ni_dvp == nd.ni_vp) 12562976Smckusick vrele(nd.ni_dvp); 12662976Smckusick else 12762976Smckusick vput(nd.ni_dvp); 12862976Smckusick return (error); 12962976Smckusick } 13062976Smckusick VATTR_NULL(&vat); 13162976Smckusick vat.va_type = VREG; 13262976Smckusick vat.va_mode = S_IRUSR; 13362976Smckusick vat.va_vaflags |= VA_EXCLUSIVE; 13462976Smckusick if (VOP_GETWRITEMOUNT(nd.ni_dvp, &wrtmp)) 13562976Smckusick wrtmp = NULL; 13662976Smckusick if (wrtmp != mp) 13762976Smckusick panic("ffs_snapshot: mount mismatch"); 13862985Smckusick if (vn_start_write(NULL, &wrtmp, V_NOWAIT) != 0) { 13962976Smckusick NDFREE(&nd, NDF_ONLY_PNBUF); 14062976Smckusick vput(nd.ni_dvp); 14162985Smckusick if ((error = vn_start_write(NULL, &wrtmp, 14262985Smckusick V_XSLEEP | PCATCH)) != 0) 14362976Smckusick return (error); 14462976Smckusick goto restart; 14562976Smckusick } 14662976Smckusick VOP_LEASE(nd.ni_dvp, p, KERNCRED, LEASE_WRITE); 14762976Smckusick error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vat); 14862976Smckusick vput(nd.ni_dvp); 14962976Smckusick if (error) { 15062976Smckusick NDFREE(&nd, NDF_ONLY_PNBUF); 15162976Smckusick vn_finished_write(wrtmp); 15262976Smckusick return (error); 15362976Smckusick } 15462976Smckusick vp = nd.ni_vp; 15562976Smckusick ip = VTOI(vp); 15662976Smckusick devvp = ip->i_devvp; 15762976Smckusick devip = VTOI(devvp); 15862976Smckusick /* 15962976Smckusick * Allocate and copy the last block contents so as to be able 16062976Smckusick * to set size to that of the filesystem. 16162976Smckusick */ 16262976Smckusick numblks = howmany(fs->fs_size, fs->fs_frag); 16362976Smckusick error = VOP_BALLOC(vp, lblktosize(fs, (off_t)(numblks - 1)), 16462976Smckusick fs->fs_bsize, KERNCRED, B_CLRBUF, &bp); 16562976Smckusick if (error) 16662976Smckusick goto out; 16762976Smckusick ip->i_size = lblktosize(fs, (off_t)numblks); 16862976Smckusick ip->i_flag |= IN_CHANGE | IN_UPDATE; 16962976Smckusick if ((error = readblock(bp, numblks - 1)) != 0) 17062976Smckusick goto out; 17162976Smckusick bawrite(bp); 17262976Smckusick /* 17362976Smckusick * Preallocate critical data structures so that we can copy 17462976Smckusick * them in without further allocation after we suspend all 17562976Smckusick * operations on the filesystem. We would like to just release 17662976Smckusick * the allocated buffers without writing them since they will 17762976Smckusick * be filled in below once we are ready to go, but this upsets 17862976Smckusick * the soft update code, so we go ahead and write the new buffers. 17962976Smckusick * 18062976Smckusick * Allocate all indirect blocks. Also allocate shadow copies 18162976Smckusick * for each of the indirect blocks. 18262976Smckusick */ 18362976Smckusick for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) { 18462976Smckusick error = VOP_BALLOC(vp, lblktosize(fs, (off_t)blkno), 18562976Smckusick fs->fs_bsize, p->p_ucred, B_METAONLY, &ibp); 18662976Smckusick if (error) 18762976Smckusick goto out; 18862976Smckusick copyblkno = fragstoblks(fs, dbtofsb(fs, ibp->b_blkno)); 18962976Smckusick bdwrite(ibp); 19062976Smckusick error = VOP_BALLOC(vp, lblktosize(fs, (off_t)copyblkno), 19162976Smckusick fs->fs_bsize, p->p_ucred, 0, &nbp); 19262976Smckusick if (error) 19362976Smckusick goto out; 19462976Smckusick bawrite(nbp); 19562976Smckusick } 19662976Smckusick /* 19762976Smckusick * Allocate shadow blocks to copy all of the other snapshot inodes 19862976Smckusick * so that we will be able to expunge them from this snapshot. 19962976Smckusick */ 20062976Smckusick for (loc = 0, inoblkcnt = 0; loc < snaploc; loc++) { 20162976Smckusick blkno = fragstoblks(fs, ino_to_fsba(fs, fs->fs_snapinum[loc])); 20262976Smckusick for (i = 0; i < inoblkcnt; i++) 20362976Smckusick if (inoblks[i] == blkno) 20462976Smckusick break; 20562976Smckusick if (i == inoblkcnt) { 20662976Smckusick inoblks[inoblkcnt++] = blkno; 20762976Smckusick error = VOP_BALLOC(vp, lblktosize(fs, (off_t)blkno), 20862976Smckusick fs->fs_bsize, KERNCRED, 0, &nbp); 20962976Smckusick if (error) 21062976Smckusick goto out; 21162976Smckusick bawrite(nbp); 21262976Smckusick } 21362976Smckusick } 21462976Smckusick /* 21562976Smckusick * Allocate all cylinder group blocks. 21662976Smckusick */ 21762976Smckusick for (cg = 0; cg < fs->fs_ncg; cg++) { 21862976Smckusick error = VOP_BALLOC(vp, (off_t)(cgtod(fs, cg)) << fs->fs_fshift, 21962976Smckusick fs->fs_bsize, KERNCRED, 0, &nbp); 22062976Smckusick if (error) 22162976Smckusick goto out; 22262976Smckusick bawrite(nbp); 22362976Smckusick } 22462976Smckusick /* 22562976Smckusick * Allocate copies for the superblock and its summary information. 22662976Smckusick */ 22762976Smckusick error = VOP_BALLOC(vp, (off_t)(SBOFF), fs->fs_bsize, KERNCRED, 22862976Smckusick 0, &nbp); 22962976Smckusick if (error) 23062976Smckusick goto out; 23162976Smckusick bawrite(nbp); 23262976Smckusick blkno = fragstoblks(fs, fs->fs_csaddr); 23362976Smckusick len = howmany(fs->fs_cssize, fs->fs_bsize); 23462976Smckusick for (loc = 0; loc < len; loc++) { 23562976Smckusick error = VOP_BALLOC(vp, lblktosize(fs, (off_t)(blkno + loc)), 23662976Smckusick fs->fs_bsize, KERNCRED, 0, &nbp); 23762976Smckusick if (error) 23862976Smckusick goto out; 23962976Smckusick bawrite(nbp); 24062976Smckusick } 24162976Smckusick /* 24262976Smckusick * Change inode to snapshot type file. 24362976Smckusick */ 24462976Smckusick ip->i_flags |= SF_IMMUTABLE | SF_SNAPSHOT; 24562976Smckusick ip->i_flag |= IN_CHANGE | IN_UPDATE; 24662976Smckusick /* 24762976Smckusick * Ensure that the snapshot is completely on disk. 24862976Smckusick */ 24962976Smckusick if ((error = VOP_FSYNC(vp, KERNCRED, MNT_WAIT, p)) != 0) 25062976Smckusick goto out; 25162976Smckusick /* 25262976Smckusick * All allocations are done, so we can now snapshot the system. 25362976Smckusick * 25462976Smckusick * Suspend operation on filesystem. 25562976Smckusick */ 25662976Smckusick for (;;) { 25762976Smckusick vn_finished_write(wrtmp); 25862976Smckusick vfs_write_suspend(vp->v_mount); 25962976Smckusick if (mp->mnt_kern_flag & MNTK_SUSPENDED) 26062976Smckusick break; 26162985Smckusick vn_start_write(NULL, &wrtmp, V_WAIT); 26262976Smckusick } 26362976Smckusick /* 26462976Smckusick * First, copy all the cylinder group maps. All the unallocated 26562976Smckusick * blocks are marked BLK_NOCOPY so that the snapshot knows that 26662976Smckusick * it need not copy them if they are later written. 26762976Smckusick */ 26862976Smckusick len = howmany(fs->fs_fpg, fs->fs_frag); 26962976Smckusick for (cg = 0; cg < fs->fs_ncg; cg++) { 27062976Smckusick error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)), 27162976Smckusick (int)fs->fs_cgsize, KERNCRED, &bp); 27262976Smckusick if (error) { 27362976Smckusick brelse(bp); 27462976Smckusick goto out1; 27562976Smckusick } 27662976Smckusick cgp = (struct cg *)bp->b_data; 27762976Smckusick if (!cg_chkmagic(cgp)) { 27862976Smckusick brelse(bp); 27962976Smckusick error = EIO; 28062976Smckusick goto out1; 28162976Smckusick } 28262976Smckusick error = bread(vp, fragstoblks(fs, cgtod(fs, cg)), fs->fs_bsize, 28362976Smckusick KERNCRED, &nbp); 28462976Smckusick if (error) { 28562976Smckusick brelse(bp); 28662976Smckusick brelse(nbp); 28762976Smckusick goto out1; 28862976Smckusick } 28962976Smckusick bcopy(bp->b_data, nbp->b_data, fs->fs_cgsize); 29062976Smckusick if (fs->fs_cgsize < fs->fs_bsize) 29162976Smckusick bzero(&nbp->b_data[fs->fs_cgsize], 29262976Smckusick fs->fs_bsize - fs->fs_cgsize); 29363788Smckusick nbp->b_flags |= B_VALIDSUSPWRT; 29462976Smckusick bawrite(nbp); 29562976Smckusick base = cg * fs->fs_fpg / fs->fs_frag; 29662976Smckusick if (base + len > numblks) 29762976Smckusick len = numblks - base; 29862976Smckusick loc = 0; 29962976Smckusick if (base < NDADDR) { 30062976Smckusick for ( ; loc < NDADDR; loc++) { 30162976Smckusick if (!ffs_isblock(fs, cg_blksfree(cgp), loc)) 30262976Smckusick continue; 30362976Smckusick ip->i_db[loc] = BLK_NOCOPY; 30462976Smckusick } 30562976Smckusick } 30662976Smckusick error = VOP_BALLOC(vp, lblktosize(fs, (off_t)(base + loc)), 30762976Smckusick fs->fs_bsize, KERNCRED, B_METAONLY, &ibp); 30862976Smckusick if (error) { 30962976Smckusick brelse(bp); 31062976Smckusick goto out1; 31162976Smckusick } 31262976Smckusick indiroff = (base + loc - NDADDR) % NINDIR(fs); 31362976Smckusick for ( ; loc < len; loc++, indiroff++) { 31462976Smckusick if (indiroff >= NINDIR(fs)) { 31563788Smckusick ibp->b_flags |= B_VALIDSUSPWRT; 31662976Smckusick bawrite(ibp); 31762976Smckusick error = VOP_BALLOC(vp, 31862976Smckusick lblktosize(fs, (off_t)(base + loc)), 31962976Smckusick fs->fs_bsize, KERNCRED, B_METAONLY, &ibp); 32062976Smckusick if (error) { 32162976Smckusick brelse(bp); 32262976Smckusick goto out1; 32362976Smckusick } 32462976Smckusick indiroff = 0; 32562976Smckusick } 32662976Smckusick if (!ffs_isblock(fs, cg_blksfree(cgp), loc)) 32762976Smckusick continue; 32862976Smckusick ((ufs_daddr_t *)(ibp->b_data))[indiroff] = BLK_NOCOPY; 32962976Smckusick } 33063788Smckusick bqrelse(bp); 33163788Smckusick ibp->b_flags |= B_VALIDSUSPWRT; 33262976Smckusick bdwrite(ibp); 33362976Smckusick } 33462976Smckusick /* 33562976Smckusick * Snapshot the superblock and its summary information. 33662976Smckusick */ 33762976Smckusick error = VOP_BALLOC(vp, (off_t)(SBOFF), fs->fs_bsize, KERNCRED, 33862976Smckusick 0, &nbp); 33962976Smckusick if (error) 34062976Smckusick goto out1; 34162976Smckusick bcopy(fs, nbp->b_data, fs->fs_sbsize); 34262976Smckusick ((struct fs *)(nbp->b_data))->fs_clean = 1; 34362976Smckusick if (fs->fs_sbsize < fs->fs_bsize) 34462976Smckusick bzero(&nbp->b_data[fs->fs_sbsize], 34562976Smckusick fs->fs_bsize - fs->fs_sbsize); 34663788Smckusick nbp->b_flags |= B_VALIDSUSPWRT; 34762976Smckusick bawrite(nbp); 34862976Smckusick blkno = fragstoblks(fs, fs->fs_csaddr); 34962976Smckusick len = howmany(fs->fs_cssize, fs->fs_bsize) - 1; 35062976Smckusick size = fs->fs_bsize; 35162976Smckusick for (loc = 0; loc <= len; loc++) { 35262976Smckusick error = VOP_BALLOC(vp, lblktosize(fs, (off_t)(blkno + loc)), 35362976Smckusick fs->fs_bsize, KERNCRED, 0, &nbp); 35462976Smckusick if (error) 35562976Smckusick goto out1; 35662976Smckusick if (loc == len) { 35762976Smckusick readblock(nbp, blkno + loc); 35862976Smckusick size = fs->fs_cssize % fs->fs_bsize; 35962976Smckusick } 36062976Smckusick bcopy(fs->fs_csp[loc], nbp->b_data, size); 36163788Smckusick nbp->b_flags |= B_VALIDSUSPWRT; 36262976Smckusick bawrite(nbp); 36362976Smckusick } 36462976Smckusick /* 36562976Smckusick * Copy the shadow blocks for the snapshot inodes so that 36662976Smckusick * the copies can can be expunged. 36762976Smckusick */ 36862976Smckusick for (loc = 0; loc < inoblkcnt; loc++) { 36962976Smckusick error = VOP_BALLOC(vp, lblktosize(fs, (off_t)inoblks[loc]), 37062976Smckusick fs->fs_bsize, KERNCRED, 0, &nbp); 37162976Smckusick if (error) 37262976Smckusick goto out1; 37362976Smckusick readblock(nbp, inoblks[loc]); 37463788Smckusick nbp->b_flags |= B_VALIDSUSPWRT; 37562976Smckusick bdwrite(nbp); 37662976Smckusick } 37762976Smckusick /* 37862976Smckusick * Copy allocation information from other snapshots and then 37962976Smckusick * expunge them from the view of the current snapshot. 38062976Smckusick */ 38162976Smckusick for (xp = devip->i_copyonwrite; xp; xp = xp->i_copyonwrite) { 38262976Smckusick /* 38362976Smckusick * Before expunging a snapshot inode, note all the 38462976Smckusick * blocks that it claims with BLK_SNAP so that fsck will 38562976Smckusick * be able to account for those blocks properly and so 38662976Smckusick * that this snapshot knows that it need not copy them 38762976Smckusick * if the other snapshot holding them is freed. 38862976Smckusick */ 38962976Smckusick if ((error = snapacct(vp, &xp->i_db[0], &xp->i_ib[NIADDR])) !=0) 39062976Smckusick goto out1; 39162976Smckusick blksperindir = 1; 39262976Smckusick lbn = -NDADDR; 39362976Smckusick len = numblks - NDADDR; 39462976Smckusick rlbn = NDADDR; 39562976Smckusick for (i = 0; len > 0 && i < NIADDR; i++) { 39662976Smckusick error = indiracct(vp, ITOV(xp), i, xp->i_ib[i], lbn, 39762976Smckusick rlbn, len, blksperindir); 39862976Smckusick if (error) 39962976Smckusick goto out1; 40062976Smckusick blksperindir *= NINDIR(fs); 40162976Smckusick lbn -= blksperindir + 1; 40262976Smckusick len -= blksperindir; 40362976Smckusick rlbn += blksperindir; 40462976Smckusick } 40562976Smckusick /* 40662976Smckusick * Set copied snapshot inode to be a zero length file. 40762976Smckusick */ 40862976Smckusick blkno = fragstoblks(fs, ino_to_fsba(fs, xp->i_number)); 40962976Smckusick error = VOP_BALLOC(vp, lblktosize(fs, (off_t)blkno), 41062976Smckusick fs->fs_bsize, KERNCRED, 0, &nbp); 41162976Smckusick if (error) 41262976Smckusick goto out1; 41362976Smckusick dip = (struct dinode *)nbp->b_data + 41462976Smckusick ino_to_fsbo(fs, xp->i_number); 41562976Smckusick dip->di_size = 0; 41662976Smckusick dip->di_blocks = 0; 41762976Smckusick dip->di_flags &= ~(SF_IMMUTABLE | SF_SNAPSHOT); 41862976Smckusick bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs_daddr_t)); 41963788Smckusick nbp->b_flags |= B_VALIDSUSPWRT; 42062976Smckusick bdwrite(nbp); 42162976Smckusick } 42262976Smckusick /* 42362976Smckusick * Copy all indirect blocks to their shadows (allocated above) 42462976Smckusick * to avoid deadlock in ffs_copyonwrite. 42562976Smckusick */ 42662976Smckusick for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) { 42762976Smckusick error = VOP_BALLOC(vp, lblktosize(fs, (off_t)blkno), 42862976Smckusick fs->fs_bsize, p->p_ucred, B_METAONLY, &ibp); 42962976Smckusick if (error) 43062976Smckusick goto out1; 43162976Smckusick copyblkno = fragstoblks(fs, dbtofsb(fs, ibp->b_blkno)); 43263788Smckusick bqrelse(ibp); 43362976Smckusick error = VOP_BALLOC(vp, lblktosize(fs, (off_t)copyblkno), 43462976Smckusick fs->fs_bsize, p->p_ucred, 0, &nbp); 43562976Smckusick if (error) 43662976Smckusick goto out1; 43762976Smckusick error = VOP_BALLOC(vp, lblktosize(fs, (off_t)blkno), 43862976Smckusick fs->fs_bsize, p->p_ucred, B_METAONLY, &ibp); 43962976Smckusick if (error) { 44062976Smckusick brelse(nbp); 44162976Smckusick goto out1; 44262976Smckusick } 44362976Smckusick bcopy(ibp->b_data, nbp->b_data, fs->fs_bsize); 44463788Smckusick bqrelse(ibp); 44563788Smckusick nbp->b_flags |= B_VALIDSUSPWRT; 44662976Smckusick bawrite(nbp); 44762976Smckusick } 44862976Smckusick /* 44962976Smckusick * Record snapshot inode. Since this is the newest snapshot, 45062976Smckusick * it must be placed at the end of the list. 45162976Smckusick */ 45262976Smckusick fs->fs_snapinum[snaploc] = ip->i_number; 45362976Smckusick if (ip->i_copyonwrite != 0) 45462976Smckusick panic("ffs_snapshot: %d already on list", ip->i_number); 45562976Smckusick if (devip->i_copyonwrite == 0) { 45662976Smckusick devvp->v_flag |= VCOPYONWRITE; 45762976Smckusick devip->i_copyonwrite = ip; 45862976Smckusick } else { 45962976Smckusick for (xp = devip->i_copyonwrite; xp->i_copyonwrite != 0; ) 46062976Smckusick xp = xp->i_copyonwrite; 46162976Smckusick xp->i_copyonwrite = ip; 46262976Smckusick } 46362976Smckusick vp->v_flag |= VSYSTEM; 46462976Smckusick /* 46562976Smckusick * Resume operation on filesystem. 46662976Smckusick */ 46762976Smckusickout1: 46862976Smckusick vfs_write_resume(vp->v_mount); 46962985Smckusick vn_start_write(NULL, &wrtmp, V_WAIT); 47062976Smckusickout: 47162976Smckusick mp->mnt_flag = flag; 47262976Smckusick (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, p); 47362976Smckusick if (error) 47462976Smckusick vput(vp); 47562976Smckusick else 47662976Smckusick VOP_UNLOCK(vp, 0, p); 47762976Smckusick vn_finished_write(wrtmp); 47862976Smckusick return (error); 47962976Smckusick} 48062976Smckusick 48162976Smckusick/* 48262976Smckusick * Descend an indirect block chain for vnode cancelvp accounting for all 48362976Smckusick * its indirect blocks in snapvp. 48462976Smckusick */ 48562976Smckusickstatic int 48662976Smckusickindiracct(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks, blksperindir) 48762976Smckusick struct vnode *snapvp; 48862976Smckusick struct vnode *cancelvp; 48962976Smckusick int level; 49062976Smckusick ufs_daddr_t blkno; 49162976Smckusick int lbn; 49262976Smckusick int rlbn; 49362976Smckusick int remblks; 49462976Smckusick int blksperindir; 49562976Smckusick{ 49662976Smckusick int subblksperindir, error, last, num, i; 49762976Smckusick struct indir indirs[NIADDR + 2]; 49862976Smckusick ufs_daddr_t *bap; 49962976Smckusick struct buf *bp; 50062976Smckusick struct fs *fs; 50162976Smckusick 50262976Smckusick if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0) 50362976Smckusick return (error); 50462976Smckusick if (lbn != indirs[num - 1 - level].in_lbn || blkno == 0 || num < 2) 50562976Smckusick panic("indiracct: botched params"); 50662976Smckusick /* 50762976Smckusick * We have to expand bread here since it will deadlock looking 50862976Smckusick * up the block number for any blocks that are not in the cache. 50962976Smckusick */ 51062976Smckusick fs = VTOI(cancelvp)->i_fs; 51162976Smckusick bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0); 51262976Smckusick bp->b_blkno = fsbtodb(fs, blkno); 51362976Smckusick if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 && 51462976Smckusick (error = readblock(bp, fragstoblks(fs, blkno)))) { 51562976Smckusick brelse(bp); 51662976Smckusick return (error); 51762976Smckusick } 51862976Smckusick /* 51962976Smckusick * Account for the block pointers in this indirect block. 52062976Smckusick */ 52162976Smckusick last = howmany(remblks, blksperindir); 52262976Smckusick if (last > NINDIR(fs)) 52362976Smckusick last = NINDIR(fs); 52462976Smckusick if (snapvp != cancelvp) { 52562976Smckusick bap = (ufs_daddr_t *)bp->b_data; 52662976Smckusick } else { 52762976Smckusick MALLOC(bap, ufs_daddr_t *, fs->fs_bsize, M_DEVBUF, M_WAITOK); 52862976Smckusick bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize); 52963788Smckusick bqrelse(bp); 53062976Smckusick } 53162976Smckusick error = snapacct(snapvp, &bap[0], &bap[last]); 53262976Smckusick if (error || level == 0) 53362976Smckusick goto out; 53462976Smckusick /* 53562976Smckusick * Account for the block pointers in each of the indirect blocks 53662976Smckusick * in the levels below us. 53762976Smckusick */ 53862976Smckusick subblksperindir = blksperindir / NINDIR(fs); 53962976Smckusick for (lbn++, level--, i = 0; i < last; i++) { 54062976Smckusick error = indiracct(snapvp, cancelvp, level, bap[i], lbn, 54162976Smckusick rlbn, remblks, subblksperindir); 54262976Smckusick if (error) 54362976Smckusick goto out; 54462976Smckusick rlbn += blksperindir; 54562976Smckusick lbn -= blksperindir; 54662976Smckusick remblks -= blksperindir; 54762976Smckusick } 54862976Smckusickout: 54962976Smckusick if (snapvp != cancelvp) 55063788Smckusick bqrelse(bp); 55162976Smckusick else 55262976Smckusick FREE(bap, M_DEVBUF); 55362976Smckusick return (error); 55462976Smckusick} 55562976Smckusick 55662976Smckusick/* 55762976Smckusick * Account for a set of blocks allocated in a snapshot inode. 55862976Smckusick */ 55962976Smckusickstatic int 56062976Smckusicksnapacct(vp, oldblkp, lastblkp) 56162976Smckusick struct vnode *vp; 56262976Smckusick ufs_daddr_t *oldblkp, *lastblkp; 56362976Smckusick{ 56462976Smckusick struct inode *ip = VTOI(vp); 56562976Smckusick struct fs *fs = ip->i_fs; 56662976Smckusick ufs_daddr_t lbn, blkno, *blkp; 56762976Smckusick struct buf *ibp; 56862976Smckusick int error; 56962976Smckusick 57062976Smckusick for ( ; oldblkp < lastblkp; oldblkp++) { 57162976Smckusick blkno = *oldblkp; 57262976Smckusick if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP) 57362976Smckusick continue; 57462976Smckusick lbn = fragstoblks(fs, blkno); 57562976Smckusick if (lbn < NDADDR) { 57662976Smckusick blkp = &ip->i_db[lbn]; 57762976Smckusick ip->i_flag |= IN_CHANGE | IN_UPDATE; 57862976Smckusick } else { 57962976Smckusick error = VOP_BALLOC(vp, lblktosize(fs, (off_t)lbn), 58062976Smckusick fs->fs_bsize, KERNCRED, B_METAONLY, &ibp); 58162976Smckusick if (error) 58262976Smckusick return (error); 58362976Smckusick blkp = &((ufs_daddr_t *)(ibp->b_data)) 58462976Smckusick [(lbn - NDADDR) % NINDIR(fs)]; 58562976Smckusick } 58662976Smckusick if (*blkp != 0) 58762976Smckusick panic("snapacct: bad block"); 58862976Smckusick *blkp = BLK_SNAP; 58963788Smckusick if (lbn >= NDADDR) { 59063788Smckusick ibp->b_flags |= B_VALIDSUSPWRT; 59162976Smckusick bdwrite(ibp); 59263788Smckusick } 59362976Smckusick } 59462976Smckusick return (0); 59562976Smckusick} 59662976Smckusick 59762976Smckusick/* 59862976Smckusick * Prepare a snapshot file for being removed. 59962976Smckusick */ 60062976Smckusickvoid 60162976Smckusickffs_snapremove(vp) 60262976Smckusick struct vnode *vp; 60362976Smckusick{ 60462976Smckusick struct inode *ip, *xp; 60562976Smckusick struct vnode *devvp; 60662976Smckusick struct buf *ibp; 60762976Smckusick struct fs *fs; 60862976Smckusick ufs_daddr_t blkno, dblk; 60962976Smckusick int error, snaploc, loc, last; 61062976Smckusick 61162976Smckusick ip = VTOI(vp); 61262976Smckusick fs = ip->i_fs; 61362976Smckusick /* 61462976Smckusick * Delete snapshot inode from superblock. Keep list dense. 61562976Smckusick */ 61662976Smckusick for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) 61762976Smckusick if (fs->fs_snapinum[snaploc] == ip->i_number) 61862976Smckusick break; 61962976Smckusick if (snaploc < FSMAXSNAP) { 62062976Smckusick for (snaploc++; snaploc < FSMAXSNAP; snaploc++) { 62162976Smckusick if (fs->fs_snapinum[snaploc] == 0) 62262976Smckusick break; 62362976Smckusick fs->fs_snapinum[snaploc - 1] = fs->fs_snapinum[snaploc]; 62462976Smckusick } 62562976Smckusick fs->fs_snapinum[snaploc - 1] = 0; 62662976Smckusick } 62762976Smckusick /* 62862976Smckusick * Delete from incore list. 62962976Smckusick * Clear copy-on-write flag if last snapshot. 63062976Smckusick */ 63162976Smckusick devvp = ip->i_devvp; 63262976Smckusick for (xp = VTOI(devvp); xp; xp = xp->i_copyonwrite) { 63362976Smckusick if (xp->i_copyonwrite != ip) 63462976Smckusick continue; 63562976Smckusick xp->i_copyonwrite = ip->i_copyonwrite; 63662976Smckusick ip->i_copyonwrite = 0; 63762976Smckusick break; 63862976Smckusick } 63962976Smckusick if (xp == 0) { 64062976Smckusick printf("ffs_snapremove: lost snapshot vnode %d\n", 64162976Smckusick ip->i_number); 64262976Smckusick vref(vp); 64362976Smckusick } 64462976Smckusick if (VTOI(devvp)->i_copyonwrite == 0) 64562976Smckusick devvp->v_flag &= ~VCOPYONWRITE; 64662976Smckusick /* 64762976Smckusick * Clear all BLK_NOCOPY fields. Pass any block claims to other 64862976Smckusick * snapshots that want them (see ffs_snapblkfree below). 64962976Smckusick */ 65062976Smckusick for (blkno = 1; blkno < NDADDR; blkno++) { 65162976Smckusick dblk = ip->i_db[blkno]; 65262976Smckusick if (dblk == BLK_NOCOPY || dblk == BLK_SNAP || 65362976Smckusick (dblk == blkstofrags(fs, blkno) && 65462976Smckusick ffs_snapblkfree(ip, dblk, fs->fs_bsize))) 65562976Smckusick ip->i_db[blkno] = 0; 65662976Smckusick } 65762976Smckusick for (blkno = NDADDR; blkno < fs->fs_size; blkno += NINDIR(fs)) { 65862976Smckusick error = VOP_BALLOC(vp, lblktosize(fs, (off_t)blkno), 65962976Smckusick fs->fs_bsize, KERNCRED, B_METAONLY, &ibp); 66062976Smckusick if (error) 66162976Smckusick continue; 66262976Smckusick if ((last = fs->fs_size - blkno) > NINDIR(fs)) 66362976Smckusick last = NINDIR(fs); 66462976Smckusick for (loc = 0; loc < last; loc++) { 66562976Smckusick dblk = ((ufs_daddr_t *)(ibp->b_data))[loc]; 66662976Smckusick if (dblk == BLK_NOCOPY || dblk == BLK_SNAP || 66762976Smckusick (dblk == blkstofrags(fs, blkno) && 66862976Smckusick ffs_snapblkfree(ip, dblk, fs->fs_bsize))) 66962976Smckusick ((ufs_daddr_t *)(ibp->b_data))[loc] = 0; 67062976Smckusick } 67162976Smckusick bawrite(ibp); 67262976Smckusick } 67362976Smckusick /* 67462976Smckusick * Clear snapshot flag and drop reference. 67562976Smckusick */ 67662976Smckusick ip->i_flags &= ~(SF_IMMUTABLE | SF_SNAPSHOT); 67762976Smckusick ip->i_flag |= IN_CHANGE | IN_UPDATE; 67862976Smckusick vrele(vp); 67962976Smckusick} 68062976Smckusick 68162976Smckusick/* 68262976Smckusick * Notification that a block is being freed. Return zero if the free 68362976Smckusick * should be allowed to proceed. Return non-zero if the snapshot file 68462976Smckusick * wants to claim the block. The block will be claimed if it is an 68562976Smckusick * uncopied part of one of the snapshots. It will be freed if it is 68662976Smckusick * either a BLK_NOCOPY or has already been copied in all of the snapshots. 68762976Smckusick * If a fragment is being freed, then all snapshots that care about 68862976Smckusick * it must make a copy since a snapshot file can only claim full sized 68962976Smckusick * blocks. Note that if more than one snapshot file maps the block, 69062976Smckusick * we can pick one at random to claim it. Since none of the snapshots 69162976Smckusick * can change, we are assurred that they will all see the same unmodified 69262976Smckusick * image. When deleting a snapshot file (see ffs_snapremove above), we 69362976Smckusick * must push any of these claimed blocks to one of the other snapshots 69462976Smckusick * that maps it. These claimed blocks are easily identified as they will 69562976Smckusick * have a block number equal to their logical block number within the 69662976Smckusick * snapshot. A copied block can never have this property because they 69762976Smckusick * must always have been allocated from a BLK_NOCOPY location. 69862976Smckusick */ 69962976Smckusickint 70062976Smckusickffs_snapblkfree(freeip, bno, size) 70162976Smckusick struct inode *freeip; 70262976Smckusick ufs_daddr_t bno; 70362976Smckusick long size; 70462976Smckusick{ 70562976Smckusick struct buf *ibp, *cbp, *savedcbp = 0; 70662976Smckusick struct fs *fs = freeip->i_fs; 70762976Smckusick struct proc *p = CURPROC; 70862976Smckusick struct inode *ip; 70962976Smckusick struct vnode *vp; 71062976Smckusick ufs_daddr_t lbn, blkno; 71162976Smckusick int indiroff = 0, error = 0, claimedblk = 0; 71262976Smckusick 71362976Smckusick lbn = fragstoblks(fs, bno); 71462976Smckusick for (ip = VTOI(freeip->i_devvp)->i_copyonwrite; ip; 71562976Smckusick ip = ip->i_copyonwrite) { 71662976Smckusick vp = ITOV(ip); 71762976Smckusick /* 71862976Smckusick * Lookup block being written. 71962976Smckusick */ 72062976Smckusick if (lbn < NDADDR) { 72162976Smckusick blkno = ip->i_db[lbn]; 72262976Smckusick } else { 72362976Smckusick vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); 72462976Smckusick p->p_flag |= P_COWINPROGRESS; 72562976Smckusick error = VOP_BALLOC(vp, lblktosize(fs, (off_t)lbn), 72662976Smckusick fs->fs_bsize, KERNCRED, B_METAONLY, &ibp); 72762976Smckusick p->p_flag &= ~P_COWINPROGRESS; 72862976Smckusick VOP_UNLOCK(vp, 0, p); 72962976Smckusick if (error) 73062976Smckusick break; 73162976Smckusick indiroff = (lbn - NDADDR) % NINDIR(fs); 73262976Smckusick blkno = ((ufs_daddr_t *)(ibp->b_data))[indiroff]; 73362976Smckusick } 73462976Smckusick /* 73562976Smckusick * Check to see if block needs to be copied. 73662976Smckusick */ 73762976Smckusick switch (blkno) { 73862976Smckusick /* 73962976Smckusick * If the snapshot has already copied the block (default), 74062976Smckusick * or does not care about the block, it is not needed. 74162976Smckusick */ 74262976Smckusick default: 74362976Smckusick case BLK_NOCOPY: 74462976Smckusick if (lbn >= NDADDR) 74563788Smckusick bqrelse(ibp); 74662976Smckusick continue; 74762976Smckusick /* 74862976Smckusick * No previous snapshot claimed the block, so it will be 74962976Smckusick * freed and become a BLK_NOCOPY (don't care) for us. 75062976Smckusick */ 75162976Smckusick case BLK_SNAP: 75262976Smckusick if (claimedblk) 75362976Smckusick panic("snapblkfree: inconsistent block type"); 75462976Smckusick vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); 75562976Smckusick if (lbn < NDADDR) { 75662976Smckusick ip->i_db[lbn] = BLK_NOCOPY; 75762976Smckusick ip->i_flag |= IN_CHANGE | IN_UPDATE; 75862976Smckusick } else { 75962976Smckusick ((ufs_daddr_t *)(ibp->b_data))[indiroff] = 76062976Smckusick BLK_NOCOPY; 76162976Smckusick bdwrite(ibp); 76262976Smckusick } 76362976Smckusick VOP_UNLOCK(vp, 0, p); 76462976Smckusick continue; 76562976Smckusick /* 76662976Smckusick * A block that we map is being freed. If it has not been 76762976Smckusick * claimed yet, we will claim or copy it (below). 76862976Smckusick */ 76962976Smckusick case 0: 77062976Smckusick claimedblk = 1; 77162976Smckusick break; 77262976Smckusick } 77362976Smckusick /* 77462976Smckusick * If this is a full size block, we will just grab it 77562976Smckusick * and assign it to the snapshot inode. Otherwise we 77662976Smckusick * will proceed to copy it. See explanation for this 77762976Smckusick * routine as to why only a single snapshot needs to 77862976Smckusick * claim this block. 77962976Smckusick */ 78062976Smckusick if (size == fs->fs_bsize) { 78162976Smckusick#ifdef DEBUG 78262976Smckusick if (snapdebug) 78362976Smckusick printf("%s %d lbn %d from inum %d\n", 78462976Smckusick "Grabonremove: snapino", ip->i_number, lbn, 78562976Smckusick freeip->i_number); 78662976Smckusick#endif 78762976Smckusick vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); 78862976Smckusick if (lbn < NDADDR) { 78962976Smckusick ip->i_db[lbn] = bno; 79062976Smckusick } else { 79162976Smckusick ((ufs_daddr_t *)(ibp->b_data))[indiroff] = bno; 79262976Smckusick bdwrite(ibp); 79362976Smckusick } 79462976Smckusick ip->i_blocks += btodb(size); 79562976Smckusick ip->i_flag |= IN_CHANGE | IN_UPDATE; 79662976Smckusick VOP_UNLOCK(vp, 0, p); 79762976Smckusick return (1); 79862976Smckusick } 79962976Smckusick if (lbn >= NDADDR) 80063788Smckusick bqrelse(ibp); 80162976Smckusick /* 80262976Smckusick * Allocate the block into which to do the copy. Note that this 80362976Smckusick * allocation will never require any additional allocations for 80462976Smckusick * the snapshot inode. 80562976Smckusick */ 80662976Smckusick vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); 80762976Smckusick p->p_flag |= P_COWINPROGRESS; 80862976Smckusick error = VOP_BALLOC(vp, lblktosize(fs, (off_t)lbn), 80962976Smckusick fs->fs_bsize, KERNCRED, 0, &cbp); 81062976Smckusick p->p_flag &= ~P_COWINPROGRESS; 81162976Smckusick VOP_UNLOCK(vp, 0, p); 81262976Smckusick if (error) 81362976Smckusick break; 81462976Smckusick#ifdef DEBUG 81562976Smckusick if (snapdebug) 81662976Smckusick printf("%s%d lbn %d for inum %d size %ld to blkno %d\n", 81762976Smckusick "Copyonremove: snapino ", ip->i_number, lbn, 81862976Smckusick freeip->i_number, size, cbp->b_blkno); 81962976Smckusick#endif 82062976Smckusick /* 82162976Smckusick * If we have already read the old block contents, then 82262976Smckusick * simply copy them to the new block. 82362976Smckusick */ 82462976Smckusick if (savedcbp != 0) { 82562976Smckusick bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize); 82662976Smckusick bawrite(cbp); 82762976Smckusick continue; 82862976Smckusick } 82962976Smckusick /* 83062976Smckusick * Otherwise, read the old block contents into the buffer. 83162976Smckusick */ 83262976Smckusick if ((error = readblock(cbp, lbn)) != 0) 83362976Smckusick break; 83462976Smckusick savedcbp = cbp; 83562976Smckusick } 83662976Smckusick if (savedcbp) 83762976Smckusick bawrite(savedcbp); 83862976Smckusick /* 83962976Smckusick * If we have been unable to allocate a block in which to do 84062976Smckusick * the copy, then return non-zero so that the fragment will 84162976Smckusick * not be freed. Although space will be lost, the snapshot 84262976Smckusick * will stay consistent. 84362976Smckusick */ 84462976Smckusick return (error); 84562976Smckusick} 84662976Smckusick 84762976Smckusick/* 84862976Smckusick * Associate snapshot files when mounting. 84962976Smckusick */ 85062976Smckusickvoid 85162976Smckusickffs_snapshot_mount(mp) 85262976Smckusick struct mount *mp; 85362976Smckusick{ 85462976Smckusick struct ufsmount *ump = VFSTOUFS(mp); 85562976Smckusick struct fs *fs = ump->um_fs; 85662976Smckusick struct proc *p = CURPROC; 85762976Smckusick struct inode *ip, **listtailp; 85862976Smckusick struct vnode *vp; 85962976Smckusick int error, snaploc, loc; 86062976Smckusick 86162976Smckusick listtailp = &VTOI(ump->um_devvp)->i_copyonwrite; 86262976Smckusick for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) { 86362976Smckusick if (fs->fs_snapinum[snaploc] == 0) 86462976Smckusick return; 86562976Smckusick if ((error = VFS_VGET(mp, fs->fs_snapinum[snaploc], &vp)) != 0){ 86662976Smckusick printf("ffs_snapshot_mount: vget failed %d\n", error); 86762976Smckusick continue; 86862976Smckusick } 86962976Smckusick ip = VTOI(vp); 87062976Smckusick if ((ip->i_flags & SF_SNAPSHOT) == 0) { 87162976Smckusick printf("ffs_snapshot_mount: non-snapshot inode %d\n", 87262976Smckusick fs->fs_snapinum[snaploc]); 87362976Smckusick vput(vp); 87462976Smckusick for (loc = snaploc + 1; loc < FSMAXSNAP; loc++) { 87562976Smckusick if (fs->fs_snapinum[loc] == 0) 87662976Smckusick break; 87762976Smckusick fs->fs_snapinum[loc - 1] = fs->fs_snapinum[loc]; 87862976Smckusick } 87962976Smckusick fs->fs_snapinum[loc - 1] = 0; 88062976Smckusick snaploc--; 88162976Smckusick continue; 88262976Smckusick } 88362976Smckusick if (ip->i_copyonwrite != 0) 88462976Smckusick panic("ffs_snapshot_mount: %d already on list", 88562976Smckusick ip->i_number); 88662976Smckusick *listtailp = ip; 88762976Smckusick listtailp = &ip->i_copyonwrite; 88862976Smckusick vp->v_flag |= VSYSTEM; 88962976Smckusick VOP_UNLOCK(vp, 0, p); 89062976Smckusick ump->um_devvp->v_flag |= VCOPYONWRITE; 89162976Smckusick } 89262976Smckusick} 89362976Smckusick 89462976Smckusick/* 89562976Smckusick * Disassociate snapshot files when unmounting. 89662976Smckusick */ 89762976Smckusickvoid 89862976Smckusickffs_snapshot_unmount(mp) 89962976Smckusick struct mount *mp; 90062976Smckusick{ 90162976Smckusick struct ufsmount *ump = VFSTOUFS(mp); 90262976Smckusick struct inode *devip = VTOI(ump->um_devvp); 90362976Smckusick struct inode *xp; 90462976Smckusick 90562976Smckusick while ((xp = devip->i_copyonwrite) != 0) { 90662976Smckusick devip->i_copyonwrite = xp->i_copyonwrite; 90762976Smckusick xp->i_copyonwrite = 0; 90862976Smckusick vrele(ITOV(xp)); 90962976Smckusick } 91062976Smckusick ump->um_devvp->v_flag &= ~VCOPYONWRITE; 91162976Smckusick} 91262976Smckusick 91362976Smckusick/* 91462976Smckusick * Check for need to copy block that is about to be written, 91562976Smckusick * copying the block if necessary. 91662976Smckusick */ 91762976Smckusickint 91862976Smckusickffs_copyonwrite(ap) 91962976Smckusick struct vop_copyonwrite_args /* { 92062976Smckusick struct vnode *a_vp; 92162976Smckusick struct buf *a_bp; 92262976Smckusick } */ *ap; 92362976Smckusick{ 92462976Smckusick struct buf *ibp, *cbp, *savedcbp = 0, *bp = ap->a_bp; 92562976Smckusick struct fs *fs = VTOI(bp->b_vp)->i_fs; 92662976Smckusick struct proc *p = CURPROC; 92762976Smckusick struct inode *ip; 92862976Smckusick struct vnode *vp; 92962976Smckusick ufs_daddr_t lbn, blkno; 93062976Smckusick int indiroff, error = 0; 93162976Smckusick 93262976Smckusick lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno)); 93362976Smckusick if (p->p_flag & P_COWINPROGRESS) 93462976Smckusick panic("ffs_copyonwrite: recursive call"); 93562976Smckusick for (ip = VTOI(ap->a_vp)->i_copyonwrite; ip; ip = ip->i_copyonwrite) { 93662976Smckusick vp = ITOV(ip); 93762976Smckusick /* 93862976Smckusick * We ensure that everything of our own that needs to be 93962976Smckusick * copied will be done at the time that ffs_snapshot is 94062976Smckusick * called. Thus we can skip the check here which can 94162976Smckusick * deadlock in doing the lookup in VOP_BALLOC. 94262976Smckusick */ 94362976Smckusick if (bp->b_vp == vp) 94462976Smckusick continue; 94562976Smckusick /* 94663788Smckusick * Check to see if block needs to be copied. We have to 94763788Smckusick * be able to do the VOP_BALLOC without blocking, otherwise 94863788Smckusick * we may get in a deadlock with another process also 94963788Smckusick * trying to allocate. If we find outselves unable to 95063788Smckusick * get the buffer lock, we unlock the snapshot vnode, 95163788Smckusick * sleep briefly, and try again. 95262976Smckusick */ 95363788Smckusickretry: 95463788Smckusick vn_lock(vp, LK_SHARED | LK_RETRY, p); 95562976Smckusick if (lbn < NDADDR) { 95662976Smckusick blkno = ip->i_db[lbn]; 95762976Smckusick } else { 95862976Smckusick p->p_flag |= P_COWINPROGRESS; 95962976Smckusick error = VOP_BALLOC(vp, lblktosize(fs, (off_t)lbn), 96063788Smckusick fs->fs_bsize, KERNCRED, B_METAONLY | B_NOWAIT, &ibp); 96162976Smckusick p->p_flag &= ~P_COWINPROGRESS; 96263788Smckusick if (error) { 96363788Smckusick VOP_UNLOCK(vp, 0, p); 96463788Smckusick if (error != EWOULDBLOCK) 96563788Smckusick break; 96663788Smckusick tsleep(vp, p->p_usrpri, "nap", 1); 96763788Smckusick goto retry; 96863788Smckusick } 96962976Smckusick indiroff = (lbn - NDADDR) % NINDIR(fs); 97062976Smckusick blkno = ((ufs_daddr_t *)(ibp->b_data))[indiroff]; 97163788Smckusick bqrelse(ibp); 97262976Smckusick } 97362976Smckusick#ifdef DIAGNOSTIC 97462976Smckusick if (blkno == BLK_SNAP && bp->b_lblkno >= 0) 97562976Smckusick panic("ffs_copyonwrite: bad copy block"); 97662976Smckusick#endif 97763788Smckusick if (blkno != 0) { 97863788Smckusick VOP_UNLOCK(vp, 0, p); 97962976Smckusick continue; 98063788Smckusick } 98162976Smckusick /* 98262976Smckusick * Allocate the block into which to do the copy. Note that this 98362976Smckusick * allocation will never require any additional allocations for 98462976Smckusick * the snapshot inode. 98562976Smckusick */ 98662976Smckusick p->p_flag |= P_COWINPROGRESS; 98762976Smckusick error = VOP_BALLOC(vp, lblktosize(fs, (off_t)lbn), 98863788Smckusick fs->fs_bsize, KERNCRED, B_NOWAIT, &cbp); 98962976Smckusick p->p_flag &= ~P_COWINPROGRESS; 99062976Smckusick VOP_UNLOCK(vp, 0, p); 99163788Smckusick if (error) { 99263788Smckusick if (error != EWOULDBLOCK) 99363788Smckusick break; 99463788Smckusick tsleep(vp, p->p_usrpri, "nap", 1); 99563788Smckusick goto retry; 99663788Smckusick } 99762976Smckusick#ifdef DEBUG 99862976Smckusick if (snapdebug) { 99962976Smckusick printf("Copyonwrite: snapino %d lbn %d for ", 100062976Smckusick ip->i_number, lbn); 100162976Smckusick if (bp->b_vp == ap->a_vp) 100262976Smckusick printf("fs metadata"); 100362976Smckusick else 100462976Smckusick printf("inum %d", VTOI(bp->b_vp)->i_number); 100562976Smckusick printf(" lblkno %d to blkno %d\n", bp->b_lblkno, 100662976Smckusick cbp->b_blkno); 100762976Smckusick } 100862976Smckusick#endif 100962976Smckusick /* 101062976Smckusick * If we have already read the old block contents, then 101162976Smckusick * simply copy them to the new block. 101262976Smckusick */ 101362976Smckusick if (savedcbp != 0) { 101462976Smckusick bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize); 101562976Smckusick bawrite(cbp); 101662976Smckusick continue; 101762976Smckusick } 101862976Smckusick /* 101962976Smckusick * Otherwise, read the old block contents into the buffer. 102062976Smckusick */ 102162976Smckusick if ((error = readblock(cbp, lbn)) != 0) 102262976Smckusick break; 102362976Smckusick savedcbp = cbp; 102462976Smckusick } 102562976Smckusick if (savedcbp) 102662976Smckusick bawrite(savedcbp); 102762976Smckusick return (error); 102862976Smckusick} 102962976Smckusick 103062976Smckusick/* 103162976Smckusick * Read the specified block into the given buffer. 103262976Smckusick * Much of this boiler-plate comes from bwrite(). 103362976Smckusick */ 103462976Smckusickstatic int 103562976Smckusickreadblock(bp, lbn) 103662976Smckusick struct buf *bp; 103762976Smckusick daddr_t lbn; 103862976Smckusick{ 103962976Smckusick struct uio auio; 104062976Smckusick struct iovec aiov; 104162976Smckusick struct proc *p = CURPROC; 104262976Smckusick struct inode *ip = VTOI(bp->b_vp); 104362976Smckusick 104462976Smckusick aiov.iov_base = bp->b_data; 104562976Smckusick aiov.iov_len = bp->b_bcount; 104662976Smckusick auio.uio_iov = &aiov; 104762976Smckusick auio.uio_iovcnt = 1; 104862976Smckusick auio.uio_offset = dbtob(fsbtodb(ip->i_fs, blkstofrags(ip->i_fs, lbn))); 104962976Smckusick auio.uio_resid = bp->b_bcount; 105062976Smckusick auio.uio_rw = UIO_READ; 105162976Smckusick auio.uio_segflg = UIO_SYSSPACE; 105262976Smckusick auio.uio_procp = p; 105362976Smckusick return (physio(ip->i_devvp->v_rdev, &auio, 0)); 105462976Smckusick} 1055