ffs_snapshot.c revision 1.17
1/* 2 * Copyright 2000 Marshall Kirk McKusick. All Rights Reserved. 3 * 4 * Further information about snapshots can be obtained from: 5 * 6 * Marshall Kirk McKusick http://www.mckusick.com/softdep/ 7 * 1614 Oxford Street mckusick@mckusick.com 8 * Berkeley, CA 94709-1608 +1-510-843-9542 9 * USA 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 21 * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY 22 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 23 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 24 * DISCLAIMED. IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR 25 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31 * SUCH DAMAGE. 32 * 33 * @(#)ffs_snapshot.c 8.11 (McKusick) 7/23/00 34 * 35 * from FreeBSD: ffs_snapshot.c,v 1.79 2004/02/13 02:02:06 kuriyama Exp 36 */ 37 38#include <sys/cdefs.h> 39__KERNEL_RCSID(0, "$NetBSD: ffs_snapshot.c,v 1.17 2005/05/29 21:25:24 christos Exp $"); 40 41#if defined(_KERNEL_OPT) 42#include "opt_ffs.h" 43#endif 44 45#include <sys/param.h> 46#include <sys/kernel.h> 47#include <sys/systm.h> 48#include <sys/conf.h> 49#include <sys/buf.h> 50#include <sys/proc.h> 51#include <sys/namei.h> 52#include <sys/sched.h> 53#include <sys/stat.h> 54#include <sys/malloc.h> 55#include <sys/mount.h> 56#include <sys/resource.h> 57#include <sys/resourcevar.h> 58#include <sys/vnode.h> 59 60#include <miscfs/specfs/specdev.h> 61 62#include <ufs/ufs/quota.h> 63#include <ufs/ufs/ufsmount.h> 64#include <ufs/ufs/inode.h> 65#include <ufs/ufs/ufs_extern.h> 66#include <ufs/ufs/ufs_bswap.h> 67 68#include <ufs/ffs/fs.h> 69#include <ufs/ffs/ffs_extern.h> 70 71/* FreeBSD -> NetBSD conversion */ 72#define KERNCRED proc0.p_ucred 73#define ufs1_daddr_t int32_t 74#define ufs2_daddr_t int64_t 75#define ufs_lbn_t daddr_t 76#define VI_MTX(v) (&(v)->v_interlock) 77#define VI_LOCK(v) simple_lock(&(v)->v_interlock) 78#define VI_UNLOCK(v) simple_unlock(&(v)->v_interlock) 79#define MNT_ILOCK(v) simple_lock(&mntvnode_slock) 80#define MNT_IUNLOCK(v) simple_unlock(&mntvnode_slock) 81 82#if !defined(FFS_NO_SNAPSHOT) 83static int cgaccount(int, struct vnode *, caddr_t, int); 84static int expunge_ufs1(struct vnode *, struct inode *, struct fs *, 85 int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, 86 ufs_lbn_t, int), int); 87static int indiracct_ufs1(struct vnode *, struct vnode *, int, 88 ufs1_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *, 89 int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, 90 ufs_lbn_t, int), int); 91static int fullacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 92 struct fs *, ufs_lbn_t, int); 93static int snapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 94 struct fs *, ufs_lbn_t, int); 95static int mapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 96 struct fs *, ufs_lbn_t, int); 97static int expunge_ufs2(struct vnode *, struct inode *, struct fs *, 98 int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, 99 ufs_lbn_t, int), int); 100static int indiracct_ufs2(struct vnode *, struct vnode *, int, 101 ufs2_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *, 102 int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, 103 ufs_lbn_t, int), int); 104static int fullacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 105 struct fs *, ufs_lbn_t, int); 106static int snapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 107 struct fs *, ufs_lbn_t, int); 108static int mapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 109 struct fs *, ufs_lbn_t, int); 110#endif /* !defined(FFS_NO_SNAPSHOT) */ 111 112static int ffs_copyonwrite(void *, struct buf *); 113static int readfsblk(struct vnode *, caddr_t, ufs2_daddr_t); 114static int __unused readvnblk(struct vnode *, caddr_t, ufs2_daddr_t); 115static int writevnblk(struct vnode *, caddr_t, ufs2_daddr_t); 116static inline int cow_enter(void); 117static inline void cow_leave(int); 118static inline ufs2_daddr_t db_get(struct inode *, int); 119static inline void db_assign(struct inode *, int, ufs2_daddr_t); 120static inline ufs2_daddr_t idb_get(struct inode *, caddr_t, int); 121static inline void idb_assign(struct inode *, caddr_t, int, ufs2_daddr_t); 122 123#ifdef DEBUG 124static int snapdebug = 0; 125#endif 126 127/* 128 * Create a snapshot file and initialize it for the filesystem. 129 * Vnode is locked on entry and return. 130 */ 131int 132ffs_snapshot(mp, vp, ctime) 133 struct mount *mp; 134 struct vnode *vp; 135 struct timespec *ctime; 136{ 137#if defined(FFS_NO_SNAPSHOT) 138 return EOPNOTSUPP; 139} 140#else /* defined(FFS_NO_SNAPSHOT) */ 141 ufs2_daddr_t numblks, blkno, *blkp, snaplistsize = 0, *snapblklist; 142 int error, ns, cg, snaploc; 143 int i, s, size, len, loc; 144 int flag = mp->mnt_flag; 145 struct timeval starttime; 146#ifdef DEBUG 147 struct timeval endtime; 148#endif 149 struct timespec ts; 150 long redo = 0; 151 int32_t *lp; 152 void *space; 153 caddr_t sbbuf = NULL; 154 struct ufsmount *ump = VFSTOUFS(mp); 155 struct fs *copy_fs = NULL, *fs = ump->um_fs; 156 struct proc *p = curproc; 157 struct inode *ip, *xp; 158 struct buf *bp, *ibp, *nbp; 159 struct vattr vat; 160 struct vnode *xvp, *nvp, *devvp; 161 162 ns = UFS_FSNEEDSWAP(fs); 163 /* 164 * Need to serialize access to snapshot code per filesystem. 165 */ 166 /* 167 * If the vnode already is a snapshot, return. 168 */ 169 if (VTOI(vp)->i_flags & SF_SNAPSHOT) { 170 if (ctime) { 171 ctime->tv_sec = DIP(VTOI(vp), mtime); 172 ctime->tv_nsec = DIP(VTOI(vp), mtimensec); 173 } 174 return 0; 175 } 176 /* 177 * Check mount, exclusive reference and owner. 178 */ 179 if (vp->v_mount != mp) 180 return EXDEV; 181 if (vp->v_usecount != 1 || vp->v_writecount != 0) 182 return EBUSY; 183 if (suser(p->p_ucred, &p->p_acflag) != 0 && 184 VTOI(vp)->i_uid != p->p_ucred->cr_uid) 185 return EACCES; 186 187 if (vp->v_size != 0) { 188 error = VOP_TRUNCATE(vp, 0, 0, NOCRED, p); 189 if (error) 190 return error; 191 } 192 /* 193 * Assign a snapshot slot in the superblock. 194 */ 195 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) 196 if (fs->fs_snapinum[snaploc] == 0) 197 break; 198 if (snaploc == FSMAXSNAP) 199 return (ENOSPC); 200 ip = VTOI(vp); 201 devvp = ip->i_devvp; 202 /* 203 * Write an empty list of preallocated blocks to the end of 204 * the snapshot to set size to at least that of the filesystem. 205 */ 206 numblks = howmany(fs->fs_size, fs->fs_frag); 207 blkno = 1; 208 blkno = ufs_rw64(blkno, ns); 209 error = vn_rdwr(UIO_WRITE, vp, 210 (caddr_t)&blkno, sizeof(blkno), lblktosize(fs, (off_t)numblks), 211 UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, p->p_ucred, NULL, NULL); 212 if (error) 213 goto out; 214 /* 215 * Preallocate critical data structures so that we can copy 216 * them in without further allocation after we suspend all 217 * operations on the filesystem. We would like to just release 218 * the allocated buffers without writing them since they will 219 * be filled in below once we are ready to go, but this upsets 220 * the soft update code, so we go ahead and write the new buffers. 221 * 222 * Allocate all indirect blocks and mark all of them as not 223 * needing to be copied. 224 */ 225 for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) { 226 error = VOP_BALLOC(vp, lblktosize(fs, (off_t)blkno), 227 fs->fs_bsize, p->p_ucred, B_METAONLY, &ibp); 228 if (error) 229 goto out; 230 bawrite(ibp); 231 } 232 /* 233 * Allocate copies for the superblock and its summary information. 234 */ 235 error = VOP_BALLOC(vp, fs->fs_sblockloc, fs->fs_sbsize, KERNCRED, 236 0, &nbp); 237 if (error) 238 goto out; 239 bawrite(nbp); 240 blkno = fragstoblks(fs, fs->fs_csaddr); 241 len = howmany(fs->fs_cssize, fs->fs_bsize); 242 for (loc = 0; loc < len; loc++) { 243 error = VOP_BALLOC(vp, lblktosize(fs, (off_t)(blkno + loc)), 244 fs->fs_bsize, KERNCRED, 0, &nbp); 245 if (error) 246 goto out; 247 bawrite(nbp); 248 } 249 /* 250 * Copy all the cylinder group maps. Although the 251 * filesystem is still active, we hope that only a few 252 * cylinder groups will change between now and when we 253 * suspend operations. Thus, we will be able to quickly 254 * touch up the few cylinder groups that changed during 255 * the suspension period. 256 */ 257 len = howmany(fs->fs_ncg, NBBY); 258 MALLOC(fs->fs_active, u_char *, len, M_DEVBUF, M_WAITOK | M_ZERO); 259 for (cg = 0; cg < fs->fs_ncg; cg++) { 260 if ((error = VOP_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)), 261 fs->fs_bsize, KERNCRED, 0, &nbp)) != 0) 262 goto out; 263 error = cgaccount(cg, vp, nbp->b_data, 1); 264 bawrite(nbp); 265 if (error) 266 goto out; 267 } 268 /* 269 * Change inode to snapshot type file. 270 */ 271 ip->i_flags |= SF_SNAPSHOT; 272 DIP_ASSIGN(ip, flags, ip->i_flags); 273 ip->i_flag |= IN_CHANGE | IN_UPDATE; 274 /* 275 * Ensure that the snapshot is completely on disk. 276 * Since we have marked it as a snapshot it is safe to 277 * unlock it as no process will be allowed to write to it. 278 */ 279 if ((error = VOP_FSYNC(vp, KERNCRED, FSYNC_WAIT, 0, 0, p)) != 0) 280 goto out; 281 VOP_UNLOCK(vp, 0); 282 /* 283 * All allocations are done, so we can now snapshot the system. 284 * 285 * Suspend operation on filesystem. 286 */ 287 if ((error = vfs_write_suspend(vp->v_mount, PUSER|PCATCH, 0)) != 0) { 288 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 289 goto out; 290 } 291 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 292 microtime(&starttime); 293 /* 294 * First, copy all the cylinder group maps that have changed. 295 */ 296 for (cg = 0; cg < fs->fs_ncg; cg++) { 297 if (ACTIVECG_ISSET(fs, cg)) 298 continue; 299 redo++; 300 if ((error = VOP_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)), 301 fs->fs_bsize, KERNCRED, 0, &nbp)) != 0) 302 goto out1; 303 error = cgaccount(cg, vp, nbp->b_data, 2); 304 bawrite(nbp); 305 if (error) 306 goto out1; 307 } 308 /* 309 * Grab a copy of the superblock and its summary information. 310 * We delay writing it until the suspension is released below. 311 */ 312 sbbuf = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK); 313 loc = blkoff(fs, fs->fs_sblockloc); 314 if (loc > 0) 315 bzero(&sbbuf[0], loc); 316 copy_fs = (struct fs *)(sbbuf + loc); 317 bcopy(fs, copy_fs, fs->fs_sbsize); 318 size = fs->fs_bsize < SBLOCKSIZE ? fs->fs_bsize : SBLOCKSIZE; 319 if (fs->fs_sbsize < size) 320 bzero(&sbbuf[loc + fs->fs_sbsize], size - fs->fs_sbsize); 321 size = blkroundup(fs, fs->fs_cssize); 322 if (fs->fs_contigsumsize > 0) 323 size += fs->fs_ncg * sizeof(int32_t); 324 space = malloc((u_long)size, M_UFSMNT, M_WAITOK); 325 copy_fs->fs_csp = space; 326 bcopy(fs->fs_csp, copy_fs->fs_csp, fs->fs_cssize); 327 space = (char *)space + fs->fs_cssize; 328 loc = howmany(fs->fs_cssize, fs->fs_fsize); 329 i = fs->fs_frag - loc % fs->fs_frag; 330 len = (i == fs->fs_frag) ? 0 : i * fs->fs_fsize; 331 if (len > 0) { 332 if ((error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + loc), 333 len, KERNCRED, &bp)) != 0) { 334 brelse(bp); 335 free(copy_fs->fs_csp, M_UFSMNT); 336 goto out1; 337 } 338 bcopy(bp->b_data, space, (u_int)len); 339 space = (char *)space + len; 340 bp->b_flags |= B_INVAL | B_NOCACHE; 341 brelse(bp); 342 } 343 if (fs->fs_contigsumsize > 0) { 344 copy_fs->fs_maxcluster = lp = space; 345 for (i = 0; i < fs->fs_ncg; i++) 346 *lp++ = fs->fs_contigsumsize; 347 } 348 /* 349 * We must check for active files that have been unlinked 350 * (e.g., with a zero link count). We have to expunge all 351 * trace of these files from the snapshot so that they are 352 * not reclaimed prematurely by fsck or unnecessarily dumped. 353 * We turn off the MNTK_SUSPENDED flag to avoid a panic from 354 * spec_strategy about writing on a suspended filesystem. 355 * Note that we skip unlinked snapshot files as they will 356 * be handled separately below. 357 * 358 * We also calculate the needed size for the snapshot list. 359 */ 360 snaplistsize = fs->fs_ncg + howmany(fs->fs_cssize, fs->fs_bsize) + 361 FSMAXSNAP + 1 /* superblock */ + 1 /* last block */ + 1 /* size */; 362 MNT_ILOCK(mp); 363loop: 364 for (xvp = LIST_FIRST(&mp->mnt_vnodelist); xvp; xvp = nvp) { 365 /* 366 * Make sure this vnode wasn't reclaimed in getnewvnode(). 367 * Start over if it has (it won't be on the list anymore). 368 */ 369 if (xvp->v_mount != mp) 370 goto loop; 371 nvp = LIST_NEXT(xvp, v_mntvnodes); 372 VI_LOCK(xvp); 373 MNT_IUNLOCK(mp); 374 if ((xvp->v_flag & VXLOCK) || 375 xvp->v_usecount == 0 || xvp->v_type == VNON || 376 (VTOI(xvp)->i_flags & SF_SNAPSHOT)) { 377 VI_UNLOCK(xvp); 378 MNT_ILOCK(mp); 379 continue; 380 } 381 if (vn_lock(xvp, LK_EXCLUSIVE | LK_INTERLOCK) != 0) { 382 MNT_ILOCK(mp); 383 goto loop; 384 } 385#ifdef DEBUG 386 if (snapdebug) 387 vprint("ffs_snapshot: busy vnode", xvp); 388#endif 389 if (VOP_GETATTR(xvp, &vat, p->p_ucred, p) == 0 && 390 vat.va_nlink > 0) { 391 VOP_UNLOCK(xvp, 0); 392 MNT_ILOCK(mp); 393 continue; 394 } 395 xp = VTOI(xvp); 396 if (ffs_checkfreefile(copy_fs, vp, xp->i_number)) { 397 VOP_UNLOCK(xvp, 0); 398 MNT_ILOCK(mp); 399 continue; 400 } 401 /* 402 * If there is a fragment, clear it here. 403 */ 404 blkno = 0; 405 loc = howmany(xp->i_size, fs->fs_bsize) - 1; 406 if (loc < NDADDR) { 407 len = fragroundup(fs, blkoff(fs, xp->i_size)); 408 if (len > 0 && len < fs->fs_bsize) { 409 ffs_blkfree(copy_fs, vp, db_get(xp, loc), 410 len, xp->i_number); 411 blkno = db_get(xp, loc); 412 db_assign(xp, loc, 0); 413 } 414 } 415 snaplistsize += 1; 416 if (xp->i_ump->um_fstype == UFS1) 417 error = expunge_ufs1(vp, xp, copy_fs, fullacct_ufs1, 418 BLK_NOCOPY); 419 else 420 error = expunge_ufs2(vp, xp, copy_fs, fullacct_ufs2, 421 BLK_NOCOPY); 422 if (blkno) 423 db_assign(xp, loc, blkno); 424 if (!error) 425 error = ffs_freefile(copy_fs, vp, xp->i_number, 426 xp->i_mode); 427 VOP_UNLOCK(xvp, 0); 428 if (error) { 429 free(copy_fs->fs_csp, M_UFSMNT); 430 goto out1; 431 } 432 MNT_ILOCK(mp); 433 } 434 MNT_IUNLOCK(mp); 435 /* 436 * If there already exist snapshots on this filesystem, grab a 437 * reference to their shared lock. If this is the first snapshot 438 * on this filesystem, we need to allocate a lock for the snapshots 439 * to share. In either case, acquire the snapshot lock and give 440 * up our original private lock. 441 */ 442 VI_LOCK(devvp); 443 if ((xp = TAILQ_FIRST(&ump->um_snapshots)) != NULL) { 444 struct lock *lkp; 445 446 lkp = ITOV(xp)->v_vnlock; 447 VI_UNLOCK(devvp); 448 VI_LOCK(vp); 449 vp->v_vnlock = lkp; 450 } else { 451 struct lock *lkp; 452 453 VI_UNLOCK(devvp); 454 MALLOC(lkp, struct lock *, sizeof(struct lock), M_UFSMNT, 455 M_WAITOK); 456 lockinit(lkp, PVFS, "snaplk", 0, LK_CANRECURSE); 457 VI_LOCK(vp); 458 vp->v_vnlock = lkp; 459 } 460 vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY); 461 transferlockers(&vp->v_lock, vp->v_vnlock); 462 lockmgr(&vp->v_lock, LK_RELEASE, NULL); 463 /* 464 * If this is the first snapshot on this filesystem, then we need 465 * to allocate the space for the list of preallocated snapshot blocks. 466 * This list will be refined below, but this preliminary one will 467 * keep us out of deadlock until the full one is ready. 468 */ 469 if (xp == NULL) { 470 MALLOC(snapblklist, ufs2_daddr_t *, 471 snaplistsize * sizeof(ufs2_daddr_t), M_UFSMNT, M_WAITOK); 472 blkp = &snapblklist[1]; 473 *blkp++ = lblkno(fs, fs->fs_sblockloc); 474 blkno = fragstoblks(fs, fs->fs_csaddr); 475 for (cg = 0; cg < fs->fs_ncg; cg++) { 476 if (fragstoblks(fs, cgtod(fs, cg)) > blkno) 477 break; 478 *blkp++ = fragstoblks(fs, cgtod(fs, cg)); 479 } 480 len = howmany(fs->fs_cssize, fs->fs_bsize); 481 for (loc = 0; loc < len; loc++) 482 *blkp++ = blkno + loc; 483 for (; cg < fs->fs_ncg; cg++) 484 *blkp++ = fragstoblks(fs, cgtod(fs, cg)); 485 snapblklist[0] = blkp - snapblklist; 486 VI_LOCK(devvp); 487 if (ump->um_snapblklist != NULL) 488 panic("ffs_snapshot: non-empty list"); 489 ump->um_snapblklist = snapblklist; 490 VI_UNLOCK(devvp); 491 } 492 /* 493 * Record snapshot inode. Since this is the newest snapshot, 494 * it must be placed at the end of the list. 495 */ 496 VI_LOCK(devvp); 497 fs->fs_snapinum[snaploc] = ip->i_number; 498 if (ip->i_nextsnap.tqe_prev != 0) 499 panic("ffs_snapshot: %d already on list", ip->i_number); 500 TAILQ_INSERT_TAIL(&ump->um_snapshots, ip, i_nextsnap); 501 VI_UNLOCK(devvp); 502 if (xp == NULL) 503 vn_cow_establish(devvp, ffs_copyonwrite, devvp); 504 vp->v_flag |= VSYSTEM; 505out1: 506 /* 507 * Resume operation on filesystem. 508 */ 509 vfs_write_resume(vp->v_mount); 510 /* 511 * Set the mtime to the time the snapshot has been taken. 512 */ 513 TIMEVAL_TO_TIMESPEC(&starttime, &ts); 514 if (ctime) 515 *ctime = ts; 516 DIP_ASSIGN(ip, mtime, ts.tv_sec); 517 DIP_ASSIGN(ip, mtimensec, ts.tv_nsec); 518 ip->i_flag |= IN_CHANGE | IN_UPDATE; 519 520#ifdef DEBUG 521 if (starttime.tv_sec > 0) { 522 microtime(&endtime); 523 timersub(&endtime, &starttime, &endtime); 524 printf("%s: suspended %ld.%03ld sec, redo %ld of %d\n", 525 vp->v_mount->mnt_stat.f_mntonname, (long)endtime.tv_sec, 526 endtime.tv_usec / 1000, redo, fs->fs_ncg); 527 } 528#endif 529 if (error) 530 goto out; 531 /* 532 * Copy allocation information from all the snapshots in 533 * this snapshot and then expunge them from its view. 534 */ 535 TAILQ_FOREACH(xp, &ump->um_snapshots, i_nextsnap) { 536 if (xp == ip) 537 break; 538 if (xp->i_ump->um_fstype == UFS1) 539 error = expunge_ufs1(vp, xp, fs, snapacct_ufs1, 540 BLK_SNAP); 541 else 542 error = expunge_ufs2(vp, xp, fs, snapacct_ufs2, 543 BLK_SNAP); 544 if (error) { 545 fs->fs_snapinum[snaploc] = 0; 546 goto done; 547 } 548 } 549 /* 550 * Allocate space for the full list of preallocated snapshot blocks. 551 */ 552 MALLOC(snapblklist, ufs2_daddr_t *, snaplistsize * sizeof(ufs2_daddr_t), 553 M_UFSMNT, M_WAITOK); 554 ip->i_snapblklist = &snapblklist[1]; 555 /* 556 * Expunge the blocks used by the snapshots from the set of 557 * blocks marked as used in the snapshot bitmaps. Also, collect 558 * the list of allocated blocks in i_snapblklist. 559 */ 560 if (ip->i_ump->um_fstype == UFS1) 561 error = expunge_ufs1(vp, ip, copy_fs, mapacct_ufs1, BLK_SNAP); 562 else 563 error = expunge_ufs2(vp, ip, copy_fs, mapacct_ufs2, BLK_SNAP); 564 if (error) { 565 fs->fs_snapinum[snaploc] = 0; 566 FREE(snapblklist, M_UFSMNT); 567 goto done; 568 } 569 if (snaplistsize < ip->i_snapblklist - snapblklist) 570 panic("ffs_snapshot: list too small"); 571 snaplistsize = ip->i_snapblklist - snapblklist; 572 snapblklist[0] = snaplistsize; 573 ip->i_snapblklist = &snapblklist[0]; 574 /* 575 * Write out the list of allocated blocks to the end of the snapshot. 576 */ 577 for (i = 0; i < snaplistsize; i++) 578 snapblklist[i] = ufs_rw64(snapblklist[i], ns); 579 error = vn_rdwr(UIO_WRITE, vp, (caddr_t)snapblklist, 580 snaplistsize*sizeof(ufs2_daddr_t), lblktosize(fs, (off_t)numblks), 581 UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, p->p_ucred, NULL, NULL); 582 for (i = 0; i < snaplistsize; i++) 583 snapblklist[i] = ufs_rw64(snapblklist[i], ns); 584 if (error) { 585 fs->fs_snapinum[snaploc] = 0; 586 FREE(snapblklist, M_UFSMNT); 587 goto done; 588 } 589 /* 590 * Write the superblock and its summary information 591 * to the snapshot. 592 */ 593 blkno = fragstoblks(fs, fs->fs_csaddr); 594 len = howmany(fs->fs_cssize, fs->fs_bsize); 595 space = copy_fs->fs_csp; 596#ifdef FFS_EI 597 if (ns) { 598 ffs_sb_swap(copy_fs, copy_fs); 599 ffs_csum_swap(space, space, fs->fs_cssize); 600 } 601#endif 602 for (loc = 0; loc < len; loc++) { 603 error = bread(vp, blkno + loc, fs->fs_bsize, KERNCRED, &nbp); 604 if (error) { 605 brelse(nbp); 606 fs->fs_snapinum[snaploc] = 0; 607 FREE(snapblklist, M_UFSMNT); 608 goto done; 609 } 610 bcopy(space, nbp->b_data, fs->fs_bsize); 611 space = (char *)space + fs->fs_bsize; 612 bawrite(nbp); 613 } 614 /* 615 * As this is the newest list, it is the most inclusive, so 616 * should replace the previous list. If this is the first snapshot 617 * free the preliminary list. 618 */ 619 VI_LOCK(devvp); 620 space = ump->um_snapblklist; 621 ump->um_snapblklist = snapblklist; 622 VI_UNLOCK(devvp); 623 if (TAILQ_FIRST(&ump->um_snapshots) == ip) 624 FREE(space, M_UFSMNT); 625done: 626 free(copy_fs->fs_csp, M_UFSMNT); 627 if (!error) { 628 error = bread(vp, lblkno(fs, fs->fs_sblockloc), fs->fs_bsize, 629 KERNCRED, &nbp); 630 if (error) { 631 brelse(nbp); 632 fs->fs_snapinum[snaploc] = 0; 633 } 634 bcopy(sbbuf, nbp->b_data, fs->fs_bsize); 635 bawrite(nbp); 636 } 637out: 638 /* 639 * Invalidate and free all pages on the snapshot vnode. 640 * All metadata has been written through the buffer cache. 641 * Clean all dirty buffers now to avoid UBC inconsistencies. 642 */ 643 if (!error) { 644 simple_lock(&vp->v_interlock); 645 error = VOP_PUTPAGES(vp, 0, 0, 646 PGO_ALLPAGES|PGO_CLEANIT|PGO_SYNCIO|PGO_FREE); 647 } 648 if (!error) { 649 s = splbio(); 650 for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { 651 nbp = LIST_NEXT(bp, b_vnbufs); 652 simple_lock(&bp->b_interlock); 653 splx(s); 654 if ((bp->b_flags & (B_DELWRI|B_BUSY)) != B_DELWRI) 655 panic("ffs_snapshot: not dirty or busy, bp %p", 656 bp); 657 bp->b_flags |= B_BUSY|B_VFLUSH; 658 if (LIST_FIRST(&bp->b_dep) == NULL) 659 bp->b_flags |= B_NOCACHE; 660 simple_unlock(&bp->b_interlock); 661 bwrite(bp); 662 s = splbio(); 663 } 664 simple_lock(&global_v_numoutput_slock); 665 while (vp->v_numoutput) { 666 vp->v_flag |= VBWAIT; 667 ltsleep((caddr_t)&vp->v_numoutput, PRIBIO+1, 668 "snapflushbuf", 0, &global_v_numoutput_slock); 669 } 670 simple_unlock(&global_v_numoutput_slock); 671 splx(s); 672 } 673 if (sbbuf) 674 free(sbbuf, M_UFSMNT); 675 if (fs->fs_active != 0) { 676 FREE(fs->fs_active, M_DEVBUF); 677 fs->fs_active = 0; 678 } 679 mp->mnt_flag = flag; 680 if (error) 681 (void) VOP_TRUNCATE(vp, (off_t)0, 0, NOCRED, p); 682 else 683 vref(vp); 684 return (error); 685} 686 687/* 688 * Copy a cylinder group map. All the unallocated blocks are marked 689 * BLK_NOCOPY so that the snapshot knows that it need not copy them 690 * if they are later written. If passno is one, then this is a first 691 * pass, so only setting needs to be done. If passno is 2, then this 692 * is a revision to a previous pass which must be undone as the 693 * replacement pass is done. 694 */ 695static int 696cgaccount(cg, vp, data, passno) 697 int cg; 698 struct vnode *vp; 699 caddr_t data; 700 int passno; 701{ 702 struct buf *bp, *ibp; 703 struct inode *ip; 704 struct cg *cgp; 705 struct fs *fs; 706 ufs2_daddr_t base, numblks; 707 int error, len, loc, ns, indiroff; 708 709 ip = VTOI(vp); 710 fs = ip->i_fs; 711 ns = UFS_FSNEEDSWAP(fs); 712 error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)), 713 (int)fs->fs_cgsize, KERNCRED, &bp); 714 if (error) { 715 brelse(bp); 716 return (error); 717 } 718 cgp = (struct cg *)bp->b_data; 719 if (!cg_chkmagic(cgp, ns)) { 720 brelse(bp); 721 return (EIO); 722 } 723 ACTIVECG_SET(fs, cg); 724 725 bcopy(bp->b_data, data, fs->fs_cgsize); 726 brelse(bp); 727 if (fs->fs_cgsize < fs->fs_bsize) 728 bzero(&data[fs->fs_cgsize], 729 fs->fs_bsize - fs->fs_cgsize); 730 numblks = howmany(fs->fs_size, fs->fs_frag); 731 len = howmany(fs->fs_fpg, fs->fs_frag); 732 base = cg * fs->fs_fpg / fs->fs_frag; 733 if (base + len >= numblks) 734 len = numblks - base - 1; 735 loc = 0; 736 if (base < NDADDR) { 737 for ( ; loc < NDADDR; loc++) { 738 if (ffs_isblock(fs, cg_blksfree(cgp, ns), loc)) 739 db_assign(ip, loc, BLK_NOCOPY); 740 else if (db_get(ip, loc) == BLK_NOCOPY) { 741 if (passno == 2) 742 db_assign(ip, loc, 0); 743 else if (passno == 1) 744 panic("ffs_snapshot: lost direct block"); 745 } 746 } 747 } 748 if ((error = VOP_BALLOC(vp, lblktosize(fs, (off_t)(base + loc)), 749 fs->fs_bsize, KERNCRED, B_METAONLY, &ibp)) != 0) 750 return (error); 751 indiroff = (base + loc - NDADDR) % NINDIR(fs); 752 for ( ; loc < len; loc++, indiroff++) { 753 if (indiroff >= NINDIR(fs)) { 754 bawrite(ibp); 755 if ((error = VOP_BALLOC(vp, 756 lblktosize(fs, (off_t)(base + loc)), 757 fs->fs_bsize, KERNCRED, B_METAONLY, &ibp)) != 0) 758 return (error); 759 indiroff = 0; 760 } 761 if (ffs_isblock(fs, cg_blksfree(cgp, ns), loc)) 762 idb_assign(ip, ibp->b_data, indiroff, BLK_NOCOPY); 763 else if (idb_get(ip, ibp->b_data, indiroff) == BLK_NOCOPY) { 764 if (passno == 2) 765 idb_assign(ip, ibp->b_data, indiroff, 0); 766 else if (passno == 1) 767 panic("ffs_snapshot: lost indirect block"); 768 } 769 } 770 bdwrite(ibp); 771 return (0); 772} 773 774/* 775 * Before expunging a snapshot inode, note all the 776 * blocks that it claims with BLK_SNAP so that fsck will 777 * be able to account for those blocks properly and so 778 * that this snapshot knows that it need not copy them 779 * if the other snapshot holding them is freed. This code 780 * is reproduced once each for UFS1 and UFS2. 781 */ 782static int 783expunge_ufs1(snapvp, cancelip, fs, acctfunc, expungetype) 784 struct vnode *snapvp; 785 struct inode *cancelip; 786 struct fs *fs; 787 int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 788 struct fs *, ufs_lbn_t, int); 789 int expungetype; 790{ 791 int i, s, error, ns, indiroff; 792 ufs_lbn_t lbn, rlbn; 793 ufs2_daddr_t len, blkno, numblks, blksperindir; 794 struct ufs1_dinode *dip; 795 struct buf *bp; 796 caddr_t bf; 797 798 ns = UFS_FSNEEDSWAP(fs); 799 /* 800 * Prepare to expunge the inode. If its inode block has not 801 * yet been copied, then allocate and fill the copy. 802 */ 803 lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number)); 804 blkno = 0; 805 if (lbn < NDADDR) { 806 blkno = db_get(VTOI(snapvp), lbn); 807 } else { 808 s = cow_enter(); 809 error = VOP_BALLOC(snapvp, lblktosize(fs, (off_t)lbn), 810 fs->fs_bsize, KERNCRED, B_METAONLY, &bp); 811 cow_leave(s); 812 if (error) 813 return (error); 814 indiroff = (lbn - NDADDR) % NINDIR(fs); 815 blkno = idb_get(VTOI(snapvp), bp->b_data, indiroff); 816 brelse(bp); 817 } 818 bf = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK); 819 if (blkno != 0) 820 error = readvnblk(snapvp, bf, lbn); 821 else 822 error = readfsblk(snapvp, bf, lbn); 823 if (error) { 824 free(bf, M_UFSMNT); 825 return error; 826 } 827 /* 828 * Set a snapshot inode to be a zero length file, regular files 829 * to be completely unallocated. 830 */ 831 dip = (struct ufs1_dinode *)bf + ino_to_fsbo(fs, cancelip->i_number); 832 if (expungetype == BLK_NOCOPY) 833 dip->di_mode = 0; 834 dip->di_size = 0; 835 dip->di_blocks = 0; 836 dip->di_flags = 837 ufs_rw32(ufs_rw32(dip->di_flags, ns) & ~SF_SNAPSHOT, ns); 838 bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs1_daddr_t)); 839 error = writevnblk(snapvp, bf, lbn); 840 free(bf, M_UFSMNT); 841 if (error) 842 return error; 843 /* 844 * Now go through and expunge all the blocks in the file 845 * using the function requested. 846 */ 847 numblks = howmany(cancelip->i_size, fs->fs_bsize); 848 if ((error = (*acctfunc)(snapvp, &cancelip->i_ffs1_db[0], 849 &cancelip->i_ffs1_db[NDADDR], fs, 0, expungetype))) 850 return (error); 851 if ((error = (*acctfunc)(snapvp, &cancelip->i_ffs1_ib[0], 852 &cancelip->i_ffs1_ib[NIADDR], fs, -1, expungetype))) 853 return (error); 854 blksperindir = 1; 855 lbn = -NDADDR; 856 len = numblks - NDADDR; 857 rlbn = NDADDR; 858 for (i = 0; len > 0 && i < NIADDR; i++) { 859 error = indiracct_ufs1(snapvp, ITOV(cancelip), i, 860 ufs_rw32(cancelip->i_ffs1_ib[i], ns), lbn, rlbn, len, 861 blksperindir, fs, acctfunc, expungetype); 862 if (error) 863 return (error); 864 blksperindir *= NINDIR(fs); 865 lbn -= blksperindir + 1; 866 len -= blksperindir; 867 rlbn += blksperindir; 868 } 869 return (0); 870} 871 872/* 873 * Descend an indirect block chain for vnode cancelvp accounting for all 874 * its indirect blocks in snapvp. 875 */ 876static int 877indiracct_ufs1(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks, 878 blksperindir, fs, acctfunc, expungetype) 879 struct vnode *snapvp; 880 struct vnode *cancelvp; 881 int level; 882 ufs1_daddr_t blkno; 883 ufs_lbn_t lbn; 884 ufs_lbn_t rlbn; 885 ufs_lbn_t remblks; 886 ufs_lbn_t blksperindir; 887 struct fs *fs; 888 int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 889 struct fs *, ufs_lbn_t, int); 890 int expungetype; 891{ 892 int error, ns, num, i; 893 ufs_lbn_t subblksperindir; 894 struct indir indirs[NIADDR + 2]; 895 ufs1_daddr_t last, *bap; 896 struct buf *bp; 897 898 ns = UFS_FSNEEDSWAP(fs); 899 900 if (blkno == 0) { 901 if (expungetype == BLK_NOCOPY) 902 return (0); 903 panic("indiracct_ufs1: missing indir"); 904 } 905 if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0) 906 return (error); 907 if (lbn != indirs[num - 1 - level].in_lbn || num < 2) 908 panic("indiracct_ufs1: botched params"); 909 /* 910 * We have to expand bread here since it will deadlock looking 911 * up the block number for any blocks that are not in the cache. 912 */ 913 bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0); 914 bp->b_blkno = fsbtodb(fs, blkno); 915 if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 && 916 (error = readfsblk(bp->b_vp, bp->b_data, fragstoblks(fs, blkno)))) { 917 brelse(bp); 918 return (error); 919 } 920 /* 921 * Account for the block pointers in this indirect block. 922 */ 923 last = howmany(remblks, blksperindir); 924 if (last > NINDIR(fs)) 925 last = NINDIR(fs); 926 MALLOC(bap, ufs1_daddr_t *, fs->fs_bsize, M_DEVBUF, M_WAITOK); 927 bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize); 928 brelse(bp); 929 error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs, 930 level == 0 ? rlbn : -1, expungetype); 931 if (error || level == 0) 932 goto out; 933 /* 934 * Account for the block pointers in each of the indirect blocks 935 * in the levels below us. 936 */ 937 subblksperindir = blksperindir / NINDIR(fs); 938 for (lbn++, level--, i = 0; i < last; i++) { 939 error = indiracct_ufs1(snapvp, cancelvp, level, 940 ufs_rw32(bap[i], ns), lbn, rlbn, remblks, subblksperindir, 941 fs, acctfunc, expungetype); 942 if (error) 943 goto out; 944 rlbn += blksperindir; 945 lbn -= blksperindir; 946 remblks -= blksperindir; 947 } 948out: 949 FREE(bap, M_DEVBUF); 950 return (error); 951} 952 953/* 954 * Do both snap accounting and map accounting. 955 */ 956static int 957fullacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype) 958 struct vnode *vp; 959 ufs1_daddr_t *oldblkp, *lastblkp; 960 struct fs *fs; 961 ufs_lbn_t lblkno; 962 int exptype; /* BLK_SNAP or BLK_NOCOPY */ 963{ 964 int error; 965 966 if ((error = snapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype))) 967 return (error); 968 return (mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype)); 969} 970 971/* 972 * Identify a set of blocks allocated in a snapshot inode. 973 */ 974static int 975snapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype) 976 struct vnode *vp; 977 ufs1_daddr_t *oldblkp, *lastblkp; 978 struct fs *fs; 979 ufs_lbn_t lblkno; 980 int expungetype; /* BLK_SNAP or BLK_NOCOPY */ 981{ 982 struct inode *ip = VTOI(vp); 983 ufs1_daddr_t blkno, *blkp; 984 ufs_lbn_t lbn; 985 struct buf *ibp; 986 int error, ns; 987 988 ns = UFS_FSNEEDSWAP(fs); 989 990 for ( ; oldblkp < lastblkp; oldblkp++) { 991 blkno = ufs_rw32(*oldblkp, ns); 992 if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP) 993 continue; 994 lbn = fragstoblks(fs, blkno); 995 if (lbn < NDADDR) { 996 blkp = &ip->i_ffs1_db[lbn]; 997 ip->i_flag |= IN_CHANGE | IN_UPDATE; 998 } else { 999 error = VOP_BALLOC(vp, lblktosize(fs, (off_t)lbn), 1000 fs->fs_bsize, KERNCRED, B_METAONLY, &ibp); 1001 if (error) 1002 return (error); 1003 blkp = &((ufs1_daddr_t *)(ibp->b_data)) 1004 [(lbn - NDADDR) % NINDIR(fs)]; 1005 } 1006 /* 1007 * If we are expunging a snapshot vnode and we 1008 * find a block marked BLK_NOCOPY, then it is 1009 * one that has been allocated to this snapshot after 1010 * we took our current snapshot and can be ignored. 1011 */ 1012 blkno = ufs_rw32(*blkp, ns); 1013 if (expungetype == BLK_SNAP && blkno == BLK_NOCOPY) { 1014 if (lbn >= NDADDR) 1015 brelse(ibp); 1016 } else { 1017 if (blkno != 0) 1018 panic("snapacct_ufs1: bad block"); 1019 *blkp = ufs_rw32(expungetype, ns); 1020 if (lbn >= NDADDR) 1021 bdwrite(ibp); 1022 } 1023 } 1024 return (0); 1025} 1026 1027/* 1028 * Account for a set of blocks allocated in a snapshot inode. 1029 */ 1030static int 1031mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype) 1032 struct vnode *vp; 1033 ufs1_daddr_t *oldblkp, *lastblkp; 1034 struct fs *fs; 1035 ufs_lbn_t lblkno; 1036 int expungetype; 1037{ 1038 ufs1_daddr_t blkno; 1039 struct inode *ip; 1040 ino_t inum; 1041 int acctit, ns; 1042 1043 ns = UFS_FSNEEDSWAP(fs); 1044 ip = VTOI(vp); 1045 inum = ip->i_number; 1046 if (lblkno == -1) 1047 acctit = 0; 1048 else 1049 acctit = 1; 1050 for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) { 1051 blkno = ufs_rw32(*oldblkp, ns); 1052 if (blkno == 0 || blkno == BLK_NOCOPY) 1053 continue; 1054 if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP) 1055 *ip->i_snapblklist++ = lblkno; 1056 if (blkno == BLK_SNAP) 1057 blkno = blkstofrags(fs, lblkno); 1058 ffs_blkfree(fs, vp, blkno, fs->fs_bsize, inum); 1059 } 1060 return (0); 1061} 1062 1063/* 1064 * Before expunging a snapshot inode, note all the 1065 * blocks that it claims with BLK_SNAP so that fsck will 1066 * be able to account for those blocks properly and so 1067 * that this snapshot knows that it need not copy them 1068 * if the other snapshot holding them is freed. This code 1069 * is reproduced once each for UFS1 and UFS2. 1070 */ 1071static int 1072expunge_ufs2(snapvp, cancelip, fs, acctfunc, expungetype) 1073 struct vnode *snapvp; 1074 struct inode *cancelip; 1075 struct fs *fs; 1076 int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 1077 struct fs *, ufs_lbn_t, int); 1078 int expungetype; 1079{ 1080 int i, s, error, ns, indiroff; 1081 ufs_lbn_t lbn, rlbn; 1082 ufs2_daddr_t len, blkno, numblks, blksperindir; 1083 struct ufs2_dinode *dip; 1084 struct buf *bp; 1085 caddr_t bf; 1086 1087 ns = UFS_FSNEEDSWAP(fs); 1088 /* 1089 * Prepare to expunge the inode. If its inode block has not 1090 * yet been copied, then allocate and fill the copy. 1091 */ 1092 lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number)); 1093 blkno = 0; 1094 if (lbn < NDADDR) { 1095 blkno = db_get(VTOI(snapvp), lbn); 1096 } else { 1097 s = cow_enter(); 1098 error = VOP_BALLOC(snapvp, lblktosize(fs, (off_t)lbn), 1099 fs->fs_bsize, KERNCRED, B_METAONLY, &bp); 1100 cow_leave(s); 1101 if (error) 1102 return (error); 1103 indiroff = (lbn - NDADDR) % NINDIR(fs); 1104 blkno = idb_get(VTOI(snapvp), bp->b_data, indiroff); 1105 brelse(bp); 1106 } 1107 bf = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK); 1108 if (blkno != 0) 1109 error = readvnblk(snapvp, bf, lbn); 1110 else 1111 error = readfsblk(snapvp, bf, lbn); 1112 if (error) { 1113 free(bf, M_UFSMNT); 1114 return error; 1115 } 1116 /* 1117 * Set a snapshot inode to be a zero length file, regular files 1118 * to be completely unallocated. 1119 */ 1120 dip = (struct ufs2_dinode *)bf + ino_to_fsbo(fs, cancelip->i_number); 1121 if (expungetype == BLK_NOCOPY) 1122 dip->di_mode = 0; 1123 dip->di_size = 0; 1124 dip->di_blocks = 0; 1125 dip->di_flags = 1126 ufs_rw32(ufs_rw32(dip->di_flags, ns) & ~SF_SNAPSHOT, ns); 1127 bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs2_daddr_t)); 1128 error = writevnblk(snapvp, bf, lbn); 1129 free(bf, M_UFSMNT); 1130 if (error) 1131 return error; 1132 /* 1133 * Now go through and expunge all the blocks in the file 1134 * using the function requested. 1135 */ 1136 numblks = howmany(cancelip->i_size, fs->fs_bsize); 1137 if ((error = (*acctfunc)(snapvp, &cancelip->i_ffs2_db[0], 1138 &cancelip->i_ffs2_db[NDADDR], fs, 0, expungetype))) 1139 return (error); 1140 if ((error = (*acctfunc)(snapvp, &cancelip->i_ffs2_ib[0], 1141 &cancelip->i_ffs2_ib[NIADDR], fs, -1, expungetype))) 1142 return (error); 1143 blksperindir = 1; 1144 lbn = -NDADDR; 1145 len = numblks - NDADDR; 1146 rlbn = NDADDR; 1147 for (i = 0; len > 0 && i < NIADDR; i++) { 1148 error = indiracct_ufs2(snapvp, ITOV(cancelip), i, 1149 ufs_rw64(cancelip->i_ffs2_ib[i], ns), lbn, rlbn, len, 1150 blksperindir, fs, acctfunc, expungetype); 1151 if (error) 1152 return (error); 1153 blksperindir *= NINDIR(fs); 1154 lbn -= blksperindir + 1; 1155 len -= blksperindir; 1156 rlbn += blksperindir; 1157 } 1158 return (0); 1159} 1160 1161/* 1162 * Descend an indirect block chain for vnode cancelvp accounting for all 1163 * its indirect blocks in snapvp. 1164 */ 1165static int 1166indiracct_ufs2(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks, 1167 blksperindir, fs, acctfunc, expungetype) 1168 struct vnode *snapvp; 1169 struct vnode *cancelvp; 1170 int level; 1171 ufs2_daddr_t blkno; 1172 ufs_lbn_t lbn; 1173 ufs_lbn_t rlbn; 1174 ufs_lbn_t remblks; 1175 ufs_lbn_t blksperindir; 1176 struct fs *fs; 1177 int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 1178 struct fs *, ufs_lbn_t, int); 1179 int expungetype; 1180{ 1181 int error, ns, num, i; 1182 ufs_lbn_t subblksperindir; 1183 struct indir indirs[NIADDR + 2]; 1184 ufs2_daddr_t last, *bap; 1185 struct buf *bp; 1186 1187 ns = UFS_FSNEEDSWAP(fs); 1188 1189 if (blkno == 0) { 1190 if (expungetype == BLK_NOCOPY) 1191 return (0); 1192 panic("indiracct_ufs2: missing indir"); 1193 } 1194 if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0) 1195 return (error); 1196 if (lbn != indirs[num - 1 - level].in_lbn || num < 2) 1197 panic("indiracct_ufs2: botched params"); 1198 /* 1199 * We have to expand bread here since it will deadlock looking 1200 * up the block number for any blocks that are not in the cache. 1201 */ 1202 bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0); 1203 bp->b_blkno = fsbtodb(fs, blkno); 1204 if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 && 1205 (error = readfsblk(bp->b_vp, bp->b_data, fragstoblks(fs, blkno)))) { 1206 brelse(bp); 1207 return (error); 1208 } 1209 /* 1210 * Account for the block pointers in this indirect block. 1211 */ 1212 last = howmany(remblks, blksperindir); 1213 if (last > NINDIR(fs)) 1214 last = NINDIR(fs); 1215 MALLOC(bap, ufs2_daddr_t *, fs->fs_bsize, M_DEVBUF, M_WAITOK); 1216 bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize); 1217 brelse(bp); 1218 error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs, 1219 level == 0 ? rlbn : -1, expungetype); 1220 if (error || level == 0) 1221 goto out; 1222 /* 1223 * Account for the block pointers in each of the indirect blocks 1224 * in the levels below us. 1225 */ 1226 subblksperindir = blksperindir / NINDIR(fs); 1227 for (lbn++, level--, i = 0; i < last; i++) { 1228 error = indiracct_ufs2(snapvp, cancelvp, level, 1229 ufs_rw64(bap[i], ns), lbn, rlbn, remblks, subblksperindir, 1230 fs, acctfunc, expungetype); 1231 if (error) 1232 goto out; 1233 rlbn += blksperindir; 1234 lbn -= blksperindir; 1235 remblks -= blksperindir; 1236 } 1237out: 1238 FREE(bap, M_DEVBUF); 1239 return (error); 1240} 1241 1242/* 1243 * Do both snap accounting and map accounting. 1244 */ 1245static int 1246fullacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype) 1247 struct vnode *vp; 1248 ufs2_daddr_t *oldblkp, *lastblkp; 1249 struct fs *fs; 1250 ufs_lbn_t lblkno; 1251 int exptype; /* BLK_SNAP or BLK_NOCOPY */ 1252{ 1253 int error; 1254 1255 if ((error = snapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype))) 1256 return (error); 1257 return (mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype)); 1258} 1259 1260/* 1261 * Identify a set of blocks allocated in a snapshot inode. 1262 */ 1263static int 1264snapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expungetype) 1265 struct vnode *vp; 1266 ufs2_daddr_t *oldblkp, *lastblkp; 1267 struct fs *fs; 1268 ufs_lbn_t lblkno; 1269 int expungetype; /* BLK_SNAP or BLK_NOCOPY */ 1270{ 1271 struct inode *ip = VTOI(vp); 1272 ufs2_daddr_t blkno, *blkp; 1273 ufs_lbn_t lbn; 1274 struct buf *ibp; 1275 int error, ns; 1276 1277 ns = UFS_FSNEEDSWAP(fs); 1278 1279 for ( ; oldblkp < lastblkp; oldblkp++) { 1280 blkno = ufs_rw64(*oldblkp, ns); 1281 if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP) 1282 continue; 1283 lbn = fragstoblks(fs, blkno); 1284 if (lbn < NDADDR) { 1285 blkp = &ip->i_ffs2_db[lbn]; 1286 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1287 } else { 1288 error = VOP_BALLOC(vp, lblktosize(fs, (off_t)lbn), 1289 fs->fs_bsize, KERNCRED, B_METAONLY, &ibp); 1290 if (error) 1291 return (error); 1292 blkp = &((ufs2_daddr_t *)(ibp->b_data)) 1293 [(lbn - NDADDR) % NINDIR(fs)]; 1294 } 1295 /* 1296 * If we are expunging a snapshot vnode and we 1297 * find a block marked BLK_NOCOPY, then it is 1298 * one that has been allocated to this snapshot after 1299 * we took our current snapshot and can be ignored. 1300 */ 1301 blkno = ufs_rw64(*blkp, ns); 1302 if (expungetype == BLK_SNAP && blkno == BLK_NOCOPY) { 1303 if (lbn >= NDADDR) 1304 brelse(ibp); 1305 } else { 1306 if (blkno != 0) 1307 panic("snapacct_ufs2: bad block"); 1308 *blkp = ufs_rw64(expungetype, ns); 1309 if (lbn >= NDADDR) 1310 bdwrite(ibp); 1311 } 1312 } 1313 return (0); 1314} 1315 1316/* 1317 * Account for a set of blocks allocated in a snapshot inode. 1318 */ 1319static int 1320mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expungetype) 1321 struct vnode *vp; 1322 ufs2_daddr_t *oldblkp, *lastblkp; 1323 struct fs *fs; 1324 ufs_lbn_t lblkno; 1325 int expungetype; 1326{ 1327 ufs2_daddr_t blkno; 1328 struct inode *ip; 1329 ino_t inum; 1330 int acctit, ns; 1331 1332 ns = UFS_FSNEEDSWAP(fs); 1333 ip = VTOI(vp); 1334 inum = ip->i_number; 1335 if (lblkno == -1) 1336 acctit = 0; 1337 else 1338 acctit = 1; 1339 for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) { 1340 blkno = ufs_rw64(*oldblkp, ns); 1341 if (blkno == 0 || blkno == BLK_NOCOPY) 1342 continue; 1343 if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP) 1344 *ip->i_snapblklist++ = lblkno; 1345 if (blkno == BLK_SNAP) 1346 blkno = blkstofrags(fs, lblkno); 1347 ffs_blkfree(fs, vp, blkno, fs->fs_bsize, inum); 1348 } 1349 return (0); 1350} 1351#endif /* defined(FFS_NO_SNAPSHOT) */ 1352 1353/* 1354 * Decrement extra reference on snapshot when last name is removed. 1355 * It will not be freed until the last open reference goes away. 1356 */ 1357void 1358ffs_snapgone(ip) 1359 struct inode *ip; 1360{ 1361 struct ufsmount *ump = VFSTOUFS(ip->i_devvp->v_specmountpoint); 1362 struct inode *xp; 1363 struct fs *fs; 1364 int snaploc; 1365 1366 /* 1367 * Find snapshot in incore list. 1368 */ 1369 TAILQ_FOREACH(xp, &ump->um_snapshots, i_nextsnap) 1370 if (xp == ip) 1371 break; 1372 if (xp != NULL) 1373 vrele(ITOV(ip)); 1374#ifdef DEBUG 1375 else if (snapdebug) 1376 printf("ffs_snapgone: lost snapshot vnode %d\n", 1377 ip->i_number); 1378#endif 1379 /* 1380 * Delete snapshot inode from superblock. Keep list dense. 1381 */ 1382 fs = ip->i_fs; 1383 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) 1384 if (fs->fs_snapinum[snaploc] == ip->i_number) 1385 break; 1386 if (snaploc < FSMAXSNAP) { 1387 for (snaploc++; snaploc < FSMAXSNAP; snaploc++) { 1388 if (fs->fs_snapinum[snaploc] == 0) 1389 break; 1390 fs->fs_snapinum[snaploc - 1] = fs->fs_snapinum[snaploc]; 1391 } 1392 fs->fs_snapinum[snaploc - 1] = 0; 1393 } 1394} 1395 1396/* 1397 * Prepare a snapshot file for being removed. 1398 */ 1399void 1400ffs_snapremove(vp) 1401 struct vnode *vp; 1402{ 1403 struct inode *ip = VTOI(vp), *xp; 1404 struct vnode *devvp = ip->i_devvp; 1405 struct fs *fs = ip->i_fs; 1406 struct ufsmount *ump = VFSTOUFS(devvp->v_specmountpoint); 1407 struct lock *lkp; 1408 struct buf *ibp; 1409 ufs2_daddr_t numblks, blkno, dblk; 1410 int error, ns, loc, last; 1411 1412 ns = UFS_FSNEEDSWAP(fs); 1413 /* 1414 * If active, delete from incore list (this snapshot may 1415 * already have been in the process of being deleted, so 1416 * would not have been active). 1417 * 1418 * Clear copy-on-write flag if last snapshot. 1419 */ 1420 if (ip->i_nextsnap.tqe_prev != 0) { 1421 VI_LOCK(devvp); 1422 lockmgr(&vp->v_lock, LK_INTERLOCK | LK_EXCLUSIVE, 1423 VI_MTX(devvp)); 1424 VI_LOCK(devvp); 1425 TAILQ_REMOVE(&ump->um_snapshots, ip, i_nextsnap); 1426 ip->i_nextsnap.tqe_prev = 0; 1427 lkp = vp->v_vnlock; 1428 vp->v_vnlock = &vp->v_lock; 1429 lockmgr(lkp, LK_RELEASE, NULL); 1430 if (TAILQ_FIRST(&ump->um_snapshots) != 0) { 1431 /* Roll back the list of preallocated blocks. */ 1432 xp = TAILQ_LAST(&ump->um_snapshots, inodelst); 1433 ump->um_snapblklist = xp->i_snapblklist; 1434 VI_UNLOCK(devvp); 1435 } else { 1436 ump->um_snapblklist = 0; 1437 lockmgr(lkp, LK_DRAIN|LK_INTERLOCK, VI_MTX(devvp)); 1438 lockmgr(lkp, LK_RELEASE, NULL); 1439 vn_cow_disestablish(devvp, ffs_copyonwrite, devvp); 1440 FREE(lkp, M_UFSMNT); 1441 } 1442 FREE(ip->i_snapblklist, M_UFSMNT); 1443 ip->i_snapblklist = NULL; 1444 } 1445 /* 1446 * Clear all BLK_NOCOPY fields. Pass any block claims to other 1447 * snapshots that want them (see ffs_snapblkfree below). 1448 */ 1449 for (blkno = 1; blkno < NDADDR; blkno++) { 1450 dblk = db_get(ip, blkno); 1451 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) 1452 db_assign(ip, blkno, 0); 1453 else if ((dblk == blkstofrags(fs, blkno) && 1454 ffs_snapblkfree(fs, ip->i_devvp, dblk, fs->fs_bsize, 1455 ip->i_number))) { 1456 DIP_ADD(ip, blocks, -btodb(fs->fs_bsize)); 1457 db_assign(ip, blkno, 0); 1458 } 1459 } 1460 numblks = howmany(ip->i_size, fs->fs_bsize); 1461 for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) { 1462 error = VOP_BALLOC(vp, lblktosize(fs, (off_t)blkno), 1463 fs->fs_bsize, KERNCRED, B_METAONLY, &ibp); 1464 if (error) 1465 continue; 1466 if (fs->fs_size - blkno > NINDIR(fs)) 1467 last = NINDIR(fs); 1468 else 1469 last = fs->fs_size - blkno; 1470 for (loc = 0; loc < last; loc++) { 1471 dblk = idb_get(ip, ibp->b_data, loc); 1472 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) 1473 idb_assign(ip, ibp->b_data, loc, 0); 1474 else if (dblk == blkstofrags(fs, blkno) && 1475 ffs_snapblkfree(fs, ip->i_devvp, dblk, 1476 fs->fs_bsize, ip->i_number)) { 1477 DIP_ADD(ip, blocks, -btodb(fs->fs_bsize)); 1478 idb_assign(ip, ibp->b_data, loc, 0); 1479 } 1480 } 1481 bawrite(ibp); 1482 } 1483 /* 1484 * Clear snapshot flag and drop reference. 1485 */ 1486 ip->i_flags &= ~SF_SNAPSHOT; 1487 DIP_ASSIGN(ip, flags, ip->i_flags); 1488 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1489} 1490 1491/* 1492 * Notification that a block is being freed. Return zero if the free 1493 * should be allowed to proceed. Return non-zero if the snapshot file 1494 * wants to claim the block. The block will be claimed if it is an 1495 * uncopied part of one of the snapshots. It will be freed if it is 1496 * either a BLK_NOCOPY or has already been copied in all of the snapshots. 1497 * If a fragment is being freed, then all snapshots that care about 1498 * it must make a copy since a snapshot file can only claim full sized 1499 * blocks. Note that if more than one snapshot file maps the block, 1500 * we can pick one at random to claim it. Since none of the snapshots 1501 * can change, we are assurred that they will all see the same unmodified 1502 * image. When deleting a snapshot file (see ffs_snapremove above), we 1503 * must push any of these claimed blocks to one of the other snapshots 1504 * that maps it. These claimed blocks are easily identified as they will 1505 * have a block number equal to their logical block number within the 1506 * snapshot. A copied block can never have this property because they 1507 * must always have been allocated from a BLK_NOCOPY location. 1508 */ 1509int 1510ffs_snapblkfree(fs, devvp, bno, size, inum) 1511 struct fs *fs; 1512 struct vnode *devvp; 1513 ufs2_daddr_t bno; 1514 long size; 1515 ino_t inum; 1516{ 1517 struct ufsmount *ump = VFSTOUFS(devvp->v_specmountpoint); 1518 struct buf *ibp; 1519 struct inode *ip; 1520 struct vnode *vp = NULL, *saved_vp = NULL; 1521 caddr_t saved_data = NULL; 1522 ufs_lbn_t lbn; 1523 ufs2_daddr_t blkno; 1524 int s, indiroff = 0, snapshot_locked = 0, error = 0, claimedblk = 0; 1525 1526 lbn = fragstoblks(fs, bno); 1527retry: 1528 VI_LOCK(devvp); 1529 TAILQ_FOREACH(ip, &ump->um_snapshots, i_nextsnap) { 1530 vp = ITOV(ip); 1531 /* 1532 * Lookup block being written. 1533 */ 1534 if (lbn < NDADDR) { 1535 blkno = db_get(ip, lbn); 1536 } else { 1537 if (snapshot_locked == 0 && 1538 lockmgr(vp->v_vnlock, 1539 LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL, 1540 VI_MTX(devvp)) != 0) 1541 goto retry; 1542 snapshot_locked = 1; 1543 s = cow_enter(); 1544 error = VOP_BALLOC(vp, lblktosize(fs, (off_t)lbn), 1545 fs->fs_bsize, KERNCRED, B_METAONLY, &ibp); 1546 cow_leave(s); 1547 if (error) 1548 break; 1549 indiroff = (lbn - NDADDR) % NINDIR(fs); 1550 blkno = idb_get(ip, ibp->b_data, indiroff); 1551 } 1552 /* 1553 * Check to see if block needs to be copied. 1554 */ 1555 if (blkno == 0) { 1556 /* 1557 * A block that we map is being freed. If it has not 1558 * been claimed yet, we will claim or copy it (below). 1559 */ 1560 claimedblk = 1; 1561 } else if (blkno == BLK_SNAP) { 1562 /* 1563 * No previous snapshot claimed the block, 1564 * so it will be freed and become a BLK_NOCOPY 1565 * (don't care) for us. 1566 */ 1567 if (claimedblk) 1568 panic("snapblkfree: inconsistent block type"); 1569 if (snapshot_locked == 0 && 1570 lockmgr(vp->v_vnlock, 1571 LK_INTERLOCK | LK_EXCLUSIVE | LK_NOWAIT, 1572 VI_MTX(devvp)) != 0) { 1573 if (lbn >= NDADDR) 1574 brelse(ibp); 1575 vn_lock(vp, LK_EXCLUSIVE | LK_SLEEPFAIL); 1576 goto retry; 1577 } 1578 snapshot_locked = 1; 1579 if (lbn < NDADDR) { 1580 db_assign(ip, lbn, BLK_NOCOPY); 1581 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1582 } else { 1583 idb_assign(ip, ibp->b_data, indiroff, 1584 BLK_NOCOPY); 1585 bwrite(ibp); 1586 } 1587 continue; 1588 } else /* BLK_NOCOPY or default */ { 1589 /* 1590 * If the snapshot has already copied the block 1591 * (default), or does not care about the block, 1592 * it is not needed. 1593 */ 1594 if (lbn >= NDADDR) 1595 brelse(ibp); 1596 continue; 1597 } 1598 /* 1599 * If this is a full size block, we will just grab it 1600 * and assign it to the snapshot inode. Otherwise we 1601 * will proceed to copy it. See explanation for this 1602 * routine as to why only a single snapshot needs to 1603 * claim this block. 1604 */ 1605 if (snapshot_locked == 0 && 1606 lockmgr(vp->v_vnlock, 1607 LK_INTERLOCK | LK_EXCLUSIVE | LK_NOWAIT, 1608 VI_MTX(devvp)) != 0) { 1609 if (lbn >= NDADDR) 1610 brelse(ibp); 1611 vn_lock(vp, LK_EXCLUSIVE | LK_SLEEPFAIL); 1612 goto retry; 1613 } 1614 snapshot_locked = 1; 1615 if (size == fs->fs_bsize) { 1616#ifdef DEBUG 1617 if (snapdebug) 1618 printf("%s %d lbn %" PRId64 " from inum %d\n", 1619 "Grabonremove: snapino", ip->i_number, 1620 lbn, inum); 1621#endif 1622 if (lbn < NDADDR) { 1623 db_assign(ip, lbn, bno); 1624 } else { 1625 idb_assign(ip, ibp->b_data, indiroff, bno); 1626 bwrite(ibp); 1627 } 1628 DIP_ADD(ip, blocks, btodb(size)); 1629 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1630 VOP_UNLOCK(vp, 0); 1631 return (1); 1632 } 1633 if (lbn >= NDADDR) 1634 brelse(ibp); 1635#ifdef DEBUG 1636 if (snapdebug) 1637 printf("%s%d lbn %" PRId64 " %s %d size %ld\n", 1638 "Copyonremove: snapino ", ip->i_number, 1639 lbn, "for inum", inum, size); 1640#endif 1641 /* 1642 * If we have already read the old block contents, then 1643 * simply copy them to the new block. Note that we need 1644 * to synchronously write snapshots that have not been 1645 * unlinked, and hence will be visible after a crash, 1646 * to ensure their integrity. 1647 */ 1648 if (saved_data) { 1649 error = writevnblk(vp, saved_data, lbn); 1650 if (error) 1651 break; 1652 continue; 1653 } 1654 /* 1655 * Otherwise, read the old block contents into the buffer. 1656 */ 1657 saved_data = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK); 1658 saved_vp = vp; 1659 if ((error = readfsblk(vp, saved_data, lbn)) != 0) { 1660 free(saved_data, M_UFSMNT); 1661 saved_data = NULL; 1662 break; 1663 } 1664 } 1665 /* 1666 * Note that we need to synchronously write snapshots that 1667 * have not been unlinked, and hence will be visible after 1668 * a crash, to ensure their integrity. 1669 */ 1670 if (saved_data) { 1671 error = writevnblk(saved_vp, saved_data, lbn); 1672 free(saved_data, M_UFSMNT); 1673 } 1674 /* 1675 * If we have been unable to allocate a block in which to do 1676 * the copy, then return non-zero so that the fragment will 1677 * not be freed. Although space will be lost, the snapshot 1678 * will stay consistent. 1679 */ 1680 if (snapshot_locked) 1681 VOP_UNLOCK(vp, 0); 1682 else 1683 VI_UNLOCK(devvp); 1684 return (error); 1685} 1686 1687/* 1688 * Associate snapshot files when mounting. 1689 */ 1690void 1691ffs_snapshot_mount(mp) 1692 struct mount *mp; 1693{ 1694 struct ufsmount *ump = VFSTOUFS(mp); 1695 struct vnode *devvp = ump->um_devvp; 1696 struct fs *fs = ump->um_fs; 1697 struct proc *p = curproc; 1698 struct vnode *vp; 1699 struct inode *ip, *xp; 1700 ufs2_daddr_t snaplistsize, *snapblklist; 1701 int i, error, ns, snaploc, loc; 1702 1703 ns = UFS_FSNEEDSWAP(fs); 1704 /* 1705 * XXX The following needs to be set before VOP_TRUNCATE or 1706 * VOP_READ can be called. 1707 */ 1708 mp->mnt_stat.f_iosize = fs->fs_bsize; 1709 /* 1710 * Process each snapshot listed in the superblock. 1711 */ 1712 vp = NULL; 1713 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) { 1714 if (fs->fs_snapinum[snaploc] == 0) 1715 break; 1716 if ((error = VFS_VGET(mp, fs->fs_snapinum[snaploc], 1717 &vp)) != 0) { 1718 printf("ffs_snapshot_mount: vget failed %d\n", error); 1719 continue; 1720 } 1721 ip = VTOI(vp); 1722 if ((ip->i_flags & SF_SNAPSHOT) == 0) { 1723 printf("ffs_snapshot_mount: non-snapshot inode %d\n", 1724 fs->fs_snapinum[snaploc]); 1725 vput(vp); 1726 vp = NULL; 1727 for (loc = snaploc + 1; loc < FSMAXSNAP; loc++) { 1728 if (fs->fs_snapinum[loc] == 0) 1729 break; 1730 fs->fs_snapinum[loc - 1] = fs->fs_snapinum[loc]; 1731 } 1732 fs->fs_snapinum[loc - 1] = 0; 1733 snaploc--; 1734 continue; 1735 } 1736 1737 /* 1738 * Read the block hints list. Use an empty list on 1739 * read errors. 1740 */ 1741 error = vn_rdwr(UIO_READ, vp, 1742 (caddr_t)&snaplistsize, sizeof(snaplistsize), 1743 lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)), 1744 UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, 1745 p->p_ucred, NULL, NULL); 1746 if (error) { 1747 printf("ffs_snapshot_mount: read_1 failed %d\n", error); 1748 snaplistsize = 1; 1749 } else 1750 snaplistsize = ufs_rw64(snaplistsize, ns); 1751 MALLOC(snapblklist, ufs2_daddr_t *, 1752 snaplistsize * sizeof(ufs2_daddr_t), M_UFSMNT, M_WAITOK); 1753 if (error) 1754 snapblklist[0] = 1; 1755 else { 1756 error = vn_rdwr(UIO_READ, vp, (caddr_t)snapblklist, 1757 snaplistsize * sizeof(ufs2_daddr_t), 1758 lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)), 1759 UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, 1760 p->p_ucred, NULL, NULL); 1761 for (i = 0; i < snaplistsize; i++) 1762 snapblklist[i] = ufs_rw64(snapblklist[i], ns); 1763 if (error) { 1764 printf("ffs_snapshot_mount: read_2 failed %d\n", 1765 error); 1766 snapblklist[0] = 1; 1767 } 1768 } 1769 ip->i_snapblklist = &snapblklist[0]; 1770 1771 /* 1772 * If there already exist snapshots on this filesystem, grab a 1773 * reference to their shared lock. If this is the first snapshot 1774 * on this filesystem, we need to allocate a lock for the 1775 * snapshots to share. In either case, acquire the snapshot 1776 * lock and give up our original private lock. 1777 */ 1778 VI_LOCK(devvp); 1779 if ((xp = TAILQ_FIRST(&ump->um_snapshots)) != NULL) { 1780 struct lock *lkp; 1781 1782 lkp = ITOV(xp)->v_vnlock; 1783 VI_UNLOCK(devvp); 1784 VI_LOCK(vp); 1785 vp->v_vnlock = lkp; 1786 } else { 1787 struct lock *lkp; 1788 1789 VI_UNLOCK(devvp); 1790 MALLOC(lkp, struct lock *, sizeof(struct lock), 1791 M_UFSMNT, M_WAITOK); 1792 lockinit(lkp, PVFS, "snaplk", 0, LK_CANRECURSE); 1793 VI_LOCK(vp); 1794 vp->v_vnlock = lkp; 1795 } 1796 vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY); 1797 transferlockers(&vp->v_lock, vp->v_vnlock); 1798 lockmgr(&vp->v_lock, LK_RELEASE, NULL); 1799 /* 1800 * Link it onto the active snapshot list. 1801 */ 1802 VI_LOCK(devvp); 1803 if (ip->i_nextsnap.tqe_prev != 0) 1804 panic("ffs_snapshot_mount: %d already on list", 1805 ip->i_number); 1806 else 1807 TAILQ_INSERT_TAIL(&ump->um_snapshots, ip, i_nextsnap); 1808 vp->v_flag |= VSYSTEM; 1809 VI_UNLOCK(devvp); 1810 VOP_UNLOCK(vp, 0); 1811 } 1812 /* 1813 * No usable snapshots found. 1814 */ 1815 if (vp == NULL) 1816 return; 1817 /* 1818 * Attach the block hints list. We always want to 1819 * use the list from the newest snapshot. 1820 */ 1821 xp = TAILQ_LAST(&ump->um_snapshots, inodelst); 1822 VI_LOCK(devvp); 1823 ump->um_snapblklist = xp->i_snapblklist; 1824 VI_UNLOCK(devvp); 1825 vn_cow_establish(devvp, ffs_copyonwrite, devvp); 1826} 1827 1828/* 1829 * Disassociate snapshot files when unmounting. 1830 */ 1831void 1832ffs_snapshot_unmount(mp) 1833 struct mount *mp; 1834{ 1835 struct ufsmount *ump = VFSTOUFS(mp); 1836 struct vnode *devvp = ump->um_devvp; 1837 struct lock *lkp = NULL; 1838 struct inode *xp; 1839 struct vnode *vp; 1840 1841 VI_LOCK(devvp); 1842 while ((xp = TAILQ_FIRST(&ump->um_snapshots)) != 0) { 1843 vp = ITOV(xp); 1844 lkp = vp->v_vnlock; 1845 vp->v_vnlock = &vp->v_lock; 1846 TAILQ_REMOVE(&ump->um_snapshots, xp, i_nextsnap); 1847 xp->i_nextsnap.tqe_prev = 0; 1848 if (xp->i_snapblklist == ump->um_snapblklist) 1849 ump->um_snapblklist = NULL; 1850 VI_UNLOCK(devvp); 1851 FREE(xp->i_snapblklist, M_UFSMNT); 1852 if (xp->i_ffs_effnlink > 0) 1853 vrele(vp); 1854 VI_LOCK(devvp); 1855 } 1856 VI_UNLOCK(devvp); 1857 if (lkp != NULL) { 1858 vn_cow_disestablish(devvp, ffs_copyonwrite, devvp); 1859 FREE(lkp, M_UFSMNT); 1860 } 1861} 1862 1863/* 1864 * Check for need to copy block that is about to be written, 1865 * copying the block if necessary. 1866 */ 1867static int 1868ffs_copyonwrite(v, bp) 1869 void *v; 1870 struct buf *bp; 1871{ 1872 struct buf *ibp; 1873 struct fs *fs; 1874 struct inode *ip; 1875 struct vnode *devvp = v, *vp = 0, *saved_vp = NULL; 1876 struct ufsmount *ump = VFSTOUFS(devvp->v_specmountpoint); 1877 caddr_t saved_data = NULL; 1878 ufs2_daddr_t lbn, blkno, *snapblklist; 1879 int lower, upper, mid, s, ns, indiroff, snapshot_locked = 0, error = 0; 1880 1881 /* 1882 * Check for valid snapshots. 1883 */ 1884 VI_LOCK(devvp); 1885 ip = TAILQ_FIRST(&ump->um_snapshots); 1886 if (ip == NULL) { 1887 VI_UNLOCK(devvp); 1888 return 0; 1889 } 1890 /* 1891 * First check to see if it is in the preallocated list. 1892 * By doing this check we avoid several potential deadlocks. 1893 */ 1894 fs = ip->i_fs; 1895 ns = UFS_FSNEEDSWAP(fs); 1896 lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno)); 1897 snapblklist = ump->um_snapblklist; 1898 upper = ump->um_snapblklist[0] - 1; 1899 lower = 1; 1900 while (lower <= upper) { 1901 mid = (lower + upper) / 2; 1902 if (snapblklist[mid] == lbn) 1903 break; 1904 if (snapblklist[mid] < lbn) 1905 lower = mid + 1; 1906 else 1907 upper = mid - 1; 1908 } 1909 if (lower <= upper) { 1910 VI_UNLOCK(devvp); 1911 return 0; 1912 } 1913 /* 1914 * Not in the precomputed list, so check the snapshots. 1915 */ 1916retry: 1917 TAILQ_FOREACH(ip, &ump->um_snapshots, i_nextsnap) { 1918 vp = ITOV(ip); 1919 /* 1920 * We ensure that everything of our own that needs to be 1921 * copied will be done at the time that ffs_snapshot is 1922 * called. Thus we can skip the check here which can 1923 * deadlock in doing the lookup in VOP_BALLOC. 1924 */ 1925 if (bp->b_vp == vp) 1926 continue; 1927 /* 1928 * Check to see if block needs to be copied. We do not have 1929 * to hold the snapshot lock while doing this lookup as it 1930 * will never require any additional allocations for the 1931 * snapshot inode. 1932 */ 1933 if (lbn < NDADDR) { 1934 blkno = db_get(ip, lbn); 1935 } else { 1936 if (snapshot_locked == 0 && 1937 lockmgr(vp->v_vnlock, 1938 LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL, 1939 VI_MTX(devvp)) != 0) { 1940 VI_LOCK(devvp); 1941 goto retry; 1942 } 1943 snapshot_locked = 1; 1944 s = cow_enter(); 1945 error = VOP_BALLOC(vp, lblktosize(fs, (off_t)lbn), 1946 fs->fs_bsize, KERNCRED, B_METAONLY, &ibp); 1947 cow_leave(s); 1948 if (error) 1949 break; 1950 indiroff = (lbn - NDADDR) % NINDIR(fs); 1951 blkno = idb_get(ip, ibp->b_data, indiroff); 1952 brelse(ibp); 1953 } 1954#ifdef DIAGNOSTIC 1955 if (blkno == BLK_SNAP && bp->b_lblkno >= 0) 1956 panic("ffs_copyonwrite: bad copy block"); 1957#endif 1958 if (blkno != 0) 1959 continue; 1960#ifdef DIAGNOSTIC 1961 if (curlwp->l_flag & L_COWINPROGRESS) 1962 printf("ffs_copyonwrite: recursive call\n"); 1963#endif 1964 /* 1965 * Allocate the block into which to do the copy. Since 1966 * multiple processes may all try to copy the same block, 1967 * we have to recheck our need to do a copy if we sleep 1968 * waiting for the lock. 1969 * 1970 * Because all snapshots on a filesystem share a single 1971 * lock, we ensure that we will never be in competition 1972 * with another process to allocate a block. 1973 */ 1974 if (snapshot_locked == 0 && 1975 lockmgr(vp->v_vnlock, 1976 LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL, 1977 VI_MTX(devvp)) != 0) { 1978 VI_LOCK(devvp); 1979 goto retry; 1980 } 1981 snapshot_locked = 1; 1982#ifdef DEBUG 1983 if (snapdebug) { 1984 printf("Copyonwrite: snapino %d lbn %" PRId64 " for ", 1985 ip->i_number, lbn); 1986 if (bp->b_vp == devvp) 1987 printf("fs metadata"); 1988 else 1989 printf("inum %d", VTOI(bp->b_vp)->i_number); 1990 printf(" lblkno %" PRId64 "\n", bp->b_lblkno); 1991 } 1992#endif 1993 /* 1994 * If we have already read the old block contents, then 1995 * simply copy them to the new block. Note that we need 1996 * to synchronously write snapshots that have not been 1997 * unlinked, and hence will be visible after a crash, 1998 * to ensure their integrity. 1999 */ 2000 if (saved_data) { 2001 error = writevnblk(vp, saved_data, lbn); 2002 if (error) 2003 break; 2004 continue; 2005 } 2006 /* 2007 * Otherwise, read the old block contents into the buffer. 2008 */ 2009 saved_data = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK); 2010 saved_vp = vp; 2011 if ((error = readfsblk(vp, saved_data, lbn)) != 0) { 2012 free(saved_data, M_UFSMNT); 2013 saved_data = NULL; 2014 break; 2015 } 2016 } 2017 /* 2018 * Note that we need to synchronously write snapshots that 2019 * have not been unlinked, and hence will be visible after 2020 * a crash, to ensure their integrity. 2021 */ 2022 if (saved_data) { 2023 error = writevnblk(saved_vp, saved_data, lbn); 2024 free(saved_data, M_UFSMNT); 2025 } 2026 if (snapshot_locked) 2027 VOP_UNLOCK(vp, 0); 2028 else 2029 VI_UNLOCK(devvp); 2030 return error; 2031} 2032 2033/* 2034 * Read the specified block from disk. Vp is usually a snapshot vnode. 2035 */ 2036static int 2037readfsblk(vp, data, lbn) 2038 struct vnode *vp; 2039 caddr_t data; 2040 ufs2_daddr_t lbn; 2041{ 2042 int s, error; 2043 struct inode *ip = VTOI(vp); 2044 struct fs *fs = ip->i_fs; 2045 struct buf *nbp; 2046 2047 s = splbio(); 2048 nbp = pool_get(&bufpool, PR_WAITOK); 2049 splx(s); 2050 2051 BUF_INIT(nbp); 2052 nbp->b_flags = B_READ; 2053 nbp->b_bcount = nbp->b_bufsize = fs->fs_bsize; 2054 nbp->b_error = 0; 2055 nbp->b_data = data; 2056 nbp->b_blkno = nbp->b_rawblkno = fsbtodb(fs, blkstofrags(fs, lbn)); 2057 nbp->b_proc = NULL; 2058 nbp->b_dev = ip->i_devvp->v_rdev; 2059 nbp->b_vp = NULLVP; 2060 2061 DEV_STRATEGY(nbp); 2062 2063 error = biowait(nbp); 2064 2065 s = splbio(); 2066 pool_put(&bufpool, nbp); 2067 splx(s); 2068 2069 return error; 2070} 2071 2072/* 2073 * Read the specified block. Bypass UBC to prevent deadlocks. 2074 */ 2075static int 2076readvnblk(vp, data, lbn) 2077 struct vnode *vp; 2078 caddr_t data; 2079 ufs2_daddr_t lbn; 2080{ 2081 int error; 2082 daddr_t bn; 2083 off_t offset; 2084 struct inode *ip = VTOI(vp); 2085 struct fs *fs = ip->i_fs; 2086 2087 error = VOP_BMAP(vp, lbn, NULL, &bn, NULL); 2088 if (error) 2089 return error; 2090 2091 if (bn != (daddr_t)-1) { 2092 offset = dbtob(bn); 2093 simple_lock(&vp->v_interlock); 2094 error = VOP_PUTPAGES(vp, trunc_page(offset), 2095 round_page(offset+fs->fs_bsize), 2096 PGO_CLEANIT|PGO_SYNCIO|PGO_FREE); 2097 if (error) 2098 return error; 2099 2100 return readfsblk(vp, data, fragstoblks(fs, dbtofsb(fs, bn))); 2101 } 2102 2103 bzero(data, fs->fs_bsize); 2104 2105 return 0; 2106} 2107 2108/* 2109 * Write the specified block. Bypass UBC to prevent deadlocks. 2110 */ 2111static int 2112writevnblk(vp, data, lbn) 2113 struct vnode *vp; 2114 caddr_t data; 2115 ufs2_daddr_t lbn; 2116{ 2117 int s, error; 2118 off_t offset; 2119 struct buf *bp; 2120 struct inode *ip = VTOI(vp); 2121 struct fs *fs = ip->i_fs; 2122 2123 offset = lblktosize(fs, (off_t)lbn); 2124 s = cow_enter(); 2125 simple_lock(&vp->v_interlock); 2126 error = VOP_PUTPAGES(vp, trunc_page(offset), 2127 round_page(offset+fs->fs_bsize), PGO_CLEANIT|PGO_SYNCIO|PGO_FREE); 2128 if (error == 0) 2129 error = VOP_BALLOC(vp, lblktosize(fs, (off_t)lbn), 2130 fs->fs_bsize, KERNCRED, B_SYNC, &bp); 2131 cow_leave(s); 2132 if (error) 2133 return error; 2134 2135 bcopy(data, bp->b_data, fs->fs_bsize); 2136 bp->b_flags |= B_NOCACHE; 2137 2138 return bwrite(bp); 2139} 2140 2141/* 2142 * Set/reset lwp's L_COWINPROGRESS flag. 2143 * May be called recursive. 2144 */ 2145static inline int 2146cow_enter(void) 2147{ 2148 struct lwp *l = curlwp; 2149 2150 if (l->l_flag & L_COWINPROGRESS) { 2151 return 0; 2152 } else { 2153 l->l_flag |= L_COWINPROGRESS; 2154 return L_COWINPROGRESS; 2155 } 2156} 2157 2158static inline void 2159cow_leave(int flag) 2160{ 2161 struct lwp *l = curlwp; 2162 2163 l->l_flag &= ~flag; 2164} 2165 2166/* 2167 * Get/Put direct block from inode or buffer containing disk addresses. Take 2168 * care for fs type (UFS1/UFS2) and byte swapping. These functions should go 2169 * into a global include. 2170 */ 2171static inline ufs2_daddr_t 2172db_get(struct inode *ip, int loc) 2173{ 2174 if (ip->i_ump->um_fstype == UFS1) 2175 return ufs_rw32(ip->i_ffs1_db[loc], UFS_IPNEEDSWAP(ip)); 2176 else 2177 return ufs_rw64(ip->i_ffs2_db[loc], UFS_IPNEEDSWAP(ip)); 2178} 2179 2180static inline void 2181db_assign(struct inode *ip, int loc, ufs2_daddr_t val) 2182{ 2183 if (ip->i_ump->um_fstype == UFS1) 2184 ip->i_ffs1_db[loc] = ufs_rw32(val, UFS_IPNEEDSWAP(ip)); 2185 else 2186 ip->i_ffs2_db[loc] = ufs_rw64(val, UFS_IPNEEDSWAP(ip)); 2187} 2188 2189static inline ufs2_daddr_t 2190idb_get(struct inode *ip, caddr_t bf, int loc) 2191{ 2192 if (ip->i_ump->um_fstype == UFS1) 2193 return ufs_rw32(((ufs1_daddr_t *)(bf))[loc], 2194 UFS_IPNEEDSWAP(ip)); 2195 else 2196 return ufs_rw64(((ufs2_daddr_t *)(bf))[loc], 2197 UFS_IPNEEDSWAP(ip)); 2198} 2199 2200static inline void 2201idb_assign(struct inode *ip, caddr_t bf, int loc, ufs2_daddr_t val) 2202{ 2203 if (ip->i_ump->um_fstype == UFS1) 2204 ((ufs1_daddr_t *)(bf))[loc] = 2205 ufs_rw32(val, UFS_IPNEEDSWAP(ip)); 2206 else 2207 ((ufs2_daddr_t *)(bf))[loc] = 2208 ufs_rw64(val, UFS_IPNEEDSWAP(ip)); 2209} 2210