ffs_snapshot.c revision 1.32
1/* $NetBSD: ffs_snapshot.c,v 1.32 2006/09/29 19:37:11 christos Exp $ */ 2 3/* 4 * Copyright 2000 Marshall Kirk McKusick. All Rights Reserved. 5 * 6 * Further information about snapshots can be obtained from: 7 * 8 * Marshall Kirk McKusick http://www.mckusick.com/softdep/ 9 * 1614 Oxford Street mckusick@mckusick.com 10 * Berkeley, CA 94709-1608 +1-510-843-9542 11 * USA 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 16 * 17 * 1. Redistributions of source code must retain the above copyright 18 * notice, this list of conditions and the following disclaimer. 19 * 2. Redistributions in binary form must reproduce the above copyright 20 * notice, this list of conditions and the following disclaimer in the 21 * documentation and/or other materials provided with the distribution. 22 * 23 * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY 24 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 25 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 26 * DISCLAIMED. IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR 27 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 * 35 * @(#)ffs_snapshot.c 8.11 (McKusick) 7/23/00 36 * 37 * from FreeBSD: ffs_snapshot.c,v 1.79 2004/02/13 02:02:06 kuriyama Exp 38 */ 39 40#include <sys/cdefs.h> 41__KERNEL_RCSID(0, "$NetBSD: ffs_snapshot.c,v 1.32 2006/09/29 19:37:11 christos Exp $"); 42 43#if defined(_KERNEL_OPT) 44#include "opt_ffs.h" 45#endif 46 47#include <sys/param.h> 48#include <sys/kernel.h> 49#include <sys/systm.h> 50#include <sys/conf.h> 51#include <sys/buf.h> 52#include <sys/proc.h> 53#include <sys/namei.h> 54#include <sys/sched.h> 55#include <sys/stat.h> 56#include <sys/malloc.h> 57#include <sys/mount.h> 58#include <sys/resource.h> 59#include <sys/resourcevar.h> 60#include <sys/vnode.h> 61#include <sys/kauth.h> 62 63#include <miscfs/specfs/specdev.h> 64 65#include <ufs/ufs/quota.h> 66#include <ufs/ufs/ufsmount.h> 67#include <ufs/ufs/inode.h> 68#include <ufs/ufs/ufs_extern.h> 69#include <ufs/ufs/ufs_bswap.h> 70 71#include <ufs/ffs/fs.h> 72#include <ufs/ffs/ffs_extern.h> 73 74/* FreeBSD -> NetBSD conversion */ 75#define KERNCRED lwp0.l_cred 76#define ufs1_daddr_t int32_t 77#define ufs2_daddr_t int64_t 78#define ufs_lbn_t daddr_t 79#define VI_MTX(v) (&(v)->v_interlock) 80#define VI_LOCK(v) simple_lock(&(v)->v_interlock) 81#define VI_UNLOCK(v) simple_unlock(&(v)->v_interlock) 82#define MNT_ILOCK(v) simple_lock(&mntvnode_slock) 83#define MNT_IUNLOCK(v) simple_unlock(&mntvnode_slock) 84 85#if !defined(FFS_NO_SNAPSHOT) 86static int cgaccount(int, struct vnode *, caddr_t, int); 87static int expunge_ufs1(struct vnode *, struct inode *, struct fs *, 88 int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, 89 ufs_lbn_t, int), int); 90static int indiracct_ufs1(struct vnode *, struct vnode *, int, 91 ufs1_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *, 92 int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, 93 ufs_lbn_t, int), int); 94static int fullacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 95 struct fs *, ufs_lbn_t, int); 96static int snapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 97 struct fs *, ufs_lbn_t, int); 98static int mapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 99 struct fs *, ufs_lbn_t, int); 100static int expunge_ufs2(struct vnode *, struct inode *, struct fs *, 101 int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, 102 ufs_lbn_t, int), int); 103static int indiracct_ufs2(struct vnode *, struct vnode *, int, 104 ufs2_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *, 105 int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, 106 ufs_lbn_t, int), int); 107static int fullacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 108 struct fs *, ufs_lbn_t, int); 109static int snapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 110 struct fs *, ufs_lbn_t, int); 111static int mapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 112 struct fs *, ufs_lbn_t, int); 113#endif /* !defined(FFS_NO_SNAPSHOT) */ 114 115static int ffs_copyonwrite(void *, struct buf *); 116static int readfsblk(struct vnode *, caddr_t, ufs2_daddr_t); 117static int __unused readvnblk(struct vnode *, caddr_t, ufs2_daddr_t); 118static int writevnblk(struct vnode *, caddr_t, ufs2_daddr_t); 119static inline int cow_enter(void); 120static inline void cow_leave(int); 121static inline ufs2_daddr_t db_get(struct inode *, int); 122static inline void db_assign(struct inode *, int, ufs2_daddr_t); 123static inline ufs2_daddr_t idb_get(struct inode *, caddr_t, int); 124static inline void idb_assign(struct inode *, caddr_t, int, ufs2_daddr_t); 125 126#ifdef DEBUG 127static int snapdebug = 0; 128#endif 129 130/* 131 * Create a snapshot file and initialize it for the filesystem. 132 * Vnode is locked on entry and return. 133 */ 134int 135ffs_snapshot(struct mount *mp, struct vnode *vp, struct timespec *ctime) 136{ 137#if defined(FFS_NO_SNAPSHOT) 138 return EOPNOTSUPP; 139} 140#else /* defined(FFS_NO_SNAPSHOT) */ 141 ufs2_daddr_t numblks, blkno, *blkp, snaplistsize = 0, *snapblklist; 142 int error, ns, cg, snaploc; 143 int i, s, size, len, loc; 144 int flag = mp->mnt_flag; 145 struct timeval starttime; 146#ifdef DEBUG 147 struct timeval endtime; 148#endif 149 struct timespec ts; 150 long redo = 0; 151 int32_t *lp; 152 void *space; 153 caddr_t sbbuf = NULL; 154 struct ufsmount *ump = VFSTOUFS(mp); 155 struct fs *copy_fs = NULL, *fs = ump->um_fs; 156 struct lwp *l = curlwp; 157 struct inode *ip, *xp; 158 struct buf *bp, *ibp, *nbp; 159 struct vattr vat; 160 struct vnode *xvp, *nvp, *devvp; 161 162 ns = UFS_FSNEEDSWAP(fs); 163 /* 164 * Need to serialize access to snapshot code per filesystem. 165 */ 166 /* 167 * If the vnode already is a snapshot, return. 168 */ 169 if (VTOI(vp)->i_flags & SF_SNAPSHOT) { 170 if (ctime) { 171 ctime->tv_sec = DIP(VTOI(vp), mtime); 172 ctime->tv_nsec = DIP(VTOI(vp), mtimensec); 173 } 174 return 0; 175 } 176 /* 177 * Check mount, exclusive reference and owner. 178 */ 179 if (vp->v_mount != mp) 180 return EXDEV; 181 if (vp->v_usecount != 1 || vp->v_writecount != 0) 182 return EBUSY; 183 if (kauth_authorize_generic(l->l_cred, KAUTH_GENERIC_ISSUSER, 184 &l->l_acflag) != 0 && 185 VTOI(vp)->i_uid != kauth_cred_geteuid(l->l_cred)) 186 return EACCES; 187 188 if (vp->v_size != 0) { 189 error = ffs_truncate(vp, 0, 0, NOCRED, l); 190 if (error) 191 return error; 192 } 193 /* 194 * Assign a snapshot slot in the superblock. 195 */ 196 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) 197 if (fs->fs_snapinum[snaploc] == 0) 198 break; 199 if (snaploc == FSMAXSNAP) 200 return (ENOSPC); 201 ip = VTOI(vp); 202 devvp = ip->i_devvp; 203 /* 204 * Write an empty list of preallocated blocks to the end of 205 * the snapshot to set size to at least that of the filesystem. 206 */ 207 numblks = howmany(fs->fs_size, fs->fs_frag); 208 blkno = 1; 209 blkno = ufs_rw64(blkno, ns); 210 error = vn_rdwr(UIO_WRITE, vp, 211 (caddr_t)&blkno, sizeof(blkno), lblktosize(fs, (off_t)numblks), 212 UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, l->l_cred, NULL, NULL); 213 if (error) 214 goto out; 215 /* 216 * Preallocate critical data structures so that we can copy 217 * them in without further allocation after we suspend all 218 * operations on the filesystem. We would like to just release 219 * the allocated buffers without writing them since they will 220 * be filled in below once we are ready to go, but this upsets 221 * the soft update code, so we go ahead and write the new buffers. 222 * 223 * Allocate all indirect blocks and mark all of them as not 224 * needing to be copied. 225 */ 226 for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) { 227 error = ffs_balloc(vp, lblktosize(fs, (off_t)blkno), 228 fs->fs_bsize, l->l_cred, B_METAONLY, &ibp); 229 if (error) 230 goto out; 231 bawrite(ibp); 232 } 233 /* 234 * Allocate copies for the superblock and its summary information. 235 */ 236 error = ffs_balloc(vp, fs->fs_sblockloc, fs->fs_sbsize, KERNCRED, 237 0, &nbp); 238 if (error) 239 goto out; 240 bawrite(nbp); 241 blkno = fragstoblks(fs, fs->fs_csaddr); 242 len = howmany(fs->fs_cssize, fs->fs_bsize); 243 for (loc = 0; loc < len; loc++) { 244 error = ffs_balloc(vp, lblktosize(fs, (off_t)(blkno + loc)), 245 fs->fs_bsize, KERNCRED, 0, &nbp); 246 if (error) 247 goto out; 248 bawrite(nbp); 249 } 250 /* 251 * Copy all the cylinder group maps. Although the 252 * filesystem is still active, we hope that only a few 253 * cylinder groups will change between now and when we 254 * suspend operations. Thus, we will be able to quickly 255 * touch up the few cylinder groups that changed during 256 * the suspension period. 257 */ 258 len = howmany(fs->fs_ncg, NBBY); 259 fs->fs_active = malloc(len, M_DEVBUF, M_WAITOK | M_ZERO); 260 for (cg = 0; cg < fs->fs_ncg; cg++) { 261 if ((error = ffs_balloc(vp, lfragtosize(fs, cgtod(fs, cg)), 262 fs->fs_bsize, KERNCRED, 0, &nbp)) != 0) 263 goto out; 264 error = cgaccount(cg, vp, nbp->b_data, 1); 265 bawrite(nbp); 266 if (error) 267 goto out; 268 } 269 /* 270 * Change inode to snapshot type file. 271 */ 272 ip->i_flags |= SF_SNAPSHOT; 273 DIP_ASSIGN(ip, flags, ip->i_flags); 274 ip->i_flag |= IN_CHANGE | IN_UPDATE; 275 /* 276 * Ensure that the snapshot is completely on disk. 277 * Since we have marked it as a snapshot it is safe to 278 * unlock it as no process will be allowed to write to it. 279 */ 280 if ((error = VOP_FSYNC(vp, KERNCRED, FSYNC_WAIT, 0, 0, l)) != 0) 281 goto out; 282 VOP_UNLOCK(vp, 0); 283 /* 284 * All allocations are done, so we can now snapshot the system. 285 * 286 * Suspend operation on filesystem. 287 */ 288 if ((error = vfs_write_suspend(vp->v_mount, PUSER|PCATCH, 0)) != 0) { 289 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 290 goto out; 291 } 292 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 293 getmicrotime(&starttime); 294 /* 295 * First, copy all the cylinder group maps that have changed. 296 */ 297 for (cg = 0; cg < fs->fs_ncg; cg++) { 298 if (ACTIVECG_ISSET(fs, cg)) 299 continue; 300 redo++; 301 if ((error = ffs_balloc(vp, lfragtosize(fs, cgtod(fs, cg)), 302 fs->fs_bsize, KERNCRED, 0, &nbp)) != 0) 303 goto out1; 304 error = cgaccount(cg, vp, nbp->b_data, 2); 305 bawrite(nbp); 306 if (error) 307 goto out1; 308 } 309 /* 310 * Grab a copy of the superblock and its summary information. 311 * We delay writing it until the suspension is released below. 312 */ 313 sbbuf = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK); 314 loc = blkoff(fs, fs->fs_sblockloc); 315 if (loc > 0) 316 bzero(&sbbuf[0], loc); 317 copy_fs = (struct fs *)(sbbuf + loc); 318 bcopy(fs, copy_fs, fs->fs_sbsize); 319 size = fs->fs_bsize < SBLOCKSIZE ? fs->fs_bsize : SBLOCKSIZE; 320 if (fs->fs_sbsize < size) 321 bzero(&sbbuf[loc + fs->fs_sbsize], size - fs->fs_sbsize); 322 size = blkroundup(fs, fs->fs_cssize); 323 if (fs->fs_contigsumsize > 0) 324 size += fs->fs_ncg * sizeof(int32_t); 325 space = malloc((u_long)size, M_UFSMNT, M_WAITOK); 326 copy_fs->fs_csp = space; 327 bcopy(fs->fs_csp, copy_fs->fs_csp, fs->fs_cssize); 328 space = (char *)space + fs->fs_cssize; 329 loc = howmany(fs->fs_cssize, fs->fs_fsize); 330 i = fs->fs_frag - loc % fs->fs_frag; 331 len = (i == fs->fs_frag) ? 0 : i * fs->fs_fsize; 332 if (len > 0) { 333 if ((error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + loc), 334 len, KERNCRED, &bp)) != 0) { 335 brelse(bp); 336 free(copy_fs->fs_csp, M_UFSMNT); 337 goto out1; 338 } 339 bcopy(bp->b_data, space, (u_int)len); 340 space = (char *)space + len; 341 bp->b_flags |= B_INVAL | B_NOCACHE; 342 brelse(bp); 343 } 344 if (fs->fs_contigsumsize > 0) { 345 copy_fs->fs_maxcluster = lp = space; 346 for (i = 0; i < fs->fs_ncg; i++) 347 *lp++ = fs->fs_contigsumsize; 348 } 349 /* 350 * We must check for active files that have been unlinked 351 * (e.g., with a zero link count). We have to expunge all 352 * trace of these files from the snapshot so that they are 353 * not reclaimed prematurely by fsck or unnecessarily dumped. 354 * We turn off the MNTK_SUSPENDED flag to avoid a panic from 355 * spec_strategy about writing on a suspended filesystem. 356 * Note that we skip unlinked snapshot files as they will 357 * be handled separately below. 358 * 359 * We also calculate the needed size for the snapshot list. 360 */ 361 snaplistsize = fs->fs_ncg + howmany(fs->fs_cssize, fs->fs_bsize) + 362 FSMAXSNAP + 1 /* superblock */ + 1 /* last block */ + 1 /* size */; 363 MNT_ILOCK(mp); 364loop: 365 for (xvp = LIST_FIRST(&mp->mnt_vnodelist); xvp; xvp = nvp) { 366 /* 367 * Make sure this vnode wasn't reclaimed in getnewvnode(). 368 * Start over if it has (it won't be on the list anymore). 369 */ 370 if (xvp->v_mount != mp) 371 goto loop; 372 nvp = LIST_NEXT(xvp, v_mntvnodes); 373 VI_LOCK(xvp); 374 MNT_IUNLOCK(mp); 375 if ((xvp->v_flag & VXLOCK) || 376 xvp->v_usecount == 0 || xvp->v_type == VNON || 377 (VTOI(xvp)->i_flags & SF_SNAPSHOT)) { 378 VI_UNLOCK(xvp); 379 MNT_ILOCK(mp); 380 continue; 381 } 382 if (vn_lock(xvp, LK_EXCLUSIVE | LK_INTERLOCK) != 0) { 383 MNT_ILOCK(mp); 384 goto loop; 385 } 386#ifdef DEBUG 387 if (snapdebug) 388 vprint("ffs_snapshot: busy vnode", xvp); 389#endif 390 if (VOP_GETATTR(xvp, &vat, l->l_cred, l) == 0 && 391 vat.va_nlink > 0) { 392 VOP_UNLOCK(xvp, 0); 393 MNT_ILOCK(mp); 394 continue; 395 } 396 xp = VTOI(xvp); 397 if (ffs_checkfreefile(copy_fs, vp, xp->i_number)) { 398 VOP_UNLOCK(xvp, 0); 399 MNT_ILOCK(mp); 400 continue; 401 } 402 /* 403 * If there is a fragment, clear it here. 404 */ 405 blkno = 0; 406 loc = howmany(xp->i_size, fs->fs_bsize) - 1; 407 if (loc < NDADDR) { 408 len = fragroundup(fs, blkoff(fs, xp->i_size)); 409 if (len > 0 && len < fs->fs_bsize) { 410 ffs_blkfree(copy_fs, vp, db_get(xp, loc), 411 len, xp->i_number); 412 blkno = db_get(xp, loc); 413 db_assign(xp, loc, 0); 414 } 415 } 416 snaplistsize += 1; 417 if (xp->i_ump->um_fstype == UFS1) 418 error = expunge_ufs1(vp, xp, copy_fs, fullacct_ufs1, 419 BLK_NOCOPY); 420 else 421 error = expunge_ufs2(vp, xp, copy_fs, fullacct_ufs2, 422 BLK_NOCOPY); 423 if (blkno) 424 db_assign(xp, loc, blkno); 425 if (!error) 426 error = ffs_freefile(copy_fs, vp, xp->i_number, 427 xp->i_mode); 428 VOP_UNLOCK(xvp, 0); 429 if (error) { 430 free(copy_fs->fs_csp, M_UFSMNT); 431 goto out1; 432 } 433 MNT_ILOCK(mp); 434 } 435 MNT_IUNLOCK(mp); 436 /* 437 * If there already exist snapshots on this filesystem, grab a 438 * reference to their shared lock. If this is the first snapshot 439 * on this filesystem, we need to allocate a lock for the snapshots 440 * to share. In either case, acquire the snapshot lock and give 441 * up our original private lock. 442 */ 443 VI_LOCK(devvp); 444 if ((xp = TAILQ_FIRST(&ump->um_snapshots)) != NULL) { 445 struct lock *lkp; 446 447 lkp = ITOV(xp)->v_vnlock; 448 VI_UNLOCK(devvp); 449 VI_LOCK(vp); 450 vp->v_vnlock = lkp; 451 } else { 452 struct lock *lkp; 453 454 VI_UNLOCK(devvp); 455 MALLOC(lkp, struct lock *, sizeof(struct lock), M_UFSMNT, 456 M_WAITOK); 457 lockinit(lkp, PVFS, "snaplk", 0, LK_CANRECURSE); 458 VI_LOCK(vp); 459 vp->v_vnlock = lkp; 460 } 461 vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY); 462 transferlockers(&vp->v_lock, vp->v_vnlock); 463 lockmgr(&vp->v_lock, LK_RELEASE, NULL); 464 /* 465 * If this is the first snapshot on this filesystem, then we need 466 * to allocate the space for the list of preallocated snapshot blocks. 467 * This list will be refined below, but this preliminary one will 468 * keep us out of deadlock until the full one is ready. 469 */ 470 if (xp == NULL) { 471 snapblklist = malloc( 472 snaplistsize * sizeof(ufs2_daddr_t), M_UFSMNT, M_WAITOK); 473 blkp = &snapblklist[1]; 474 *blkp++ = lblkno(fs, fs->fs_sblockloc); 475 blkno = fragstoblks(fs, fs->fs_csaddr); 476 for (cg = 0; cg < fs->fs_ncg; cg++) { 477 if (fragstoblks(fs, cgtod(fs, cg)) > blkno) 478 break; 479 *blkp++ = fragstoblks(fs, cgtod(fs, cg)); 480 } 481 len = howmany(fs->fs_cssize, fs->fs_bsize); 482 for (loc = 0; loc < len; loc++) 483 *blkp++ = blkno + loc; 484 for (; cg < fs->fs_ncg; cg++) 485 *blkp++ = fragstoblks(fs, cgtod(fs, cg)); 486 snapblklist[0] = blkp - snapblklist; 487 VI_LOCK(devvp); 488 if (ump->um_snapblklist != NULL) 489 panic("ffs_snapshot: non-empty list"); 490 ump->um_snapblklist = snapblklist; 491 VI_UNLOCK(devvp); 492 } 493 /* 494 * Record snapshot inode. Since this is the newest snapshot, 495 * it must be placed at the end of the list. 496 */ 497 VI_LOCK(devvp); 498 fs->fs_snapinum[snaploc] = ip->i_number; 499 if (ip->i_nextsnap.tqe_prev != 0) 500 panic("ffs_snapshot: %llu already on list", 501 (unsigned long long)ip->i_number); 502 TAILQ_INSERT_TAIL(&ump->um_snapshots, ip, i_nextsnap); 503 VI_UNLOCK(devvp); 504 if (xp == NULL) 505 vn_cow_establish(devvp, ffs_copyonwrite, devvp); 506 vp->v_flag |= VSYSTEM; 507out1: 508 /* 509 * Resume operation on filesystem. 510 */ 511 vfs_write_resume(vp->v_mount); 512 /* 513 * Set the mtime to the time the snapshot has been taken. 514 */ 515 TIMEVAL_TO_TIMESPEC(&starttime, &ts); 516 if (ctime) 517 *ctime = ts; 518 DIP_ASSIGN(ip, mtime, ts.tv_sec); 519 DIP_ASSIGN(ip, mtimensec, ts.tv_nsec); 520 ip->i_flag |= IN_CHANGE | IN_UPDATE; 521 522#ifdef DEBUG 523 if (starttime.tv_sec > 0) { 524 getmicrotime(&endtime); 525 timersub(&endtime, &starttime, &endtime); 526 printf("%s: suspended %ld.%03ld sec, redo %ld of %d\n", 527 vp->v_mount->mnt_stat.f_mntonname, (long)endtime.tv_sec, 528 endtime.tv_usec / 1000, redo, fs->fs_ncg); 529 } 530#endif 531 if (error) 532 goto out; 533 /* 534 * Copy allocation information from all the snapshots in 535 * this snapshot and then expunge them from its view. 536 */ 537 TAILQ_FOREACH(xp, &ump->um_snapshots, i_nextsnap) { 538 if (xp == ip) 539 break; 540 if (xp->i_ump->um_fstype == UFS1) 541 error = expunge_ufs1(vp, xp, fs, snapacct_ufs1, 542 BLK_SNAP); 543 else 544 error = expunge_ufs2(vp, xp, fs, snapacct_ufs2, 545 BLK_SNAP); 546 if (error) { 547 fs->fs_snapinum[snaploc] = 0; 548 goto done; 549 } 550 } 551 /* 552 * Allocate space for the full list of preallocated snapshot blocks. 553 */ 554 snapblklist = malloc(snaplistsize * sizeof(ufs2_daddr_t), 555 M_UFSMNT, M_WAITOK); 556 ip->i_snapblklist = &snapblklist[1]; 557 /* 558 * Expunge the blocks used by the snapshots from the set of 559 * blocks marked as used in the snapshot bitmaps. Also, collect 560 * the list of allocated blocks in i_snapblklist. 561 */ 562 if (ip->i_ump->um_fstype == UFS1) 563 error = expunge_ufs1(vp, ip, copy_fs, mapacct_ufs1, BLK_SNAP); 564 else 565 error = expunge_ufs2(vp, ip, copy_fs, mapacct_ufs2, BLK_SNAP); 566 if (error) { 567 fs->fs_snapinum[snaploc] = 0; 568 FREE(snapblklist, M_UFSMNT); 569 goto done; 570 } 571 if (snaplistsize < ip->i_snapblklist - snapblklist) 572 panic("ffs_snapshot: list too small"); 573 snaplistsize = ip->i_snapblklist - snapblklist; 574 snapblklist[0] = snaplistsize; 575 ip->i_snapblklist = &snapblklist[0]; 576 /* 577 * Write out the list of allocated blocks to the end of the snapshot. 578 */ 579 for (i = 0; i < snaplistsize; i++) 580 snapblklist[i] = ufs_rw64(snapblklist[i], ns); 581 error = vn_rdwr(UIO_WRITE, vp, (caddr_t)snapblklist, 582 snaplistsize*sizeof(ufs2_daddr_t), lblktosize(fs, (off_t)numblks), 583 UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, l->l_cred, NULL, NULL); 584 for (i = 0; i < snaplistsize; i++) 585 snapblklist[i] = ufs_rw64(snapblklist[i], ns); 586 if (error) { 587 fs->fs_snapinum[snaploc] = 0; 588 FREE(snapblklist, M_UFSMNT); 589 goto done; 590 } 591 /* 592 * Write the superblock and its summary information 593 * to the snapshot. 594 */ 595 blkno = fragstoblks(fs, fs->fs_csaddr); 596 len = howmany(fs->fs_cssize, fs->fs_bsize); 597 space = copy_fs->fs_csp; 598#ifdef FFS_EI 599 if (ns) { 600 ffs_sb_swap(copy_fs, copy_fs); 601 ffs_csum_swap(space, space, fs->fs_cssize); 602 } 603#endif 604 for (loc = 0; loc < len; loc++) { 605 error = bread(vp, blkno + loc, fs->fs_bsize, KERNCRED, &nbp); 606 if (error) { 607 brelse(nbp); 608 fs->fs_snapinum[snaploc] = 0; 609 FREE(snapblklist, M_UFSMNT); 610 goto done; 611 } 612 bcopy(space, nbp->b_data, fs->fs_bsize); 613 space = (char *)space + fs->fs_bsize; 614 bawrite(nbp); 615 } 616 /* 617 * As this is the newest list, it is the most inclusive, so 618 * should replace the previous list. If this is the first snapshot 619 * free the preliminary list. 620 */ 621 VI_LOCK(devvp); 622 space = ump->um_snapblklist; 623 ump->um_snapblklist = snapblklist; 624 VI_UNLOCK(devvp); 625 if (TAILQ_FIRST(&ump->um_snapshots) == ip) 626 FREE(space, M_UFSMNT); 627done: 628 free(copy_fs->fs_csp, M_UFSMNT); 629 if (!error) { 630 error = bread(vp, lblkno(fs, fs->fs_sblockloc), fs->fs_bsize, 631 KERNCRED, &nbp); 632 if (error) { 633 brelse(nbp); 634 fs->fs_snapinum[snaploc] = 0; 635 } 636 bcopy(sbbuf, nbp->b_data, fs->fs_bsize); 637 bawrite(nbp); 638 } 639out: 640 /* 641 * Invalidate and free all pages on the snapshot vnode. 642 * All metadata has been written through the buffer cache. 643 * Clean all dirty buffers now to avoid UBC inconsistencies. 644 */ 645 if (!error) { 646 simple_lock(&vp->v_interlock); 647 error = VOP_PUTPAGES(vp, 0, 0, 648 PGO_ALLPAGES|PGO_CLEANIT|PGO_SYNCIO|PGO_FREE); 649 } 650 if (!error) { 651 s = splbio(); 652 for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { 653 nbp = LIST_NEXT(bp, b_vnbufs); 654 simple_lock(&bp->b_interlock); 655 splx(s); 656 if ((bp->b_flags & (B_DELWRI|B_BUSY)) != B_DELWRI) 657 panic("ffs_snapshot: not dirty or busy, bp %p", 658 bp); 659 bp->b_flags |= B_BUSY|B_VFLUSH; 660 if (LIST_FIRST(&bp->b_dep) == NULL) 661 bp->b_flags |= B_NOCACHE; 662 simple_unlock(&bp->b_interlock); 663 bwrite(bp); 664 s = splbio(); 665 } 666 simple_lock(&global_v_numoutput_slock); 667 while (vp->v_numoutput) { 668 vp->v_flag |= VBWAIT; 669 ltsleep((caddr_t)&vp->v_numoutput, PRIBIO+1, 670 "snapflushbuf", 0, &global_v_numoutput_slock); 671 } 672 simple_unlock(&global_v_numoutput_slock); 673 splx(s); 674 } 675 if (sbbuf) 676 free(sbbuf, M_UFSMNT); 677 if (fs->fs_active != 0) { 678 FREE(fs->fs_active, M_DEVBUF); 679 fs->fs_active = 0; 680 } 681 mp->mnt_flag = flag; 682 if (error) 683 (void) ffs_truncate(vp, (off_t)0, 0, NOCRED, l); 684 else 685 vref(vp); 686 return (error); 687} 688 689/* 690 * Copy a cylinder group map. All the unallocated blocks are marked 691 * BLK_NOCOPY so that the snapshot knows that it need not copy them 692 * if they are later written. If passno is one, then this is a first 693 * pass, so only setting needs to be done. If passno is 2, then this 694 * is a revision to a previous pass which must be undone as the 695 * replacement pass is done. 696 */ 697static int 698cgaccount(int cg, struct vnode *vp, caddr_t data, int passno) 699{ 700 struct buf *bp, *ibp; 701 struct inode *ip; 702 struct cg *cgp; 703 struct fs *fs; 704 ufs2_daddr_t base, numblks; 705 int error, len, loc, ns, indiroff; 706 707 ip = VTOI(vp); 708 fs = ip->i_fs; 709 ns = UFS_FSNEEDSWAP(fs); 710 error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)), 711 (int)fs->fs_cgsize, KERNCRED, &bp); 712 if (error) { 713 brelse(bp); 714 return (error); 715 } 716 cgp = (struct cg *)bp->b_data; 717 if (!cg_chkmagic(cgp, ns)) { 718 brelse(bp); 719 return (EIO); 720 } 721 ACTIVECG_SET(fs, cg); 722 723 bcopy(bp->b_data, data, fs->fs_cgsize); 724 brelse(bp); 725 if (fs->fs_cgsize < fs->fs_bsize) 726 bzero(&data[fs->fs_cgsize], 727 fs->fs_bsize - fs->fs_cgsize); 728 numblks = howmany(fs->fs_size, fs->fs_frag); 729 len = howmany(fs->fs_fpg, fs->fs_frag); 730 base = cg * fs->fs_fpg / fs->fs_frag; 731 if (base + len >= numblks) 732 len = numblks - base - 1; 733 loc = 0; 734 if (base < NDADDR) { 735 for ( ; loc < NDADDR; loc++) { 736 if (ffs_isblock(fs, cg_blksfree(cgp, ns), loc)) 737 db_assign(ip, loc, BLK_NOCOPY); 738 else if (db_get(ip, loc) == BLK_NOCOPY) { 739 if (passno == 2) 740 db_assign(ip, loc, 0); 741 else if (passno == 1) 742 panic("ffs_snapshot: lost direct block"); 743 } 744 } 745 } 746 if ((error = ffs_balloc(vp, lblktosize(fs, (off_t)(base + loc)), 747 fs->fs_bsize, KERNCRED, B_METAONLY, &ibp)) != 0) 748 return (error); 749 indiroff = (base + loc - NDADDR) % NINDIR(fs); 750 for ( ; loc < len; loc++, indiroff++) { 751 if (indiroff >= NINDIR(fs)) { 752 bawrite(ibp); 753 if ((error = ffs_balloc(vp, 754 lblktosize(fs, (off_t)(base + loc)), 755 fs->fs_bsize, KERNCRED, B_METAONLY, &ibp)) != 0) 756 return (error); 757 indiroff = 0; 758 } 759 if (ffs_isblock(fs, cg_blksfree(cgp, ns), loc)) 760 idb_assign(ip, ibp->b_data, indiroff, BLK_NOCOPY); 761 else if (idb_get(ip, ibp->b_data, indiroff) == BLK_NOCOPY) { 762 if (passno == 2) 763 idb_assign(ip, ibp->b_data, indiroff, 0); 764 else if (passno == 1) 765 panic("ffs_snapshot: lost indirect block"); 766 } 767 } 768 bdwrite(ibp); 769 return (0); 770} 771 772/* 773 * Before expunging a snapshot inode, note all the 774 * blocks that it claims with BLK_SNAP so that fsck will 775 * be able to account for those blocks properly and so 776 * that this snapshot knows that it need not copy them 777 * if the other snapshot holding them is freed. This code 778 * is reproduced once each for UFS1 and UFS2. 779 */ 780static int 781expunge_ufs1(struct vnode *snapvp, struct inode *cancelip, struct fs *fs, 782 int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 783 struct fs *, ufs_lbn_t, int), 784 int expungetype) 785{ 786 int i, s, error, ns, indiroff; 787 ufs_lbn_t lbn, rlbn; 788 ufs2_daddr_t len, blkno, numblks, blksperindir; 789 struct ufs1_dinode *dip; 790 struct buf *bp; 791 caddr_t bf; 792 793 ns = UFS_FSNEEDSWAP(fs); 794 /* 795 * Prepare to expunge the inode. If its inode block has not 796 * yet been copied, then allocate and fill the copy. 797 */ 798 lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number)); 799 blkno = 0; 800 if (lbn < NDADDR) { 801 blkno = db_get(VTOI(snapvp), lbn); 802 } else { 803 s = cow_enter(); 804 error = ffs_balloc(snapvp, lblktosize(fs, (off_t)lbn), 805 fs->fs_bsize, KERNCRED, B_METAONLY, &bp); 806 cow_leave(s); 807 if (error) 808 return (error); 809 indiroff = (lbn - NDADDR) % NINDIR(fs); 810 blkno = idb_get(VTOI(snapvp), bp->b_data, indiroff); 811 brelse(bp); 812 } 813 bf = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK); 814 if (blkno != 0) 815 error = readvnblk(snapvp, bf, lbn); 816 else 817 error = readfsblk(snapvp, bf, lbn); 818 if (error) { 819 free(bf, M_UFSMNT); 820 return error; 821 } 822 /* 823 * Set a snapshot inode to be a zero length file, regular files 824 * to be completely unallocated. 825 */ 826 dip = (struct ufs1_dinode *)bf + ino_to_fsbo(fs, cancelip->i_number); 827 if (expungetype == BLK_NOCOPY) 828 dip->di_mode = 0; 829 dip->di_size = 0; 830 dip->di_blocks = 0; 831 dip->di_flags = 832 ufs_rw32(ufs_rw32(dip->di_flags, ns) & ~SF_SNAPSHOT, ns); 833 bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs1_daddr_t)); 834 error = writevnblk(snapvp, bf, lbn); 835 free(bf, M_UFSMNT); 836 if (error) 837 return error; 838 /* 839 * Now go through and expunge all the blocks in the file 840 * using the function requested. 841 */ 842 numblks = howmany(cancelip->i_size, fs->fs_bsize); 843 if ((error = (*acctfunc)(snapvp, &cancelip->i_ffs1_db[0], 844 &cancelip->i_ffs1_db[NDADDR], fs, 0, expungetype))) 845 return (error); 846 if ((error = (*acctfunc)(snapvp, &cancelip->i_ffs1_ib[0], 847 &cancelip->i_ffs1_ib[NIADDR], fs, -1, expungetype))) 848 return (error); 849 blksperindir = 1; 850 lbn = -NDADDR; 851 len = numblks - NDADDR; 852 rlbn = NDADDR; 853 for (i = 0; len > 0 && i < NIADDR; i++) { 854 error = indiracct_ufs1(snapvp, ITOV(cancelip), i, 855 ufs_rw32(cancelip->i_ffs1_ib[i], ns), lbn, rlbn, len, 856 blksperindir, fs, acctfunc, expungetype); 857 if (error) 858 return (error); 859 blksperindir *= NINDIR(fs); 860 lbn -= blksperindir + 1; 861 len -= blksperindir; 862 rlbn += blksperindir; 863 } 864 return (0); 865} 866 867/* 868 * Descend an indirect block chain for vnode cancelvp accounting for all 869 * its indirect blocks in snapvp. 870 */ 871static int 872indiracct_ufs1(struct vnode *snapvp, struct vnode *cancelvp, int level, 873 ufs1_daddr_t blkno, ufs_lbn_t lbn, ufs_lbn_t rlbn, ufs_lbn_t remblks, 874 ufs_lbn_t blksperindir, struct fs *fs, 875 int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 876 struct fs *, ufs_lbn_t, int), 877 int expungetype) 878{ 879 int error, ns, num, i; 880 ufs_lbn_t subblksperindir; 881 struct indir indirs[NIADDR + 2]; 882 ufs1_daddr_t last, *bap; 883 struct buf *bp; 884 885 ns = UFS_FSNEEDSWAP(fs); 886 887 if (blkno == 0) { 888 if (expungetype == BLK_NOCOPY) 889 return (0); 890 panic("indiracct_ufs1: missing indir"); 891 } 892 if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0) 893 return (error); 894 if (lbn != indirs[num - 1 - level].in_lbn || num < 2) 895 panic("indiracct_ufs1: botched params"); 896 /* 897 * We have to expand bread here since it will deadlock looking 898 * up the block number for any blocks that are not in the cache. 899 */ 900 bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0); 901 bp->b_blkno = fsbtodb(fs, blkno); 902 if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 && 903 (error = readfsblk(bp->b_vp, bp->b_data, fragstoblks(fs, blkno)))) { 904 brelse(bp); 905 return (error); 906 } 907 /* 908 * Account for the block pointers in this indirect block. 909 */ 910 last = howmany(remblks, blksperindir); 911 if (last > NINDIR(fs)) 912 last = NINDIR(fs); 913 bap = malloc(fs->fs_bsize, M_DEVBUF, M_WAITOK); 914 bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize); 915 brelse(bp); 916 error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs, 917 level == 0 ? rlbn : -1, expungetype); 918 if (error || level == 0) 919 goto out; 920 /* 921 * Account for the block pointers in each of the indirect blocks 922 * in the levels below us. 923 */ 924 subblksperindir = blksperindir / NINDIR(fs); 925 for (lbn++, level--, i = 0; i < last; i++) { 926 error = indiracct_ufs1(snapvp, cancelvp, level, 927 ufs_rw32(bap[i], ns), lbn, rlbn, remblks, subblksperindir, 928 fs, acctfunc, expungetype); 929 if (error) 930 goto out; 931 rlbn += blksperindir; 932 lbn -= blksperindir; 933 remblks -= blksperindir; 934 } 935out: 936 FREE(bap, M_DEVBUF); 937 return (error); 938} 939 940/* 941 * Do both snap accounting and map accounting. 942 */ 943static int 944fullacct_ufs1(struct vnode *vp, ufs1_daddr_t *oldblkp, ufs1_daddr_t *lastblkp, 945 struct fs *fs, ufs_lbn_t lblkno, 946 int exptype /* BLK_SNAP or BLK_NOCOPY */) 947{ 948 int error; 949 950 if ((error = snapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype))) 951 return (error); 952 return (mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype)); 953} 954 955/* 956 * Identify a set of blocks allocated in a snapshot inode. 957 */ 958static int 959snapacct_ufs1(struct vnode *vp, ufs1_daddr_t *oldblkp, ufs1_daddr_t *lastblkp, 960 struct fs *fs, ufs_lbn_t lblkno, 961 int expungetype /* BLK_SNAP or BLK_NOCOPY */) 962{ 963 struct inode *ip = VTOI(vp); 964 ufs1_daddr_t blkno, *blkp; 965 ufs_lbn_t lbn; 966 struct buf *ibp; 967 int error, ns; 968 969 ns = UFS_FSNEEDSWAP(fs); 970 971 for ( ; oldblkp < lastblkp; oldblkp++) { 972 blkno = ufs_rw32(*oldblkp, ns); 973 if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP) 974 continue; 975 lbn = fragstoblks(fs, blkno); 976 if (lbn < NDADDR) { 977 blkp = &ip->i_ffs1_db[lbn]; 978 ip->i_flag |= IN_CHANGE | IN_UPDATE; 979 } else { 980 error = ffs_balloc(vp, lblktosize(fs, (off_t)lbn), 981 fs->fs_bsize, KERNCRED, B_METAONLY, &ibp); 982 if (error) 983 return (error); 984 blkp = &((ufs1_daddr_t *)(ibp->b_data)) 985 [(lbn - NDADDR) % NINDIR(fs)]; 986 } 987 /* 988 * If we are expunging a snapshot vnode and we 989 * find a block marked BLK_NOCOPY, then it is 990 * one that has been allocated to this snapshot after 991 * we took our current snapshot and can be ignored. 992 */ 993 blkno = ufs_rw32(*blkp, ns); 994 if (expungetype == BLK_SNAP && blkno == BLK_NOCOPY) { 995 if (lbn >= NDADDR) 996 brelse(ibp); 997 } else { 998 if (blkno != 0) 999 panic("snapacct_ufs1: bad block"); 1000 *blkp = ufs_rw32(expungetype, ns); 1001 if (lbn >= NDADDR) 1002 bdwrite(ibp); 1003 } 1004 } 1005 return (0); 1006} 1007 1008/* 1009 * Account for a set of blocks allocated in a snapshot inode. 1010 */ 1011static int 1012mapacct_ufs1(struct vnode *vp, ufs1_daddr_t *oldblkp, ufs1_daddr_t *lastblkp, 1013 struct fs *fs, ufs_lbn_t lblkno, int expungetype) 1014{ 1015 ufs1_daddr_t blkno; 1016 struct inode *ip; 1017 ino_t inum; 1018 int acctit, ns; 1019 1020 ns = UFS_FSNEEDSWAP(fs); 1021 ip = VTOI(vp); 1022 inum = ip->i_number; 1023 if (lblkno == -1) 1024 acctit = 0; 1025 else 1026 acctit = 1; 1027 for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) { 1028 blkno = ufs_rw32(*oldblkp, ns); 1029 if (blkno == 0 || blkno == BLK_NOCOPY) 1030 continue; 1031 if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP) 1032 *ip->i_snapblklist++ = lblkno; 1033 if (blkno == BLK_SNAP) 1034 blkno = blkstofrags(fs, lblkno); 1035 ffs_blkfree(fs, vp, blkno, fs->fs_bsize, inum); 1036 } 1037 return (0); 1038} 1039 1040/* 1041 * Before expunging a snapshot inode, note all the 1042 * blocks that it claims with BLK_SNAP so that fsck will 1043 * be able to account for those blocks properly and so 1044 * that this snapshot knows that it need not copy them 1045 * if the other snapshot holding them is freed. This code 1046 * is reproduced once each for UFS1 and UFS2. 1047 */ 1048static int 1049expunge_ufs2(struct vnode *snapvp, struct inode *cancelip, struct fs *fs, 1050 int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 1051 struct fs *, ufs_lbn_t, int), 1052 int expungetype) 1053{ 1054 int i, s, error, ns, indiroff; 1055 ufs_lbn_t lbn, rlbn; 1056 ufs2_daddr_t len, blkno, numblks, blksperindir; 1057 struct ufs2_dinode *dip; 1058 struct buf *bp; 1059 caddr_t bf; 1060 1061 ns = UFS_FSNEEDSWAP(fs); 1062 /* 1063 * Prepare to expunge the inode. If its inode block has not 1064 * yet been copied, then allocate and fill the copy. 1065 */ 1066 lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number)); 1067 blkno = 0; 1068 if (lbn < NDADDR) { 1069 blkno = db_get(VTOI(snapvp), lbn); 1070 } else { 1071 s = cow_enter(); 1072 error = ffs_balloc(snapvp, lblktosize(fs, (off_t)lbn), 1073 fs->fs_bsize, KERNCRED, B_METAONLY, &bp); 1074 cow_leave(s); 1075 if (error) 1076 return (error); 1077 indiroff = (lbn - NDADDR) % NINDIR(fs); 1078 blkno = idb_get(VTOI(snapvp), bp->b_data, indiroff); 1079 brelse(bp); 1080 } 1081 bf = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK); 1082 if (blkno != 0) 1083 error = readvnblk(snapvp, bf, lbn); 1084 else 1085 error = readfsblk(snapvp, bf, lbn); 1086 if (error) { 1087 free(bf, M_UFSMNT); 1088 return error; 1089 } 1090 /* 1091 * Set a snapshot inode to be a zero length file, regular files 1092 * to be completely unallocated. 1093 */ 1094 dip = (struct ufs2_dinode *)bf + ino_to_fsbo(fs, cancelip->i_number); 1095 if (expungetype == BLK_NOCOPY) 1096 dip->di_mode = 0; 1097 dip->di_size = 0; 1098 dip->di_blocks = 0; 1099 dip->di_flags = 1100 ufs_rw32(ufs_rw32(dip->di_flags, ns) & ~SF_SNAPSHOT, ns); 1101 bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs2_daddr_t)); 1102 error = writevnblk(snapvp, bf, lbn); 1103 free(bf, M_UFSMNT); 1104 if (error) 1105 return error; 1106 /* 1107 * Now go through and expunge all the blocks in the file 1108 * using the function requested. 1109 */ 1110 numblks = howmany(cancelip->i_size, fs->fs_bsize); 1111 if ((error = (*acctfunc)(snapvp, &cancelip->i_ffs2_db[0], 1112 &cancelip->i_ffs2_db[NDADDR], fs, 0, expungetype))) 1113 return (error); 1114 if ((error = (*acctfunc)(snapvp, &cancelip->i_ffs2_ib[0], 1115 &cancelip->i_ffs2_ib[NIADDR], fs, -1, expungetype))) 1116 return (error); 1117 blksperindir = 1; 1118 lbn = -NDADDR; 1119 len = numblks - NDADDR; 1120 rlbn = NDADDR; 1121 for (i = 0; len > 0 && i < NIADDR; i++) { 1122 error = indiracct_ufs2(snapvp, ITOV(cancelip), i, 1123 ufs_rw64(cancelip->i_ffs2_ib[i], ns), lbn, rlbn, len, 1124 blksperindir, fs, acctfunc, expungetype); 1125 if (error) 1126 return (error); 1127 blksperindir *= NINDIR(fs); 1128 lbn -= blksperindir + 1; 1129 len -= blksperindir; 1130 rlbn += blksperindir; 1131 } 1132 return (0); 1133} 1134 1135/* 1136 * Descend an indirect block chain for vnode cancelvp accounting for all 1137 * its indirect blocks in snapvp. 1138 */ 1139static int 1140indiracct_ufs2(struct vnode *snapvp, struct vnode *cancelvp, int level, 1141 ufs2_daddr_t blkno, ufs_lbn_t lbn, ufs_lbn_t rlbn, ufs_lbn_t remblks, 1142 ufs_lbn_t blksperindir, struct fs *fs, 1143 int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 1144 struct fs *, ufs_lbn_t, int), 1145 int expungetype) 1146{ 1147 int error, ns, num, i; 1148 ufs_lbn_t subblksperindir; 1149 struct indir indirs[NIADDR + 2]; 1150 ufs2_daddr_t last, *bap; 1151 struct buf *bp; 1152 1153 ns = UFS_FSNEEDSWAP(fs); 1154 1155 if (blkno == 0) { 1156 if (expungetype == BLK_NOCOPY) 1157 return (0); 1158 panic("indiracct_ufs2: missing indir"); 1159 } 1160 if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0) 1161 return (error); 1162 if (lbn != indirs[num - 1 - level].in_lbn || num < 2) 1163 panic("indiracct_ufs2: botched params"); 1164 /* 1165 * We have to expand bread here since it will deadlock looking 1166 * up the block number for any blocks that are not in the cache. 1167 */ 1168 bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0); 1169 bp->b_blkno = fsbtodb(fs, blkno); 1170 if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 && 1171 (error = readfsblk(bp->b_vp, bp->b_data, fragstoblks(fs, blkno)))) { 1172 brelse(bp); 1173 return (error); 1174 } 1175 /* 1176 * Account for the block pointers in this indirect block. 1177 */ 1178 last = howmany(remblks, blksperindir); 1179 if (last > NINDIR(fs)) 1180 last = NINDIR(fs); 1181 bap = malloc(fs->fs_bsize, M_DEVBUF, M_WAITOK); 1182 bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize); 1183 brelse(bp); 1184 error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs, 1185 level == 0 ? rlbn : -1, expungetype); 1186 if (error || level == 0) 1187 goto out; 1188 /* 1189 * Account for the block pointers in each of the indirect blocks 1190 * in the levels below us. 1191 */ 1192 subblksperindir = blksperindir / NINDIR(fs); 1193 for (lbn++, level--, i = 0; i < last; i++) { 1194 error = indiracct_ufs2(snapvp, cancelvp, level, 1195 ufs_rw64(bap[i], ns), lbn, rlbn, remblks, subblksperindir, 1196 fs, acctfunc, expungetype); 1197 if (error) 1198 goto out; 1199 rlbn += blksperindir; 1200 lbn -= blksperindir; 1201 remblks -= blksperindir; 1202 } 1203out: 1204 FREE(bap, M_DEVBUF); 1205 return (error); 1206} 1207 1208/* 1209 * Do both snap accounting and map accounting. 1210 */ 1211static int 1212fullacct_ufs2(struct vnode *vp, ufs2_daddr_t *oldblkp, ufs2_daddr_t *lastblkp, 1213 struct fs *fs, ufs_lbn_t lblkno, 1214 int exptype /* BLK_SNAP or BLK_NOCOPY */) 1215{ 1216 int error; 1217 1218 if ((error = snapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype))) 1219 return (error); 1220 return (mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype)); 1221} 1222 1223/* 1224 * Identify a set of blocks allocated in a snapshot inode. 1225 */ 1226static int 1227snapacct_ufs2(struct vnode *vp, ufs2_daddr_t *oldblkp, ufs2_daddr_t *lastblkp, 1228 struct fs *fs, ufs_lbn_t lblkno, 1229 int expungetype /* BLK_SNAP or BLK_NOCOPY */) 1230{ 1231 struct inode *ip = VTOI(vp); 1232 ufs2_daddr_t blkno, *blkp; 1233 ufs_lbn_t lbn; 1234 struct buf *ibp; 1235 int error, ns; 1236 1237 ns = UFS_FSNEEDSWAP(fs); 1238 1239 for ( ; oldblkp < lastblkp; oldblkp++) { 1240 blkno = ufs_rw64(*oldblkp, ns); 1241 if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP) 1242 continue; 1243 lbn = fragstoblks(fs, blkno); 1244 if (lbn < NDADDR) { 1245 blkp = &ip->i_ffs2_db[lbn]; 1246 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1247 } else { 1248 error = ffs_balloc(vp, lblktosize(fs, (off_t)lbn), 1249 fs->fs_bsize, KERNCRED, B_METAONLY, &ibp); 1250 if (error) 1251 return (error); 1252 blkp = &((ufs2_daddr_t *)(ibp->b_data)) 1253 [(lbn - NDADDR) % NINDIR(fs)]; 1254 } 1255 /* 1256 * If we are expunging a snapshot vnode and we 1257 * find a block marked BLK_NOCOPY, then it is 1258 * one that has been allocated to this snapshot after 1259 * we took our current snapshot and can be ignored. 1260 */ 1261 blkno = ufs_rw64(*blkp, ns); 1262 if (expungetype == BLK_SNAP && blkno == BLK_NOCOPY) { 1263 if (lbn >= NDADDR) 1264 brelse(ibp); 1265 } else { 1266 if (blkno != 0) 1267 panic("snapacct_ufs2: bad block"); 1268 *blkp = ufs_rw64(expungetype, ns); 1269 if (lbn >= NDADDR) 1270 bdwrite(ibp); 1271 } 1272 } 1273 return (0); 1274} 1275 1276/* 1277 * Account for a set of blocks allocated in a snapshot inode. 1278 */ 1279static int 1280mapacct_ufs2(struct vnode *vp, ufs2_daddr_t *oldblkp, ufs2_daddr_t *lastblkp, 1281 struct fs *fs, ufs_lbn_t lblkno, int expungetype) 1282{ 1283 ufs2_daddr_t blkno; 1284 struct inode *ip; 1285 ino_t inum; 1286 int acctit, ns; 1287 1288 ns = UFS_FSNEEDSWAP(fs); 1289 ip = VTOI(vp); 1290 inum = ip->i_number; 1291 if (lblkno == -1) 1292 acctit = 0; 1293 else 1294 acctit = 1; 1295 for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) { 1296 blkno = ufs_rw64(*oldblkp, ns); 1297 if (blkno == 0 || blkno == BLK_NOCOPY) 1298 continue; 1299 if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP) 1300 *ip->i_snapblklist++ = lblkno; 1301 if (blkno == BLK_SNAP) 1302 blkno = blkstofrags(fs, lblkno); 1303 ffs_blkfree(fs, vp, blkno, fs->fs_bsize, inum); 1304 } 1305 return (0); 1306} 1307#endif /* defined(FFS_NO_SNAPSHOT) */ 1308 1309/* 1310 * Decrement extra reference on snapshot when last name is removed. 1311 * It will not be freed until the last open reference goes away. 1312 */ 1313void 1314ffs_snapgone(struct inode *ip) 1315{ 1316 struct ufsmount *ump = VFSTOUFS(ip->i_devvp->v_specmountpoint); 1317 struct inode *xp; 1318 struct fs *fs; 1319 int snaploc; 1320 1321 /* 1322 * Find snapshot in incore list. 1323 */ 1324 TAILQ_FOREACH(xp, &ump->um_snapshots, i_nextsnap) 1325 if (xp == ip) 1326 break; 1327 if (xp != NULL) 1328 vrele(ITOV(ip)); 1329#ifdef DEBUG 1330 else if (snapdebug) 1331 printf("ffs_snapgone: lost snapshot vnode %llu\n", 1332 (unsigned long long)ip->i_number); 1333#endif 1334 /* 1335 * Delete snapshot inode from superblock. Keep list dense. 1336 */ 1337 fs = ip->i_fs; 1338 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) 1339 if (fs->fs_snapinum[snaploc] == ip->i_number) 1340 break; 1341 if (snaploc < FSMAXSNAP) { 1342 for (snaploc++; snaploc < FSMAXSNAP; snaploc++) { 1343 if (fs->fs_snapinum[snaploc] == 0) 1344 break; 1345 fs->fs_snapinum[snaploc - 1] = fs->fs_snapinum[snaploc]; 1346 } 1347 fs->fs_snapinum[snaploc - 1] = 0; 1348 } 1349} 1350 1351/* 1352 * Prepare a snapshot file for being removed. 1353 */ 1354void 1355ffs_snapremove(struct vnode *vp) 1356{ 1357 struct inode *ip = VTOI(vp), *xp; 1358 struct vnode *devvp = ip->i_devvp; 1359 struct fs *fs = ip->i_fs; 1360 struct ufsmount *ump = VFSTOUFS(devvp->v_specmountpoint); 1361 struct lock *lkp; 1362 struct buf *ibp; 1363 ufs2_daddr_t numblks, blkno, dblk; 1364 int error, ns, loc, last; 1365 1366 ns = UFS_FSNEEDSWAP(fs); 1367 /* 1368 * If active, delete from incore list (this snapshot may 1369 * already have been in the process of being deleted, so 1370 * would not have been active). 1371 * 1372 * Clear copy-on-write flag if last snapshot. 1373 */ 1374 if (ip->i_nextsnap.tqe_prev != 0) { 1375 VI_LOCK(devvp); 1376 lockmgr(&vp->v_lock, LK_INTERLOCK | LK_EXCLUSIVE, 1377 VI_MTX(devvp)); 1378 VI_LOCK(devvp); 1379 TAILQ_REMOVE(&ump->um_snapshots, ip, i_nextsnap); 1380 ip->i_nextsnap.tqe_prev = 0; 1381 lkp = vp->v_vnlock; 1382 vp->v_vnlock = &vp->v_lock; 1383 lockmgr(lkp, LK_RELEASE, NULL); 1384 if (TAILQ_FIRST(&ump->um_snapshots) != 0) { 1385 /* Roll back the list of preallocated blocks. */ 1386 xp = TAILQ_LAST(&ump->um_snapshots, inodelst); 1387 ump->um_snapblklist = xp->i_snapblklist; 1388 VI_UNLOCK(devvp); 1389 } else { 1390 ump->um_snapblklist = 0; 1391 lockmgr(lkp, LK_DRAIN|LK_INTERLOCK, VI_MTX(devvp)); 1392 lockmgr(lkp, LK_RELEASE, NULL); 1393 vn_cow_disestablish(devvp, ffs_copyonwrite, devvp); 1394 FREE(lkp, M_UFSMNT); 1395 } 1396 FREE(ip->i_snapblklist, M_UFSMNT); 1397 ip->i_snapblklist = NULL; 1398 } 1399 /* 1400 * Clear all BLK_NOCOPY fields. Pass any block claims to other 1401 * snapshots that want them (see ffs_snapblkfree below). 1402 */ 1403 for (blkno = 1; blkno < NDADDR; blkno++) { 1404 dblk = db_get(ip, blkno); 1405 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) 1406 db_assign(ip, blkno, 0); 1407 else if ((dblk == blkstofrags(fs, blkno) && 1408 ffs_snapblkfree(fs, ip->i_devvp, dblk, fs->fs_bsize, 1409 ip->i_number))) { 1410 DIP_ADD(ip, blocks, -btodb(fs->fs_bsize)); 1411 db_assign(ip, blkno, 0); 1412 } 1413 } 1414 numblks = howmany(ip->i_size, fs->fs_bsize); 1415 for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) { 1416 error = ffs_balloc(vp, lblktosize(fs, (off_t)blkno), 1417 fs->fs_bsize, KERNCRED, B_METAONLY, &ibp); 1418 if (error) 1419 continue; 1420 if (fs->fs_size - blkno > NINDIR(fs)) 1421 last = NINDIR(fs); 1422 else 1423 last = fs->fs_size - blkno; 1424 for (loc = 0; loc < last; loc++) { 1425 dblk = idb_get(ip, ibp->b_data, loc); 1426 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) 1427 idb_assign(ip, ibp->b_data, loc, 0); 1428 else if (dblk == blkstofrags(fs, blkno) && 1429 ffs_snapblkfree(fs, ip->i_devvp, dblk, 1430 fs->fs_bsize, ip->i_number)) { 1431 DIP_ADD(ip, blocks, -btodb(fs->fs_bsize)); 1432 idb_assign(ip, ibp->b_data, loc, 0); 1433 } 1434 } 1435 bawrite(ibp); 1436 } 1437 /* 1438 * Clear snapshot flag and drop reference. 1439 */ 1440 ip->i_flags &= ~SF_SNAPSHOT; 1441 DIP_ASSIGN(ip, flags, ip->i_flags); 1442 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1443} 1444 1445/* 1446 * Notification that a block is being freed. Return zero if the free 1447 * should be allowed to proceed. Return non-zero if the snapshot file 1448 * wants to claim the block. The block will be claimed if it is an 1449 * uncopied part of one of the snapshots. It will be freed if it is 1450 * either a BLK_NOCOPY or has already been copied in all of the snapshots. 1451 * If a fragment is being freed, then all snapshots that care about 1452 * it must make a copy since a snapshot file can only claim full sized 1453 * blocks. Note that if more than one snapshot file maps the block, 1454 * we can pick one at random to claim it. Since none of the snapshots 1455 * can change, we are assurred that they will all see the same unmodified 1456 * image. When deleting a snapshot file (see ffs_snapremove above), we 1457 * must push any of these claimed blocks to one of the other snapshots 1458 * that maps it. These claimed blocks are easily identified as they will 1459 * have a block number equal to their logical block number within the 1460 * snapshot. A copied block can never have this property because they 1461 * must always have been allocated from a BLK_NOCOPY location. 1462 */ 1463int 1464ffs_snapblkfree(struct fs *fs, struct vnode *devvp, ufs2_daddr_t bno, 1465 long size, ino_t inum) 1466{ 1467 struct ufsmount *ump = VFSTOUFS(devvp->v_specmountpoint); 1468 struct buf *ibp; 1469 struct inode *ip; 1470 struct vnode *vp = NULL, *saved_vp = NULL; 1471 caddr_t saved_data = NULL; 1472 ufs_lbn_t lbn; 1473 ufs2_daddr_t blkno; 1474 int s, indiroff = 0, snapshot_locked = 0, error = 0, claimedblk = 0; 1475 1476 lbn = fragstoblks(fs, bno); 1477retry: 1478 VI_LOCK(devvp); 1479 TAILQ_FOREACH(ip, &ump->um_snapshots, i_nextsnap) { 1480 vp = ITOV(ip); 1481 /* 1482 * Lookup block being written. 1483 */ 1484 if (lbn < NDADDR) { 1485 blkno = db_get(ip, lbn); 1486 } else { 1487 if (snapshot_locked == 0 && 1488 lockmgr(vp->v_vnlock, 1489 LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL, 1490 VI_MTX(devvp)) != 0) 1491 goto retry; 1492 snapshot_locked = 1; 1493 s = cow_enter(); 1494 error = ffs_balloc(vp, lblktosize(fs, (off_t)lbn), 1495 fs->fs_bsize, KERNCRED, B_METAONLY, &ibp); 1496 cow_leave(s); 1497 if (error) 1498 break; 1499 indiroff = (lbn - NDADDR) % NINDIR(fs); 1500 blkno = idb_get(ip, ibp->b_data, indiroff); 1501 } 1502 /* 1503 * Check to see if block needs to be copied. 1504 */ 1505 if (blkno == 0) { 1506 /* 1507 * A block that we map is being freed. If it has not 1508 * been claimed yet, we will claim or copy it (below). 1509 */ 1510 claimedblk = 1; 1511 } else if (blkno == BLK_SNAP) { 1512 /* 1513 * No previous snapshot claimed the block, 1514 * so it will be freed and become a BLK_NOCOPY 1515 * (don't care) for us. 1516 */ 1517 if (claimedblk) 1518 panic("snapblkfree: inconsistent block type"); 1519 if (snapshot_locked == 0 && 1520 lockmgr(vp->v_vnlock, 1521 LK_INTERLOCK | LK_EXCLUSIVE | LK_NOWAIT, 1522 VI_MTX(devvp)) != 0) { 1523#if 0 /* CID-2949: dead code */ 1524 if (lbn >= NDADDR) 1525 brelse(ibp); 1526#endif 1527 vn_lock(vp, LK_EXCLUSIVE | LK_SLEEPFAIL); 1528 goto retry; 1529 } 1530 snapshot_locked = 1; 1531 if (lbn < NDADDR) { 1532 db_assign(ip, lbn, BLK_NOCOPY); 1533 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1534 } else { 1535 idb_assign(ip, ibp->b_data, indiroff, 1536 BLK_NOCOPY); 1537 bwrite(ibp); 1538 } 1539 continue; 1540 } else /* BLK_NOCOPY or default */ { 1541 /* 1542 * If the snapshot has already copied the block 1543 * (default), or does not care about the block, 1544 * it is not needed. 1545 */ 1546 if (lbn >= NDADDR) 1547 brelse(ibp); 1548 continue; 1549 } 1550 /* 1551 * If this is a full size block, we will just grab it 1552 * and assign it to the snapshot inode. Otherwise we 1553 * will proceed to copy it. See explanation for this 1554 * routine as to why only a single snapshot needs to 1555 * claim this block. 1556 */ 1557 if (snapshot_locked == 0 && 1558 lockmgr(vp->v_vnlock, 1559 LK_INTERLOCK | LK_EXCLUSIVE | LK_NOWAIT, 1560 VI_MTX(devvp)) != 0) { 1561 vn_lock(vp, LK_EXCLUSIVE | LK_SLEEPFAIL); 1562 goto retry; 1563 } 1564 snapshot_locked = 1; 1565 if (size == fs->fs_bsize) { 1566#ifdef DEBUG 1567 if (snapdebug) 1568 printf("%s %llu lbn %" PRId64 1569 "from inum %llu\n", 1570 "Grabonremove: snapino", 1571 (unsigned long long)ip->i_number, 1572 lbn, (unsigned long long)inum); 1573#endif 1574 if (lbn < NDADDR) { 1575 db_assign(ip, lbn, bno); 1576 } else { 1577 idb_assign(ip, ibp->b_data, indiroff, bno); 1578 bwrite(ibp); 1579 } 1580 DIP_ADD(ip, blocks, btodb(size)); 1581 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1582 VOP_UNLOCK(vp, 0); 1583 return (1); 1584 } 1585 if (lbn >= NDADDR) 1586 brelse(ibp); 1587#ifdef DEBUG 1588 if (snapdebug) 1589 printf("%s%llu lbn %" PRId64 " %s %llu size %ld\n", 1590 "Copyonremove: snapino ", 1591 (unsigned long long)ip->i_number, 1592 lbn, "for inum", (unsigned long long)inum, size); 1593#endif 1594 /* 1595 * If we have already read the old block contents, then 1596 * simply copy them to the new block. Note that we need 1597 * to synchronously write snapshots that have not been 1598 * unlinked, and hence will be visible after a crash, 1599 * to ensure their integrity. 1600 */ 1601 if (saved_data) { 1602 error = writevnblk(vp, saved_data, lbn); 1603 if (error) 1604 break; 1605 continue; 1606 } 1607 /* 1608 * Otherwise, read the old block contents into the buffer. 1609 */ 1610 saved_data = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK); 1611 saved_vp = vp; 1612 if ((error = readfsblk(vp, saved_data, lbn)) != 0) { 1613 free(saved_data, M_UFSMNT); 1614 saved_data = NULL; 1615 break; 1616 } 1617 } 1618 /* 1619 * Note that we need to synchronously write snapshots that 1620 * have not been unlinked, and hence will be visible after 1621 * a crash, to ensure their integrity. 1622 */ 1623 if (saved_data) { 1624 error = writevnblk(saved_vp, saved_data, lbn); 1625 free(saved_data, M_UFSMNT); 1626 } 1627 /* 1628 * If we have been unable to allocate a block in which to do 1629 * the copy, then return non-zero so that the fragment will 1630 * not be freed. Although space will be lost, the snapshot 1631 * will stay consistent. 1632 */ 1633 if (snapshot_locked) 1634 VOP_UNLOCK(vp, 0); 1635 else 1636 VI_UNLOCK(devvp); 1637 return (error); 1638} 1639 1640/* 1641 * Associate snapshot files when mounting. 1642 */ 1643void 1644ffs_snapshot_mount(struct mount *mp) 1645{ 1646 struct ufsmount *ump = VFSTOUFS(mp); 1647 struct vnode *devvp = ump->um_devvp; 1648 struct fs *fs = ump->um_fs; 1649 struct lwp *l = curlwp; 1650 struct vnode *vp; 1651 struct inode *ip, *xp; 1652 ufs2_daddr_t snaplistsize, *snapblklist; 1653 int i, error, ns, snaploc, loc; 1654 1655 ns = UFS_FSNEEDSWAP(fs); 1656 /* 1657 * XXX The following needs to be set before ffs_truncate or 1658 * VOP_READ can be called. 1659 */ 1660 mp->mnt_stat.f_iosize = fs->fs_bsize; 1661 /* 1662 * Process each snapshot listed in the superblock. 1663 */ 1664 vp = NULL; 1665 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) { 1666 if (fs->fs_snapinum[snaploc] == 0) 1667 break; 1668 if ((error = VFS_VGET(mp, fs->fs_snapinum[snaploc], 1669 &vp)) != 0) { 1670 printf("ffs_snapshot_mount: vget failed %d\n", error); 1671 continue; 1672 } 1673 ip = VTOI(vp); 1674 if ((ip->i_flags & SF_SNAPSHOT) == 0) { 1675 printf("ffs_snapshot_mount: non-snapshot inode %d\n", 1676 fs->fs_snapinum[snaploc]); 1677 vput(vp); 1678 vp = NULL; 1679 for (loc = snaploc + 1; loc < FSMAXSNAP; loc++) { 1680 if (fs->fs_snapinum[loc] == 0) 1681 break; 1682 fs->fs_snapinum[loc - 1] = fs->fs_snapinum[loc]; 1683 } 1684 fs->fs_snapinum[loc - 1] = 0; 1685 snaploc--; 1686 continue; 1687 } 1688 1689 /* 1690 * Read the block hints list. Use an empty list on 1691 * read errors. 1692 */ 1693 error = vn_rdwr(UIO_READ, vp, 1694 (caddr_t)&snaplistsize, sizeof(snaplistsize), 1695 lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)), 1696 UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, 1697 l->l_cred, NULL, NULL); 1698 if (error) { 1699 printf("ffs_snapshot_mount: read_1 failed %d\n", error); 1700 snaplistsize = 1; 1701 } else 1702 snaplistsize = ufs_rw64(snaplistsize, ns); 1703 snapblklist = malloc( 1704 snaplistsize * sizeof(ufs2_daddr_t), M_UFSMNT, M_WAITOK); 1705 if (error) 1706 snapblklist[0] = 1; 1707 else { 1708 error = vn_rdwr(UIO_READ, vp, (caddr_t)snapblklist, 1709 snaplistsize * sizeof(ufs2_daddr_t), 1710 lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)), 1711 UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, 1712 l->l_cred, NULL, NULL); 1713 for (i = 0; i < snaplistsize; i++) 1714 snapblklist[i] = ufs_rw64(snapblklist[i], ns); 1715 if (error) { 1716 printf("ffs_snapshot_mount: read_2 failed %d\n", 1717 error); 1718 snapblklist[0] = 1; 1719 } 1720 } 1721 ip->i_snapblklist = &snapblklist[0]; 1722 1723 /* 1724 * If there already exist snapshots on this filesystem, grab a 1725 * reference to their shared lock. If this is the first snapshot 1726 * on this filesystem, we need to allocate a lock for the 1727 * snapshots to share. In either case, acquire the snapshot 1728 * lock and give up our original private lock. 1729 */ 1730 VI_LOCK(devvp); 1731 if ((xp = TAILQ_FIRST(&ump->um_snapshots)) != NULL) { 1732 struct lock *lkp; 1733 1734 lkp = ITOV(xp)->v_vnlock; 1735 VI_UNLOCK(devvp); 1736 VI_LOCK(vp); 1737 vp->v_vnlock = lkp; 1738 } else { 1739 struct lock *lkp; 1740 1741 VI_UNLOCK(devvp); 1742 MALLOC(lkp, struct lock *, sizeof(struct lock), 1743 M_UFSMNT, M_WAITOK); 1744 lockinit(lkp, PVFS, "snaplk", 0, LK_CANRECURSE); 1745 VI_LOCK(vp); 1746 vp->v_vnlock = lkp; 1747 } 1748 vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY); 1749 transferlockers(&vp->v_lock, vp->v_vnlock); 1750 lockmgr(&vp->v_lock, LK_RELEASE, NULL); 1751 /* 1752 * Link it onto the active snapshot list. 1753 */ 1754 VI_LOCK(devvp); 1755 if (ip->i_nextsnap.tqe_prev != 0) 1756 panic("ffs_snapshot_mount: %llu already on list", 1757 (unsigned long long)ip->i_number); 1758 else 1759 TAILQ_INSERT_TAIL(&ump->um_snapshots, ip, i_nextsnap); 1760 vp->v_flag |= VSYSTEM; 1761 VI_UNLOCK(devvp); 1762 VOP_UNLOCK(vp, 0); 1763 } 1764 /* 1765 * No usable snapshots found. 1766 */ 1767 if (vp == NULL) 1768 return; 1769 /* 1770 * Attach the block hints list. We always want to 1771 * use the list from the newest snapshot. 1772 */ 1773 xp = TAILQ_LAST(&ump->um_snapshots, inodelst); 1774 VI_LOCK(devvp); 1775 ump->um_snapblklist = xp->i_snapblklist; 1776 VI_UNLOCK(devvp); 1777 vn_cow_establish(devvp, ffs_copyonwrite, devvp); 1778} 1779 1780/* 1781 * Disassociate snapshot files when unmounting. 1782 */ 1783void 1784ffs_snapshot_unmount(struct mount *mp) 1785{ 1786 struct ufsmount *ump = VFSTOUFS(mp); 1787 struct vnode *devvp = ump->um_devvp; 1788 struct lock *lkp = NULL; 1789 struct inode *xp; 1790 struct vnode *vp; 1791 1792 VI_LOCK(devvp); 1793 while ((xp = TAILQ_FIRST(&ump->um_snapshots)) != 0) { 1794 vp = ITOV(xp); 1795 lkp = vp->v_vnlock; 1796 vp->v_vnlock = &vp->v_lock; 1797 TAILQ_REMOVE(&ump->um_snapshots, xp, i_nextsnap); 1798 xp->i_nextsnap.tqe_prev = 0; 1799 if (xp->i_snapblklist == ump->um_snapblklist) 1800 ump->um_snapblklist = NULL; 1801 VI_UNLOCK(devvp); 1802 FREE(xp->i_snapblklist, M_UFSMNT); 1803 if (xp->i_ffs_effnlink > 0) 1804 vrele(vp); 1805 VI_LOCK(devvp); 1806 } 1807 VI_UNLOCK(devvp); 1808 if (lkp != NULL) { 1809 vn_cow_disestablish(devvp, ffs_copyonwrite, devvp); 1810 FREE(lkp, M_UFSMNT); 1811 } 1812} 1813 1814/* 1815 * Check for need to copy block that is about to be written, 1816 * copying the block if necessary. 1817 */ 1818static int 1819ffs_copyonwrite(void *v, struct buf *bp) 1820{ 1821 struct buf *ibp; 1822 struct fs *fs; 1823 struct inode *ip; 1824 struct vnode *devvp = v, *vp = 0, *saved_vp = NULL; 1825 struct ufsmount *ump = VFSTOUFS(devvp->v_specmountpoint); 1826 caddr_t saved_data = NULL; 1827 ufs2_daddr_t lbn, blkno, *snapblklist; 1828 int lower, upper, mid, s, ns, indiroff, snapshot_locked = 0, error = 0; 1829 1830 /* 1831 * Check for valid snapshots. 1832 */ 1833 VI_LOCK(devvp); 1834 ip = TAILQ_FIRST(&ump->um_snapshots); 1835 if (ip == NULL) { 1836 VI_UNLOCK(devvp); 1837 return 0; 1838 } 1839 /* 1840 * First check to see if it is in the preallocated list. 1841 * By doing this check we avoid several potential deadlocks. 1842 */ 1843 fs = ip->i_fs; 1844 ns = UFS_FSNEEDSWAP(fs); 1845 lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno)); 1846 snapblklist = ump->um_snapblklist; 1847 upper = ump->um_snapblklist[0] - 1; 1848 lower = 1; 1849 while (lower <= upper) { 1850 mid = (lower + upper) / 2; 1851 if (snapblklist[mid] == lbn) 1852 break; 1853 if (snapblklist[mid] < lbn) 1854 lower = mid + 1; 1855 else 1856 upper = mid - 1; 1857 } 1858 if (lower <= upper) { 1859 VI_UNLOCK(devvp); 1860 return 0; 1861 } 1862 /* 1863 * Not in the precomputed list, so check the snapshots. 1864 */ 1865retry: 1866 TAILQ_FOREACH(ip, &ump->um_snapshots, i_nextsnap) { 1867 vp = ITOV(ip); 1868 /* 1869 * We ensure that everything of our own that needs to be 1870 * copied will be done at the time that ffs_snapshot is 1871 * called. Thus we can skip the check here which can 1872 * deadlock in doing the lookup in ffs_balloc. 1873 */ 1874 if (bp->b_vp == vp) 1875 continue; 1876 /* 1877 * Check to see if block needs to be copied. We do not have 1878 * to hold the snapshot lock while doing this lookup as it 1879 * will never require any additional allocations for the 1880 * snapshot inode. 1881 */ 1882 if (lbn < NDADDR) { 1883 blkno = db_get(ip, lbn); 1884 } else { 1885 if (snapshot_locked == 0 && 1886 lockmgr(vp->v_vnlock, 1887 LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL, 1888 VI_MTX(devvp)) != 0) { 1889 VI_LOCK(devvp); 1890 goto retry; 1891 } 1892 snapshot_locked = 1; 1893 s = cow_enter(); 1894 error = ffs_balloc(vp, lblktosize(fs, (off_t)lbn), 1895 fs->fs_bsize, KERNCRED, B_METAONLY, &ibp); 1896 cow_leave(s); 1897 if (error) 1898 break; 1899 indiroff = (lbn - NDADDR) % NINDIR(fs); 1900 blkno = idb_get(ip, ibp->b_data, indiroff); 1901 brelse(ibp); 1902 } 1903#ifdef DIAGNOSTIC 1904 if (blkno == BLK_SNAP && bp->b_lblkno >= 0) 1905 panic("ffs_copyonwrite: bad copy block"); 1906#endif 1907 if (blkno != 0) 1908 continue; 1909#ifdef DIAGNOSTIC 1910 if (curlwp->l_flag & L_COWINPROGRESS) 1911 printf("ffs_copyonwrite: recursive call\n"); 1912#endif 1913 /* 1914 * Allocate the block into which to do the copy. Since 1915 * multiple processes may all try to copy the same block, 1916 * we have to recheck our need to do a copy if we sleep 1917 * waiting for the lock. 1918 * 1919 * Because all snapshots on a filesystem share a single 1920 * lock, we ensure that we will never be in competition 1921 * with another process to allocate a block. 1922 */ 1923 if (snapshot_locked == 0 && 1924 lockmgr(vp->v_vnlock, 1925 LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL, 1926 VI_MTX(devvp)) != 0) { 1927 VI_LOCK(devvp); 1928 goto retry; 1929 } 1930 snapshot_locked = 1; 1931#ifdef DEBUG 1932 if (snapdebug) { 1933 printf("Copyonwrite: snapino %llu lbn %" PRId64 " for ", 1934 (unsigned long long)ip->i_number, lbn); 1935 if (bp->b_vp == devvp) 1936 printf("fs metadata"); 1937 else 1938 printf("inum %llu", (unsigned long long) 1939 VTOI(bp->b_vp)->i_number); 1940 printf(" lblkno %" PRId64 "\n", bp->b_lblkno); 1941 } 1942#endif 1943 /* 1944 * If we have already read the old block contents, then 1945 * simply copy them to the new block. Note that we need 1946 * to synchronously write snapshots that have not been 1947 * unlinked, and hence will be visible after a crash, 1948 * to ensure their integrity. 1949 */ 1950 if (saved_data) { 1951 error = writevnblk(vp, saved_data, lbn); 1952 if (error) 1953 break; 1954 continue; 1955 } 1956 /* 1957 * Otherwise, read the old block contents into the buffer. 1958 */ 1959 saved_data = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK); 1960 saved_vp = vp; 1961 if ((error = readfsblk(vp, saved_data, lbn)) != 0) { 1962 free(saved_data, M_UFSMNT); 1963 saved_data = NULL; 1964 break; 1965 } 1966 } 1967 /* 1968 * Note that we need to synchronously write snapshots that 1969 * have not been unlinked, and hence will be visible after 1970 * a crash, to ensure their integrity. 1971 */ 1972 if (saved_data) { 1973 error = writevnblk(saved_vp, saved_data, lbn); 1974 free(saved_data, M_UFSMNT); 1975 } 1976 if (snapshot_locked) 1977 VOP_UNLOCK(vp, 0); 1978 else 1979 VI_UNLOCK(devvp); 1980 return error; 1981} 1982 1983/* 1984 * Read the specified block from disk. Vp is usually a snapshot vnode. 1985 */ 1986static int 1987readfsblk(struct vnode *vp, caddr_t data, ufs2_daddr_t lbn) 1988{ 1989 int error; 1990 struct inode *ip = VTOI(vp); 1991 struct fs *fs = ip->i_fs; 1992 struct buf *nbp; 1993 1994 nbp = getiobuf(); 1995 nbp->b_flags = B_READ; 1996 nbp->b_bcount = nbp->b_bufsize = fs->fs_bsize; 1997 nbp->b_error = 0; 1998 nbp->b_data = data; 1999 nbp->b_blkno = nbp->b_rawblkno = fsbtodb(fs, blkstofrags(fs, lbn)); 2000 nbp->b_proc = NULL; 2001 nbp->b_dev = ip->i_devvp->v_rdev; 2002 nbp->b_vp = NULLVP; 2003 2004 DEV_STRATEGY(nbp); 2005 2006 error = biowait(nbp); 2007 2008 putiobuf(nbp); 2009 2010 return error; 2011} 2012 2013/* 2014 * Read the specified block. Bypass UBC to prevent deadlocks. 2015 */ 2016static int 2017readvnblk(struct vnode *vp, caddr_t data, ufs2_daddr_t lbn) 2018{ 2019 int error; 2020 daddr_t bn; 2021 off_t offset; 2022 struct inode *ip = VTOI(vp); 2023 struct fs *fs = ip->i_fs; 2024 2025 error = VOP_BMAP(vp, lbn, NULL, &bn, NULL); 2026 if (error) 2027 return error; 2028 2029 if (bn != (daddr_t)-1) { 2030 offset = dbtob(bn); 2031 simple_lock(&vp->v_interlock); 2032 error = VOP_PUTPAGES(vp, trunc_page(offset), 2033 round_page(offset+fs->fs_bsize), 2034 PGO_CLEANIT|PGO_SYNCIO|PGO_FREE); 2035 if (error) 2036 return error; 2037 2038 return readfsblk(vp, data, fragstoblks(fs, dbtofsb(fs, bn))); 2039 } 2040 2041 bzero(data, fs->fs_bsize); 2042 2043 return 0; 2044} 2045 2046/* 2047 * Write the specified block. Bypass UBC to prevent deadlocks. 2048 */ 2049static int 2050writevnblk(struct vnode *vp, caddr_t data, ufs2_daddr_t lbn) 2051{ 2052 int s, error; 2053 off_t offset; 2054 struct buf *bp; 2055 struct inode *ip = VTOI(vp); 2056 struct fs *fs = ip->i_fs; 2057 2058 offset = lblktosize(fs, (off_t)lbn); 2059 s = cow_enter(); 2060 simple_lock(&vp->v_interlock); 2061 error = VOP_PUTPAGES(vp, trunc_page(offset), 2062 round_page(offset+fs->fs_bsize), PGO_CLEANIT|PGO_SYNCIO|PGO_FREE); 2063 if (error == 0) 2064 error = ffs_balloc(vp, lblktosize(fs, (off_t)lbn), 2065 fs->fs_bsize, KERNCRED, B_SYNC, &bp); 2066 cow_leave(s); 2067 if (error) 2068 return error; 2069 2070 bcopy(data, bp->b_data, fs->fs_bsize); 2071 bp->b_flags |= B_NOCACHE; 2072 2073 return bwrite(bp); 2074} 2075 2076/* 2077 * Set/reset lwp's L_COWINPROGRESS flag. 2078 * May be called recursive. 2079 */ 2080static inline int 2081cow_enter(void) 2082{ 2083 struct lwp *l = curlwp; 2084 2085 if (l->l_flag & L_COWINPROGRESS) { 2086 return 0; 2087 } else { 2088 l->l_flag |= L_COWINPROGRESS; 2089 return L_COWINPROGRESS; 2090 } 2091} 2092 2093static inline void 2094cow_leave(int flag) 2095{ 2096 struct lwp *l = curlwp; 2097 2098 l->l_flag &= ~flag; 2099} 2100 2101/* 2102 * Get/Put direct block from inode or buffer containing disk addresses. Take 2103 * care for fs type (UFS1/UFS2) and byte swapping. These functions should go 2104 * into a global include. 2105 */ 2106static inline ufs2_daddr_t 2107db_get(struct inode *ip, int loc) 2108{ 2109 if (ip->i_ump->um_fstype == UFS1) 2110 return ufs_rw32(ip->i_ffs1_db[loc], UFS_IPNEEDSWAP(ip)); 2111 else 2112 return ufs_rw64(ip->i_ffs2_db[loc], UFS_IPNEEDSWAP(ip)); 2113} 2114 2115static inline void 2116db_assign(struct inode *ip, int loc, ufs2_daddr_t val) 2117{ 2118 if (ip->i_ump->um_fstype == UFS1) 2119 ip->i_ffs1_db[loc] = ufs_rw32(val, UFS_IPNEEDSWAP(ip)); 2120 else 2121 ip->i_ffs2_db[loc] = ufs_rw64(val, UFS_IPNEEDSWAP(ip)); 2122} 2123 2124static inline ufs2_daddr_t 2125idb_get(struct inode *ip, caddr_t bf, int loc) 2126{ 2127 if (ip->i_ump->um_fstype == UFS1) 2128 return ufs_rw32(((ufs1_daddr_t *)(bf))[loc], 2129 UFS_IPNEEDSWAP(ip)); 2130 else 2131 return ufs_rw64(((ufs2_daddr_t *)(bf))[loc], 2132 UFS_IPNEEDSWAP(ip)); 2133} 2134 2135static inline void 2136idb_assign(struct inode *ip, caddr_t bf, int loc, ufs2_daddr_t val) 2137{ 2138 if (ip->i_ump->um_fstype == UFS1) 2139 ((ufs1_daddr_t *)(bf))[loc] = 2140 ufs_rw32(val, UFS_IPNEEDSWAP(ip)); 2141 else 2142 ((ufs2_daddr_t *)(bf))[loc] = 2143 ufs_rw64(val, UFS_IPNEEDSWAP(ip)); 2144} 2145