ffs_snapshot.c revision 1.73
1/* $NetBSD: ffs_snapshot.c,v 1.73 2008/07/31 15:37:56 hannken Exp $ */ 2 3/* 4 * Copyright 2000 Marshall Kirk McKusick. All Rights Reserved. 5 * 6 * Further information about snapshots can be obtained from: 7 * 8 * Marshall Kirk McKusick http://www.mckusick.com/softdep/ 9 * 1614 Oxford Street mckusick@mckusick.com 10 * Berkeley, CA 94709-1608 +1-510-843-9542 11 * USA 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 16 * 17 * 1. Redistributions of source code must retain the above copyright 18 * notice, this list of conditions and the following disclaimer. 19 * 2. Redistributions in binary form must reproduce the above copyright 20 * notice, this list of conditions and the following disclaimer in the 21 * documentation and/or other materials provided with the distribution. 22 * 23 * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY 24 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 25 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 26 * DISCLAIMED. IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR 27 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 * 35 * @(#)ffs_snapshot.c 8.11 (McKusick) 7/23/00 36 * 37 * from FreeBSD: ffs_snapshot.c,v 1.79 2004/02/13 02:02:06 kuriyama Exp 38 */ 39 40#include <sys/cdefs.h> 41__KERNEL_RCSID(0, "$NetBSD: ffs_snapshot.c,v 1.73 2008/07/31 15:37:56 hannken Exp $"); 42 43#if defined(_KERNEL_OPT) 44#include "opt_ffs.h" 45#endif 46 47#include <sys/param.h> 48#include <sys/kernel.h> 49#include <sys/systm.h> 50#include <sys/conf.h> 51#include <sys/buf.h> 52#include <sys/proc.h> 53#include <sys/namei.h> 54#include <sys/sched.h> 55#include <sys/stat.h> 56#include <sys/malloc.h> 57#include <sys/mount.h> 58#include <sys/resource.h> 59#include <sys/resourcevar.h> 60#include <sys/vnode.h> 61#include <sys/kauth.h> 62#include <sys/fstrans.h> 63 64#include <miscfs/specfs/specdev.h> 65 66#include <ufs/ufs/quota.h> 67#include <ufs/ufs/ufsmount.h> 68#include <ufs/ufs/inode.h> 69#include <ufs/ufs/ufs_extern.h> 70#include <ufs/ufs/ufs_bswap.h> 71 72#include <ufs/ffs/fs.h> 73#include <ufs/ffs/ffs_extern.h> 74 75#include <uvm/uvm.h> 76 77/* FreeBSD -> NetBSD conversion */ 78#define KERNCRED lwp0.l_cred 79#define ufs1_daddr_t int32_t 80#define ufs2_daddr_t int64_t 81#define ufs_lbn_t daddr_t 82#define VI_MTX(v) (&(v)->v_interlock) 83#define VI_LOCK(v) mutex_enter(&(v)->v_interlock) 84#define VI_UNLOCK(v) mutex_exit(&(v)->v_interlock) 85#define MNT_ILOCK(v) mutex_enter(&mntvnode_lock) 86#define MNT_IUNLOCK(v) mutex_exit(&mntvnode_lock) 87 88#if !defined(FFS_NO_SNAPSHOT) 89static int cgaccount(int, struct vnode *, void *, int); 90static int expunge_ufs1(struct vnode *, struct inode *, struct fs *, 91 int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, 92 ufs_lbn_t, int), int); 93static int indiracct_ufs1(struct vnode *, struct vnode *, int, 94 ufs1_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *, 95 int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, 96 ufs_lbn_t, int), int); 97static int fullacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 98 struct fs *, ufs_lbn_t, int); 99static int snapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 100 struct fs *, ufs_lbn_t, int); 101static int mapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 102 struct fs *, ufs_lbn_t, int); 103static int expunge_ufs2(struct vnode *, struct inode *, struct fs *, 104 int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, 105 ufs_lbn_t, int), int); 106static int indiracct_ufs2(struct vnode *, struct vnode *, int, 107 ufs2_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *, 108 int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, 109 ufs_lbn_t, int), int); 110static int fullacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 111 struct fs *, ufs_lbn_t, int); 112static int snapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 113 struct fs *, ufs_lbn_t, int); 114static int mapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 115 struct fs *, ufs_lbn_t, int); 116#endif /* !defined(FFS_NO_SNAPSHOT) */ 117 118static int ffs_copyonwrite(void *, struct buf *, bool); 119static int snapblkaddr(struct vnode *, daddr_t, daddr_t *); 120static int rwfsblk(struct vnode *, int, void *, ufs2_daddr_t); 121static int wrsnapblk(struct vnode *, void *, ufs2_daddr_t); 122static inline ufs2_daddr_t db_get(struct inode *, int); 123static inline void db_assign(struct inode *, int, ufs2_daddr_t); 124static inline ufs2_daddr_t idb_get(struct inode *, void *, int); 125static inline void idb_assign(struct inode *, void *, int, ufs2_daddr_t); 126 127struct snap_info { 128 kmutex_t si_lock; /* Lock this snapinfo */ 129 struct vnlock si_vnlock; /* Snapshot vnode common lock */ 130 TAILQ_HEAD(inodelst, inode) si_snapshots; /* List of active snapshots */ 131 daddr_t *si_snapblklist; /* Snapshot block hints list */ 132 uint32_t si_gen; /* Incremented on change */ 133}; 134 135#ifdef DEBUG 136static int snapdebug = 0; 137#endif 138 139int 140ffs_snapshot_init(struct ufsmount *ump) 141{ 142 struct snap_info *si; 143 144 si = ump->um_snapinfo = kmem_alloc(sizeof(*si), KM_SLEEP); 145 if (si == NULL) 146 return ENOMEM; 147 148 TAILQ_INIT(&si->si_snapshots); 149 mutex_init(&si->si_lock, MUTEX_DEFAULT, IPL_NONE); 150 rw_init(&si->si_vnlock.vl_lock); 151 si->si_vnlock.vl_canrecurse = 1; 152 si->si_vnlock.vl_recursecnt = 0; 153 si->si_gen = 0; 154 si->si_snapblklist = NULL; 155 156 return 0; 157} 158 159void 160ffs_snapshot_fini(struct ufsmount *ump) 161{ 162 struct snap_info *si; 163 164 si = ump->um_snapinfo; 165 ump->um_snapinfo = NULL; 166 167 KASSERT(TAILQ_EMPTY(&si->si_snapshots)); 168 mutex_destroy(&si->si_lock); 169 rw_destroy(&si->si_vnlock.vl_lock); 170 KASSERT(si->si_snapblklist == NULL); 171 kmem_free(si, sizeof(*si)); 172} 173 174/* 175 * Create a snapshot file and initialize it for the filesystem. 176 * Vnode is locked on entry and return. 177 */ 178int 179ffs_snapshot(struct mount *mp, struct vnode *vp, 180 struct timespec *ctime) 181{ 182#if defined(FFS_NO_SNAPSHOT) 183 return EOPNOTSUPP; 184} 185#else /* defined(FFS_NO_SNAPSHOT) */ 186 ufs2_daddr_t numblks, blkno, *blkp, snaplistsize = 0, *snapblklist; 187 int error, ns, cg, snaploc; 188 int i, size, len, loc; 189 int flag = mp->mnt_flag; 190 struct timeval starttime; 191#ifdef DEBUG 192 struct timeval endtime; 193#endif 194 struct timespec ts; 195 long redo = 0; 196 int32_t *lp; 197 void *space; 198 void *sbbuf = NULL; 199 struct fs *copy_fs = NULL, *fs = VFSTOUFS(mp)->um_fs; 200 struct lwp *l = curlwp; 201 struct inode *ip, *xp; 202 struct buf *bp, *ibp, *nbp; 203 struct vattr vat; 204 struct vnode *xvp, *mvp, *devvp; 205 struct snap_info *si; 206 207 ns = UFS_FSNEEDSWAP(fs); 208 si = VFSTOUFS(mp)->um_snapinfo; 209 210 /* Snapshots do not work yet with WAPBL. */ 211 if ((mp->mnt_flag & MNT_LOG)) 212 return EOPNOTSUPP; 213 /* 214 * Need to serialize access to snapshot code per filesystem. 215 */ 216 /* 217 * If the vnode already is a snapshot, return. 218 */ 219 if (VTOI(vp)->i_flags & SF_SNAPSHOT) { 220 if (ctime) { 221 ctime->tv_sec = DIP(VTOI(vp), mtime); 222 ctime->tv_nsec = DIP(VTOI(vp), mtimensec); 223 } 224 return 0; 225 } 226 /* 227 * Check mount, exclusive reference and owner. 228 */ 229 if (vp->v_mount != mp) 230 return EXDEV; 231 if (vp->v_usecount != 1 || vp->v_writecount != 0) 232 return EBUSY; 233 if (kauth_authorize_generic(l->l_cred, KAUTH_GENERIC_ISSUSER, 234 NULL) != 0 && 235 VTOI(vp)->i_uid != kauth_cred_geteuid(l->l_cred)) 236 return EACCES; 237 238 if (vp->v_size != 0) { 239 error = ffs_truncate(vp, 0, 0, NOCRED); 240 if (error) 241 return error; 242 } 243 /* 244 * Assign a snapshot slot in the superblock. 245 */ 246 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) 247 if (fs->fs_snapinum[snaploc] == 0) 248 break; 249 if (snaploc == FSMAXSNAP) 250 return (ENOSPC); 251 ip = VTOI(vp); 252 devvp = ip->i_devvp; 253 /* 254 * Write an empty list of preallocated blocks to the end of 255 * the snapshot to set size to at least that of the filesystem. 256 */ 257 numblks = howmany(fs->fs_size, fs->fs_frag); 258 blkno = 1; 259 blkno = ufs_rw64(blkno, ns); 260 error = vn_rdwr(UIO_WRITE, vp, 261 (void *)&blkno, sizeof(blkno), lblktosize(fs, (off_t)numblks), 262 UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, l->l_cred, NULL, NULL); 263 if (error) 264 goto out; 265 /* 266 * Preallocate critical data structures so that we can copy 267 * them in without further allocation after we suspend all 268 * operations on the filesystem. We would like to just release 269 * the allocated buffers without writing them since they will 270 * be filled in below once we are ready to go, but this upsets 271 * the soft update code, so we go ahead and write the new buffers. 272 * 273 * Allocate all indirect blocks and mark all of them as not 274 * needing to be copied. 275 */ 276 for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) { 277 error = ffs_balloc(vp, lblktosize(fs, (off_t)blkno), 278 fs->fs_bsize, l->l_cred, B_METAONLY, &ibp); 279 if (error) 280 goto out; 281 if (DOINGSOFTDEP(vp)) 282 bawrite(ibp); 283 else 284 brelse(ibp, 0); 285 } 286 /* 287 * Allocate copies for the superblock and its summary information. 288 */ 289 error = ffs_balloc(vp, fs->fs_sblockloc, fs->fs_sbsize, KERNCRED, 290 0, &nbp); 291 if (error) 292 goto out; 293 bawrite(nbp); 294 blkno = fragstoblks(fs, fs->fs_csaddr); 295 len = howmany(fs->fs_cssize, fs->fs_bsize); 296 for (loc = 0; loc < len; loc++) { 297 error = ffs_balloc(vp, lblktosize(fs, (off_t)(blkno + loc)), 298 fs->fs_bsize, KERNCRED, 0, &nbp); 299 if (error) 300 goto out; 301 bawrite(nbp); 302 } 303 /* 304 * Copy all the cylinder group maps. Although the 305 * filesystem is still active, we hope that only a few 306 * cylinder groups will change between now and when we 307 * suspend operations. Thus, we will be able to quickly 308 * touch up the few cylinder groups that changed during 309 * the suspension period. 310 */ 311 len = howmany(fs->fs_ncg, NBBY); 312 fs->fs_active = malloc(len, M_DEVBUF, M_WAITOK | M_ZERO); 313 for (cg = 0; cg < fs->fs_ncg; cg++) { 314 if ((error = ffs_balloc(vp, lfragtosize(fs, cgtod(fs, cg)), 315 fs->fs_bsize, KERNCRED, 0, &nbp)) != 0) 316 goto out; 317 error = cgaccount(cg, vp, nbp->b_data, 1); 318 bawrite(nbp); 319 if (error) 320 goto out; 321 } 322 /* 323 * Change inode to snapshot type file. 324 */ 325 ip->i_flags |= SF_SNAPSHOT; 326 DIP_ASSIGN(ip, flags, ip->i_flags); 327 ip->i_flag |= IN_CHANGE | IN_UPDATE; 328 /* 329 * Ensure that the snapshot is completely on disk. 330 * Since we have marked it as a snapshot it is safe to 331 * unlock it as no process will be allowed to write to it. 332 */ 333 if ((error = VOP_FSYNC(vp, KERNCRED, FSYNC_WAIT, 0, 0)) != 0) 334 goto out; 335 VOP_UNLOCK(vp, 0); 336 /* 337 * All allocations are done, so we can now snapshot the system. 338 * 339 * Suspend operation on filesystem. 340 */ 341 if ((error = vfs_suspend(vp->v_mount, 0)) != 0) { 342 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 343 goto out; 344 } 345 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 346 getmicrotime(&starttime); 347 /* 348 * First, copy all the cylinder group maps that have changed. 349 */ 350 for (cg = 0; cg < fs->fs_ncg; cg++) { 351 if (ACTIVECG_ISSET(fs, cg)) 352 continue; 353 redo++; 354 if ((error = ffs_balloc(vp, lfragtosize(fs, cgtod(fs, cg)), 355 fs->fs_bsize, KERNCRED, 0, &nbp)) != 0) 356 goto out1; 357 error = cgaccount(cg, vp, nbp->b_data, 2); 358 bawrite(nbp); 359 if (error) 360 goto out1; 361 } 362 /* 363 * Grab a copy of the superblock and its summary information. 364 * We delay writing it until the suspension is released below. 365 */ 366 sbbuf = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK); 367 loc = blkoff(fs, fs->fs_sblockloc); 368 if (loc > 0) 369 memset(sbbuf, 0, loc); 370 copy_fs = (struct fs *)((char *)sbbuf + loc); 371 bcopy(fs, copy_fs, fs->fs_sbsize); 372 size = fs->fs_bsize < SBLOCKSIZE ? fs->fs_bsize : SBLOCKSIZE; 373 if (fs->fs_sbsize < size) 374 memset((char *)sbbuf + loc + fs->fs_sbsize, 0, 375 size - fs->fs_sbsize); 376 size = blkroundup(fs, fs->fs_cssize); 377 if (fs->fs_contigsumsize > 0) 378 size += fs->fs_ncg * sizeof(int32_t); 379 space = malloc((u_long)size, M_UFSMNT, M_WAITOK); 380 copy_fs->fs_csp = space; 381 bcopy(fs->fs_csp, copy_fs->fs_csp, fs->fs_cssize); 382 space = (char *)space + fs->fs_cssize; 383 loc = howmany(fs->fs_cssize, fs->fs_fsize); 384 i = fs->fs_frag - loc % fs->fs_frag; 385 len = (i == fs->fs_frag) ? 0 : i * fs->fs_fsize; 386 if (len > 0) { 387 if ((error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + loc), 388 len, KERNCRED, 0, &bp)) != 0) { 389 brelse(bp, 0); 390 free(copy_fs->fs_csp, M_UFSMNT); 391 goto out1; 392 } 393 bcopy(bp->b_data, space, (u_int)len); 394 space = (char *)space + len; 395 brelse(bp, BC_INVAL | BC_NOCACHE); 396 } 397 if (fs->fs_contigsumsize > 0) { 398 copy_fs->fs_maxcluster = lp = space; 399 for (i = 0; i < fs->fs_ncg; i++) 400 *lp++ = fs->fs_contigsumsize; 401 } 402 /* 403 * We must check for active files that have been unlinked 404 * (e.g., with a zero link count). We have to expunge all 405 * trace of these files from the snapshot so that they are 406 * not reclaimed prematurely by fsck or unnecessarily dumped. 407 * We turn off the MNTK_SUSPENDED flag to avoid a panic from 408 * spec_strategy about writing on a suspended filesystem. 409 * Note that we skip unlinked snapshot files as they will 410 * be handled separately below. 411 * 412 * We also calculate the needed size for the snapshot list. 413 */ 414 snaplistsize = fs->fs_ncg + howmany(fs->fs_cssize, fs->fs_bsize) + 415 FSMAXSNAP + 1 /* superblock */ + 1 /* last block */ + 1 /* size */; 416 /* Allocate a marker vnode */ 417 if ((mvp = vnalloc(mp)) == NULL) { 418 error = ENOMEM; 419 goto out1; 420 } 421 MNT_ILOCK(mp); 422 /* 423 * NOTE: not using the TAILQ_FOREACH here since in this loop vgone() 424 * and vclean() can be called indirectly 425 */ 426 for (xvp = TAILQ_FIRST(&mp->mnt_vnodelist); xvp; xvp = vunmark(mvp)) { 427 vmark(mvp, xvp); 428 /* 429 * Make sure this vnode wasn't reclaimed in getnewvnode(). 430 * Start over if it has (it won't be on the list anymore). 431 */ 432 if (xvp->v_mount != mp || vismarker(xvp)) 433 continue; 434 VI_LOCK(xvp); 435 if ((xvp->v_iflag & VI_XLOCK) || 436 xvp->v_usecount == 0 || xvp->v_type == VNON || 437 VTOI(xvp) == NULL || 438 (VTOI(xvp)->i_flags & SF_SNAPSHOT)) { 439 VI_UNLOCK(xvp); 440 continue; 441 } 442 MNT_IUNLOCK(mp); 443 /* 444 * XXXAD should increase vnode ref count to prevent it 445 * disappearing or being recycled. 446 */ 447 VI_UNLOCK(xvp); 448#ifdef DEBUG 449 if (snapdebug) 450 vprint("ffs_snapshot: busy vnode", xvp); 451#endif 452 if (VOP_GETATTR(xvp, &vat, l->l_cred) == 0 && 453 vat.va_nlink > 0) { 454 MNT_ILOCK(mp); 455 continue; 456 } 457 xp = VTOI(xvp); 458 if (ffs_checkfreefile(copy_fs, vp, xp->i_number)) { 459 MNT_ILOCK(mp); 460 continue; 461 } 462 /* 463 * If there is a fragment, clear it here. 464 */ 465 blkno = 0; 466 loc = howmany(xp->i_size, fs->fs_bsize) - 1; 467 if (loc < NDADDR) { 468 len = fragroundup(fs, blkoff(fs, xp->i_size)); 469 if (len > 0 && len < fs->fs_bsize) { 470 ffs_blkfree(copy_fs, vp, db_get(xp, loc), 471 len, xp->i_number); 472 blkno = db_get(xp, loc); 473 db_assign(xp, loc, 0); 474 } 475 } 476 snaplistsize += 1; 477 if (xp->i_ump->um_fstype == UFS1) 478 error = expunge_ufs1(vp, xp, copy_fs, fullacct_ufs1, 479 BLK_NOCOPY); 480 else 481 error = expunge_ufs2(vp, xp, copy_fs, fullacct_ufs2, 482 BLK_NOCOPY); 483 if (blkno) 484 db_assign(xp, loc, blkno); 485 if (!error) 486 error = ffs_freefile(copy_fs, vp, xp->i_number, 487 xp->i_mode); 488 if (error) { 489 free(copy_fs->fs_csp, M_UFSMNT); 490 (void)vunmark(mvp); 491 goto out1; 492 } 493 MNT_ILOCK(mp); 494 } 495 MNT_IUNLOCK(mp); 496 vnfree(mvp); 497 /* 498 * Acquire the snapshot lock and give up our original private lock. 499 */ 500 VI_LOCK(vp); 501 vp->v_vnlock = &si->si_vnlock; 502 vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY); 503 vlockmgr(&vp->v_lock, LK_RELEASE); 504 /* 505 * If this is the first snapshot on this filesystem, then we need 506 * to allocate the space for the list of preallocated snapshot blocks. 507 * This list will be refined below, but this preliminary one will 508 * keep us out of deadlock until the full one is ready. 509 */ 510 mutex_enter(&si->si_lock); 511 if ((xp = TAILQ_FIRST(&si->si_snapshots)) == NULL) { 512 mutex_exit(&si->si_lock); 513 snapblklist = malloc( 514 snaplistsize * sizeof(ufs2_daddr_t), M_UFSMNT, M_WAITOK); 515 blkp = &snapblklist[1]; 516 *blkp++ = lblkno(fs, fs->fs_sblockloc); 517 blkno = fragstoblks(fs, fs->fs_csaddr); 518 for (cg = 0; cg < fs->fs_ncg; cg++) { 519 if (fragstoblks(fs, cgtod(fs, cg)) > blkno) 520 break; 521 *blkp++ = fragstoblks(fs, cgtod(fs, cg)); 522 } 523 len = howmany(fs->fs_cssize, fs->fs_bsize); 524 for (loc = 0; loc < len; loc++) 525 *blkp++ = blkno + loc; 526 for (; cg < fs->fs_ncg; cg++) 527 *blkp++ = fragstoblks(fs, cgtod(fs, cg)); 528 snapblklist[0] = blkp - snapblklist; 529 mutex_enter(&si->si_lock); 530 if (si->si_snapblklist != NULL) 531 panic("ffs_snapshot: non-empty list"); 532 si->si_snapblklist = snapblklist; 533 } 534 /* 535 * Record snapshot inode. Since this is the newest snapshot, 536 * it must be placed at the end of the list. 537 */ 538 fs->fs_snapinum[snaploc] = ip->i_number; 539 if (ip->i_nextsnap.tqe_prev != 0) 540 panic("ffs_snapshot: %llu already on list", 541 (unsigned long long)ip->i_number); 542 TAILQ_INSERT_TAIL(&si->si_snapshots, ip, i_nextsnap); 543 if (xp == NULL) 544 fscow_establish(mp, ffs_copyonwrite, devvp); 545 si->si_gen++; 546 mutex_exit(&si->si_lock); 547 vp->v_vflag |= VV_SYSTEM; 548out1: 549 /* 550 * Resume operation on filesystem. 551 */ 552 vfs_resume(vp->v_mount); 553 /* 554 * Set the mtime to the time the snapshot has been taken. 555 */ 556 TIMEVAL_TO_TIMESPEC(&starttime, &ts); 557 if (ctime) 558 *ctime = ts; 559 DIP_ASSIGN(ip, mtime, ts.tv_sec); 560 DIP_ASSIGN(ip, mtimensec, ts.tv_nsec); 561 ip->i_flag |= IN_CHANGE | IN_UPDATE; 562 563#ifdef DEBUG 564 if (starttime.tv_sec > 0) { 565 getmicrotime(&endtime); 566 timersub(&endtime, &starttime, &endtime); 567 printf("%s: suspended %ld.%03ld sec, redo %ld of %d\n", 568 vp->v_mount->mnt_stat.f_mntonname, (long)endtime.tv_sec, 569 endtime.tv_usec / 1000, redo, fs->fs_ncg); 570 } 571#endif 572 if (error) 573 goto out; 574 /* 575 * Copy allocation information from all the snapshots in 576 * this snapshot and then expunge them from its view. 577 */ 578 TAILQ_FOREACH(xp, &si->si_snapshots, i_nextsnap) { 579 if (xp == ip) 580 break; 581 if (xp->i_ump->um_fstype == UFS1) 582 error = expunge_ufs1(vp, xp, fs, snapacct_ufs1, 583 BLK_SNAP); 584 else 585 error = expunge_ufs2(vp, xp, fs, snapacct_ufs2, 586 BLK_SNAP); 587 if (error == 0 && xp->i_ffs_effnlink == 0) 588 error = ffs_freefile(copy_fs, vp, 589 xp->i_number, xp->i_mode); 590 if (error) { 591 fs->fs_snapinum[snaploc] = 0; 592 goto done; 593 } 594 } 595 /* 596 * Allocate space for the full list of preallocated snapshot blocks. 597 */ 598 snapblklist = malloc(snaplistsize * sizeof(ufs2_daddr_t), 599 M_UFSMNT, M_WAITOK); 600 ip->i_snapblklist = &snapblklist[1]; 601 /* 602 * Expunge the blocks used by the snapshots from the set of 603 * blocks marked as used in the snapshot bitmaps. Also, collect 604 * the list of allocated blocks in i_snapblklist. 605 */ 606 if (ip->i_ump->um_fstype == UFS1) 607 error = expunge_ufs1(vp, ip, copy_fs, mapacct_ufs1, BLK_SNAP); 608 else 609 error = expunge_ufs2(vp, ip, copy_fs, mapacct_ufs2, BLK_SNAP); 610 if (error) { 611 fs->fs_snapinum[snaploc] = 0; 612 FREE(snapblklist, M_UFSMNT); 613 goto done; 614 } 615 if (snaplistsize < ip->i_snapblklist - snapblklist) 616 panic("ffs_snapshot: list too small"); 617 snaplistsize = ip->i_snapblklist - snapblklist; 618 snapblklist[0] = snaplistsize; 619 ip->i_snapblklist = &snapblklist[0]; 620 /* 621 * Write out the list of allocated blocks to the end of the snapshot. 622 */ 623 for (i = 0; i < snaplistsize; i++) 624 snapblklist[i] = ufs_rw64(snapblklist[i], ns); 625 error = vn_rdwr(UIO_WRITE, vp, (void *)snapblklist, 626 snaplistsize*sizeof(ufs2_daddr_t), lblktosize(fs, (off_t)numblks), 627 UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, l->l_cred, NULL, NULL); 628 for (i = 0; i < snaplistsize; i++) 629 snapblklist[i] = ufs_rw64(snapblklist[i], ns); 630 if (error) { 631 fs->fs_snapinum[snaploc] = 0; 632 FREE(snapblklist, M_UFSMNT); 633 goto done; 634 } 635 /* 636 * Write the superblock and its summary information 637 * to the snapshot. 638 */ 639 blkno = fragstoblks(fs, fs->fs_csaddr); 640 len = howmany(fs->fs_cssize, fs->fs_bsize); 641 space = copy_fs->fs_csp; 642#ifdef FFS_EI 643 if (ns) { 644 ffs_sb_swap(copy_fs, copy_fs); 645 ffs_csum_swap(space, space, fs->fs_cssize); 646 } 647#endif 648 for (loc = 0; loc < len; loc++) { 649 error = bread(vp, blkno + loc, fs->fs_bsize, KERNCRED, 0, &nbp); 650 if (error) { 651 brelse(nbp, 0); 652 fs->fs_snapinum[snaploc] = 0; 653 FREE(snapblklist, M_UFSMNT); 654 goto done; 655 } 656 bcopy(space, nbp->b_data, fs->fs_bsize); 657 space = (char *)space + fs->fs_bsize; 658 bawrite(nbp); 659 } 660 /* 661 * As this is the newest list, it is the most inclusive, so 662 * should replace the previous list. If this is the first snapshot 663 * free the preliminary list. 664 */ 665 mutex_enter(&si->si_lock); 666 space = si->si_snapblklist; 667 si->si_snapblklist = snapblklist; 668 if (TAILQ_FIRST(&si->si_snapshots) == ip) 669 FREE(space, M_UFSMNT); 670 si->si_gen++; 671 mutex_exit(&si->si_lock); 672done: 673 free(copy_fs->fs_csp, M_UFSMNT); 674 if (!error) { 675 error = bread(vp, lblkno(fs, fs->fs_sblockloc), fs->fs_bsize, 676 KERNCRED, 0, &nbp); 677 if (error) { 678 brelse(nbp, 0); 679 fs->fs_snapinum[snaploc] = 0; 680 } 681 bcopy(sbbuf, nbp->b_data, fs->fs_bsize); 682 bawrite(nbp); 683 } 684out: 685 /* 686 * Invalidate and free all pages on the snapshot vnode. 687 * All data has been written through the buffer cache. 688 * Clean all dirty buffers now to avoid UBC inconsistencies. 689 */ 690 if (!error) { 691 mutex_enter(&vp->v_interlock); 692 error = VOP_PUTPAGES(vp, 0, 0, 693 PGO_ALLPAGES|PGO_CLEANIT|PGO_SYNCIO|PGO_FREE); 694 } 695 if (!error) { 696 mutex_enter(&bufcache_lock); 697 for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { 698 nbp = LIST_NEXT(bp, b_vnbufs); 699 if (bp->b_lblkno < 0) 700 continue; 701 KASSERT((bp->b_cflags & BC_BUSY) == 0); 702 bp->b_cflags |= BC_BUSY; 703 mutex_exit(&bufcache_lock); 704 if (DOINGSOFTDEP(vp)) { 705 bp->b_cflags |= BC_VFLUSH | BC_NOCACHE; 706 error = bwrite(bp); 707 } else { 708 error = rwfsblk(vp, B_WRITE, bp->b_data, 709 fragstoblks(fs, dbtofsb(fs, bp->b_blkno))); 710 brelse(bp, BC_INVAL | BC_VFLUSH); 711 } 712 mutex_enter(&bufcache_lock); 713 if (error) 714 break; 715 nbp = LIST_FIRST(&vp->v_dirtyblkhd); 716 } 717 mutex_exit(&bufcache_lock); 718 } 719 if (sbbuf) 720 free(sbbuf, M_UFSMNT); 721 if (fs->fs_active != 0) { 722 FREE(fs->fs_active, M_DEVBUF); 723 fs->fs_active = 0; 724 } 725 mp->mnt_flag = flag; 726 if (error) 727 (void) ffs_truncate(vp, (off_t)0, 0, NOCRED); 728 else 729 vref(vp); 730 return (error); 731} 732 733/* 734 * Copy a cylinder group map. All the unallocated blocks are marked 735 * BLK_NOCOPY so that the snapshot knows that it need not copy them 736 * if they are later written. If passno is one, then this is a first 737 * pass, so only setting needs to be done. If passno is 2, then this 738 * is a revision to a previous pass which must be undone as the 739 * replacement pass is done. 740 */ 741static int 742cgaccount(int cg, struct vnode *vp, void *data, int passno) 743{ 744 struct buf *bp, *ibp; 745 struct inode *ip; 746 struct cg *cgp; 747 struct fs *fs; 748 ufs2_daddr_t base, numblks; 749 int error, len, loc, ns, indiroff; 750 751 ip = VTOI(vp); 752 fs = ip->i_fs; 753 ns = UFS_FSNEEDSWAP(fs); 754 error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)), 755 (int)fs->fs_cgsize, KERNCRED, 0, &bp); 756 if (error) { 757 brelse(bp, 0); 758 return (error); 759 } 760 cgp = (struct cg *)bp->b_data; 761 if (!cg_chkmagic(cgp, ns)) { 762 brelse(bp, 0); 763 return (EIO); 764 } 765 ACTIVECG_SET(fs, cg); 766 767 bcopy(bp->b_data, data, fs->fs_cgsize); 768 brelse(bp, 0); 769 if (fs->fs_cgsize < fs->fs_bsize) 770 memset((char *)data + fs->fs_cgsize, 0, 771 fs->fs_bsize - fs->fs_cgsize); 772 numblks = howmany(fs->fs_size, fs->fs_frag); 773 len = howmany(fs->fs_fpg, fs->fs_frag); 774 base = cg * fs->fs_fpg / fs->fs_frag; 775 if (base + len >= numblks) 776 len = numblks - base - 1; 777 loc = 0; 778 if (base < NDADDR) { 779 for ( ; loc < NDADDR; loc++) { 780 if (ffs_isblock(fs, cg_blksfree(cgp, ns), loc)) 781 db_assign(ip, loc, BLK_NOCOPY); 782 else if (db_get(ip, loc) == BLK_NOCOPY) { 783 if (passno == 2) 784 db_assign(ip, loc, 0); 785 else if (passno == 1) 786 panic("ffs_snapshot: lost direct block"); 787 } 788 } 789 } 790 if ((error = ffs_balloc(vp, lblktosize(fs, (off_t)(base + loc)), 791 fs->fs_bsize, KERNCRED, B_METAONLY, &ibp)) != 0) 792 return (error); 793 indiroff = (base + loc - NDADDR) % NINDIR(fs); 794 for ( ; loc < len; loc++, indiroff++) { 795 if (indiroff >= NINDIR(fs)) { 796 bawrite(ibp); 797 if ((error = ffs_balloc(vp, 798 lblktosize(fs, (off_t)(base + loc)), 799 fs->fs_bsize, KERNCRED, B_METAONLY, &ibp)) != 0) 800 return (error); 801 indiroff = 0; 802 } 803 if (ffs_isblock(fs, cg_blksfree(cgp, ns), loc)) 804 idb_assign(ip, ibp->b_data, indiroff, BLK_NOCOPY); 805 else if (idb_get(ip, ibp->b_data, indiroff) == BLK_NOCOPY) { 806 if (passno == 2) 807 idb_assign(ip, ibp->b_data, indiroff, 0); 808 else if (passno == 1) 809 panic("ffs_snapshot: lost indirect block"); 810 } 811 } 812 bdwrite(ibp); 813 return (0); 814} 815 816/* 817 * Before expunging a snapshot inode, note all the 818 * blocks that it claims with BLK_SNAP so that fsck will 819 * be able to account for those blocks properly and so 820 * that this snapshot knows that it need not copy them 821 * if the other snapshot holding them is freed. This code 822 * is reproduced once each for UFS1 and UFS2. 823 */ 824static int 825expunge_ufs1(struct vnode *snapvp, struct inode *cancelip, struct fs *fs, 826 int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 827 struct fs *, ufs_lbn_t, int), 828 int expungetype) 829{ 830 int i, error, ns; 831 ufs_lbn_t lbn, rlbn; 832 ufs2_daddr_t len, blkno, numblks, blksperindir; 833 struct ufs1_dinode *dip; 834 struct buf *bp; 835 836 ns = UFS_FSNEEDSWAP(fs); 837 /* 838 * Prepare to expunge the inode. If its inode block has not 839 * yet been copied, then allocate and fill the copy. 840 */ 841 lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number)); 842 error = snapblkaddr(snapvp, lbn, &blkno); 843 if (error) 844 return error; 845 if (blkno != 0) { 846 error = bread(snapvp, lbn, fs->fs_bsize, KERNCRED, 847 B_MODIFY, &bp); 848 } else { 849 error = ffs_balloc(snapvp, lblktosize(fs, (off_t)lbn), 850 fs->fs_bsize, KERNCRED, 0, &bp); 851 if (! error) 852 error = rwfsblk(snapvp, B_READ, bp->b_data, lbn); 853 } 854 if (error) 855 return error; 856 /* 857 * Set a snapshot inode to be a zero length file, regular files 858 * or unlinked snapshots to be completely unallocated. 859 */ 860 dip = (struct ufs1_dinode *)bp->b_data + 861 ino_to_fsbo(fs, cancelip->i_number); 862 if (expungetype == BLK_NOCOPY || cancelip->i_ffs_effnlink == 0) 863 dip->di_mode = 0; 864 dip->di_size = 0; 865 dip->di_blocks = 0; 866 dip->di_flags = 867 ufs_rw32(ufs_rw32(dip->di_flags, ns) & ~SF_SNAPSHOT, ns); 868 bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs1_daddr_t)); 869 bdwrite(bp); 870 /* 871 * Now go through and expunge all the blocks in the file 872 * using the function requested. 873 */ 874 numblks = howmany(cancelip->i_size, fs->fs_bsize); 875 if ((error = (*acctfunc)(snapvp, &cancelip->i_ffs1_db[0], 876 &cancelip->i_ffs1_db[NDADDR], fs, 0, expungetype))) 877 return (error); 878 if ((error = (*acctfunc)(snapvp, &cancelip->i_ffs1_ib[0], 879 &cancelip->i_ffs1_ib[NIADDR], fs, -1, expungetype))) 880 return (error); 881 blksperindir = 1; 882 lbn = -NDADDR; 883 len = numblks - NDADDR; 884 rlbn = NDADDR; 885 for (i = 0; len > 0 && i < NIADDR; i++) { 886 error = indiracct_ufs1(snapvp, ITOV(cancelip), i, 887 ufs_rw32(cancelip->i_ffs1_ib[i], ns), lbn, rlbn, len, 888 blksperindir, fs, acctfunc, expungetype); 889 if (error) 890 return (error); 891 blksperindir *= NINDIR(fs); 892 lbn -= blksperindir + 1; 893 len -= blksperindir; 894 rlbn += blksperindir; 895 } 896 return (0); 897} 898 899/* 900 * Descend an indirect block chain for vnode cancelvp accounting for all 901 * its indirect blocks in snapvp. 902 */ 903static int 904indiracct_ufs1(struct vnode *snapvp, struct vnode *cancelvp, int level, 905 ufs1_daddr_t blkno, ufs_lbn_t lbn, ufs_lbn_t rlbn, ufs_lbn_t remblks, 906 ufs_lbn_t blksperindir, struct fs *fs, 907 int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 908 struct fs *, ufs_lbn_t, int), 909 int expungetype) 910{ 911 int error, ns, num, i; 912 ufs_lbn_t subblksperindir; 913 struct indir indirs[NIADDR + 2]; 914 ufs1_daddr_t last, *bap; 915 struct buf *bp; 916 917 ns = UFS_FSNEEDSWAP(fs); 918 919 if (blkno == 0) { 920 if (expungetype == BLK_NOCOPY) 921 return (0); 922 panic("indiracct_ufs1: missing indir"); 923 } 924 if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0) 925 return (error); 926 if (lbn != indirs[num - 1 - level].in_lbn || num < 2) 927 panic("indiracct_ufs1: botched params"); 928 /* 929 * We have to expand bread here since it will deadlock looking 930 * up the block number for any blocks that are not in the cache. 931 */ 932 error = ffs_getblk(cancelvp, lbn, fsbtodb(fs, blkno), fs->fs_bsize, 933 false, &bp); 934 if (error) 935 return error; 936 if ((bp->b_oflags & (BO_DONE | BO_DELWRI)) == 0 && (error = 937 rwfsblk(bp->b_vp, B_READ, bp->b_data, fragstoblks(fs, blkno)))) { 938 brelse(bp, 0); 939 return (error); 940 } 941 /* 942 * Account for the block pointers in this indirect block. 943 */ 944 last = howmany(remblks, blksperindir); 945 if (last > NINDIR(fs)) 946 last = NINDIR(fs); 947 bap = malloc(fs->fs_bsize, M_DEVBUF, M_WAITOK); 948 bcopy(bp->b_data, (void *)bap, fs->fs_bsize); 949 brelse(bp, 0); 950 error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs, 951 level == 0 ? rlbn : -1, expungetype); 952 if (error || level == 0) 953 goto out; 954 /* 955 * Account for the block pointers in each of the indirect blocks 956 * in the levels below us. 957 */ 958 subblksperindir = blksperindir / NINDIR(fs); 959 for (lbn++, level--, i = 0; i < last; i++) { 960 error = indiracct_ufs1(snapvp, cancelvp, level, 961 ufs_rw32(bap[i], ns), lbn, rlbn, remblks, subblksperindir, 962 fs, acctfunc, expungetype); 963 if (error) 964 goto out; 965 rlbn += blksperindir; 966 lbn -= blksperindir; 967 remblks -= blksperindir; 968 } 969out: 970 FREE(bap, M_DEVBUF); 971 return (error); 972} 973 974/* 975 * Do both snap accounting and map accounting. 976 */ 977static int 978fullacct_ufs1(struct vnode *vp, ufs1_daddr_t *oldblkp, ufs1_daddr_t *lastblkp, 979 struct fs *fs, ufs_lbn_t lblkno, 980 int exptype /* BLK_SNAP or BLK_NOCOPY */) 981{ 982 int error; 983 984 if ((error = snapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype))) 985 return (error); 986 return (mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype)); 987} 988 989/* 990 * Identify a set of blocks allocated in a snapshot inode. 991 */ 992static int 993snapacct_ufs1(struct vnode *vp, ufs1_daddr_t *oldblkp, ufs1_daddr_t *lastblkp, 994 struct fs *fs, ufs_lbn_t lblkno, 995 int expungetype /* BLK_SNAP or BLK_NOCOPY */) 996{ 997 struct inode *ip = VTOI(vp); 998 ufs1_daddr_t blkno, *blkp; 999 ufs_lbn_t lbn; 1000 struct buf *ibp; 1001 int error, ns; 1002 1003 ns = UFS_FSNEEDSWAP(fs); 1004 1005 for ( ; oldblkp < lastblkp; oldblkp++) { 1006 blkno = ufs_rw32(*oldblkp, ns); 1007 if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP) 1008 continue; 1009 lbn = fragstoblks(fs, blkno); 1010 if (lbn < NDADDR) { 1011 blkp = &ip->i_ffs1_db[lbn]; 1012 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1013 } else { 1014 error = ffs_balloc(vp, lblktosize(fs, (off_t)lbn), 1015 fs->fs_bsize, KERNCRED, B_METAONLY, &ibp); 1016 if (error) 1017 return (error); 1018 blkp = &((ufs1_daddr_t *)(ibp->b_data)) 1019 [(lbn - NDADDR) % NINDIR(fs)]; 1020 } 1021 /* 1022 * If we are expunging a snapshot vnode and we 1023 * find a block marked BLK_NOCOPY, then it is 1024 * one that has been allocated to this snapshot after 1025 * we took our current snapshot and can be ignored. 1026 */ 1027 blkno = ufs_rw32(*blkp, ns); 1028 if (expungetype == BLK_SNAP && blkno == BLK_NOCOPY) { 1029 if (lbn >= NDADDR) 1030 brelse(ibp, 0); 1031 } else { 1032 if (blkno != 0) 1033 panic("snapacct_ufs1: bad block"); 1034 *blkp = ufs_rw32(expungetype, ns); 1035 if (lbn >= NDADDR) 1036 bdwrite(ibp); 1037 } 1038 } 1039 return (0); 1040} 1041 1042/* 1043 * Account for a set of blocks allocated in a snapshot inode. 1044 */ 1045static int 1046mapacct_ufs1(struct vnode *vp, ufs1_daddr_t *oldblkp, ufs1_daddr_t *lastblkp, 1047 struct fs *fs, ufs_lbn_t lblkno, int expungetype) 1048{ 1049 ufs1_daddr_t blkno; 1050 struct inode *ip; 1051 ino_t inum; 1052 int acctit, ns; 1053 1054 ns = UFS_FSNEEDSWAP(fs); 1055 ip = VTOI(vp); 1056 inum = ip->i_number; 1057 if (lblkno == -1) 1058 acctit = 0; 1059 else 1060 acctit = 1; 1061 for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) { 1062 blkno = ufs_rw32(*oldblkp, ns); 1063 if (blkno == 0 || blkno == BLK_NOCOPY) 1064 continue; 1065 if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP) 1066 *ip->i_snapblklist++ = lblkno; 1067 if (blkno == BLK_SNAP) 1068 blkno = blkstofrags(fs, lblkno); 1069 ffs_blkfree(fs, vp, blkno, fs->fs_bsize, inum); 1070 } 1071 return (0); 1072} 1073 1074/* 1075 * Before expunging a snapshot inode, note all the 1076 * blocks that it claims with BLK_SNAP so that fsck will 1077 * be able to account for those blocks properly and so 1078 * that this snapshot knows that it need not copy them 1079 * if the other snapshot holding them is freed. This code 1080 * is reproduced once each for UFS1 and UFS2. 1081 */ 1082static int 1083expunge_ufs2(struct vnode *snapvp, struct inode *cancelip, struct fs *fs, 1084 int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 1085 struct fs *, ufs_lbn_t, int), 1086 int expungetype) 1087{ 1088 int i, error, ns; 1089 ufs_lbn_t lbn, rlbn; 1090 ufs2_daddr_t len, blkno, numblks, blksperindir; 1091 struct ufs2_dinode *dip; 1092 struct buf *bp; 1093 1094 ns = UFS_FSNEEDSWAP(fs); 1095 /* 1096 * Prepare to expunge the inode. If its inode block has not 1097 * yet been copied, then allocate and fill the copy. 1098 */ 1099 lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number)); 1100 error = snapblkaddr(snapvp, lbn, &blkno); 1101 if (error) 1102 return error; 1103 if (blkno != 0) { 1104 error = bread(snapvp, lbn, fs->fs_bsize, KERNCRED, 1105 B_MODIFY, &bp); 1106 } else { 1107 error = ffs_balloc(snapvp, lblktosize(fs, (off_t)lbn), 1108 fs->fs_bsize, KERNCRED, 0, &bp); 1109 if (! error) 1110 error = rwfsblk(snapvp, B_READ, bp->b_data, lbn); 1111 } 1112 if (error) 1113 return error; 1114 /* 1115 * Set a snapshot inode to be a zero length file, regular files 1116 * or unlinked snapshots to be completely unallocated. 1117 */ 1118 dip = (struct ufs2_dinode *)bp->b_data + 1119 ino_to_fsbo(fs, cancelip->i_number); 1120 if (expungetype == BLK_NOCOPY || cancelip->i_ffs_effnlink == 0) 1121 dip->di_mode = 0; 1122 dip->di_size = 0; 1123 dip->di_blocks = 0; 1124 dip->di_flags = 1125 ufs_rw32(ufs_rw32(dip->di_flags, ns) & ~SF_SNAPSHOT, ns); 1126 bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs2_daddr_t)); 1127 bdwrite(bp); 1128 /* 1129 * Now go through and expunge all the blocks in the file 1130 * using the function requested. 1131 */ 1132 numblks = howmany(cancelip->i_size, fs->fs_bsize); 1133 if ((error = (*acctfunc)(snapvp, &cancelip->i_ffs2_db[0], 1134 &cancelip->i_ffs2_db[NDADDR], fs, 0, expungetype))) 1135 return (error); 1136 if ((error = (*acctfunc)(snapvp, &cancelip->i_ffs2_ib[0], 1137 &cancelip->i_ffs2_ib[NIADDR], fs, -1, expungetype))) 1138 return (error); 1139 blksperindir = 1; 1140 lbn = -NDADDR; 1141 len = numblks - NDADDR; 1142 rlbn = NDADDR; 1143 for (i = 0; len > 0 && i < NIADDR; i++) { 1144 error = indiracct_ufs2(snapvp, ITOV(cancelip), i, 1145 ufs_rw64(cancelip->i_ffs2_ib[i], ns), lbn, rlbn, len, 1146 blksperindir, fs, acctfunc, expungetype); 1147 if (error) 1148 return (error); 1149 blksperindir *= NINDIR(fs); 1150 lbn -= blksperindir + 1; 1151 len -= blksperindir; 1152 rlbn += blksperindir; 1153 } 1154 return (0); 1155} 1156 1157/* 1158 * Descend an indirect block chain for vnode cancelvp accounting for all 1159 * its indirect blocks in snapvp. 1160 */ 1161static int 1162indiracct_ufs2(struct vnode *snapvp, struct vnode *cancelvp, int level, 1163 ufs2_daddr_t blkno, ufs_lbn_t lbn, ufs_lbn_t rlbn, ufs_lbn_t remblks, 1164 ufs_lbn_t blksperindir, struct fs *fs, 1165 int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 1166 struct fs *, ufs_lbn_t, int), 1167 int expungetype) 1168{ 1169 int error, ns, num, i; 1170 ufs_lbn_t subblksperindir; 1171 struct indir indirs[NIADDR + 2]; 1172 ufs2_daddr_t last, *bap; 1173 struct buf *bp; 1174 1175 ns = UFS_FSNEEDSWAP(fs); 1176 1177 if (blkno == 0) { 1178 if (expungetype == BLK_NOCOPY) 1179 return (0); 1180 panic("indiracct_ufs2: missing indir"); 1181 } 1182 if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0) 1183 return (error); 1184 if (lbn != indirs[num - 1 - level].in_lbn || num < 2) 1185 panic("indiracct_ufs2: botched params"); 1186 /* 1187 * We have to expand bread here since it will deadlock looking 1188 * up the block number for any blocks that are not in the cache. 1189 */ 1190 error = ffs_getblk(cancelvp, lbn, fsbtodb(fs, blkno), fs->fs_bsize, 1191 false, &bp); 1192 if (error) 1193 return error; 1194 if ((bp->b_oflags & (BO_DONE | BO_DELWRI)) == 0 && (error = 1195 rwfsblk(bp->b_vp, B_READ, bp->b_data, fragstoblks(fs, blkno)))) { 1196 brelse(bp, 0); 1197 return (error); 1198 } 1199 /* 1200 * Account for the block pointers in this indirect block. 1201 */ 1202 last = howmany(remblks, blksperindir); 1203 if (last > NINDIR(fs)) 1204 last = NINDIR(fs); 1205 bap = malloc(fs->fs_bsize, M_DEVBUF, M_WAITOK); 1206 bcopy(bp->b_data, (void *)bap, fs->fs_bsize); 1207 brelse(bp, 0); 1208 error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs, 1209 level == 0 ? rlbn : -1, expungetype); 1210 if (error || level == 0) 1211 goto out; 1212 /* 1213 * Account for the block pointers in each of the indirect blocks 1214 * in the levels below us. 1215 */ 1216 subblksperindir = blksperindir / NINDIR(fs); 1217 for (lbn++, level--, i = 0; i < last; i++) { 1218 error = indiracct_ufs2(snapvp, cancelvp, level, 1219 ufs_rw64(bap[i], ns), lbn, rlbn, remblks, subblksperindir, 1220 fs, acctfunc, expungetype); 1221 if (error) 1222 goto out; 1223 rlbn += blksperindir; 1224 lbn -= blksperindir; 1225 remblks -= blksperindir; 1226 } 1227out: 1228 FREE(bap, M_DEVBUF); 1229 return (error); 1230} 1231 1232/* 1233 * Do both snap accounting and map accounting. 1234 */ 1235static int 1236fullacct_ufs2(struct vnode *vp, ufs2_daddr_t *oldblkp, ufs2_daddr_t *lastblkp, 1237 struct fs *fs, ufs_lbn_t lblkno, 1238 int exptype /* BLK_SNAP or BLK_NOCOPY */) 1239{ 1240 int error; 1241 1242 if ((error = snapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype))) 1243 return (error); 1244 return (mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype)); 1245} 1246 1247/* 1248 * Identify a set of blocks allocated in a snapshot inode. 1249 */ 1250static int 1251snapacct_ufs2(struct vnode *vp, ufs2_daddr_t *oldblkp, ufs2_daddr_t *lastblkp, 1252 struct fs *fs, ufs_lbn_t lblkno, 1253 int expungetype /* BLK_SNAP or BLK_NOCOPY */) 1254{ 1255 struct inode *ip = VTOI(vp); 1256 ufs2_daddr_t blkno, *blkp; 1257 ufs_lbn_t lbn; 1258 struct buf *ibp; 1259 int error, ns; 1260 1261 ns = UFS_FSNEEDSWAP(fs); 1262 1263 for ( ; oldblkp < lastblkp; oldblkp++) { 1264 blkno = ufs_rw64(*oldblkp, ns); 1265 if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP) 1266 continue; 1267 lbn = fragstoblks(fs, blkno); 1268 if (lbn < NDADDR) { 1269 blkp = &ip->i_ffs2_db[lbn]; 1270 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1271 } else { 1272 error = ffs_balloc(vp, lblktosize(fs, (off_t)lbn), 1273 fs->fs_bsize, KERNCRED, B_METAONLY, &ibp); 1274 if (error) 1275 return (error); 1276 blkp = &((ufs2_daddr_t *)(ibp->b_data)) 1277 [(lbn - NDADDR) % NINDIR(fs)]; 1278 } 1279 /* 1280 * If we are expunging a snapshot vnode and we 1281 * find a block marked BLK_NOCOPY, then it is 1282 * one that has been allocated to this snapshot after 1283 * we took our current snapshot and can be ignored. 1284 */ 1285 blkno = ufs_rw64(*blkp, ns); 1286 if (expungetype == BLK_SNAP && blkno == BLK_NOCOPY) { 1287 if (lbn >= NDADDR) 1288 brelse(ibp, 0); 1289 } else { 1290 if (blkno != 0) 1291 panic("snapacct_ufs2: bad block"); 1292 *blkp = ufs_rw64(expungetype, ns); 1293 if (lbn >= NDADDR) 1294 bdwrite(ibp); 1295 } 1296 } 1297 return (0); 1298} 1299 1300/* 1301 * Account for a set of blocks allocated in a snapshot inode. 1302 */ 1303static int 1304mapacct_ufs2(struct vnode *vp, ufs2_daddr_t *oldblkp, ufs2_daddr_t *lastblkp, 1305 struct fs *fs, ufs_lbn_t lblkno, int expungetype) 1306{ 1307 ufs2_daddr_t blkno; 1308 struct inode *ip; 1309 ino_t inum; 1310 int acctit, ns; 1311 1312 ns = UFS_FSNEEDSWAP(fs); 1313 ip = VTOI(vp); 1314 inum = ip->i_number; 1315 if (lblkno == -1) 1316 acctit = 0; 1317 else 1318 acctit = 1; 1319 for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) { 1320 blkno = ufs_rw64(*oldblkp, ns); 1321 if (blkno == 0 || blkno == BLK_NOCOPY) 1322 continue; 1323 if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP) 1324 *ip->i_snapblklist++ = lblkno; 1325 if (blkno == BLK_SNAP) 1326 blkno = blkstofrags(fs, lblkno); 1327 ffs_blkfree(fs, vp, blkno, fs->fs_bsize, inum); 1328 } 1329 return (0); 1330} 1331#endif /* defined(FFS_NO_SNAPSHOT) */ 1332 1333/* 1334 * Decrement extra reference on snapshot when last name is removed. 1335 * It will not be freed until the last open reference goes away. 1336 */ 1337void 1338ffs_snapgone(struct inode *ip) 1339{ 1340 struct mount *mp = ip->i_devvp->v_specmountpoint; 1341 struct inode *xp; 1342 struct fs *fs; 1343 struct snap_info *si; 1344 int snaploc; 1345 1346 si = VFSTOUFS(mp)->um_snapinfo; 1347 1348 /* 1349 * Find snapshot in incore list. 1350 */ 1351 mutex_enter(&si->si_lock); 1352 TAILQ_FOREACH(xp, &si->si_snapshots, i_nextsnap) 1353 if (xp == ip) 1354 break; 1355 mutex_exit(&si->si_lock); 1356 if (xp != NULL) 1357 vrele(ITOV(ip)); 1358#ifdef DEBUG 1359 else if (snapdebug) 1360 printf("ffs_snapgone: lost snapshot vnode %llu\n", 1361 (unsigned long long)ip->i_number); 1362#endif 1363 /* 1364 * Delete snapshot inode from superblock. Keep list dense. 1365 */ 1366 mutex_enter(&si->si_lock); 1367 fs = ip->i_fs; 1368 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) 1369 if (fs->fs_snapinum[snaploc] == ip->i_number) 1370 break; 1371 if (snaploc < FSMAXSNAP) { 1372 for (snaploc++; snaploc < FSMAXSNAP; snaploc++) { 1373 if (fs->fs_snapinum[snaploc] == 0) 1374 break; 1375 fs->fs_snapinum[snaploc - 1] = fs->fs_snapinum[snaploc]; 1376 } 1377 fs->fs_snapinum[snaploc - 1] = 0; 1378 } 1379 si->si_gen++; 1380 mutex_exit(&si->si_lock); 1381} 1382 1383/* 1384 * Prepare a snapshot file for being removed. 1385 */ 1386void 1387ffs_snapremove(struct vnode *vp) 1388{ 1389 struct inode *ip = VTOI(vp), *xp; 1390 struct vnode *devvp = ip->i_devvp; 1391 struct fs *fs = ip->i_fs; 1392 struct mount *mp = devvp->v_specmountpoint; 1393 struct vnlock *lkp; 1394 struct buf *ibp; 1395 struct snap_info *si; 1396 ufs2_daddr_t numblks, blkno, dblk; 1397 int error, ns, loc, last; 1398 1399 si = VFSTOUFS(mp)->um_snapinfo; 1400 ns = UFS_FSNEEDSWAP(fs); 1401 /* 1402 * If active, delete from incore list (this snapshot may 1403 * already have been in the process of being deleted, so 1404 * would not have been active). 1405 * 1406 * Clear copy-on-write flag if last snapshot. 1407 */ 1408 if (ip->i_nextsnap.tqe_prev != 0) { 1409 mutex_enter(&si->si_lock); 1410 vlockmgr(&vp->v_lock, LK_EXCLUSIVE); 1411 TAILQ_REMOVE(&si->si_snapshots, ip, i_nextsnap); 1412 ip->i_nextsnap.tqe_prev = 0; 1413 lkp = vp->v_vnlock; 1414 KASSERT(lkp == &si->si_vnlock); 1415 vp->v_vnlock = &vp->v_lock; 1416 vlockmgr(lkp, LK_RELEASE); 1417 if (TAILQ_FIRST(&si->si_snapshots) != 0) { 1418 /* Roll back the list of preallocated blocks. */ 1419 xp = TAILQ_LAST(&si->si_snapshots, inodelst); 1420 si->si_snapblklist = xp->i_snapblklist; 1421 } else { 1422 si->si_snapblklist = 0; 1423 si->si_gen++; 1424 mutex_exit(&si->si_lock); 1425 fscow_disestablish(mp, ffs_copyonwrite, devvp); 1426 mutex_enter(&si->si_lock); 1427 } 1428 si->si_gen++; 1429 mutex_exit(&si->si_lock); 1430 FREE(ip->i_snapblklist, M_UFSMNT); 1431 ip->i_snapblklist = NULL; 1432 } 1433 /* 1434 * Clear all BLK_NOCOPY fields. Pass any block claims to other 1435 * snapshots that want them (see ffs_snapblkfree below). 1436 */ 1437 for (blkno = 1; blkno < NDADDR; blkno++) { 1438 dblk = db_get(ip, blkno); 1439 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) 1440 db_assign(ip, blkno, 0); 1441 else if ((dblk == blkstofrags(fs, blkno) && 1442 ffs_snapblkfree(fs, ip->i_devvp, dblk, fs->fs_bsize, 1443 ip->i_number))) { 1444 DIP_ADD(ip, blocks, -btodb(fs->fs_bsize)); 1445 db_assign(ip, blkno, 0); 1446 } 1447 } 1448 numblks = howmany(ip->i_size, fs->fs_bsize); 1449 for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) { 1450 error = ffs_balloc(vp, lblktosize(fs, (off_t)blkno), 1451 fs->fs_bsize, KERNCRED, B_METAONLY, &ibp); 1452 if (error) 1453 continue; 1454 if (fs->fs_size - blkno > NINDIR(fs)) 1455 last = NINDIR(fs); 1456 else 1457 last = fs->fs_size - blkno; 1458 for (loc = 0; loc < last; loc++) { 1459 dblk = idb_get(ip, ibp->b_data, loc); 1460 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) 1461 idb_assign(ip, ibp->b_data, loc, 0); 1462 else if (dblk == blkstofrags(fs, blkno) && 1463 ffs_snapblkfree(fs, ip->i_devvp, dblk, 1464 fs->fs_bsize, ip->i_number)) { 1465 DIP_ADD(ip, blocks, -btodb(fs->fs_bsize)); 1466 idb_assign(ip, ibp->b_data, loc, 0); 1467 } 1468 } 1469 bawrite(ibp); 1470 } 1471 /* 1472 * Clear snapshot flag and drop reference. 1473 */ 1474 ip->i_flags &= ~SF_SNAPSHOT; 1475 DIP_ASSIGN(ip, flags, ip->i_flags); 1476 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1477} 1478 1479/* 1480 * Notification that a block is being freed. Return zero if the free 1481 * should be allowed to proceed. Return non-zero if the snapshot file 1482 * wants to claim the block. The block will be claimed if it is an 1483 * uncopied part of one of the snapshots. It will be freed if it is 1484 * either a BLK_NOCOPY or has already been copied in all of the snapshots. 1485 * If a fragment is being freed, then all snapshots that care about 1486 * it must make a copy since a snapshot file can only claim full sized 1487 * blocks. Note that if more than one snapshot file maps the block, 1488 * we can pick one at random to claim it. Since none of the snapshots 1489 * can change, we are assurred that they will all see the same unmodified 1490 * image. When deleting a snapshot file (see ffs_snapremove above), we 1491 * must push any of these claimed blocks to one of the other snapshots 1492 * that maps it. These claimed blocks are easily identified as they will 1493 * have a block number equal to their logical block number within the 1494 * snapshot. A copied block can never have this property because they 1495 * must always have been allocated from a BLK_NOCOPY location. 1496 */ 1497int 1498ffs_snapblkfree(struct fs *fs, struct vnode *devvp, ufs2_daddr_t bno, 1499 long size, ino_t inum) 1500{ 1501 struct mount *mp = devvp->v_specmountpoint; 1502 struct buf *ibp; 1503 struct inode *ip; 1504 struct vnode *vp = NULL; 1505 struct snap_info *si; 1506 void *saved_data = NULL; 1507 ufs_lbn_t lbn; 1508 ufs2_daddr_t blkno; 1509 uint32_t gen; 1510 int indiroff = 0, snapshot_locked = 0, error = 0, claimedblk = 0; 1511 1512 si = VFSTOUFS(mp)->um_snapinfo; 1513 lbn = fragstoblks(fs, bno); 1514 mutex_enter(&si->si_lock); 1515retry: 1516 gen = si->si_gen; 1517 TAILQ_FOREACH(ip, &si->si_snapshots, i_nextsnap) { 1518 vp = ITOV(ip); 1519 if (snapshot_locked == 0) { 1520 if (VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT) != 0) { 1521 mutex_exit(&si->si_lock); 1522 kpause("snaplock", false, 1, NULL); 1523 mutex_enter(&si->si_lock); 1524 goto retry; 1525 } 1526 snapshot_locked = 1; 1527 if (gen != si->si_gen) 1528 goto retry; 1529 } 1530 /* 1531 * Lookup block being written. 1532 */ 1533 if (lbn < NDADDR) { 1534 blkno = db_get(ip, lbn); 1535 } else { 1536 mutex_exit(&si->si_lock); 1537 error = ffs_balloc(vp, lblktosize(fs, (off_t)lbn), 1538 fs->fs_bsize, KERNCRED, B_METAONLY, &ibp); 1539 if (error) { 1540 mutex_enter(&si->si_lock); 1541 break; 1542 } 1543 indiroff = (lbn - NDADDR) % NINDIR(fs); 1544 blkno = idb_get(ip, ibp->b_data, indiroff); 1545 mutex_enter(&si->si_lock); 1546 if (gen != si->si_gen) { 1547 brelse(ibp, 0); 1548 goto retry; 1549 } 1550 } 1551 /* 1552 * Check to see if block needs to be copied. 1553 */ 1554 if (blkno == 0) { 1555 /* 1556 * A block that we map is being freed. If it has not 1557 * been claimed yet, we will claim or copy it (below). 1558 */ 1559 claimedblk = 1; 1560 } else if (blkno == BLK_SNAP) { 1561 /* 1562 * No previous snapshot claimed the block, 1563 * so it will be freed and become a BLK_NOCOPY 1564 * (don't care) for us. 1565 */ 1566 if (claimedblk) 1567 panic("snapblkfree: inconsistent block type"); 1568 if (lbn < NDADDR) { 1569 db_assign(ip, lbn, BLK_NOCOPY); 1570 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1571 } else { 1572 idb_assign(ip, ibp->b_data, indiroff, 1573 BLK_NOCOPY); 1574 mutex_exit(&si->si_lock); 1575 if (ip->i_ffs_effnlink > 0) 1576 bwrite(ibp); 1577 else 1578 bdwrite(ibp); 1579 mutex_enter(&si->si_lock); 1580 if (gen != si->si_gen) 1581 goto retry; 1582 } 1583 continue; 1584 } else /* BLK_NOCOPY or default */ { 1585 /* 1586 * If the snapshot has already copied the block 1587 * (default), or does not care about the block, 1588 * it is not needed. 1589 */ 1590 if (lbn >= NDADDR) 1591 brelse(ibp, 0); 1592 continue; 1593 } 1594 /* 1595 * If this is a full size block, we will just grab it 1596 * and assign it to the snapshot inode. Otherwise we 1597 * will proceed to copy it. See explanation for this 1598 * routine as to why only a single snapshot needs to 1599 * claim this block. 1600 */ 1601 if (size == fs->fs_bsize) { 1602#ifdef DEBUG 1603 if (snapdebug) 1604 printf("%s %llu lbn %" PRId64 1605 "from inum %llu\n", 1606 "Grabonremove: snapino", 1607 (unsigned long long)ip->i_number, 1608 lbn, (unsigned long long)inum); 1609#endif 1610 mutex_exit(&si->si_lock); 1611 if (lbn < NDADDR) { 1612 db_assign(ip, lbn, bno); 1613 } else { 1614 idb_assign(ip, ibp->b_data, indiroff, bno); 1615 if (ip->i_ffs_effnlink > 0) 1616 bwrite(ibp); 1617 else 1618 bdwrite(ibp); 1619 } 1620 DIP_ADD(ip, blocks, btodb(size)); 1621 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1622 VOP_UNLOCK(vp, 0); 1623 return (1); 1624 } 1625 if (lbn >= NDADDR) 1626 brelse(ibp, 0); 1627#ifdef DEBUG 1628 if (snapdebug) 1629 printf("%s%llu lbn %" PRId64 " %s %llu size %ld\n", 1630 "Copyonremove: snapino ", 1631 (unsigned long long)ip->i_number, 1632 lbn, "for inum", (unsigned long long)inum, size); 1633#endif 1634 /* 1635 * If we have already read the old block contents, then 1636 * simply copy them to the new block. Note that we need 1637 * to synchronously write snapshots that have not been 1638 * unlinked, and hence will be visible after a crash, 1639 * to ensure their integrity. 1640 */ 1641 mutex_exit(&si->si_lock); 1642 if (saved_data == NULL) { 1643 saved_data = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK); 1644 error = rwfsblk(vp, B_READ, saved_data, lbn); 1645 if (error) { 1646 free(saved_data, M_UFSMNT); 1647 saved_data = NULL; 1648 mutex_enter(&si->si_lock); 1649 break; 1650 } 1651 } 1652 error = wrsnapblk(vp, saved_data, lbn); 1653 mutex_enter(&si->si_lock); 1654 if (error) 1655 break; 1656 if (gen != si->si_gen) 1657 goto retry; 1658 } 1659 mutex_exit(&si->si_lock); 1660 if (saved_data) 1661 free(saved_data, M_UFSMNT); 1662 /* 1663 * If we have been unable to allocate a block in which to do 1664 * the copy, then return non-zero so that the fragment will 1665 * not be freed. Although space will be lost, the snapshot 1666 * will stay consistent. 1667 */ 1668 if (snapshot_locked) 1669 VOP_UNLOCK(vp, 0); 1670 return (error); 1671} 1672 1673/* 1674 * Associate snapshot files when mounting. 1675 */ 1676void 1677ffs_snapshot_mount(struct mount *mp) 1678{ 1679 struct vnode *devvp = VFSTOUFS(mp)->um_devvp; 1680 struct fs *fs = VFSTOUFS(mp)->um_fs; 1681 struct lwp *l = curlwp; 1682 struct vnode *vp; 1683 struct inode *ip, *xp; 1684 struct snap_info *si; 1685 ufs2_daddr_t snaplistsize, *snapblklist; 1686 int i, error, ns, snaploc, loc; 1687 1688 /* 1689 * No persistent snapshots on apple ufs file systems. 1690 */ 1691 if (UFS_MPISAPPLEUFS(VFSTOUFS(mp))) 1692 return; 1693 1694 si = VFSTOUFS(mp)->um_snapinfo; 1695 ns = UFS_FSNEEDSWAP(fs); 1696 /* 1697 * XXX The following needs to be set before ffs_truncate or 1698 * VOP_READ can be called. 1699 */ 1700 mp->mnt_stat.f_iosize = fs->fs_bsize; 1701 /* 1702 * Process each snapshot listed in the superblock. 1703 */ 1704 vp = NULL; 1705 mutex_enter(&si->si_lock); 1706 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) { 1707 if (fs->fs_snapinum[snaploc] == 0) 1708 break; 1709 if ((error = VFS_VGET(mp, fs->fs_snapinum[snaploc], 1710 &vp)) != 0) { 1711 printf("ffs_snapshot_mount: vget failed %d\n", error); 1712 continue; 1713 } 1714 ip = VTOI(vp); 1715 if ((ip->i_flags & SF_SNAPSHOT) == 0) { 1716 printf("ffs_snapshot_mount: non-snapshot inode %d\n", 1717 fs->fs_snapinum[snaploc]); 1718 vput(vp); 1719 vp = NULL; 1720 for (loc = snaploc + 1; loc < FSMAXSNAP; loc++) { 1721 if (fs->fs_snapinum[loc] == 0) 1722 break; 1723 fs->fs_snapinum[loc - 1] = fs->fs_snapinum[loc]; 1724 } 1725 fs->fs_snapinum[loc - 1] = 0; 1726 snaploc--; 1727 continue; 1728 } 1729 1730 /* 1731 * Read the block hints list. Use an empty list on 1732 * read errors. 1733 */ 1734 error = vn_rdwr(UIO_READ, vp, 1735 (void *)&snaplistsize, sizeof(snaplistsize), 1736 lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)), 1737 UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, 1738 l->l_cred, NULL, NULL); 1739 if (error) { 1740 printf("ffs_snapshot_mount: read_1 failed %d\n", error); 1741 snaplistsize = 1; 1742 } else 1743 snaplistsize = ufs_rw64(snaplistsize, ns); 1744 snapblklist = malloc( 1745 snaplistsize * sizeof(ufs2_daddr_t), M_UFSMNT, M_WAITOK); 1746 if (error) 1747 snapblklist[0] = 1; 1748 else { 1749 error = vn_rdwr(UIO_READ, vp, (void *)snapblklist, 1750 snaplistsize * sizeof(ufs2_daddr_t), 1751 lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)), 1752 UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, 1753 l->l_cred, NULL, NULL); 1754 for (i = 0; i < snaplistsize; i++) 1755 snapblklist[i] = ufs_rw64(snapblklist[i], ns); 1756 if (error) { 1757 printf("ffs_snapshot_mount: read_2 failed %d\n", 1758 error); 1759 snapblklist[0] = 1; 1760 } 1761 } 1762 ip->i_snapblklist = &snapblklist[0]; 1763 1764 /* 1765 * Acquire the snapshot lock and give up our original 1766 * private lock. 1767 */ 1768 VI_LOCK(vp); 1769 vp->v_vnlock = &si->si_vnlock; 1770 vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY); 1771 vlockmgr(&vp->v_lock, LK_RELEASE); 1772 /* 1773 * Link it onto the active snapshot list. 1774 */ 1775 if (ip->i_nextsnap.tqe_prev != 0) 1776 panic("ffs_snapshot_mount: %llu already on list", 1777 (unsigned long long)ip->i_number); 1778 else 1779 TAILQ_INSERT_TAIL(&si->si_snapshots, ip, i_nextsnap); 1780 vp->v_vflag |= VV_SYSTEM; 1781 VOP_UNLOCK(vp, 0); 1782 } 1783 /* 1784 * No usable snapshots found. 1785 */ 1786 if (vp == NULL) { 1787 mutex_exit(&si->si_lock); 1788 return; 1789 } 1790 /* 1791 * Attach the block hints list. We always want to 1792 * use the list from the newest snapshot. 1793 */ 1794 xp = TAILQ_LAST(&si->si_snapshots, inodelst); 1795 si->si_snapblklist = xp->i_snapblklist; 1796 fscow_establish(mp, ffs_copyonwrite, devvp); 1797 si->si_gen++; 1798 mutex_exit(&si->si_lock); 1799} 1800 1801/* 1802 * Disassociate snapshot files when unmounting. 1803 */ 1804void 1805ffs_snapshot_unmount(struct mount *mp) 1806{ 1807 struct vnode *devvp = VFSTOUFS(mp)->um_devvp; 1808 struct inode *xp; 1809 struct vnode *vp = NULL; 1810 struct snap_info *si; 1811 1812 si = VFSTOUFS(mp)->um_snapinfo; 1813 mutex_enter(&si->si_lock); 1814 while ((xp = TAILQ_FIRST(&si->si_snapshots)) != 0) { 1815 vp = ITOV(xp); 1816 vp->v_vnlock = &vp->v_lock; 1817 TAILQ_REMOVE(&si->si_snapshots, xp, i_nextsnap); 1818 xp->i_nextsnap.tqe_prev = 0; 1819 if (xp->i_snapblklist == si->si_snapblklist) 1820 si->si_snapblklist = NULL; 1821 FREE(xp->i_snapblklist, M_UFSMNT); 1822 if (xp->i_ffs_effnlink > 0) { 1823 si->si_gen++; 1824 mutex_exit(&si->si_lock); 1825 vrele(vp); 1826 mutex_enter(&si->si_lock); 1827 } 1828 } 1829 if (vp) 1830 fscow_disestablish(mp, ffs_copyonwrite, devvp); 1831 si->si_gen++; 1832 mutex_exit(&si->si_lock); 1833} 1834 1835/* 1836 * Lookup a snapshots data block address. 1837 * Simpler than UFS_BALLOC() as we know all metadata is already allocated. 1838 */ 1839static int 1840snapblkaddr(struct vnode *vp, daddr_t lbn, daddr_t *res) 1841{ 1842 struct indir indirs[NIADDR + 2]; 1843 struct inode *ip = VTOI(vp); 1844 struct fs *fs = ip->i_fs; 1845 struct buf *bp; 1846 int error, num; 1847 1848 KASSERT(lbn >= 0); 1849 1850 if (lbn < NDADDR) { 1851 *res = db_get(ip, lbn); 1852 return 0; 1853 } 1854 if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0) 1855 return error; 1856 error = bread(vp, indirs[num-1].in_lbn, fs->fs_bsize, NOCRED, 0, &bp); 1857 if (error == 0) 1858 *res = idb_get(ip, bp->b_data, indirs[num-1].in_off); 1859 brelse(bp, 0); 1860 1861 return error; 1862} 1863 1864/* 1865 * Check for need to copy block that is about to be written, 1866 * copying the block if necessary. 1867 */ 1868static int 1869ffs_copyonwrite(void *v, struct buf *bp, bool data_valid) 1870{ 1871 struct fs *fs; 1872 struct inode *ip; 1873 struct vnode *devvp = v, *vp = NULL; 1874 struct mount *mp = devvp->v_specmountpoint; 1875 struct snap_info *si; 1876 void *saved_data = NULL; 1877 ufs2_daddr_t lbn, blkno, *snapblklist; 1878 uint32_t gen; 1879 int lower, upper, mid, ns, snapshot_locked = 0, error = 0; 1880 1881 /* 1882 * Check for valid snapshots. 1883 */ 1884 si = VFSTOUFS(mp)->um_snapinfo; 1885 mutex_enter(&si->si_lock); 1886 ip = TAILQ_FIRST(&si->si_snapshots); 1887 if (ip == NULL) { 1888 mutex_exit(&si->si_lock); 1889 return 0; 1890 } 1891 /* 1892 * First check to see if it is in the preallocated list. 1893 * By doing this check we avoid several potential deadlocks. 1894 */ 1895 fs = ip->i_fs; 1896 ns = UFS_FSNEEDSWAP(fs); 1897 lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno)); 1898 snapblklist = si->si_snapblklist; 1899 upper = si->si_snapblklist[0] - 1; 1900 lower = 1; 1901 while (lower <= upper) { 1902 mid = (lower + upper) / 2; 1903 if (snapblklist[mid] == lbn) 1904 break; 1905 if (snapblklist[mid] < lbn) 1906 lower = mid + 1; 1907 else 1908 upper = mid - 1; 1909 } 1910 if (lower <= upper) { 1911 mutex_exit(&si->si_lock); 1912 return 0; 1913 } 1914 /* 1915 * Not in the precomputed list, so check the snapshots. 1916 */ 1917 if (data_valid && bp->b_bcount == fs->fs_bsize) 1918 saved_data = bp->b_data; 1919retry: 1920 gen = si->si_gen; 1921 TAILQ_FOREACH(ip, &si->si_snapshots, i_nextsnap) { 1922 vp = ITOV(ip); 1923 /* 1924 * We ensure that everything of our own that needs to be 1925 * copied will be done at the time that ffs_snapshot is 1926 * called. Thus we can skip the check here which can 1927 * deadlock in doing the lookup in ffs_balloc. 1928 */ 1929 if (bp->b_vp == vp) 1930 continue; 1931 /* 1932 * Check to see if block needs to be copied. 1933 */ 1934 if (lbn < NDADDR) { 1935 blkno = db_get(ip, lbn); 1936 } else { 1937 mutex_exit(&si->si_lock); 1938 if ((error = snapblkaddr(vp, lbn, &blkno)) != 0) { 1939 mutex_enter(&si->si_lock); 1940 break; 1941 } 1942 mutex_enter(&si->si_lock); 1943 if (gen != si->si_gen) 1944 goto retry; 1945 } 1946#ifdef DIAGNOSTIC 1947 if (blkno == BLK_SNAP && bp->b_lblkno >= 0) 1948 panic("ffs_copyonwrite: bad copy block"); 1949#endif 1950 if (blkno != 0) 1951 continue; 1952 1953 if (snapshot_locked == 0) { 1954 if (VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT) != 0) { 1955 mutex_exit(&si->si_lock); 1956 kpause("snaplock", false, 1, NULL); 1957 mutex_enter(&si->si_lock); 1958 goto retry; 1959 } 1960 snapshot_locked = 1; 1961 if (gen != si->si_gen) 1962 goto retry; 1963 1964 /* Check again if block still needs to be copied */ 1965 if (lbn < NDADDR) { 1966 blkno = db_get(ip, lbn); 1967 } else { 1968 mutex_exit(&si->si_lock); 1969 if ((error = snapblkaddr(vp, lbn, &blkno)) != 0) { 1970 mutex_enter(&si->si_lock); 1971 break; 1972 } 1973 mutex_enter(&si->si_lock); 1974 if (gen != si->si_gen) 1975 goto retry; 1976 } 1977 1978 if (blkno != 0) 1979 continue; 1980 } 1981 /* 1982 * Allocate the block into which to do the copy. Since 1983 * multiple processes may all try to copy the same block, 1984 * we have to recheck our need to do a copy if we sleep 1985 * waiting for the lock. 1986 * 1987 * Because all snapshots on a filesystem share a single 1988 * lock, we ensure that we will never be in competition 1989 * with another process to allocate a block. 1990 */ 1991#ifdef DEBUG 1992 if (snapdebug) { 1993 printf("Copyonwrite: snapino %llu lbn %" PRId64 " for ", 1994 (unsigned long long)ip->i_number, lbn); 1995 if (bp->b_vp == devvp) 1996 printf("fs metadata"); 1997 else 1998 printf("inum %llu", (unsigned long long) 1999 VTOI(bp->b_vp)->i_number); 2000 printf(" lblkno %" PRId64 "\n", bp->b_lblkno); 2001 } 2002#endif 2003 /* 2004 * If we have already read the old block contents, then 2005 * simply copy them to the new block. Note that we need 2006 * to synchronously write snapshots that have not been 2007 * unlinked, and hence will be visible after a crash, 2008 * to ensure their integrity. 2009 */ 2010 mutex_exit(&si->si_lock); 2011 if (saved_data == NULL) { 2012 saved_data = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK); 2013 error = rwfsblk(vp, B_READ, saved_data, lbn); 2014 if (error) { 2015 free(saved_data, M_UFSMNT); 2016 saved_data = NULL; 2017 mutex_enter(&si->si_lock); 2018 break; 2019 } 2020 } 2021 error = wrsnapblk(vp, saved_data, lbn); 2022 mutex_enter(&si->si_lock); 2023 if (error) 2024 break; 2025 if (gen != si->si_gen) 2026 goto retry; 2027 } 2028 /* 2029 * Note that we need to synchronously write snapshots that 2030 * have not been unlinked, and hence will be visible after 2031 * a crash, to ensure their integrity. 2032 */ 2033 mutex_exit(&si->si_lock); 2034 if (saved_data && saved_data != bp->b_data) 2035 free(saved_data, M_UFSMNT); 2036 if (snapshot_locked) 2037 VOP_UNLOCK(vp, 0); 2038 return error; 2039} 2040 2041/* 2042 * Read or write the specified block of the filesystem vp resides on 2043 * from or to the disk bypassing UBC and the buffer cache. 2044 */ 2045static int 2046rwfsblk(struct vnode *vp, int flags, void *data, ufs2_daddr_t lbn) 2047{ 2048 int error; 2049 struct inode *ip = VTOI(vp); 2050 struct fs *fs = ip->i_fs; 2051 struct buf *nbp; 2052 2053 nbp = getiobuf(NULL, true); 2054 nbp->b_flags = flags; 2055 nbp->b_bcount = nbp->b_bufsize = fs->fs_bsize; 2056 nbp->b_error = 0; 2057 nbp->b_data = data; 2058 nbp->b_blkno = nbp->b_rawblkno = fsbtodb(fs, blkstofrags(fs, lbn)); 2059 nbp->b_proc = NULL; 2060 nbp->b_dev = ip->i_devvp->v_rdev; 2061 SET(nbp->b_cflags, BC_BUSY); /* mark buffer busy */ 2062 2063 bdev_strategy(nbp); 2064 2065 error = biowait(nbp); 2066 2067 putiobuf(nbp); 2068 2069 return error; 2070} 2071 2072/* 2073 * Write the specified block to a snapshot. 2074 */ 2075static int 2076wrsnapblk(struct vnode *vp, void *data, ufs2_daddr_t lbn) 2077{ 2078 struct inode *ip = VTOI(vp); 2079 struct fs *fs = ip->i_fs; 2080 off_t offset; 2081 int error, flags; 2082 2083 offset = lblktosize(fs, (off_t)lbn); 2084 flags = IO_NODELOCKED|IO_UNIT; 2085 if (ip->i_ffs_effnlink > 0) 2086 flags |= IO_SYNC; 2087 error = vn_rdwr(UIO_WRITE, vp, data, fs->fs_bsize, offset, 2088 UIO_SYSSPACE, flags, curlwp->l_cred, NULL, NULL); 2089 2090 if (!error && curlwp == uvm.pagedaemon_lwp) { 2091 mutex_enter(&vp->v_interlock); 2092 error = VOP_PUTPAGES(vp, 2093 trunc_page(offset), round_page(offset+fs->fs_bsize), 2094 PGO_CLEANIT|PGO_SYNCIO|PGO_FREE); 2095 } 2096 2097 return error; 2098} 2099 2100/* 2101 * Get/Put direct block from inode or buffer containing disk addresses. Take 2102 * care for fs type (UFS1/UFS2) and byte swapping. These functions should go 2103 * into a global include. 2104 */ 2105static inline ufs2_daddr_t 2106db_get(struct inode *ip, int loc) 2107{ 2108 if (ip->i_ump->um_fstype == UFS1) 2109 return ufs_rw32(ip->i_ffs1_db[loc], UFS_IPNEEDSWAP(ip)); 2110 else 2111 return ufs_rw64(ip->i_ffs2_db[loc], UFS_IPNEEDSWAP(ip)); 2112} 2113 2114static inline void 2115db_assign(struct inode *ip, int loc, ufs2_daddr_t val) 2116{ 2117 if (ip->i_ump->um_fstype == UFS1) 2118 ip->i_ffs1_db[loc] = ufs_rw32(val, UFS_IPNEEDSWAP(ip)); 2119 else 2120 ip->i_ffs2_db[loc] = ufs_rw64(val, UFS_IPNEEDSWAP(ip)); 2121} 2122 2123static inline ufs2_daddr_t 2124idb_get(struct inode *ip, void *bf, int loc) 2125{ 2126 if (ip->i_ump->um_fstype == UFS1) 2127 return ufs_rw32(((ufs1_daddr_t *)(bf))[loc], 2128 UFS_IPNEEDSWAP(ip)); 2129 else 2130 return ufs_rw64(((ufs2_daddr_t *)(bf))[loc], 2131 UFS_IPNEEDSWAP(ip)); 2132} 2133 2134static inline void 2135idb_assign(struct inode *ip, void *bf, int loc, ufs2_daddr_t val) 2136{ 2137 if (ip->i_ump->um_fstype == UFS1) 2138 ((ufs1_daddr_t *)(bf))[loc] = 2139 ufs_rw32(val, UFS_IPNEEDSWAP(ip)); 2140 else 2141 ((ufs2_daddr_t *)(bf))[loc] = 2142 ufs_rw64(val, UFS_IPNEEDSWAP(ip)); 2143} 2144